27 lines
908 B
Python
27 lines
908 B
Python
|
from typing import List
|
||
|
from haystack.nodes import PreProcessor
|
||
|
from haystack import Document
|
||
|
|
||
|
|
||
|
class TextPreprocessor:
|
||
|
def __init__(self) -> None:
|
||
|
self.preprocessor = PreProcessor(
|
||
|
split_by="word",
|
||
|
split_length=100,
|
||
|
clean_empty_lines=True,
|
||
|
clean_whitespace=True,
|
||
|
clean_header_footer=True,
|
||
|
split_respect_sentence_boundary=True,
|
||
|
)
|
||
|
|
||
|
# def remove_newlines(self, text: str) -> str:
|
||
|
# return text.replace("\n", " ") # Replacing newline character with space
|
||
|
|
||
|
def preprocess_docs_haystack_format(self, docs: List[Document]):
|
||
|
processed_docs = self.preprocessor.process(docs)
|
||
|
|
||
|
# # Apply remove_newlines to each processed Document's text
|
||
|
# for doc in processed_docs:
|
||
|
# doc.content = self.remove_newlines(doc.content)
|
||
|
return processed_docs
|