from typing import List from haystack.nodes import PreProcessor from haystack import Document class TextPreprocessor: def __init__(self) -> None: self.preprocessor = PreProcessor( split_by="word", split_length=100, clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_respect_sentence_boundary=True, ) # def remove_newlines(self, text: str) -> str: # return text.replace("\n", " ") # Replacing newline character with space def preprocess_docs_haystack_format(self, docs: List[Document]): processed_docs = self.preprocessor.process(docs) # # Apply remove_newlines to each processed Document's text # for doc in processed_docs: # doc.content = self.remove_newlines(doc.content) return processed_docs