BA-Chatbot/data_service/preprocessing/haystack_preprosessor.py

27 lines
908 B
Python
Executable File

from typing import List
from haystack.nodes import PreProcessor
from haystack import Document
class TextPreprocessor:
def __init__(self) -> None:
self.preprocessor = PreProcessor(
split_by="word",
split_length=100,
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_respect_sentence_boundary=True,
)
# def remove_newlines(self, text: str) -> str:
# return text.replace("\n", " ") # Replacing newline character with space
def preprocess_docs_haystack_format(self, docs: List[Document]):
processed_docs = self.preprocessor.process(docs)
# # Apply remove_newlines to each processed Document's text
# for doc in processed_docs:
# doc.content = self.remove_newlines(doc.content)
return processed_docs