BA-Chatbot_Ali_Thesis/data_service/data_indexer.py

from api.embeddingsServiceCaller import EmbeddingServiceCaller
from preprocessing.haystack_preprosessor import TextPreprocessor
from converter.pdf_converter import PDFConverter
from pipeline.indexing_pipeline import CustomPipeline

# from parser.module_handbook_parser.IB_parser import parse_ib_handbook
from parser.paper_parser import PaperParser
import os
import json
from typing import List
from haystack.schema import Document
from tqdm import tqdm
import copy


"""
This script performs comprehensive data parsing and indexing for a variety of documents including academic papers, module handbooks, and web crawled data.
It primarily focuses on processing PDF files to extract essential information, then generating embeddings and storing these documents in a database. 
The script utilizes several key components: CustomPipeline for embedding and indexing, TextPreprocessor for data preprocessing, PDFConverter for converting PDFs to text, and PaperParser for parsing academic papers. 

NOTE: It requires running Docker containers for services like Elasticsearch, Weaviate, and GROBID. 
NOTE: Before executing the script, ensure that necessary PDFs and crawled data are placed in the "./data" directory. 
The script handles the conversion of module handbooks and papers, preprocesses them, and finally indexes them in the appropriate document stores, making the information readily accessible for retrieval and analysis tasks.
"""
pipeline = CustomPipeline(api_key="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")
preprocessor = TextPreprocessor()
caller = EmbeddingServiceCaller()
paper_parser = PaperParser()


def parse_and_index_paper():
    # Parsed und speichert paper in Datenbank
    base_dir = "./data/paper"
    parsed_pdfs_with_meta = []
    for author in os.listdir(base_dir):
        author_path = os.path.join(base_dir, author)
        if os.path.isdir(author_path):
            for paper_file in os.listdir(author_path):
                if paper_file.lower().endswith(".pdf"):
                    paper_path = os.path.join(author_path, paper_file)
                    print()
                    print(f"Parse Paper: {paper_path} .....")
                    print()
                    paper_info = paper_parser.parseHeader(paper_path)
                    abstract = paper_info.get("abstract")
                    keywords = paper_info.get("keywords")
                    title = paper_info.get("title")
                    if abstract and title:
                        parsed_pdfs_with_meta.append(
                            {
                                "content": f"{title}\n{abstract}",
                                "file_path": paper_path,
                                "title": title,
                                "abstract": abstract,
                                "keywords": keywords,
                                "author": author,
                            }
                        )
    index_data(parsed_pdfs_with_meta, "paper")
    with open("parsed_pdfs.json", "w") as file:
        json.dump(parsed_pdfs_with_meta, file)


def index_data(data: List[Document], index):
    # Funktion die Embeddings erstellt und zusammen mit den Dokumenten abspeichert
    # pipeline.doc_store.write_documents(data, index)
    # pipeline.doc_store.update_embeddings(pipeline.emb_retriever, index)
    processed_data = []
    for doc in data:
        item = copy.deepcopy(doc)
        if not isinstance(item, dict):
            item = item.to_dict()
        else:
            doc = Document.from_dict(doc)
        item[
            pipeline.doc_store_ada.embedding_field
        ] = pipeline.emb_retriever_ada.embed_documents([doc])[0]
        item[
            pipeline.doc_store_distilbert.embedding_field
        ] = pipeline.retriever_distilbert.embed_documents([doc])[0]
        item[
            pipeline.doc_store_mpnet.embedding_field
        ] = pipeline.emb_retriever_mpnet.embed_documents([doc])[0]
        processed_data.append(item)

    pipeline.doc_store_mpnet.write_documents(processed_data, index=index)
    # NOTE: Unkommentieren und LLaMA Emeddings in weaviate abzuspeichern
    # indexed_data = pipeline.doc_store_mpnet.get_all_documents(index=index)
    # embedded_docs: List[Document] = []
    # for doc in tqdm(indexed_data):
    #     content = doc.content
    #     item = {
    #         "content": content,
    #         "embedding": caller.get_embeddings(content, embedding_type="last_layer"),
    #         "es_id": doc.id,
    #     }
    #     embedded_docs.append(item)
    # pipeline.vector_doc_store_llama.write_documents(embedded_docs, index=index)
    # pipeline.vector_doc_store_llama.update_embeddings(pipeline.llama_retriever, index=index)


directory = "./data"
converter = PDFConverter()
parsed_wpms = {}
parsed_modules = {}
converted_stupo = None

for filename in os.listdir(directory):
    full_path = os.path.join(directory, filename)
    if filename.endswith(".pdf"):
        if filename == "Juni_2023_SPO_Bachelor.pdf":
            converted_stupo = converter.convert_pdf_to_text_haystack(full_path)
            data = preprocessor.preprocess_docs_haystack_format(converted_stupo)
            index_data(data, "stupo")
with open(f"{directory}/adjusted_modulhandbuch_ib.json", "r") as f:
    data = json.load(f)
    index_data(data, "ib")

with open(f"{directory}/crawled_hsma_web.json", "r") as f:
    data = json.load(f)

    index_data(preprocessor.preprocess_docs_haystack_format(data), "crawled_hsma")

parse_and_index_paper()