127 lines
5.5 KiB
Python
127 lines
5.5 KiB
Python
|
from api.embeddingsServiceCaller import EmbeddingServiceCaller
|
|||
|
from preprocessing.haystack_preprosessor import TextPreprocessor
|
|||
|
from converter.pdf_converter import PDFConverter
|
|||
|
from pipeline.indexing_pipeline import CustomPipeline
|
|||
|
|
|||
|
# from parser.module_handbook_parser.IB_parser import parse_ib_handbook
|
|||
|
from parser.paper_parser import PaperParser
|
|||
|
import os
|
|||
|
import json
|
|||
|
from typing import List
|
|||
|
from haystack.schema import Document
|
|||
|
from tqdm import tqdm
|
|||
|
import copy
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
This script performs comprehensive data parsing and indexing for a variety of documents including academic papers, module handbooks, and web crawled data.
|
|||
|
It primarily focuses on processing PDF files to extract essential information, then generating embeddings and storing these documents in a database.
|
|||
|
The script utilizes several key components: CustomPipeline for embedding and indexing, TextPreprocessor for data preprocessing, PDFConverter for converting PDFs to text, and PaperParser for parsing academic papers.
|
|||
|
|
|||
|
NOTE: It requires running Docker containers for services like Elasticsearch, Weaviate, and GROBID.
|
|||
|
NOTE: Before executing the script, ensure that necessary PDFs and crawled data are placed in the "./data" directory.
|
|||
|
The script handles the conversion of module handbooks and papers, preprocesses them, and finally indexes them in the appropriate document stores, making the information readily accessible for retrieval and analysis tasks.
|
|||
|
"""
|
|||
|
pipeline = CustomPipeline(api_key="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")
|
|||
|
preprocessor = TextPreprocessor()
|
|||
|
caller = EmbeddingServiceCaller()
|
|||
|
paper_parser = PaperParser()
|
|||
|
|
|||
|
|
|||
|
def parse_and_index_paper():
|
|||
|
# Parsed und speichert paper in Datenbank
|
|||
|
base_dir = "./data/paper"
|
|||
|
parsed_pdfs_with_meta = []
|
|||
|
for author in os.listdir(base_dir):
|
|||
|
author_path = os.path.join(base_dir, author)
|
|||
|
if os.path.isdir(author_path):
|
|||
|
for paper_file in os.listdir(author_path):
|
|||
|
if paper_file.lower().endswith(".pdf"):
|
|||
|
paper_path = os.path.join(author_path, paper_file)
|
|||
|
print()
|
|||
|
print(f"Parse Paper: {paper_path} .....")
|
|||
|
print()
|
|||
|
paper_info = paper_parser.parseHeader(paper_path)
|
|||
|
abstract = paper_info.get("abstract")
|
|||
|
keywords = paper_info.get("keywords")
|
|||
|
title = paper_info.get("title")
|
|||
|
if abstract and title:
|
|||
|
parsed_pdfs_with_meta.append(
|
|||
|
{
|
|||
|
"content": f"{title}\n{abstract}",
|
|||
|
"file_path": paper_path,
|
|||
|
"title": title,
|
|||
|
"abstract": abstract,
|
|||
|
"keywords": keywords,
|
|||
|
"author": author,
|
|||
|
}
|
|||
|
)
|
|||
|
index_data(parsed_pdfs_with_meta, "paper")
|
|||
|
with open("parsed_pdfs.json", "w") as file:
|
|||
|
json.dump(parsed_pdfs_with_meta, file)
|
|||
|
|
|||
|
|
|||
|
def index_data(data: List[Document], index):
|
|||
|
# Funktion die Embeddings erstellt und zusammen mit den Dokumenten abspeichert
|
|||
|
# pipeline.doc_store.write_documents(data, index)
|
|||
|
# pipeline.doc_store.update_embeddings(pipeline.emb_retriever, index)
|
|||
|
processed_data = []
|
|||
|
for doc in data:
|
|||
|
item = copy.deepcopy(doc)
|
|||
|
if not isinstance(item, dict):
|
|||
|
item = item.to_dict()
|
|||
|
else:
|
|||
|
doc = Document.from_dict(doc)
|
|||
|
item[
|
|||
|
pipeline.doc_store_ada.embedding_field
|
|||
|
] = pipeline.emb_retriever_ada.embed_documents([doc])[0]
|
|||
|
item[
|
|||
|
pipeline.doc_store_distilbert.embedding_field
|
|||
|
] = pipeline.retriever_distilbert.embed_documents([doc])[0]
|
|||
|
item[
|
|||
|
pipeline.doc_store_mpnet.embedding_field
|
|||
|
] = pipeline.emb_retriever_mpnet.embed_documents([doc])[0]
|
|||
|
processed_data.append(item)
|
|||
|
|
|||
|
pipeline.doc_store_mpnet.write_documents(processed_data, index=index)
|
|||
|
# NOTE: Unkommentieren und LLaMA Emeddings in weaviate abzuspeichern
|
|||
|
# indexed_data = pipeline.doc_store_mpnet.get_all_documents(index=index)
|
|||
|
# embedded_docs: List[Document] = []
|
|||
|
# for doc in tqdm(indexed_data):
|
|||
|
# content = doc.content
|
|||
|
# item = {
|
|||
|
# "content": content,
|
|||
|
# "embedding": caller.get_embeddings(content, embedding_type="last_layer"),
|
|||
|
# "es_id": doc.id,
|
|||
|
# }
|
|||
|
# embedded_docs.append(item)
|
|||
|
# pipeline.vector_doc_store_llama.write_documents(embedded_docs, index=index)
|
|||
|
# pipeline.vector_doc_store_llama.update_embeddings(pipeline.llama_retriever, index=index)
|
|||
|
|
|||
|
|
|||
|
directory = "./data"
|
|||
|
converter = PDFConverter()
|
|||
|
parsed_wpms = {}
|
|||
|
parsed_modules = {}
|
|||
|
converted_stupo = None
|
|||
|
|
|||
|
for filename in os.listdir(directory):
|
|||
|
full_path = os.path.join(directory, filename)
|
|||
|
if filename.endswith(".pdf"):
|
|||
|
if filename == "Juni_2023_SPO_Bachelor.pdf":
|
|||
|
converted_stupo = converter.convert_pdf_to_text_haystack(full_path)
|
|||
|
data = preprocessor.preprocess_docs_haystack_format(converted_stupo)
|
|||
|
index_data(data, "stupo")
|
|||
|
with open(f"{directory}/adjusted_modulhandbuch_ib.json", "r") as f:
|
|||
|
data = json.load(f)
|
|||
|
index_data(data, "ib")
|
|||
|
|
|||
|
with open(f"{directory}/crawled_hsma_web.json", "r") as f:
|
|||
|
data = json.load(f)
|
|||
|
|
|||
|
index_data(preprocessor.preprocess_docs_haystack_format(data), "crawled_hsma")
|
|||
|
|
|||
|
parse_and_index_paper()
|
|||
|
|
|||
|
|