BA-Chatbot_Ali_Thesis/backend/expert_search.py

import json
from reranker import ReRanker
from reader import Reader
from retriever.retriever import Retriever
from retriever.retriever_pipeline import CustomPipeline
from helper.openai import openai_expert_search
from haystack.nodes import FARMReader

AUTHOR_MAPPING = {
    "Wolf": "Prof. Dr. Ivo Wolf",
    "Hummel": "Prof. Dr. Oliver Hummel",
    "Fimmel": "Prof. Dr. Elena Fimmel",
    "Eckert": "Prof. Dr. rer. nat. Kai Eckert",
    "Fischer": "Prof. Dr. Jörn Fischer",
    "Gröschel": "Prof. Dr. Michael Gröschel",
    "Gumbel": "Prof. Dr. Markus Gumbel",
    "Nagel": "Prof. Dr. Till Nagel",
    "Specht": "Prof. Dr. Thomas Specht",
    "Steinberger": "Prof. Dr. Jessica Steinberger",
    "Dietrich": "Prof. Dr. Gabriele Roth-Dietrich",
    "Dopatka": "Prof. Dr. rer. nat. Frank Dopatka",
    "Kraus": "Prof. Dr. Stefan Kraus",
    "Leuchter": "Prof. Dr.-Ing. Sandro Leuchter",
    "Paulus": "Prof. Dr. Sachar Paulus",
}

class ExpertSearch:
    def __init__(
        self,
        pipeline: CustomPipeline,
        retriever: Retriever,
        reader: Reader,
        reRanker: ReRanker,
        farm_reader: FARMReader,
    ) -> None:
        """
        Initializes the ExpertSearch class with components for searching, retrieving, reranking, and reading documents.

        Args:
            pipeline (CustomPipeline): A pipeline for document retrieval and processing.
            retriever (Retriever): A component for retrieving documents based on queries.
            reader (Reader): A component for interpreting and processing documents.
            reRanker (ReRanker): A component for reranking documents based on relevance.
            farm_reader (FARMReader): A FARM-based reader for additional document processing.
        """
        self.pipeline = pipeline
        self.retriever = retriever
        self.reader = reader
        self.reRanker = reRanker
        self.farm_reader = farm_reader

    def search_experts(
        self,
        query: str,
        search_method="classic_retriever_reader",
        retrieval_method="mpnet",
        rerank_documents=True,
        generate_anwser=False,
    ):
        """
        Performs an expert search based on a given query and specified method.

        Args:
            query (str): The search query.
            search_method (str, optional): The method of search (e.g., 'classic_retriever_reader', 'sort_llm'). Defaults to "classic_retriever_reader".
            retrieval_method (str, optional): The retrieval method to be used. Defaults to "mpnet".
            rerank_documents (bool, optional): Flag to rerank documents post-retrieval. Defaults to True.
            generate_anwser (bool, optional): Flag to generate answers using a reader. Defaults to False.

        Returns:
            Varies: Returns different types of outputs based on the search method chosen.
        """


        if search_method == "sort_llm":
            result = self.pipeline.doc_store.get_all_documents(index="paper")
            prompt = f"""<s>[INST] <<SYS>>s
            You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

            If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
            <</SYS>>

            Your task is to sort the list of papers:\n{json.dumps([{"title":doc.meta["title"], "id": doc.id, "author":doc.meta["author"]} for doc in result])} \n\n according to their relevance to the request: '{query}'. Your answer should only contain a python list of the top five paper ids and nothing more. [/INST] \n\n Top five:"""
            payload = json.dumps({"question": query, "model": "HF", "prompt": prompt})

            return [
                {"title": doc.meta["title"], "id": doc.id, "author": doc.meta["author"]}
                for doc in result
            ]

        top_k_documents = self.retriever.get_top_k_passages(
            query=query, index="paper", method=retrieval_method
        )["documents"]
        final_references = top_k_documents
        if rerank_documents:
            reranked_top_k = self.reRanker.rerank_documents_with_gpt35(
                documents=top_k_documents, query=query
            )
            final_references = self.reRanker.get_final_references(
                reranked_documents=reranked_top_k,
                retrieved_documents=top_k_documents,
            )
        if search_method == "classic_retriever_reader":
            if generate_anwser:
                return self.reader.get_gpt_expert_search_answer(
                    prompt=openai_expert_search,
                    top_k_passages=final_references,
                    query=query,
                )
            else:
                return final_references
        for doc in final_references:
            current_author = doc.meta.get("author")
            new_author = AUTHOR_MAPPING.get(current_author, "Unbekannt")
            doc.meta["author"] = new_author or current_author
        return final_references