BA-Chatbot_Ali_Thesis/backend/reranker.py

from typing import Dict, List
from haystack.schema import Document
from api.embeddingsServiceCaller import EmbeddingServiceCaller
from helper.openai import (
    openai_doc_reference_prompt_v1,
    openai_doc_citation_prompt_v2,
    MAX_GPT4_TOKENS,
    GPT4_COMPLETION_TOKENS,
    MAX_GPT35_TURBO_TOKENS,
    RERANKING_TOKENS,
    count_prompt_tokens_gpt4,
    count_prompt_tokens_gpt35,
)
import json
import ast


class ReRanker:
    def __init__(self) -> None:
        """
        Initializes the ReRanker class with an caller for MODEL SERVICE.
        """
        self.caller = EmbeddingServiceCaller()

    def rerank_documents_with_gpt35(self, documents: List[Document], query: str):
        """
        Reranks a list of documents using GPT-3.5 based on a given query.

        Args:
            documents (List[Document]): A list of Document objects to be reranked.
            query (str): The query string used for reranking.

        Returns:
            List[Document]: A list of reranked Document objects.
        """
        formatted_documents = []
        reranked_documents_token_count = count_prompt_tokens_gpt35(
            openai_doc_citation_prompt_v2
        )
        for doc in documents:
            reranked_documents_token_count += count_prompt_tokens_gpt35(doc.content)
            if (
                reranked_documents_token_count
                < MAX_GPT35_TURBO_TOKENS - RERANKING_TOKENS
            ):
                formatted_documents.append({"content": doc.content, "id": doc.id})

        payload = json.dumps(
            {
                "system_prompt": openai_doc_citation_prompt_v2,
                "documents": formatted_documents,
                "query": query,
            }
        )
        sorted_document_ids = self.caller.rerank_documents_gpt(payload=payload)
        print(sorted_document_ids, "sorted_document_ids")
        message_content = sorted_document_ids["choices"][0]["message"]["content"]

        # Check if the message content is a string representation of a list. If not then return empty list.
        # If yes then parse it, and check if the returned ids exists.
        try:
            content_list = ast.literal_eval(message_content)
            if isinstance(content_list, list):
                # Proceed with further processing
                return [doc for doc in documents for id in content_list if id == doc.id]
            else:
                return []
        except (SyntaxError, ValueError):
            return []

    def get_final_references(
        self, reranked_documents: List[Document], retrieved_documents: List[Document]
    ) -> List[Document]:
        """
        Combines reranked and retrieved documents, ensuring no duplicates and maintaining order.

        Args:
            reranked_documents (List[Document]): The documents after reranking.
            retrieved_documents (List[Document]): The original set of retrieved documents.

        Returns:
            List[Document]: A combined list of reranked and retrieved documents.
        """
        final_references = list(reranked_documents)
        if not reranked_documents:
            return retrieved_documents
        # If  The model in the Re-Ranking process did not return all document ids.
        # In that Case, we create a new sorted list. The first indexes are the existing documents
        # from the re-ranking, followed by the missing ones from the retriever.
        elif len(reranked_documents) < len(retrieved_documents):
            reranked_ids = set(doc.id for doc in reranked_documents)
            missing_documents = [
                doc for doc in retrieved_documents if doc.id not in reranked_ids
            ]
            final_references.extend(missing_documents)
            return final_references
        elif len(reranked_documents) == len(retrieved_documents):
            return final_references
        else:
            return retrieved_documents