# pylint: disable=ungrouped-imports """ --------------------------------------------------------------------------- NOTE: Custom Implementation of an Retriever based on the LLaMA Model, which is compatible with Haystack Retriever Pipeline. Calls under the hood the MODEL SERVICE. NOTE: SEE functions embed_queries and embed_documents for pooling strategy and layer extraction --------------------------------------------------------------------------- """ from typing import List, Dict, Union, Optional, Any, Literal, Callable import logging from pathlib import Path from copy import deepcopy from requests.exceptions import HTTPError import numpy as np from tqdm import tqdm import pandas as pd from huggingface_hub import hf_hub_download from haystack.errors import HaystackError from haystack.schema import Document, FilterType from haystack.document_stores import BaseDocumentStore from haystack.telemetry import send_event from haystack.lazy_imports import LazyImport from haystack.nodes.retriever import DenseRetriever logger = logging.getLogger(__name__) with LazyImport(message="Run 'pip install farm-haystack[inference]'") as torch_and_transformers_import: import torch from haystack.modeling.utils import initialize_device_settings # pylint: disable=ungrouped-imports from transformers import AutoConfig import sys sys.path.append("../..") from api.embeddingsServiceCaller import EmbeddingServiceCaller _EMBEDDING_ENCODERS: Dict[str, Callable] = { "llama": {} } class LlamaRetriever(DenseRetriever): def __init__( self, model_format = "llama", document_store: Optional[BaseDocumentStore] = None, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, "torch.device"]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: Optional[List[str]] = None, api_key: Optional[str] = None, azure_api_version: str = "2022-12-01", azure_base_url: Optional[str] = None, azure_deployment_name: Optional[str] = None, api_base: str = "https://api.openai.com/v1", openai_organization: Optional[str] = None, ): torch_and_transformers_import.check() if embed_meta_fields is None: embed_meta_fields = [] super().__init__() self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) if batch_size < len(self.devices): logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.") self.document_store = document_store self.model_version = model_version self.use_gpu = use_gpu self.batch_size = batch_size self.max_seq_len = max_seq_len self.pooling_strategy = pooling_strategy self.emb_extraction_layer = emb_extraction_layer self.top_k = top_k self.progress_bar = progress_bar self.use_auth_token = use_auth_token self.scale_score = scale_score self.api_key = api_key self.api_base = api_base self.api_version = azure_api_version self.azure_base_url = azure_base_url self.azure_deployment_name = azure_deployment_name self.openai_organization = openai_organization self.model_format= model_format self.emb_caller= EmbeddingServiceCaller() self.embed_meta_fields = embed_meta_fields def retrieve( self, query: str, filters: Optional[FilterType] = None, top_k: Optional[int] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, scale_score: Optional[bool] = None, document_store: Optional[BaseDocumentStore] = None, ) -> List[Document]: document_store = document_store or self.document_store if document_store is None: raise ValueError( "This Retriever was not initialized with a Document Store. Provide one to the retrieve() method." ) if top_k is None: top_k = self.top_k if index is None: index = document_store.index if scale_score is None: scale_score = self.scale_score query_emb = self.embed_queries(queries=[query]) documents = document_store.query_by_embedding( query_emb=query_emb, filters=filters, top_k=top_k, index=index, headers=headers, scale_score=scale_score ) return documents def retrieve_batch( self, queries: List[str], filters: Optional[Union[FilterType, List[Optional[FilterType]]]] = None, top_k: Optional[int] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: Optional[bool] = None, document_store: Optional[BaseDocumentStore] = None, ) -> List[List[Document]]: document_store = document_store or self.document_store if document_store is None: raise ValueError( "This Retriever was not initialized with a Document Store. Provide one to the retrieve_batch() method." ) if top_k is None: top_k = self.top_k if batch_size is None: batch_size = self.batch_size if index is None: index = document_store.index if scale_score is None: scale_score = self.scale_score query_embs: np.ndarray = self.embed_queries(queries=queries) batched_query_embs: List[np.ndarray] = [] for i in range(0, len(query_embs), batch_size): batched_query_embs.extend(query_embs[i : i + batch_size]) documents = document_store.query_by_embedding_batch( query_embs=batched_query_embs, top_k=top_k, filters=filters, index=index, headers=headers, scale_score=scale_score, ) return documents def embed_queries(self, queries: List[str]) -> np.ndarray: if isinstance(queries, str): queries = [queries] assert isinstance(queries, list), "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])" return np.array(self.emb_caller.get_embeddings(queries[0])) def embed_documents(self, documents: List[Document]) -> np.ndarray: documents = self._preprocess_documents(documents) embeddings=[] for doc in documents: embeddings.append(self.emb_caller.get_embeddings(doc.content)) return np.array(embeddings) def _preprocess_documents(self, docs: List[Document]) -> List[Document]: linearized_docs = [] for doc in docs: doc = deepcopy(doc) if doc.content_type == "table": if isinstance(doc.content, pd.DataFrame): doc.content = doc.content.to_csv(index=False) else: raise HaystackError("Documents of type 'table' need to have a pd.DataFrame as content field") meta_data_fields = [] for key in self.embed_meta_fields: if key in doc.meta and doc.meta[key]: if isinstance(doc.meta[key], list): meta_data_fields.extend([item for item in doc.meta[key]]) else: meta_data_fields.append(doc.meta[key]) meta_data_fields = [str(field) for field in meta_data_fields] doc.content = "\n".join(meta_data_fields + [doc.content]) linearized_docs.append(doc) return linearized_docs @staticmethod def _infer_model_format(model_name_or_path: str, use_auth_token: Optional[Union[str, bool]]) -> str: valid_openai_model_name = model_name_or_path in ["ada", "babbage", "davinci", "curie"] or any( m in model_name_or_path for m in ["-ada-", "-babbage-", "-davinci-", "-curie-"] ) if valid_openai_model_name: return "openai" if model_name_or_path in ["small", "medium", "large", "multilingual-22-12", "finance-sentiment"]: return "cohere" if Path(model_name_or_path).exists(): if Path(f"{model_name_or_path}/config_sentence_transformers.json").exists(): return "sentence_transformers" else: try: hf_hub_download( repo_id=model_name_or_path, filename="config_sentence_transformers.json", use_auth_token=use_auth_token, ) return "sentence_transformers" except HTTPError: pass config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) if config.model_type == "retribert": return "retribert" return "farm" def train( self, training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: Optional[int] = None, batch_size: int = 16, train_loss: Literal["mnrl", "margin_mse"] = "mnrl", num_workers: int = 0, use_amp: bool = False, **kwargs, ) -> None: pass