102 lines
4.2 KiB
Python
102 lines
4.2 KiB
Python
|
import os
|
||
|
import sys
|
||
|
from dotenv import load_dotenv
|
||
|
load_dotenv()
|
||
|
sys_path = os.environ.get('SYS_PATH')
|
||
|
|
||
|
# from embeddings.llama import Embedder
|
||
|
sys.path.append(sys_path)
|
||
|
from embeddings.llama import Embedder
|
||
|
from transformers import LlamaForCausalLM, LlamaTokenizer
|
||
|
import torch
|
||
|
from database.es_handler import ElasticSearchData
|
||
|
from tqdm import tqdm
|
||
|
import pickle
|
||
|
|
||
|
class LlamaTransformerEmbeddings:
|
||
|
def __init__(self, model_path, ggml=False, output_hidden_states= True):
|
||
|
if ggml:
|
||
|
self.model_ggml= Embedder(model_path)
|
||
|
else:
|
||
|
self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
|
||
|
self.model = LlamaForCausalLM.from_pretrained(model_path, output_hidden_states=output_hidden_states)
|
||
|
def get_embeddings_hf(self, text):
|
||
|
inputs = self.tokenizer(text, return_tensors="pt")
|
||
|
with torch.no_grad():
|
||
|
outputs = self.model(**inputs)
|
||
|
# embeddings = outputs.hidden_states[-1]
|
||
|
# avg_embeddings = torch.mean(embeddings, dim=1)
|
||
|
# return avg_embeddings[0]
|
||
|
# embeddings, _ = torch.max(outputs.hidden_states[-1], dim=1)
|
||
|
hidden_states= outputs[2]
|
||
|
token_vecs = hidden_states[-2][0]
|
||
|
sentence_embedding = torch.mean(token_vecs, dim=0)
|
||
|
return sentence_embedding
|
||
|
|
||
|
def get_input_embeddings(self, text):
|
||
|
inputs = self.tokenizer(text, return_tensors="pt")
|
||
|
input_ids = inputs['input_ids']
|
||
|
with torch.no_grad():
|
||
|
input_embeddings = self.model.get_input_embeddings()
|
||
|
embeddings = input_embeddings(input_ids)
|
||
|
mean = torch.mean(embeddings[0], 0).cpu().numpy()
|
||
|
return mean
|
||
|
|
||
|
def get_embeddings_ggml(self,text):
|
||
|
return self.model_ggml.embed_text_llama(doc=text)
|
||
|
|
||
|
def generate_answer(self, prompt):
|
||
|
inputs = self.tokenizer(prompt, return_tensors="pt")
|
||
|
generate_ids = self.model.generate(inputs.input_ids, max_length=700)
|
||
|
return self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||
|
|
||
|
# print("Program has ended.")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# tokenizer = LlamaTokenizer.from_pretrained("../models/tmp/llama-13b-hf")
|
||
|
# model = LlamaForCausalLM.from_pretrained("../models/tmp/llama-13b-hf" ,output_hidden_states=True)
|
||
|
|
||
|
|
||
|
# text = "Ihr Text für die semantische Suche"
|
||
|
# inputs = tokenizer(text, return_tensors="pt")
|
||
|
|
||
|
# with torch.no_grad():
|
||
|
# outputs = model(**inputs)
|
||
|
|
||
|
# embeddings = outputs.hidden_states
|
||
|
# with open('hidden_states.pkl', 'wb') as f:
|
||
|
# pickle.dump(embeddings, f)
|
||
|
# final_embeddings = embeddings[-1]
|
||
|
with open('hidden_states.pkl', 'rb') as f:
|
||
|
hidden_states = pickle.load(f)
|
||
|
print(len(hidden_states))
|
||
|
########### WORD EMBEDDINGS##############################
|
||
|
# Concatenate the tensors for all layers. We use `stack` here to
|
||
|
# create a new dimension in the tensor.
|
||
|
# token_embeddings = torch.stack(hidden_states, dim=0)
|
||
|
# print(token_embeddings.size())
|
||
|
# # Remove dimension 1, the "batches".
|
||
|
# token_embeddings = torch.squeeze(token_embeddings, dim=1)
|
||
|
# print(token_embeddings.size())
|
||
|
# # Swap dimensions 0 and 1.
|
||
|
# token_embeddings = token_embeddings.permute(1,0,2)
|
||
|
# print(token_embeddings.size())
|
||
|
# # Stores the token vectors, with shape [ num_tokens x 20480]
|
||
|
# token_vecs_cat = []
|
||
|
# for token in token_embeddings:
|
||
|
# # `token` is a [40 x 5120] tensor
|
||
|
|
||
|
# # Concatenate the vectors (that is, append them together) from the last
|
||
|
# # four layers.
|
||
|
# # Each layer vector is 5120 values, so `cat_vec` is length 20480.
|
||
|
# cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
|
||
|
# # Use `cat_vec` to represent `token`.
|
||
|
# token_vecs_cat.append(cat_vec)
|
||
|
# print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))
|
||
|
########### WORD EMBEDDINGS##############################
|
||
|
|
||
|
########### SENTENCE EMBEDDINGS##############################
|
||
|
token_vecs = hidden_states[-2][0]
|
||
|
sentence_embedding = torch.mean(token_vecs, dim=0)
|
||
|
# print ("Our final sentence embedding vector of shape:", sentence_embedding.size())
|
||
|
########### SENTENCE EMBEDDINGS##############################
|