300 lines
10 KiB
Plaintext
300 lines
10 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import os\n",
|
||
|
"import sys\n",
|
||
|
"\n",
|
||
|
"# from embeddings.llama import Embedder\n",
|
||
|
"sys.path.append('/root/home/BA_QA_HSMA/backendd')\n",
|
||
|
"from embeddings.llama import Embedder\n",
|
||
|
"from transformers import LlamaForCausalLM, LlamaTokenizer\n",
|
||
|
"import torch\n",
|
||
|
"from database.es_handler import ElasticSearchData\n",
|
||
|
"from tqdm import tqdm\n",
|
||
|
"import pickle\n",
|
||
|
"from transformer_llama import LlamaTransformerEmbeddings"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
|
||
|
"The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. \n",
|
||
|
"The class this function is called from is 'LlamaTokenizer'.\n",
|
||
|
"Loading checkpoint shards: 0%| | 0/41 [00:00<?, ?it/s]/home/maydane/miniconda3/envs/backend/lib/python3.10/site-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
|
||
|
" return self.fget.__get__(instance, owner)()\n",
|
||
|
"Loading checkpoint shards: 100%|██████████| 41/41 [05:15<00:00, 7.70s/it]\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"model_path = \"../models/tmp/llama-13b-hf\"\n",
|
||
|
"embeddings_model = LlamaTransformerEmbeddings(model_path)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"ename": "AttributeError",
|
||
|
"evalue": "'LlamaTokenizer' object has no attribute 'vocab'",
|
||
|
"output_type": "error",
|
||
|
"traceback": [
|
||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
|
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||
|
"Cell \u001b[0;32mIn[30], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mlist\u001b[39m(embeddings_model\u001b[39m.\u001b[39;49mtokenizer\u001b[39m.\u001b[39;49mvocab\u001b[39m.\u001b[39mkeys())[\u001b[39m5000\u001b[39m:\u001b[39m5020\u001b[39m]\n",
|
||
|
"\u001b[0;31mAttributeError\u001b[0m: 'LlamaTokenizer' object has no attribute 'vocab'"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"list(embeddings_model.tokenizer.vocab.keys())[5000:5020]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 32,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"tokenizer= embeddings_model.tokenizer\n",
|
||
|
"text= \"After stealing money from the bank vault, the bank robber was seen \" \\\n",
|
||
|
" \"fishing on the Mississippi river bank.\""
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 33,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Split the sentence into tokens.\n",
|
||
|
"tokenized_text = tokenizer.tokenize(text)\n",
|
||
|
"\n",
|
||
|
"# Map the token strings to their vocabulary indeces.\n",
|
||
|
"indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 35,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 35,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"segments_ids = [1] * len(tokenized_text)\n",
|
||
|
"segments_ids"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 36,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"tokens_tensor = torch.tensor([indexed_tokens])\n",
|
||
|
"segments_tensors = torch.tensor([segments_ids])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 25,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"0 ▁After\n",
|
||
|
"1 ▁ste\n",
|
||
|
"2 aling\n",
|
||
|
"3 ▁money\n",
|
||
|
"4 ▁from\n",
|
||
|
"5 ▁the\n",
|
||
|
"6 ▁bank\n",
|
||
|
"7 ▁v\n",
|
||
|
"8 ault\n",
|
||
|
"9 ,\n",
|
||
|
"10 ▁the\n",
|
||
|
"11 ▁bank\n",
|
||
|
"12 ▁rob\n",
|
||
|
"13 ber\n",
|
||
|
"14 ▁was\n",
|
||
|
"15 ▁seen\n",
|
||
|
"16 ▁fish\n",
|
||
|
"17 ing\n",
|
||
|
"18 ▁on\n",
|
||
|
"19 ▁the\n",
|
||
|
"20 ▁Mississippi\n",
|
||
|
"21 ▁river\n",
|
||
|
"22 ▁bank\n",
|
||
|
"23 .\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"tokenized_text = embeddings_model.tokenizer.tokenize(\"After stealing money from the bank vault, the bank robber was seen \" \\\n",
|
||
|
" \"fishing on the Mississippi river bank.\")\n",
|
||
|
"for i, token_str in enumerate(tokenized_text):\n",
|
||
|
" print (i, token_str)\n",
|
||
|
"\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from scipy.spatial.distance import cosine\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 26,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"ename": "NameError",
|
||
|
"evalue": "name 'token_embeddings' is not defined",
|
||
|
"output_type": "error",
|
||
|
"traceback": [
|
||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||
|
"Cell \u001b[0;32mIn[26], line 7\u001b[0m\n\u001b[1;32m 2\u001b[0m token_vecs_cat \u001b[39m=\u001b[39m []\n\u001b[1;32m 4\u001b[0m \u001b[39m# `token_embeddings` is a [22 x 12 x 768] tensor.\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \n\u001b[1;32m 6\u001b[0m \u001b[39m# For each token in the sentence...\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m \u001b[39mfor\u001b[39;00m token \u001b[39min\u001b[39;00m token_embeddings:\n\u001b[1;32m 8\u001b[0m \n\u001b[1;32m 9\u001b[0m \u001b[39m# `token` is a [12 x 768] tensor\u001b[39;00m\n\u001b[1;32m 10\u001b[0m \n\u001b[1;32m 11\u001b[0m \u001b[39m# Concatenate the vectors (that is, append them together) from the last \u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[39m# four layers.\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[39m# Each layer vector is 768 values, so `cat_vec` is length 3,072.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m cat_vec \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mcat((token[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m], token[\u001b[39m-\u001b[39m\u001b[39m2\u001b[39m], token[\u001b[39m-\u001b[39m\u001b[39m3\u001b[39m], token[\u001b[39m-\u001b[39m\u001b[39m4\u001b[39m]), dim\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m)\n\u001b[1;32m 16\u001b[0m \u001b[39m# Use `cat_vec` to represent `token`.\u001b[39;00m\n",
|
||
|
"\u001b[0;31mNameError\u001b[0m: name 'token_embeddings' is not defined"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Stores the token vectors, with shape [22 x 3,072]\n",
|
||
|
"token_vecs_cat = []\n",
|
||
|
"\n",
|
||
|
"# `token_embeddings` is a [22 x 12 x 768] tensor.\n",
|
||
|
"\n",
|
||
|
"# For each token in the sentence...\n",
|
||
|
"for token in token_embeddings:\n",
|
||
|
" \n",
|
||
|
" # `token` is a [12 x 768] tensor\n",
|
||
|
"\n",
|
||
|
" # Concatenate the vectors (that is, append them together) from the last \n",
|
||
|
" # four layers.\n",
|
||
|
" # Each layer vector is 768 values, so `cat_vec` is length 3,072.\n",
|
||
|
" cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)\n",
|
||
|
" \n",
|
||
|
" # Use `cat_vec` to represent `token`.\n",
|
||
|
" token_vecs_cat.append(cat_vec)\n",
|
||
|
"\n",
|
||
|
"print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Stores the token vectors, with shape [22 x 768]\n",
|
||
|
"token_vecs_sum = []\n",
|
||
|
"\n",
|
||
|
"# `token_embeddings` is a [22 x 12 x 768] tensor.\n",
|
||
|
"\n",
|
||
|
"# For each token in the sentence...\n",
|
||
|
"for token in token_embeddings:\n",
|
||
|
"\n",
|
||
|
" # `token` is a [12 x 768] tensor\n",
|
||
|
"\n",
|
||
|
" # Sum the vectors from the last four layers.\n",
|
||
|
" sum_vec = torch.sum(token[-4:], dim=0)\n",
|
||
|
" \n",
|
||
|
" # Use `sum_vec` to represent `token`.\n",
|
||
|
" token_vecs_sum.append(sum_vec)\n",
|
||
|
"\n",
|
||
|
"print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Calculate the cosine similarity between the word bank \n",
|
||
|
"# in \"bank robber\" vs \"river bank\" (different meanings).\n",
|
||
|
"diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])\n",
|
||
|
"\n",
|
||
|
"# Calculate the cosine similarity between the word bank\n",
|
||
|
"# in \"bank robber\" vs \"bank vault\" (same meaning).\n",
|
||
|
"same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])\n",
|
||
|
"\n",
|
||
|
"print('Vector similarity for *similar* meanings: %.2f' % same_bank)\n",
|
||
|
"print('Vector similarity for *different* meanings: %.2f' % diff_bank)"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3.10.11 ('backend')",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.10.11"
|
||
|
},
|
||
|
"orig_nbformat": 4,
|
||
|
"vscode": {
|
||
|
"interpreter": {
|
||
|
"hash": "ec98c019f1befdeef47e250107e8ecbbb590b18e092be4f687ed7315b206d36b"
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|