-
Notifications
You must be signed in to change notification settings - Fork 0
/
huggingface implementation.py
41 lines (31 loc) · 1.52 KB
/
huggingface implementation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
# Load the pre-trained model and tokenizer
model_base_name = "meta-llama/Llama-2-7b-hf"
model = LlamaForCausalLM.from_pretrained(model_base_name)
tokenizer = LlamaTokenizer.from_pretrained(model_base_name)
# Check vocabulary size and maximum sequence length
vocab_size = tokenizer.vocab_size
max_seq_length = model.config.max_position_embeddings
print("Vocabulary Size:", vocab_size)
print("Max Sequence Length:", max_seq_length)
# Add a padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Specify input sentences
sentences = ["This is me", "A 2nd sentence"]
# Tokenize the input sentences with padding and truncation
input_ids = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=max_seq_length)['input_ids']
# Ensure token IDs are within the vocabulary range
input_ids = input_ids.clamp(max=vocab_size - 1)
# Get model outputs (logits)
with torch.no_grad():
outputs = model(input_ids)
# Extract hidden states from the base model
hidden_states = outputs.logits
# Extract embeddings for [CLS] tokens (you can choose other tokens as needed)
cls_embeddings = hidden_states[:, 0, :]
# Now, cls_embeddings contains dense embeddings for your input sentences
# Compute cosine similarity using torch.nn.functional.cosine_similarity
import torch.nn.functional as F
similarity = F.cosine_similarity(cls_embeddings[0].unsqueeze(0), cls_embeddings[1].unsqueeze(0))
print("Cosine Similarity:", similarity.item())