-
Notifications
You must be signed in to change notification settings - Fork 26
/
synonym-finder.go
272 lines (232 loc) · 7.87 KB
/
synonym-finder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
// A program to find synonyms for a given word in a Word2Vec model.
// It finds words in the model with similarity scores above a given threshold.
//
// Usage: synonym-finder [OPTIONS] QUERY
// QUERY is the word to find similar words for (required)
// Options:
// -model_path string
// Path to the Word2Vec model file (required)
// -threshold float
// Similarity threshold for matching (required) (default 0.7)
// -ignore-case
// Ignore case. Note: word2vec is case-sensitive. Ignoring case may lead to unexpected results
// -f string
// File containing patterns, one per line
// -o Print only matching tokens
//
// Example:
// synonym-finder -model_path ../models/glove/glove.6B.300d.bin -threshold 0.5 angry
package main
import (
"bufio"
"encoding/binary"
"flag"
"fmt"
"io"
"math"
"os"
"strings"
)
// Options defines the command-line options
type Options struct {
ModelPath string
SimilarityThreshold float64
IgnoreCase bool
PatternFile string
OnlyMatching bool // New field for -o flag
}
// VectorModel interface defines the methods that all vector models must implement
type VectorModel interface {
LoadModel(filename string) error
GetEmbedding(token string) interface{}
}
// VecModel32bit represents a 32-bit floating point Word2Vec model
type VecModel32bit struct {
Vectors map[string][]float32
Size int
}
// LoadModel loads a 32-bit floating point Word2Vec model from a file
func (m *VecModel32bit) LoadModel(filename string) error {
file, err := os.Open(filename)
if err != nil {
return fmt.Errorf("failed to open file: %v", err)
}
defer file.Close()
reader := bufio.NewReader(file)
// Read header
var vocabSize, vectorSize int
_, err = fmt.Fscanf(reader, "%d %d\n", &vocabSize, &vectorSize)
if err != nil {
return fmt.Errorf("failed to read header: %v\nCheck that you have a valid model file", err)
}
// Validate header
if vocabSize <= 0 || vectorSize <= 0 {
return fmt.Errorf("invalid header: vocabSize=%d, vectorSize=%d\nCheck that you have a valid model file", vocabSize, vectorSize)
}
m.Vectors = make(map[string][]float32, vocabSize)
m.Size = vectorSize
for i := 0; i < vocabSize; i++ {
word, err := reader.ReadString(' ')
if err != nil {
return fmt.Errorf("failed to read word: %v", err)
}
word = strings.TrimSpace(word)
vector := make([]float32, vectorSize)
for j := 0; j < vectorSize; j++ {
err := binary.Read(reader, binary.LittleEndian, &vector[j])
if err != nil {
return fmt.Errorf("failed to read vector: %v", err)
}
}
// Check if we've reached the end of the record
nextByte, err := reader.Peek(1)
if err != nil && err != io.EOF {
return fmt.Errorf("unexpected error reading next byte: %v", err)
}
if len(nextByte) > 0 && nextByte[0] == '\n' {
reader.ReadByte() // consume the newline
}
m.Vectors[word] = vector
}
// Check if we've reached the end of the file
_, err = reader.ReadByte()
if err != io.EOF {
return fmt.Errorf("unexpected data at end of file.\nCheck that you have a valid model file")
}
return nil
}
// GetEmbedding returns the vector embedding of a token for the 32-bit model
func (m *VecModel32bit) GetEmbedding(token string) interface{} {
vec, ok := m.Vectors[token]
if !ok {
return make([]float32, m.Size)
}
return vec
}
// LoadVectorModel loads either a 32-bit or 8-bit model based on the file extension
func LoadVectorModel(filename string) (VectorModel, error) {
var model VectorModel
if strings.HasSuffix(filename, ".bin") {
model = &VecModel32bit{}
} else {
return nil, fmt.Errorf("unsupported file format")
}
err := model.LoadModel(filename)
if err != nil {
return nil, err
}
return model, nil
}
// calculateSimilarity calculates the cosine similarity between two vectors
func calculateSimilarity32bit(vec1, vec2 []float32) float64 {
dotProduct := float64(0)
norm1 := float64(0)
norm2 := float64(0)
for i := range vec1 {
dotProduct += float64(vec1[i] * vec2[i])
norm1 += float64(vec1[i] * vec1[i])
norm2 += float64(vec2[i] * vec2[i])
}
return dotProduct / (math.Sqrt(norm1) * math.Sqrt(norm2))
}
// findSimilarWords finds words in the model that are similar to the query word above the given threshold
func findSimilarWords(model VectorModel, query string, threshold float64, onlyMatching bool) error {
queryEmbedding := model.GetEmbedding(query).([]float32)
if len(queryEmbedding) == 0 {
return fmt.Errorf("query word not found in model")
}
if onlyMatching {
fmt.Println(query) // Print the bare query
} else {
fmt.Printf("Words similar to '%s' with similarity >= %.2f:\n", query, threshold)
}
for word, embedding := range model.(*VecModel32bit).Vectors {
similarity := calculateSimilarity32bit(queryEmbedding, embedding)
if similarity >= threshold && similarity < 1.0 {
if onlyMatching {
fmt.Println(word)
} else {
fmt.Printf("%s %.4f\n", word, similarity)
}
}
}
return nil
}
// findSimilarWordsForPatterns finds similar words for each pattern in the given file
func findSimilarWordsForPatterns(model VectorModel, patternFile string, threshold float64, onlyMatching bool) error {
file, err := os.Open(patternFile)
if err != nil {
return fmt.Errorf("failed to open pattern file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
pattern := strings.TrimSpace(scanner.Text())
if pattern == "" {
continue
}
err := findSimilarWords(model, pattern, threshold, onlyMatching)
if err != nil {
fmt.Printf("Warning: %v\n", err)
}
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("error reading pattern file: %v", err)
}
return nil
}
func main() {
var opts Options
flag.StringVar(&opts.ModelPath, "model_path", "", "Path to the Word2Vec model file (required)")
flag.Float64Var(&opts.SimilarityThreshold, "threshold", 0.7, "Similarity threshold for matching (default 0.7)")
flag.BoolVar(&opts.IgnoreCase, "ignore-case", false, "Ignore case. Note: word2vec is case-sensitive. Ignoring case may lead to unexpected results")
flag.StringVar(&opts.PatternFile, "f", "", "File containing patterns, one per line")
flag.BoolVar(&opts.OnlyMatching, "o", false, "Print only matching tokens")
// Custom usage message
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] [QUERY]\n\n", os.Args[0])
fmt.Fprintf(os.Stderr, "QUERY is the word to find similar words for (required if -f is not used)\n\n")
fmt.Fprintf(os.Stderr, "Options:\n")
flag.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 cat\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 -f patterns.txt\n", os.Args[0])
fmt.Fprintf(os.Stderr, " %s -model_path path/to/model.bin -threshold 0.8 -o cat\n", os.Args[0])
}
flag.Parse()
if opts.ModelPath == "" {
fmt.Fprintln(os.Stderr, "Error: Model path is required. Please provide it via -model_path flag.")
flag.Usage()
os.Exit(1)
}
if flag.Lookup("threshold").Value.String() == "0.7" {
fmt.Fprintln(os.Stderr, "Error: Threshold is required. Please provide it via -threshold flag.")
flag.Usage()
os.Exit(1)
}
model, err := LoadVectorModel(opts.ModelPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Error loading model: %v\n", err)
os.Exit(1)
}
if opts.PatternFile != "" {
err = findSimilarWordsForPatterns(model, opts.PatternFile, opts.SimilarityThreshold, opts.OnlyMatching)
if err != nil {
fmt.Fprintf(os.Stderr, "Error processing pattern file: %v\n", err)
os.Exit(1)
}
} else {
args := flag.Args()
if len(args) != 1 {
fmt.Fprintln(os.Stderr, "Error: Exactly one query word is required when not using -f")
flag.Usage()
os.Exit(1)
}
query := args[0]
err = findSimilarWords(model, query, opts.SimilarityThreshold, opts.OnlyMatching)
if err != nil {
fmt.Fprintf(os.Stderr, "Error finding similar words: %v\n", err)
os.Exit(1)
}
}
}