-
Notifications
You must be signed in to change notification settings - Fork 455
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
initial implementation for hnsw (#1955)
* initial implementation for hnsw indexing and search * support multi-thread * corpus are currently stored in jsonl files
- Loading branch information
Showing
11 changed files
with
1,288 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
93 changes: 93 additions & 0 deletions
93
src/main/java/io/anserini/collection/VectorCollection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.collection; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
|
||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
|
||
/** | ||
* A document collection for encoded dense vectors for ANN (HNSW) search. | ||
* The "vector" field are concatenated into the "contents" field for indexing. | ||
*/ | ||
public class VectorCollection extends DocumentCollection<VectorCollection.Document> { | ||
public VectorCollection(Path path) { | ||
this.path = path; | ||
} | ||
|
||
@Override | ||
public FileSegment<VectorCollection.Document> createFileSegment(Path p) throws IOException { | ||
return new VectorCollection.Segment<>(p); | ||
} | ||
|
||
public static class Segment<T extends VectorCollection.Document> extends JsonCollection.Segment<T> { | ||
public Segment(Path path) throws IOException { | ||
super(path); | ||
} | ||
|
||
@Override | ||
protected Document createNewDocument(JsonNode json) { | ||
return new Document(json); | ||
} | ||
} | ||
|
||
public static class Document extends JsonCollection.Document { | ||
private final String id; | ||
private final String contents; | ||
private final String raw; | ||
private Map<String, String> fields; | ||
|
||
public Document(JsonNode json) { | ||
super(); | ||
this.raw = json.toPrettyString(); | ||
this.id = json.get("docid").asText(); | ||
this.contents = json.get("vector").toString(); | ||
// We're not going to index any other fields, so just initialize an empty map. | ||
this.fields = new HashMap<>(); | ||
} | ||
|
||
@Override | ||
public String id() { | ||
if (id == null) { | ||
throw new RuntimeException("JSON document has no \"_id\" field!"); | ||
} | ||
return id; | ||
} | ||
|
||
@Override | ||
public String contents() { | ||
if (contents == null) { | ||
throw new RuntimeException("JSON document has no contents that could be parsed!"); | ||
} | ||
return contents; | ||
} | ||
|
||
@Override | ||
public String raw() { | ||
return raw; | ||
} | ||
|
||
@Override | ||
public Map<String, String> fields() { | ||
return fields; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for reproducible information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.index; | ||
|
||
import org.kohsuke.args4j.Option; | ||
import org.kohsuke.args4j.spi.StringArrayOptionHandler; | ||
|
||
|
||
public class IndexVectorArgs { | ||
|
||
// This is the name of the field in the Lucene document where the docid is stored. | ||
public static final String ID = "id"; | ||
|
||
// This is the name of the field in the Lucene document that should be searched by default. | ||
public static final String CONTENTS = "contents"; | ||
|
||
// This is the name of the field in the Lucene document where the raw document is stored. | ||
public static final String RAW = "raw"; | ||
|
||
// This is the name of the field in the Lucene document where the vector document is stored. | ||
public static final String VECTOR = "vector"; | ||
|
||
private static final int TIMEOUT = 600 * 1000; | ||
|
||
|
||
// required arguments | ||
|
||
@Option(name = "-input", metaVar = "[path]", required = true, | ||
usage = "Location of input collection.") | ||
public String input; | ||
|
||
@Option(name = "-threads", metaVar = "[num]", required = true, | ||
usage = "Number of indexing threads.") | ||
public int threads; | ||
|
||
@Option(name = "-collection", metaVar = "[class]", required = true, | ||
usage = "Collection class in package 'io.anserini.collection'.") | ||
public String collectionClass; | ||
|
||
@Option(name = "-generator", metaVar = "[class]", | ||
usage = "Document generator class in package 'io.anserini.index.generator'.") | ||
public String generatorClass = "DefaultLuceneDocumentGenerator"; | ||
|
||
// optional general arguments | ||
|
||
@Option(name = "-verbose", forbids = {"-quiet"}, | ||
usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.") | ||
public boolean verbose = false; | ||
|
||
@Option(name = "-quiet", forbids = {"-verbose"}, | ||
usage = "Turns off all logging.") | ||
public boolean quiet = false; | ||
|
||
// optional arguments | ||
|
||
@Option(name = "-index", metaVar = "[path]", usage = "Index path.") | ||
public String index; | ||
|
||
@Option(name = "-fields", handler = StringArrayOptionHandler.class, | ||
usage = "List of fields to index (space separated), in addition to the default 'contents' field.") | ||
public String[] fields = new String[]{}; | ||
|
||
@Option(name = "-storePositions", | ||
usage = "Boolean switch to index store term positions; needed for phrase queries.") | ||
public boolean storePositions = false; | ||
|
||
@Option(name = "-storeDocvectors", | ||
usage = "Boolean switch to store document vectors; needed for (pseudo) relevance feedback.") | ||
public boolean storeDocvectors = false; | ||
|
||
@Option(name = "-storeContents", | ||
usage = "Boolean switch to store document contents.") | ||
public boolean storeContents = false; | ||
|
||
@Option(name = "-storeRaw", | ||
usage = "Boolean switch to store raw source documents.") | ||
public boolean storeRaw = false; | ||
|
||
@Option(name = "-optimize", | ||
usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.") | ||
public boolean optimize = false; | ||
|
||
@Option(name = "-uniqueDocid", | ||
usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " + | ||
"but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.") | ||
public boolean uniqueDocid = false; | ||
|
||
@Option(name = "-memorybuffer", metaVar = "[mb]", | ||
usage = "Memory buffer size (in MB).") | ||
public int memorybufferSize = 2048; | ||
|
||
@Option(name = "-whitelist", metaVar = "[file]", | ||
usage = "File containing list of docids, one per line; only these docids will be indexed.") | ||
public String whitelist = null; | ||
|
||
|
||
// Sharding options | ||
|
||
@Option(name = "-shard.count", metaVar = "[n]", | ||
usage = "Number of shards to partition the document collection into.") | ||
public int shardCount = -1; | ||
|
||
@Option(name = "-shard.current", metaVar = "[n]", | ||
usage = "The current shard number to generate (indexed from 0).") | ||
public int shardCurrent = -1; | ||
} |
Oops, something went wrong.