initial implementation for hnsw (#1955)

* initial implementation for hnsw indexing and search * support multi-thread * corpus are currently stored in jsonl files
castorini · Aug 17, 2022 · 02fa99d · 02fa99d
1 parent 5c7f455
commit 02fa99d
Show file tree

Hide file tree

Showing 11 changed files with 1,288 additions and 0 deletions.
diff --git a/pom.xml b/pom.xml
@@ -105,10 +105,18 @@
               <mainClass>io.anserini.index.IndexCollection</mainClass>
               <id>IndexCollection</id>
             </program>
+            <program>
+              <mainClass>io.anserini.index.IndexVectorCollection</mainClass>
+              <id>IndexVectorCollection</id>
+            </program>
             <program>
               <mainClass>io.anserini.search.SearchCollection</mainClass>
               <id>SearchCollection</id>
             </program>
+            <program>
+              <mainClass>io.anserini.search.SearchVectorCollection</mainClass>
+              <id>SearchVectorCollection</id>
+            </program>
             <program>
               <mainClass>io.anserini.search.SearchMsmarco</mainClass>
               <id>SearchMsmarco</id>

diff --git a/src/main/java/io/anserini/collection/VectorCollection.java b/src/main/java/io/anserini/collection/VectorCollection.java
@@ -0,0 +1,93 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import com.fasterxml.jackson.databind.JsonNode;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * A document collection for encoded dense vectors for ANN (HNSW) search.
+ * The "vector" field are concatenated into the "contents" field for indexing.
+ */
+public class VectorCollection extends DocumentCollection<VectorCollection.Document> {
+  public VectorCollection(Path path) {
+    this.path = path;
+  }
+
+  @Override
+  public FileSegment<VectorCollection.Document> createFileSegment(Path p) throws IOException {
+    return new VectorCollection.Segment<>(p);
+  }
+
+  public static class Segment<T extends VectorCollection.Document> extends JsonCollection.Segment<T> {
+    public Segment(Path path) throws IOException {
+      super(path);
+    }
+
+    @Override
+    protected Document createNewDocument(JsonNode json) {
+      return new Document(json);
+    }
+  }
+
+  public static class Document extends JsonCollection.Document {
+    private final String id;
+    private final String contents;
+    private final String raw;
+    private Map<String, String> fields;
+
+    public Document(JsonNode json) {
+      super();
+      this.raw = json.toPrettyString();
+      this.id = json.get("docid").asText();
+      this.contents = json.get("vector").toString();
+      // We're not going to index any other fields, so just initialize an empty map.
+      this.fields = new HashMap<>();
+    }
+
+    @Override
+    public String id() {
+      if (id == null) {
+        throw new RuntimeException("JSON document has no \"_id\" field!");
+      }
+      return id;
+    }
+
+    @Override
+    public String contents() {
+      if (contents == null) {
+        throw new RuntimeException("JSON document has no contents that could be parsed!");
+      }
+      return contents;
+    }
+
+    @Override
+    public String raw() {
+      return raw;
+    }
+
+    @Override
+    public Map<String, String> fields() {
+      return fields;
+    }
+  }
+}
diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java
@@ -36,9 +36,12 @@ public class IndexArgs {
   // This is the name of the field in the Lucene document where the entity document is stored.
   public static final String ENTITY = "entity";
 
+  // This is the name of the field in the Lucene document where the vector document is stored.
+  public static final String VECTOR = "vector";
 
   private static final int TIMEOUT = 600 * 1000;
 
+
   // required arguments
 
   @Option(name = "-input", metaVar = "[path]", required = true,

diff --git a/src/main/java/io/anserini/index/IndexVectorArgs.java b/src/main/java/io/anserini/index/IndexVectorArgs.java
@@ -0,0 +1,120 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index;
+
+import org.kohsuke.args4j.Option;
+import org.kohsuke.args4j.spi.StringArrayOptionHandler;
+
+
+public class IndexVectorArgs {
+
+  // This is the name of the field in the Lucene document where the docid is stored.
+  public static final String ID = "id";
+
+  // This is the name of the field in the Lucene document that should be searched by default.
+  public static final String CONTENTS = "contents";
+
+  // This is the name of the field in the Lucene document where the raw document is stored.
+  public static final String RAW = "raw";
+
+  // This is the name of the field in the Lucene document where the vector document is stored.
+  public static final String VECTOR = "vector";
+
+  private static final int TIMEOUT = 600 * 1000;
+
+
+  // required arguments
+
+  @Option(name = "-input", metaVar = "[path]", required = true,
+      usage = "Location of input collection.")
+  public String input;
+
+  @Option(name = "-threads", metaVar = "[num]", required = true,
+      usage = "Number of indexing threads.")
+  public int threads;
+
+  @Option(name = "-collection", metaVar = "[class]", required = true,
+      usage = "Collection class in package 'io.anserini.collection'.")
+  public String collectionClass;
+
+  @Option(name = "-generator", metaVar = "[class]",
+      usage = "Document generator class in package 'io.anserini.index.generator'.")
+  public String generatorClass = "DefaultLuceneDocumentGenerator";
+
+  // optional general arguments
+
+  @Option(name = "-verbose", forbids = {"-quiet"},
+      usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.")
+  public boolean verbose = false;
+
+  @Option(name = "-quiet", forbids = {"-verbose"},
+      usage = "Turns off all logging.")
+  public boolean quiet = false;
+
+  // optional arguments
+
+  @Option(name = "-index", metaVar = "[path]", usage = "Index path.")
+  public String index;
+
+  @Option(name = "-fields", handler = StringArrayOptionHandler.class,
+      usage = "List of fields to index (space separated), in addition to the default 'contents' field.")
+  public String[] fields = new String[]{};
+
+  @Option(name = "-storePositions",
+      usage = "Boolean switch to index store term positions; needed for phrase queries.")
+  public boolean storePositions = false;
+
+  @Option(name = "-storeDocvectors",
+      usage = "Boolean switch to store document vectors; needed for (pseudo) relevance feedback.")
+  public boolean storeDocvectors = false;
+
+  @Option(name = "-storeContents",
+      usage = "Boolean switch to store document contents.")
+  public boolean storeContents = false;
+
+  @Option(name = "-storeRaw",
+      usage = "Boolean switch to store raw source documents.")
+  public boolean storeRaw = false;
+
+  @Option(name = "-optimize",
+      usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.")
+  public boolean optimize = false;
+
+  @Option(name = "-uniqueDocid",
+      usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " +
+              "but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.")
+  public boolean uniqueDocid = false;
+
+  @Option(name = "-memorybuffer", metaVar = "[mb]",
+      usage = "Memory buffer size (in MB).")
+  public int memorybufferSize = 2048;
+
+  @Option(name = "-whitelist", metaVar = "[file]",
+      usage = "File containing list of docids, one per line; only these docids will be indexed.")
+  public String whitelist = null;
+
+
+  // Sharding options
+
+  @Option(name = "-shard.count", metaVar = "[n]",
+      usage = "Number of shards to partition the document collection into.")
+  public int shardCount = -1;
+
+  @Option(name = "-shard.current", metaVar = "[n]",
+      usage = "The current shard number to generate (indexed from 0).")
+  public int shardCurrent = -1;
+}