Skip to content

Commit

Permalink
initial implementation for hnsw (#1955)
Browse files Browse the repository at this point in the history
* initial implementation for hnsw indexing and search
* support multi-thread
* corpus are currently stored in jsonl files
  • Loading branch information
MXueguang authored Aug 17, 2022
1 parent 5c7f455 commit 02fa99d
Show file tree
Hide file tree
Showing 11 changed files with 1,288 additions and 0 deletions.
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,18 @@
<mainClass>io.anserini.index.IndexCollection</mainClass>
<id>IndexCollection</id>
</program>
<program>
<mainClass>io.anserini.index.IndexVectorCollection</mainClass>
<id>IndexVectorCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchCollection</mainClass>
<id>SearchCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchVectorCollection</mainClass>
<id>SearchVectorCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchMsmarco</mainClass>
<id>SearchMsmarco</id>
Expand Down
93 changes: 93 additions & 0 deletions src/main/java/io/anserini/collection/VectorCollection.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.collection;

import com.fasterxml.jackson.databind.JsonNode;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;


/**
* A document collection for encoded dense vectors for ANN (HNSW) search.
* The "vector" field are concatenated into the "contents" field for indexing.
*/
public class VectorCollection extends DocumentCollection<VectorCollection.Document> {
public VectorCollection(Path path) {
this.path = path;
}

@Override
public FileSegment<VectorCollection.Document> createFileSegment(Path p) throws IOException {
return new VectorCollection.Segment<>(p);
}

public static class Segment<T extends VectorCollection.Document> extends JsonCollection.Segment<T> {
public Segment(Path path) throws IOException {
super(path);
}

@Override
protected Document createNewDocument(JsonNode json) {
return new Document(json);
}
}

public static class Document extends JsonCollection.Document {
private final String id;
private final String contents;
private final String raw;
private Map<String, String> fields;

public Document(JsonNode json) {
super();
this.raw = json.toPrettyString();
this.id = json.get("docid").asText();
this.contents = json.get("vector").toString();
// We're not going to index any other fields, so just initialize an empty map.
this.fields = new HashMap<>();
}

@Override
public String id() {
if (id == null) {
throw new RuntimeException("JSON document has no \"_id\" field!");
}
return id;
}

@Override
public String contents() {
if (contents == null) {
throw new RuntimeException("JSON document has no contents that could be parsed!");
}
return contents;
}

@Override
public String raw() {
return raw;
}

@Override
public Map<String, String> fields() {
return fields;
}
}
}
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/index/IndexArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ public class IndexArgs {
// This is the name of the field in the Lucene document where the entity document is stored.
public static final String ENTITY = "entity";

// This is the name of the field in the Lucene document where the vector document is stored.
public static final String VECTOR = "vector";

private static final int TIMEOUT = 600 * 1000;


// required arguments

@Option(name = "-input", metaVar = "[path]", required = true,
Expand Down
120 changes: 120 additions & 0 deletions src/main/java/io/anserini/index/IndexVectorArgs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.spi.StringArrayOptionHandler;


public class IndexVectorArgs {

// This is the name of the field in the Lucene document where the docid is stored.
public static final String ID = "id";

// This is the name of the field in the Lucene document that should be searched by default.
public static final String CONTENTS = "contents";

// This is the name of the field in the Lucene document where the raw document is stored.
public static final String RAW = "raw";

// This is the name of the field in the Lucene document where the vector document is stored.
public static final String VECTOR = "vector";

private static final int TIMEOUT = 600 * 1000;


// required arguments

@Option(name = "-input", metaVar = "[path]", required = true,
usage = "Location of input collection.")
public String input;

@Option(name = "-threads", metaVar = "[num]", required = true,
usage = "Number of indexing threads.")
public int threads;

@Option(name = "-collection", metaVar = "[class]", required = true,
usage = "Collection class in package 'io.anserini.collection'.")
public String collectionClass;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";

// optional general arguments

@Option(name = "-verbose", forbids = {"-quiet"},
usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.")
public boolean verbose = false;

@Option(name = "-quiet", forbids = {"-verbose"},
usage = "Turns off all logging.")
public boolean quiet = false;

// optional arguments

@Option(name = "-index", metaVar = "[path]", usage = "Index path.")
public String index;

@Option(name = "-fields", handler = StringArrayOptionHandler.class,
usage = "List of fields to index (space separated), in addition to the default 'contents' field.")
public String[] fields = new String[]{};

@Option(name = "-storePositions",
usage = "Boolean switch to index store term positions; needed for phrase queries.")
public boolean storePositions = false;

@Option(name = "-storeDocvectors",
usage = "Boolean switch to store document vectors; needed for (pseudo) relevance feedback.")
public boolean storeDocvectors = false;

@Option(name = "-storeContents",
usage = "Boolean switch to store document contents.")
public boolean storeContents = false;

@Option(name = "-storeRaw",
usage = "Boolean switch to store raw source documents.")
public boolean storeRaw = false;

@Option(name = "-optimize",
usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.")
public boolean optimize = false;

@Option(name = "-uniqueDocid",
usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " +
"but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.")
public boolean uniqueDocid = false;

@Option(name = "-memorybuffer", metaVar = "[mb]",
usage = "Memory buffer size (in MB).")
public int memorybufferSize = 2048;

@Option(name = "-whitelist", metaVar = "[file]",
usage = "File containing list of docids, one per line; only these docids will be indexed.")
public String whitelist = null;


// Sharding options

@Option(name = "-shard.count", metaVar = "[n]",
usage = "Number of shards to partition the document collection into.")
public int shardCount = -1;

@Option(name = "-shard.current", metaVar = "[n]",
usage = "The current shard number to generate (indexed from 0).")
public int shardCurrent = -1;
}
Loading

0 comments on commit 02fa99d

Please sign in to comment.