Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initial implementation for hnsw #1955

Merged
merged 10 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,18 @@
<mainClass>io.anserini.index.IndexCollection</mainClass>
<id>IndexCollection</id>
</program>
<program>
<mainClass>io.anserini.index.IndexVectorCollection</mainClass>
<id>IndexVectorCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchCollection</mainClass>
<id>SearchCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchVectorCollection</mainClass>
<id>SearchVectorCollection</id>
</program>
<program>
<mainClass>io.anserini.search.SearchMsmarco</mainClass>
<id>SearchMsmarco</id>
Expand Down
93 changes: 93 additions & 0 deletions src/main/java/io/anserini/collection/VectorCollection.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.collection;

import com.fasterxml.jackson.databind.JsonNode;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;


/**
* A document collection for encoded dense vectors for ANN (HNSW) search.
* The "vector" field are concatenated into the "contents" field for indexing.
*/
public class VectorCollection extends DocumentCollection<VectorCollection.Document> {
public VectorCollection(Path path) {
this.path = path;
}

@Override
public FileSegment<VectorCollection.Document> createFileSegment(Path p) throws IOException {
return new VectorCollection.Segment<>(p);
}

public static class Segment<T extends VectorCollection.Document> extends JsonCollection.Segment<T> {
public Segment(Path path) throws IOException {
super(path);
}

@Override
protected Document createNewDocument(JsonNode json) {
return new Document(json);
}
}

public static class Document extends JsonCollection.Document {
private final String id;
private final String contents;
private final String raw;
private Map<String, String> fields;

public Document(JsonNode json) {
super();
this.raw = json.toPrettyString();
this.id = json.get("docid").asText();
this.contents = json.get("vector").toString();
// We're not going to index any other fields, so just initialize an empty map.
this.fields = new HashMap<>();
}

@Override
public String id() {
if (id == null) {
throw new RuntimeException("JSON document has no \"_id\" field!");
}
return id;
}

@Override
public String contents() {
if (contents == null) {
throw new RuntimeException("JSON document has no contents that could be parsed!");
}
return contents;
}

@Override
public String raw() {
return raw;
}

@Override
public Map<String, String> fields() {
return fields;
}
}
}
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/index/IndexArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,12 @@ public class IndexArgs {
// This is the name of the field in the Lucene document where the entity document is stored.
public static final String ENTITY = "entity";

// This is the name of the field in the Lucene document where the vector document is stored.
public static final String VECTOR = "vector";

private static final int TIMEOUT = 600 * 1000;


// required arguments

@Option(name = "-input", metaVar = "[path]", required = true,
Expand Down
120 changes: 120 additions & 0 deletions src/main/java/io/anserini/index/IndexVectorArgs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index;

import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.spi.StringArrayOptionHandler;


public class IndexVectorArgs {

// This is the name of the field in the Lucene document where the docid is stored.
public static final String ID = "id";

// This is the name of the field in the Lucene document that should be searched by default.
public static final String CONTENTS = "contents";

// This is the name of the field in the Lucene document where the raw document is stored.
public static final String RAW = "raw";

// This is the name of the field in the Lucene document where the vector document is stored.
public static final String VECTOR = "vector";

private static final int TIMEOUT = 600 * 1000;


// required arguments

@Option(name = "-input", metaVar = "[path]", required = true,
usage = "Location of input collection.")
public String input;

@Option(name = "-threads", metaVar = "[num]", required = true,
usage = "Number of indexing threads.")
public int threads;

@Option(name = "-collection", metaVar = "[class]", required = true,
usage = "Collection class in package 'io.anserini.collection'.")
public String collectionClass;

@Option(name = "-generator", metaVar = "[class]",
usage = "Document generator class in package 'io.anserini.index.generator'.")
public String generatorClass = "DefaultLuceneDocumentGenerator";

// optional general arguments

@Option(name = "-verbose", forbids = {"-quiet"},
usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.")
public boolean verbose = false;

@Option(name = "-quiet", forbids = {"-verbose"},
usage = "Turns off all logging.")
public boolean quiet = false;

// optional arguments

@Option(name = "-index", metaVar = "[path]", usage = "Index path.")
public String index;

@Option(name = "-fields", handler = StringArrayOptionHandler.class,
usage = "List of fields to index (space separated), in addition to the default 'contents' field.")
public String[] fields = new String[]{};

@Option(name = "-storePositions",
usage = "Boolean switch to index store term positions; needed for phrase queries.")
public boolean storePositions = false;

@Option(name = "-storeDocvectors",
usage = "Boolean switch to store document vectors; needed for (pseudo) relevance feedback.")
public boolean storeDocvectors = false;

@Option(name = "-storeContents",
usage = "Boolean switch to store document contents.")
public boolean storeContents = false;

@Option(name = "-storeRaw",
usage = "Boolean switch to store raw source documents.")
public boolean storeRaw = false;

@Option(name = "-optimize",
usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.")
public boolean optimize = false;

@Option(name = "-uniqueDocid",
usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " +
"but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.")
public boolean uniqueDocid = false;

@Option(name = "-memorybuffer", metaVar = "[mb]",
usage = "Memory buffer size (in MB).")
public int memorybufferSize = 2048;

@Option(name = "-whitelist", metaVar = "[file]",
usage = "File containing list of docids, one per line; only these docids will be indexed.")
public String whitelist = null;


// Sharding options

@Option(name = "-shard.count", metaVar = "[n]",
usage = "Number of shards to partition the document collection into.")
public int shardCount = -1;

@Option(name = "-shard.current", metaVar = "[n]",
usage = "The current shard number to generate (indexed from 0).")
public int shardCurrent = -1;
}
Loading