Skip to content

Commit

Permalink
Update Pyserini bindings to Lucene 9 (#1961)
Browse files Browse the repository at this point in the history
+ Expose Lucene 8 backwards compatibility bindings in SimpleSearcher and SimpleImpactSearcher:
  Basically, if we detect Lucene 8 indexes, we disable consistent tie-breaking, which depends on docvalues; see #1952
+ General cleanup (fixed code formatting in SimpleImpactSearcher)
+ Remove main in SimpleSearcher
+ Change to Python method names (snake_case)
  • Loading branch information
lintool authored Aug 17, 2022
1 parent 02fa99d commit e475cc4
Show file tree
Hide file tree
Showing 77 changed files with 812 additions and 718 deletions.
589 changes: 288 additions & 301 deletions src/main/java/io/anserini/search/SimpleImpactSearcher.java

Large diffs are not rendered by default.

398 changes: 113 additions & 285 deletions src/main/java/io/anserini/search/SimpleSearcher.java

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/main/java/io/anserini/search/SimpleTweetSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ public static void main(String[] args) throws Exception {
PrintWriter out = new PrintWriter(Files.newBufferedWriter(Paths.get(searchArgs.output), StandardCharsets.US_ASCII));

if (searchArgs.useRM3) {
searcher.setRM3();
searcher.set_rm3();
}

for (Object id : topics.keySet()) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search;

import org.junit.Test;

import java.util.HashMap;
import java.util.Map;

import static org.junit.Assert.assertEquals;

public class SimpleImpactSearcherPrebuiltLucene8Test {

@Test
public void testSearch1() throws Exception {
SimpleImpactSearcher searcher =
new SimpleImpactSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_json_collection_tokenized");
assertEquals(2, searcher.get_total_num_docs());

SimpleImpactSearcher.Result[] hits;

Map<String, Float> query = new HashMap<>();
query.put("##ing", 1.0f);

hits = searcher.search(query, 10);
assertEquals(1, hits.length);
assertEquals("2000001", hits[0].docid);
assertEquals(2, (int) hits[0].score);

query = new HashMap<>();
query.put("test", 1.0f);
hits = searcher.search(query, 10);
assertEquals(1, hits.length);
assertEquals("2000000", hits[0].docid);
assertEquals(1, (int) hits[0].score);

searcher.close();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search;

import org.junit.Test;

import java.util.HashMap;
import java.util.Map;

import static org.junit.Assert.assertEquals;

public class SimpleImpactSearcherPrebuiltLucene9Test {

@Test
public void testSearch1() throws Exception {
SimpleImpactSearcher searcher =
new SimpleImpactSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_json_collection_tokenized");
assertEquals(2, searcher.get_total_num_docs());

SimpleImpactSearcher.Result[] hits;

Map<String, Float> query = new HashMap<>();
query.put("##ing", 1.0f);

hits = searcher.search(query, 10);
assertEquals(1, hits.length);
assertEquals("2000001", hits[0].docid);
assertEquals(2, (int) hits[0].score);

query = new HashMap<>();
query.put("test", 1.0f);
hits = searcher.search(query, 10);
assertEquals(1, hits.length);
assertEquals("2000000", hits[0].docid);
assertEquals(1, (int) hits[0].score);

searcher.close();
}

}
65 changes: 29 additions & 36 deletions src/test/java/io/anserini/search/SimpleImpactSearcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,21 @@
import java.util.Map;

public class SimpleImpactSearcherTest extends IndexerTestBase {

@Test
public void testGetDoc() throws Exception {
SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString());

assertEquals("here is some text here is some more text. city.",
searcher.document(0).get("contents"));
assertEquals("more texts", searcher.document(1).get("contents"));
assertEquals("here is a test", searcher.document(2).get("contents"));
assertNull(searcher.document(3));
searcher.doc(0).get("contents"));
assertEquals("more texts", searcher.doc(1).get("contents"));
assertEquals("here is a test", searcher.doc(2).get("contents"));
assertNull(searcher.doc(3));

assertEquals("here is some text here is some more text. city.",
searcher.document("doc1").get("contents"));
assertEquals("more texts", searcher.document("doc2").get("contents"));
assertEquals("here is a test", searcher.document("doc3").get("contents"));
assertNull(searcher.document(3));
searcher.doc("doc1").get("contents"));
assertEquals("more texts", searcher.doc("doc2").get("contents"));
assertEquals("here is a test", searcher.doc("doc3").get("contents"));
assertNull(searcher.doc(3));

searcher.close();
}
Expand All @@ -50,9 +49,9 @@ public void testGetDocByField() throws Exception {
SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString());

assertEquals("here is some text here is some more text. city.",
searcher.documentByField("id", "doc1").get("contents"));
assertEquals("more texts", searcher.documentByField("id", "doc2").get("contents"));
assertEquals("here is a test", searcher.documentByField("id", "doc3").get("contents"));
searcher.doc_by_field("id", "doc1").get("contents"));
assertEquals("more texts", searcher.doc_by_field("id", "doc2").get("contents"));
assertEquals("here is a test", searcher.doc_by_field("id", "doc3").get("contents"));

searcher.close();
}
Expand All @@ -61,21 +60,15 @@ public void testGetDocByField() throws Exception {
public void testGetContents() throws Exception {
SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString());

assertEquals("here is some text here is some more text. city.",
searcher.documentContents(0));
assertEquals("more texts",
searcher.documentContents(1));
assertEquals("here is a test",
searcher.documentContents(2));
assertNull(searcher.document(3));
assertEquals("here is some text here is some more text. city.", searcher.doc_contents(0));
assertEquals("more texts", searcher.doc_contents(1));
assertEquals("here is a test", searcher.doc_contents(2));
assertNull(searcher.doc(3));

assertEquals("here is some text here is some more text. city.",
searcher.documentContents("doc1"));
assertEquals("more texts",
searcher.documentContents("doc2"));
assertEquals("here is a test",
searcher.documentContents("doc3"));
assertNull(searcher.documentContents("doc42"));
assertEquals("here is some text here is some more text. city.", searcher.doc_contents("doc1"));
assertEquals("more texts", searcher.doc_contents("doc2"));
assertEquals("here is a test", searcher.doc_contents("doc3"));
assertNull(searcher.doc_contents("doc42"));

searcher.close();
}
Expand All @@ -85,20 +78,20 @@ public void testGetRaw() throws Exception {
SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString());

assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}",
searcher.documentRaw(0));
searcher.doc_raw(0));
assertEquals("{\"contents\": \"more texts\"}",
searcher.documentRaw(1));
searcher.doc_raw(1));
assertEquals("{\"contents\": \"here is a test\"}",
searcher.documentRaw(2));
assertNull(searcher.document(3));
searcher.doc_raw(2));
assertNull(searcher.doc(3));

assertEquals("{\"contents\": \"here is some text here is some more text. city.\"}",
searcher.documentRaw("doc1"));
searcher.doc_raw("doc1"));
assertEquals("{\"contents\": \"more texts\"}",
searcher.documentRaw("doc2"));
searcher.doc_raw("doc2"));
assertEquals("{\"contents\": \"here is a test\"}",
searcher.documentRaw("doc3"));
assertNull(searcher.documentContents("doc42"));
searcher.doc_raw("doc3"));
assertNull(searcher.doc_contents("doc42"));

searcher.close();
}
Expand Down Expand Up @@ -184,7 +177,7 @@ public void testBatchSearch() throws Exception {
qids.add("query_test");
qids.add("query_more");

Map<String, SimpleImpactSearcher.Result[]> hits = searcher.batchSearch(queries, qids, 10, 2);
Map<String, SimpleImpactSearcher.Result[]> hits = searcher.batch_search(queries, qids, 10, 2);
assertEquals(2, hits.size());

assertEquals(1, hits.get("query_test").length);
Expand All @@ -200,6 +193,6 @@ public void testBatchSearch() throws Exception {
@Test
public void testTotalNumDocuments() throws Exception {
SimpleImpactSearcher searcher = new SimpleImpactSearcher(super.tempDir1.toString());
assertEquals(3 ,searcher.getTotalNumDocuments());
assertEquals(3 ,searcher.get_total_num_docs());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search;

import org.junit.Test;

import static org.junit.Assert.assertEquals;

public class SimpleSearcherPrebuiltLucene8Test {

@Test
public void testSearch1() throws Exception {
SimpleSearcher searcher =
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene8-index.sample_docs_trec_colletion2");
assertEquals(3, searcher.get_total_num_docs());

SimpleSearcher.Result[] hits;

hits = searcher.search("text", 10);
assertEquals(3, hits.length);
assertEquals("DOC222", hits[0].docid);
assertEquals(0.1015f, hits[0].score, 10e-4);
assertEquals("TREC_DOC_1", hits[1].docid);
assertEquals(0.0738f, hits[1].score, 10e-4);
assertEquals("WSJ_1", hits[2].docid);
assertEquals(0.0687f, hits[2].score, 10e-4);

hits = searcher.search("simple", 10);
assertEquals(2, hits.length);
assertEquals("TREC_DOC_1", hits[0].docid);
assertEquals(0.2597f, hits[0].score, 10e-4);
assertEquals("DOC222", hits[1].docid);
assertEquals(0.2416f, hits[1].score, 10e-4);

searcher.close();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search;

import org.junit.Test;

import static org.junit.Assert.assertEquals;

public class SimpleSearcherPrebuiltLucene9Test {

@Test
public void testSearch1() throws Exception {
SimpleSearcher searcher =
new SimpleSearcher("src/test/resources/prebuilt_indexes/lucene9-index.sample_docs_trec_colletion2");
assertEquals(3, searcher.get_total_num_docs());

SimpleSearcher.Result[] hits;

hits = searcher.search("text", 10);
assertEquals(3, hits.length);
assertEquals("DOC222", hits[0].docid);
assertEquals(0.1015f, hits[0].score, 10e-4);
assertEquals("TREC_DOC_1", hits[1].docid);
assertEquals(0.0738f, hits[1].score, 10e-4);
assertEquals("WSJ_1", hits[2].docid);
assertEquals(0.0687f, hits[2].score, 10e-4);

hits = searcher.search("simple", 10);
assertEquals(2, hits.length);
assertEquals("TREC_DOC_1", hits[0].docid);
assertEquals(0.2597f, hits[0].score, 10e-4);
assertEquals("DOC222", hits[1].docid);
assertEquals(0.2416f, hits[1].score, 10e-4);

searcher.close();
}

}
Loading

0 comments on commit e475cc4

Please sign in to comment.