query_toks support for terrier.Retriever (#466)

Co-authored-by: Sean MacAvaney <sean.macavaney@gmail.com>
terrier-org · Aug 23, 2024 · bcf3819 · bcf3819
1 parent 4988b70
commit bcf3819
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 23 deletions.
diff --git a/docs/terrier-retrieval.rst b/docs/terrier-retrieval.rst
@@ -35,14 +35,67 @@ Retriever
 
 
 
+Query Formats for Terrier retrievers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default Terrier assumes that queries can be parsed by its `standard query parser <https://github.com/terrier-org/terrier-core/blob/5.x/doc/querylanguage.md#user-query-language>`_,
+which is standard search-engine like query language. Queries provided by Dataset objects are assumed to be in this format, using the 
+standard `["qid", "query"]` dataframe columns. 
+
+Two alternative query formats are also supported:
+
+ - MatchOp - this is a `lower-level query language <https://github.com/terrier-org/terrier-core/blob/5.x/doc/querylanguage.md#matching-op-query-language>`_ supported by Terrier, which is Indri-like in nature, and supports operators like ``#1()``. (exact phrase and ``#combine()`` (weighting). MatchOp queries stored in the `"query"` column. 
+
+ - pre-tokenised queries - in this format, query terms are provided, with weights, in a dictionary. Query terms are assumed to be already stemmed. This
+ format is useful for techniques that weight query terms, such as for Learned Sparse Retrieval (e.g. see `pyterrier_splade <https://github.com/cmacdonald/pyt_splade>`_).
+
+The following query dataframes are therefore equivalent:
+
+ - Raw query:
+
+    =====  =============================
+    qid    query         
+    =====  =============================
+        1  chemical chemical reactions
+    =====  =============================
+
+ - Using Terrier's QL to express weights on query terms:
+
+    =====  =============================
+    qid    query         
+    =====  =============================
+        1  chemical^2 reactions
+    =====  =============================
+
+ - Using Terrier's MatchOpQL to express weights on stemmed and tokenised query terms:
+
+    =====  ======================================
+    qid    query         
+    =====  ======================================
+        1  #combine:0=2:1=1(chemic reaction)
+    =====  ======================================
+
+ - Use the query_toks column (the query column is ignored):
+
+    =====  ====================================== =============================
+    qid    query_toks                             query         
+    =====  ====================================== =============================
+        1  {'chemic' : 2.0, 'reaction' : 1}       chemical chemical reactions
+    =====  ====================================== =============================
+
+
+
 Terrier Configuration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When using PyTerrier, we have to be aware of the underlying Terrier configuration, 
-namely *properties* and *controls*. Properties are global configuration and were 
-traditionally configured by editing a `terrier.properties` file; In contrast, 
-controls are per-query configuration. In PyTerrier, we specify both when we construct
-the Retriever object:
+namely *properties* and *controls*. We aim to surface the most common configuration options 
+through the Python API, but occasionally its necessary to resort to properties or controls
+directly. 
+
+Properties are global configuration and were traditionally configured by editing a 
+`terrier.properties` file; In contrast, controls are per-query configuration. In PyTerrier, 
+we specify both when we construct the Retriever object:
 
 Common controls:
  - `"wmodel"` - the name of the weighting model. (This can also be specified using the wmodel kwarg).
@@ -55,7 +108,8 @@ Common controls:
 Common properties:
  - `"termpipelines"` - the default Terrier term pipeline configuration is `"Stopwords,PorterStemmer"`.
    If you have created an index with a different configuration, you will need to set the  `"termpipelines"`
-   property for *each* Retriever constructed.
+   property for *each* Retriever constructed. NB: These are now configurable using ``stemming=`` and
+   ``stopwords=`` kwargs.
 
 **Examples**::
 
@@ -98,7 +152,7 @@ Good Practice::
     pl2 = pt.terrier.Retriever(index, wmodel="PL2")
     # here, we share the index between two instances of Retriever
 
-You can use the IndexFactory to specify that the index data structures to be loaded into memory::
+You can use the IndexFactory to specify that the index data structures to be loaded into memory, which can benefit efficiency::
 
     # load all structures into memory
     inmemindex = pt.IndexFactory.of("/path/to/data.properties", memory=True)

diff --git a/pyterrier/terrier/retriever.py b/pyterrier/terrier/retriever.py
@@ -9,6 +9,7 @@
 import concurrent
 from concurrent.futures import ThreadPoolExecutor
 import pyterrier as pt
+from typing import Dict
 
 _matchops = ["#combine", "#uw", "#1", "#tag", "#prefix", "#band", "#base64", "#syn"]
 def _matchop(query):
@@ -17,6 +18,18 @@ def _matchop(query):
             return True
     return False
 
+def _querytoks2matchop(query_toks: Dict[str,float]) -> str:
+    def _matchop_tok(t, w):
+        import base64
+        import string
+        if not all(a in string.ascii_letters + string.digits for a in t):
+            encoded = base64.b64encode(t.encode('utf-8')).decode("utf-8") 
+            t = f'#base64({encoded})'
+        if w != 1:
+            t = f'#combine:0={w:f}({t})'
+        return t
+    return ' '.join([ _matchop_tok(t, w) for (t,w) in query_toks.items() ])
+
 @pt.java.required
 def _function2wmodel(function):
     from jnius import PythonJavaClass, java_method
@@ -301,13 +314,19 @@ def __setstate__(self, d):
     def _retrieve_one(self, row, input_results=None, docno_provided=False, docid_provided=False, scores_provided=False):
         rank = FIRST_RANK
         qid = str(row.qid)
-        query = row.query
-        if len(query) == 0:
-            warn("Skipping empty query for qid %s" % qid)
-            return []
 
-        srq = self.manager.newSearchRequest(qid, query)
-
+        # row is a namedtuple, whose fields are exposed in _fields
+        query_toks_present : bool = 'query_toks' in row._fields
+        if query_toks_present:
+            query = '' # Clear the query so it doesn't match the "applypipeline:off" or "_matchop" condictions below... The query_toks query is converted below.
+            srq = self.manager.newSearchRequest(qid)
+        else:
+            query = row.query
+            if len(query) == 0:
+                warn("Skipping empty query for qid %s" % qid)
+                return []
+            srq = self.manager.newSearchRequest(qid, query)
+
         for control, value in self.controls.items():
             srq.setControl(control, str(value))
 
@@ -326,6 +345,17 @@ def _retrieve_one(self, row, input_results=None, docno_provided=False, docid_pro
             srq.setControl("parseql", "off")
             srq.setControl("matchopql", "on")
 
+        if query_toks_present:
+            if len(row.query_toks) == 0:
+                warn("Skipping empty query_toks for qid %s" % qid)
+                return []
+            srq.setControl("terrierql", "off")
+            srq.setControl("parsecontrols", "off")
+            srq.setControl("parseql", "off")
+            srq.setControl("matchopql", "on")
+            query = _querytoks2matchop(row.query_toks)
+            srq.setOriginalQuery(query)
+
         #ask decorate only to grab what we need
         srq.setControl("decorate", ",".join(self.metadata))
 
@@ -719,12 +749,18 @@ def transform(self, queries):
         newscores=[]
         for row in pt.tqdm(queries.itertuples(), desc=str(self), total=queries.shape[0], unit="q") if self.verbose else queries.itertuples():
             qid = str(row.qid)
-            query = row.query
-            if len(query) == 0:
-                warn("Skipping empty query for qid %s" % qid)
-                continue
-
-            srq = self.manager.newSearchRequest(qid, query)
+            query_toks_present : bool = 'query_toks' in row._fields
+            if query_toks_present:
+                # Even though it might look like we should parse the query toks here, we don't want the resulting query to be caught by the conditions
+                # that come before the "if query_toks_present" check. So we set it to an empty string and handle the parsing below.
+                query = ''
+                srq = self.manager.newSearchRequest(qid)
+            else:
+                query = row.query
+                if len(query) == 0:
+                    warn("Skipping empty query for qid %s" % qid)
+                    continue
+                srq = self.manager.newSearchRequest(qid, query)
 
             for control, value in self.controls.items():
                 srq.setControl(control, str(value))
@@ -741,6 +777,17 @@ def transform(self, queries):
                 srq.setControl("parseql", "off")
                 srq.setControl("matchopql", "on")
 
+            if query_toks_present:
+                if len(row.query_toks) == 0:
+                    warn("Skipping empty query_toks for qid %s" % qid)
+                    return []
+                srq.setControl("terrierql", "off")
+                srq.setControl("parsecontrols", "off")
+                srq.setControl("parseql", "off")
+                srq.setControl("matchopql", "on")
+                query = _querytoks2matchop(row.query_toks)
+                srq.setOriginalQuery(query)
+
             # this handles the case that a candidate set of documents has been set. 
             if docno_provided or docid_provided:
                 # we use RequestContextMatching to make a ResultSet from the 

diff --git a/tests/test_br.py b/tests/test_br.py
@@ -56,6 +56,25 @@ def test_br_cutoff(self):
         result = retr.transform(input_set)
         self.assertEqual(10, len(result))
 
+    def test_br_query_toks(self):
+        indexloc = self.here + "/fixtures/index/data.properties"
+
+        retr = pt.terrier.Retriever(indexloc)
+        query_terrier = 'applytermpipeline:off chemic^2 reaction^0.5'
+        result_terrier = retr.search(query_terrier)
+
+        query_matchop = '#combine:0=2:1=0.5(chemic reaction)'
+        result_matchop = retr.search(query_matchop)
+
+        query_toks = { 'chemic' : 2, 'reaction' : 0.5}
+        result_toks = retr.transform(pd.DataFrame([['1', query_toks]], columns=['qid', 'query_toks']))
+
+        self.assertEqual(len(result_terrier), len(result_matchop))
+        self.assertEqual(len(result_terrier), len(result_toks))
+        from pandas.testing import assert_frame_equal
+        assert_frame_equal(result_terrier[["qid", "docno", "score", "rank"]], result_matchop[["qid", "docno", "score", "rank"]])
+        assert_frame_equal(result_terrier[["qid", "docno", "score", "rank"]], result_toks[["qid", "docno", "score", "rank"]])
+
     def test_br_cutoff_stability(self):
         indexloc = self.here + "/fixtures/index/data.properties"
         input_set = pd.DataFrame([
@@ -197,7 +216,6 @@ def test_num_manual_wmodel(self):
         except JavaException as ja:
             print(ja.stacktrace)
             raise ja
-
 
     def test_num_python_wmodel(self):
         indexref = self.here+"/fixtures/index/data.properties"

diff --git a/tests/test_fbr.py b/tests/test_fbr.py
@@ -90,10 +90,6 @@ def test_fbr_reranking2(self):
         result1F_map = { row.docno : row.feature0 for row in result1.itertuples() }
         result2_map = { row.docno : row.score for row in result2.itertuples() }
 
-        print(result1F_map)
-        print(result2_map)
-
-
         # check features scores
         # NB: places can go no less than 4, as two documents have similar PL2 scores
         for rank, row in enumerate(result0.itertuples()):
@@ -141,6 +137,25 @@ def test_fbr(self):
             retrBasic = pt.terrier.Retriever(indexref)
             if "matching" in retrBasic.controls:
                 self.assertNotEqual(retrBasic.controls["matching"], "FatFeaturedScoringMatching,org.terrier.matching.daat.FatFull")
+
+    def test_fbr_query_toks(self):
+        indexloc = self.here + "/fixtures/index/data.properties"
+
+        retr = pt.terrier.FeaturesRetriever(indexloc, ["WMODEL:PL2"], wmodel="DPH")
+        query_terrier = 'applytermpipeline:off chemic^2 reaction^0.5'
+        result_terrier = retr.search(query_terrier)
+
+        query_matchop = '#combine:0=2:1=0.5(chemic reaction)'
+        result_matchop = retr.search(query_matchop)
+
+        query_toks = { 'chemic' : 2, 'reaction' : 0.5}
+        result_toks = retr.transform(pd.DataFrame([['1', query_toks]], columns=['qid', 'query_toks']))
+
+        self.assertEqual(len(result_terrier), len(result_matchop))
+        self.assertEqual(len(result_terrier), len(result_toks))
+        from pandas.testing import assert_frame_equal
+        assert_frame_equal(result_terrier[["qid", "docno", "score", "rank", "features"]], result_matchop[["qid", "docno", "score", "rank", "features"]])
+        assert_frame_equal(result_terrier[["qid", "docno", "score", "rank", "features"]], result_toks[["qid", "docno", "score", "rank", "features"]])
 
     def test_fbr_example(self):
         JIR = pt.java.autoclass('org.terrier.querying.IndexRef')