terrier-org · cmacdonald · Oct 26, 2023 · Mar 28, 2023 · Apr 3, 2023 · Oct 11, 2023
diff --git a/docs/terrier-index-api.rst b/docs/terrier-index-api.rst
@@ -38,6 +38,13 @@ If you want to access the underlying data structures, you need to use IndexFacto
 
 NB: BatchRetrieve will accept anything "index-like", i.e. a string location of an index, an IndexRef or an Index.
 
+We can also ask for the index to be loaded into memory::
+
+    index = pt.IndexFactory.of("/path/to/data.properties", mem=True)
+
+.. autoclass:: pyterrier.IndexFactory
+    :members: of
+
 Whats in an Index
 =================
 

diff --git a/docs/terrier-retrieval.rst b/docs/terrier-retrieval.rst
@@ -98,6 +98,17 @@ Good Practice::
     pl2 = pt.BatchRetrieve(index, wmodel="PL2")
     # here, we share the index between two instances of BatchRetrieve
 
+You can use the IndexFactory to specify that the index data structures to be loaded into memory::
+
+    # load all structures into memory
+    inmemindex = pt.IndexFactory.of("/path/to/data.properties", memory=True)
+    bm25_fast = pt.BatchRetrieve(inmemindex, wmodel="BM25")
+
+    # load just inverted and lexicon into memory
+    inmem_inverted_index = pt.IndexFactory.of("/path/to/data.properties", memory=['inverted', 'lexicon'])
+    bm25_fast = pt.BatchRetrieve(inmem_inverted_index, wmodel="BM25")
+
+
 TextScorer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/pyterrier/__init__.py b/pyterrier/__init__.py
@@ -145,6 +145,7 @@ def init(version=None, mem=None, packages=[], jvm_opts=[], redirect_io=True, log
 
     from .batchretrieve import BatchRetrieve, FeaturesBatchRetrieve
     from .utils import Utils
+    from .bootstrap import IndexFactory
     from .datasets import get_dataset, find_datasets, list_datasets
     from .index import Indexer, FilesIndexer, TRECCollectionIndexer, DFIndexer, DFIndexUtils, IterDictIndexer, IndexingType, TerrierStemmer, TerrierStopwords, TerrierTokeniser
     from .pipelines import Experiment, GridScan, GridSearch, KFoldGridSearch
@@ -193,7 +194,7 @@ def init(version=None, mem=None, packages=[], jvm_opts=[], redirect_io=True, log
     globals()["DFIndexer"] = DFIndexer
     globals()["DFIndexUtils"] = DFIndexUtils
     globals()["IterDictIndexer"] = IterDictIndexer
-    globals()["IndexFactory"] = autoclass("org.terrier.structures.IndexFactory")
+    globals()["IndexFactory"] = IndexFactory
     globals()["IndexRef"] = autoclass("org.terrier.querying.IndexRef")
     globals()["IndexingType"] = IndexingType
     globals()["TerrierStemmer"] = TerrierStemmer

diff --git a/pyterrier/bootstrap.py b/pyterrier/bootstrap.py
@@ -1,10 +1,129 @@
 from . import mavenresolver
+from typing import Union, List
 
 stdout_ref = None
 stderr_ref = None
 TERRIER_PKG = "org.terrier"
 SAVED_FNS=[]
 
+class IndexFactory:
+    """
+    The ``of()`` method of this factory class allows to load a Terrier `Index <http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html>`_.
+
+    NB: This class "shades" the native Terrier `IndexFactory <http://terrier.org/docs/current/javadoc/org/terrier/structures/IndexFactory.html>`_ class - it offers essential the same API,
+    except that the ``of()`` method contains a memory kwarg, that can be used to load additional index data structures into memory. 
+
+    Terrier data structures that can be loaded into memory:
+     - 'inverted' - the inverted index, contains posting lists for each term. In the default configuration, this is read in from disk in chunks.
+     - 'lexicon' - the dictionary. By default, a binary search of the on-disk structure is used, so loading into memory can enhance speed.
+     - 'meta' - metadata about documents. Used as the final stage of retrieval, one seek for each retrieved document.
+     - 'direct' - contains posting lists for each document. No speed advantage for loading into memory unless pseudo-relevance feedback is being used.
+     - 'document' - contains document lengths, which are anyway loaded into memory. No speed advantage for loading into memory unless pseudo-relevance feedback is being used.
+    """
+
+    @staticmethod
+    def _load_into_memory(index, structures=['lexicon', 'direct', 'inverted', 'meta'], load=False):
+
+        REWRITES = {
+            'meta' : {
+                # both metaindex implementations have the same property
+                'org.terrier.structures.ZstdCompressedMetaIndex' : {
+                    'index.meta.index-source' : 'fileinmem',
+                    'index.meta.data-source' : 'fileinmem'},
+
+                'org.terrier.structures.CompressingMetaIndex' : {
+                    'index.meta.index-source' : 'fileinmem',
+                    'index.meta.data-source' : 'fileinmem'}
+            },
+            'lexicon' : {
+                'org.terrier.structures.FSOMapFileLexicon' : {
+                    'index.lexicon.data-source' : 'fileinmem'
+                }
+            },
+            'direct' : {
+                'org.terrier.structures.bit.BitPostingIndex' : {
+                    'index.direct.data-source' : 'fileinmem'}
+            },
+            'inverted' : {
+                'org.terrier.structures.bit.BitPostingIndex' : {
+                    'index.direct.data-source' : 'fileinmem'}
+            },
+        }
+        if "direct" in structures:
+            REWRITES['document'] = {
+                # we have to be sensitive to the presence of fields or not
+                # NB: loading these structures into memory only benefit direct index access
+                'org.terrier.structures.FSADocumentIndex' : {
+                    'index.document.class' : 'FSADocumentIndexInMem'
+                }, 
+                'org.terrier.structures.FSAFieldDocumentIndex' : {
+                    'index.document.class' : 'FSADocumentIndexInMemFields'
+                }
+            }
+
+        from . import cast
+        pindex = cast("org.terrier.structures.IndexOnDisk", index)
+        load_profile = pindex.getIndexLoadingProfileAsRetrieval()
+        dirty_structures = set()
+        for s in structures:
+            if not pindex.hasIndexStructure(s):
+                continue
+            clz = pindex.getIndexProperty(f"index.{s}.class", "notfound")
+            if not clz in REWRITES[s]:
+                raise ValueError(f"Cannot load structure {s} into memory, underlying class {clz} is not supported")
+
+            # we only reload an index structure if a property has changed
+            dirty = False
+            for k, v in REWRITES[s][clz].items():
+                if pindex.getIndexProperty(k, "notset") != v:
+                    pindex.setIndexProperty(k, v)
+                    dirty_structures.add(s)
+
+                    # if the document index is reloaded, the inverted index should be reloaded too
+                    # NB: the direct index needs reloaded too, but this option is only available IF
+                    # the direct index is setup
+                    if s == "document":
+                        dirty_structures.add("inverted")
+
+        # remove the old data structures from memory
+        for s in dirty_structures:
+            if pindex.structureCache.containsKey(s):
+                pindex.structureCache.remove(s)
+
+        # force the index structures to be loaded now
+        if load:
+            for s in dirty_structures:
+                pindex.getIndexStructure(s)
+
+        # dont allow the index properties to be rewritten
+        pindex.dirtyProperties = False
+        return index
+
+    @staticmethod 
+    def of(indexlike, memory : Union[bool, List[str]] = False):
+        """
+        Loads an index. Returns a Terrier `Index <http://terrier.org/docs/current/javadoc/org/terrier/structures/Index.html>`_ object.
+
+        Args:
+            indexlike(str or IndexRef): Where is the index located
+            memory(bool or List[str]): If the index should be loaded into memory. Use `True` for all structures, or a list of structure names.
+        """
+        from . import autoclass
+        IOD = autoclass("org.terrier.structures.IndexOnDisk")
+        load_profile =  IOD.getIndexLoadingProfileAsRetrieval()
+
+        if memory or (isinstance(memory, list) and len(memory) > 0): #MEMORY CAN BE A LIST?
+            IOD.setIndexLoadingProfileAsRetrieval(False)
+        index = autoclass("org.terrier.structures.IndexFactory").of(indexlike)
+
+        # noop if memory is False
+        IOD.setIndexLoadingProfileAsRetrieval(load_profile)
+        if not memory:
+            return index
+        if isinstance(memory, list):
+            return IndexFactory._load_into_memory(index, structures=memory)
+        return IndexFactory._load_into_memory(index)
+
 def logging(level):
     from jnius import autoclass
     autoclass("org.terrier.python.PTUtils").setLogLevel(level, None)

diff --git a/pyterrier/index.py b/pyterrier/index.py
@@ -39,7 +39,6 @@
 Properties = None
 CLITool = None
 IndexRef = None
-IndexFactory = None
 StructureMerger = None
 BlockStructureMerger = None
 
@@ -73,7 +72,6 @@ def run_autoclass():
     global Properties
     global CLITool
     global IndexRef
-    global IndexFactory
     global StructureMerger
     global BlockStructureMerger
 
@@ -96,7 +94,6 @@ def run_autoclass():
     Properties = autoclass('java.util.Properties')
     CLITool = autoclass("org.terrier.applications.CLITool")
     IndexRef = autoclass('org.terrier.querying.IndexRef')
-    IndexFactory = autoclass('org.terrier.structures.IndexFactory')
     StructureMerger = autoclass("org.terrier.structures.merging.StructureMerger")
     BlockStructureMerger = autoclass("org.terrier.structures.merging.BlockStructureMerger")
 

diff --git a/tests/test_br.py b/tests/test_br.py
@@ -86,6 +86,14 @@ def test_br_col_passthrough(self):
         result = retr.transform(input_set)
         self.assertIn("username", result.columns)
 
+    def test_br_mem(self):
+        indexloc = self.here + "/fixtures/index/data.properties"
+        memindex = pt.IndexFactory.of(indexloc, memory=True)
+        pindex = pt.cast("org.terrier.structures.IndexOnDisk", memindex)
+        self.assertEqual("fileinmem", pindex.getIndexProperty("index.lexicon.data-source", "notfound"))
+        retr = pt.BatchRetrieve(memindex)
+        retr.search("chemical reactions")
+
     def test_br_empty(self):
         indexloc = self.here + "/fixtures/index/data.properties"