Skip to content

Commit

Permalink
support meta config in IterDictIndexer constructor #250
Browse files Browse the repository at this point in the history
Merge pull request #251 from terrier-org/iterdictmeta_250
  • Loading branch information
cmacdonald authored Dec 20, 2021
2 parents 2b60572 + 86254fe commit b7f1daf
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 13 deletions.
34 changes: 27 additions & 7 deletions pyterrier/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,18 @@ def next(self):

from pyterrier.transformer import IterDictIndexerBase
class _BaseIterDictIndexer(Indexer, IterDictIndexerBase):
def __init__(self, index_path, *args, meta_reverse=['docno'], threads=1, **kwargs):
def __init__(self, index_path, *args, meta = {'docno' : 20}, meta_reverse=['docno'], threads=1, **kwargs):
"""
Args:
index_path(str): Directory to store index. Ignored for IndexingType.MEMORY.
meta(Dict[str,int]): What metadata for each document to record in the index, and what length to reserve. Defaults to `{"docno" : 20}`.
meta_reverse(List[str]): What metadata shoudl we be able to resolve back to a docid. Defaults to `["docno"]`,
"""
IterDictIndexerBase.__init__(self)
Indexer.__init__(self, index_path, *args, **kwargs)
self.threads = threads
self.meta = meta
self.meta_reverse = meta_reverse

def _setup(self, fields, meta, meta_lengths):
Expand Down Expand Up @@ -635,17 +643,23 @@ class _IterDictIndexer_nofifo(_BaseIterDictIndexer):
Use this Indexer if you wish to index an iter of dicts (possibly with multiple fields).
This version is used for Windows -- which doesn't support the faster fifo implementation.
"""
def index(self, it, fields=('text',), meta=('docno',), meta_lengths=None, threads=None):
def index(self, it, fields=('text',), meta=None, meta_lengths=None, threads=None):
"""
Index the specified iter of dicts with the (optional) specified fields
Args:
it(iter[dict]): an iter of document dict to be indexed
fields(list[str]): keys to be indexed as fields
meta(list[str]): keys to be considered as metdata
meta_lengths(list[int]): length of metadata, defaults to 512 characters
meta(list[str]): keys to be considered as metdata. Deprecated
meta_lengths(list[int]): length of metadata, defaults to 512 characters. Deprecated
"""
self._setup(fields, meta, meta_lengths)
if meta is not None:
warn('specifying meta and meta_lengths in IterDictIndexer.index() is deprecated, use kwargs in constructor instead', DeprecationWarning, 2)
self.meta = meta
if meta_lengths is not None:
self.meta = {zip(meta, meta_lengths)}

self._setup(fields, self.meta, None)
assert self.threads == 1, 'IterDictIndexer does not support multiple threads on Windows'
# we need to prevent collectionIterator from being GCd
collectionIterator = FlatJSONDocumentIterator(iter(it)) # force it to be iter
Expand All @@ -667,7 +681,7 @@ class _IterDictIndexer_fifo(_BaseIterDictIndexer):
This version is optimized by using multiple threads and POSIX fifos to tranfer data,
which ends up being much faster.
"""
def index(self, it, fields=('text',), meta=('docno',), meta_lengths=None):
def index(self, it, fields=('text',), meta=None, meta_lengths=None):
"""
Index the specified iter of dicts with the (optional) specified fields
Expand All @@ -681,7 +695,13 @@ def index(self, it, fields=('text',), meta=('docno',), meta_lengths=None):
JsonlDocumentIterator = autoclass("org.terrier.python.JsonlDocumentIterator")
ParallelIndexer = autoclass("org.terrier.python.ParallelIndexer")

self._setup(fields, meta, meta_lengths)
if meta is not None:
warn('specifying meta and meta_lengths in IterDictIndexer.index() is deprecated, use constructor instead', DeprecationWarning, 2)
self.meta = meta
if meta_lengths is not None:
self.meta = {zip(meta, meta_lengths)}

self._setup(fields, self.meta, None)

os.makedirs(self.index_dir, exist_ok=True) # ParallelIndexer expects the directory to exist

Expand Down
30 changes: 24 additions & 6 deletions tests/test_iterdictindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@
class TestIterDictIndexer(TempDirTestCase):


def _create_index(self, it, fields, meta, type, indexer):
def _create_index(self, it, fields, type, indexer):
print("Writing index to " + self.test_dir)
indexref = indexer.index(it, fields, meta)
indexref = indexer.index(it, fields)
self.assertIsNotNone(indexref)
return indexref

def _make_check_index(self, n, index_type, fields=('text',), meta=('docno', 'url', 'title')):
from pyterrier.index import IndexingType
# Test both versions: _fifo (for UNIX) and _nofifo (for Windows)
indexers = [
pt.index._IterDictIndexer_fifo(self.test_dir, type=index_type),
pt.index._IterDictIndexer_fifo(self.test_dir, type=index_type, threads=4),
pt.index._IterDictIndexer_nofifo(self.test_dir, type=index_type),
pt.index._IterDictIndexer_fifo(self.test_dir, type=index_type, meta=meta),
pt.index._IterDictIndexer_fifo(self.test_dir, type=index_type, threads=4, meta=meta),
pt.index._IterDictIndexer_nofifo(self.test_dir, type=index_type, meta=meta),
]
if BaseTestCase.is_windows():
indexers = [indexers[-1]]
Expand All @@ -35,7 +35,7 @@ def _make_check_index(self, n, index_type, fields=('text',), meta=('docno', 'url
{'docno': '3', 'url': 'url3', 'text': 'The body may perhaps compensates for the loss', 'title': 'Best of Viktor Prowoll'},
)
it = itertools.islice(it, n)
indexref = self._create_index(it, fields, meta, index_type, indexer)
indexref = self._create_index(it, fields, index_type, indexer)
index = pt.IndexFactory.of(indexref)
self.assertIsNotNone(index)
self.assertEqual(n, index.getCollectionStatistics().getNumberOfDocuments())
Expand Down Expand Up @@ -150,6 +150,24 @@ def test_createindex3_single_pass_2fields(self):
from pyterrier.index import IndexingType
self._make_check_index(3, IndexingType.SINGLEPASS, fields=['text', 'title'])

def test_meta_init(self):
it = [
{'docno': '1', 'url': 'url1', 'text': 'He ran out of money, so he had to stop playing', 'title': 'Woes of playing poker'},
{'docno': '2', 'url': 'url2', 'text': 'The waves were crashing on the shore; it was a', 'title': 'Lovely sight'},
{'docno': '3', 'url': 'url3', 'text': 'The body may perhaps compensates for the loss', 'title': 'Best of Viktor Prowoll'},
]
props={}
props["termpipelines"] = ""

indexer = pt.IterDictIndexer(self.test_dir, meta={'docno' : 10, 'url' : 10, 'text' : 100, 'title' : 100}, meta_reverse=['docno', 'url'])
indexref = indexer.index(it)
index = pt.IndexFactory.of(indexref)
self.assertIn("docno", index.getMetaIndex().getKeys())
self.assertIn("text", index.getMetaIndex().getKeys())
self.assertIn("docno", index.getMetaIndex().getKeys())
self.assertIn("url", index.getMetaIndex().getReverseKeys())


def test_check_stemmer(self):
it = [
{'docno': '1', 'url': 'url1', 'text': 'He ran out of money, so he had to stop playing', 'title': 'Woes of playing poker'},
Expand Down

0 comments on commit b7f1daf

Please sign in to comment.