Skip to content

Commit

Permalink
ensure that all docs have a snippet, related to #264
Browse files Browse the repository at this point in the history
  • Loading branch information
cmacdonald committed Jan 11, 2022
1 parent 5c4259c commit 99e6751
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pyterrier/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def snippets(
# use Tf as a passage scorer on sliding window passages
psg_scorer = (
pt.text.sliding(text_attr='text', length=15, prepend_attr=None)
>> pt.text.scorer(body_attr="text", wmodel='Tf', background_index=index)
>> pt.text.scorer(body_attr="text", wmodel='Tf', takes='docs')
)
# use psg_scorer for performing query-biased summarisation on docs retrieved by br
Expand Down
9 changes: 5 additions & 4 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,17 @@ def test_scorer_rerank(self):
def test_snippets(self):
br = pt.BatchRetrieve.from_dataset("vaswani", "terrier_stemmed_text", metadata=["docno", "text"])
psg_scorer = (
pt.text.sliding(text_attr='text', length=5, prepend_attr=None)
>> pt.text.scorer(body_attr="text", wmodel='Tf')
>> pt.debug.print_rows(jupyter=False)
pt.text.sliding(text_attr='text', length=25, stride=12, prepend_attr=None)
>> pt.text.scorer(body_attr="text", wmodel='Tf', takes='docs')
)
pipe = br >> pt.text.snippets(psg_scorer)
dfOut = pipe.search("chemical reactions")
print(dfOut)
self.assertTrue("rank" in dfOut.columns)
self.assertTrue("score" in dfOut.columns)
self.assertTrue("summary" in dfOut.columns)
#.count() checks number of non-NaN values
#so lets count how mant are NaN
self.assertEqual(0, len(dfOut) - dfOut.summary.count())


def test_fetch_text_docno(self):
Expand Down

0 comments on commit 99e6751

Please sign in to comment.