Skip to content

Commit

Permalink
Merge pull request #1954 from Sefaria/make-ref-part-type-more-lenient
Browse files Browse the repository at this point in the history
Make ref part type more lenient
  • Loading branch information
nsantacruz committed Jul 7, 2024
2 parents bc5fffb + ab6f4b5 commit 17e22ab
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
2 changes: 1 addition & 1 deletion sefaria/model/linker/ref_part.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R
"""
start_char, end_char = span_char_inds(part.span)
pivot = len(part.text) - len(str_end) + start_char
aspan = part.span.doc.char_span(0, pivot, alignment_mode='contract')
aspan = part.span.doc.char_span(start_char, pivot, alignment_mode='contract')
bspan = part.span.doc.char_span(pivot, end_char, alignment_mode='contract')
if aspan is None or bspan is None:
raise InputError(f"Couldn't break on token boundaries for strings '{self.text[0:pivot]}' and '{self.text[pivot:end_char]}'")
Expand Down
12 changes: 8 additions & 4 deletions sefaria/model/linker/ref_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,24 +427,28 @@ def _apply_context_swaps(self, raw_ref: RawRef, context_swap_map: Dict[str, str]
raw_ref.parts_to_match = swapped_ref_parts

def _get_unrefined_ref_part_matches_recursive(self, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]:
"""
We are now considering all types for trie lookups (not just NAMED) since there seem to be no cases of false positives when we consider all part types
In addition, sometimes the raw ref part type model misclassifies a part type and relaxing the type requirement here allows it to recover.
The exception is we only will split NAMED parts since this causes some odd parts to split. e.g. משנה א can be considered part of the title of book when א is removed
"""
title_trie = title_trie or self.get_ref_part_title_trie()
prev_ref_parts = prev_ref_parts or []
matches = []
for part in ref_parts:
temp_raw_ref = raw_ref
# no need to consider other types at root level
if part.type != RefPartType.NAMED: continue

temp_title_trie, partial_key_end = title_trie.get_continuations(part.key(), allow_partial=True)
if temp_title_trie is None: continue
if partial_key_end is None:
matched_part = part
else:
elif part.type == RefPartType.NAMED:
try:
temp_raw_ref, apart, bpart = raw_ref.split_part(part, partial_key_end)
matched_part = apart
except InputError:
matched_part = part # fallback on original part
else:
continue
temp_prev_ref_parts = prev_ref_parts + [matched_part]
if LEAF_TRIE_ENTRY in temp_title_trie:
for node in temp_title_trie[LEAF_TRIE_ENTRY]:
Expand Down
3 changes: 3 additions & 0 deletions sefaria/model/linker/tests/linker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ def test_resolved_raw_ref_clone():
# Base text context
[crrd(['@ובתוס\'', '#דכ"ז ע"ב', '*ד"ה והלכתא'], "Rashi on Berakhot 2a"), ("Tosafot on Berakhot 27b:14:2",)], # shared context child via graph context
# Mis-classified part types
[crrd(['@ושו"ע', "#אה״ע", "#סי׳ כ״ח", "#סעיף א"]), ("Shulchan Arukh, Even HaEzer 28:1",)],
# Ibid
[crrd(['&שם', '#ז'], prev_trefs=["Genesis 1"]), ["Genesis 7", "Genesis 1:7"]], # ambiguous ibid
[crrd(['&Ibid', '#12'], prev_trefs=["Exodus 1:7"], lang='en'), ["Exodus 1:12", "Exodus 12"]], # ambiguous ibid when context is segment level (not clear if this is really ambiguous. maybe should only have segment level result)
Expand Down

0 comments on commit 17e22ab

Please sign in to comment.