Merge pull request #1954 from Sefaria/make-ref-part-type-more-lenient

Make ref part type more lenient
Sefaria · Jul 7, 2024 · 17e22ab · 17e22ab
2 parents bc5fffb + ab6f4b5
commit 17e22ab
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 5 deletions.
diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py
@@ -443,7 +443,7 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R
         """
         start_char, end_char = span_char_inds(part.span)
         pivot = len(part.text) - len(str_end) + start_char
-        aspan = part.span.doc.char_span(0, pivot, alignment_mode='contract')
+        aspan = part.span.doc.char_span(start_char, pivot, alignment_mode='contract')
         bspan = part.span.doc.char_span(pivot, end_char, alignment_mode='contract')
         if aspan is None or bspan is None:
             raise InputError(f"Couldn't break on token boundaries for strings '{self.text[0:pivot]}' and '{self.text[pivot:end_char]}'")

diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py
@@ -427,24 +427,28 @@ def _apply_context_swaps(self, raw_ref: RawRef, context_swap_map: Dict[str, str]
         raw_ref.parts_to_match = swapped_ref_parts
 
     def _get_unrefined_ref_part_matches_recursive(self, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]:
+        """
+        We are now considering all types for trie lookups (not just NAMED) since there seem to be no cases of false positives when we consider all part types
+        In addition, sometimes the raw ref part type model misclassifies a part type and relaxing the type requirement here allows it to recover.
+        The exception is we only will split NAMED parts since this causes some odd parts to split. e.g. משנה א can be considered part of the title of book when א is removed
+        """
         title_trie = title_trie or self.get_ref_part_title_trie()
         prev_ref_parts = prev_ref_parts or []
         matches = []
         for part in ref_parts:
             temp_raw_ref = raw_ref
-            # no need to consider other types at root level
-            if part.type != RefPartType.NAMED: continue
-
             temp_title_trie, partial_key_end = title_trie.get_continuations(part.key(), allow_partial=True)
             if temp_title_trie is None: continue
             if partial_key_end is None:
                 matched_part = part
-            else:
+            elif part.type == RefPartType.NAMED:
                 try:
                     temp_raw_ref, apart, bpart = raw_ref.split_part(part, partial_key_end)
                     matched_part = apart
                 except InputError:
                     matched_part = part  # fallback on original part
+            else:
+                continue
             temp_prev_ref_parts = prev_ref_parts + [matched_part]
             if LEAF_TRIE_ENTRY in temp_title_trie:
                 for node in temp_title_trie[LEAF_TRIE_ENTRY]:

diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py
@@ -105,6 +105,9 @@ def test_resolved_raw_ref_clone():
     # Base text context
     [crrd(['@ובתוס\'', '#דכ"ז ע"ב', '*ד"ה והלכתא'], "Rashi on Berakhot 2a"), ("Tosafot on Berakhot 27b:14:2",)],  # shared context child via graph context
 
+    # Mis-classified part types
+    [crrd(['@ושו"ע', "#אה״ע", "#סי׳ כ״ח", "#סעיף א"]), ("Shulchan Arukh, Even HaEzer 28:1",)],
+
     # Ibid
     [crrd(['&שם', '#ז'], prev_trefs=["Genesis 1"]), ["Genesis 7", "Genesis 1:7"]],  # ambiguous ibid
     [crrd(['&Ibid', '#12'], prev_trefs=["Exodus 1:7"], lang='en'), ["Exodus 1:12", "Exodus 12"]],  # ambiguous ibid when context is segment level (not clear if this is really ambiguous. maybe should only have segment level result)