From 873c49652722f6a9254a2c271e0018a137089fa0 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:22:49 +0300 Subject: [PATCH] feat(linker): update tokenizer to include more punctuation. --- sefaria/spacy_function_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/spacy_function_registry.py b/sefaria/spacy_function_registry.py index 34a8366744..a1a91bb342 100644 --- a/sefaria/spacy_function_registry.py +++ b/sefaria/spacy_function_registry.py @@ -5,7 +5,7 @@ def inner_punct_tokenizer_factory(): def inner_punct_tokenizer(nlp): # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) - infix_re = re.compile(r'''[\.\,\?\:\;…\‘\’\`\“\”\"\'~\–\-/\(\)]''') + infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''') prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)