Skip to content

Commit

Permalink
test(normalizer): add and modify tests for new end calculation.
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed Oct 26, 2023
1 parent a567d4e commit 4a3797a
Showing 1 changed file with 20 additions and 4 deletions.
24 changes: 20 additions & 4 deletions sefaria/helper/tests/normalization_tests.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
import django
django.setup()
from sefaria.helper.normalization import *
Expand Down Expand Up @@ -55,9 +56,9 @@ def test_simpler_normalizer_composer():
nsc = NormalizerComposer(['brackets', 'double-space'])
assert nsc.normalize(text) == normalized
text_to_remove = nsc.find_text_to_remove(text)
assert len(text_to_remove) == 2
assert len(text_to_remove) == 1
(start0, end0), repl0 = text_to_remove[0]
assert text[start0:end0] == " "
assert text[start0:end0] == " ["
assert repl0 == ' '


Expand All @@ -67,12 +68,26 @@ def test_complicated_normalizer_composer():
nsc = NormalizerComposer(['html', "parens-plus-contents", 'brackets', 'double-space'])
assert nsc.normalize(text) == normalized
text_to_remove = nsc.find_text_to_remove(text)
assert len(text_to_remove) == 6
assert len(text_to_remove) == 5
(start0, end0), repl0 = text_to_remove[0]
assert text[start0:end0] == "(<i>hello</i> other stuff) "
assert text[start0:end0] == "(<i>hello</i> other stuff) ["
assert repl0 == ' '


def test_mapping():
text = """<b><i> test"""
normalized = """ test"""
nsc = NormalizerComposer(['html', 'double-space'])
assert nsc.normalize(text) == normalized
mapping = nsc.get_mapping_after_normalization(text)
test_word = "test"
start_norm_ind = normalized.index(test_word)
norm_inds = (start_norm_ind, start_norm_ind+len(test_word))
unnorm_inds = nsc.convert_normalized_indices_to_unnormalized_indices([norm_inds], mapping)[0]
# actual test
assert text[slice(*unnorm_inds)] == normalized[slice(*norm_inds)]


def test_html_normalizer_for_empty_prefix():
text = """It is written<sup>24</sup><i class="footnote"><i>1K</i>. 17:1.</i> <i>Elijah the Tisbite</i>"""
normalizer = NormalizerComposer(['html'])
Expand Down Expand Up @@ -102,6 +117,7 @@ def test_nested_itag():
assert text[s:e] == """<i class="footnote">bull<sup>nested</sup><i class="footnote">The</i>.</i>"""


@pytest.mark.xfail(reason="not clear we want to support char_indices_from_word_indices as it's unused")
def test_word_to_char():
test_string = 'some words go here\n\nhello world'
words = ['go', 'here', 'hello']
Expand Down

0 comments on commit 4a3797a

Please sign in to comment.