feat: Support upper/mixed case special characters

Closes #5
alexpovel · May 22, 2023 · 90111da · 90111da
1 parent f544e4b
commit 90111da
Show file tree

Hide file tree

Showing 35 changed files with 342 additions and 74 deletions.
diff --git a/data/word-lists/de/dev.txt b/data/word-lists/de/dev.txt
@@ -6,6 +6,7 @@ Maßstab
 Dominoeffekt
 dröge
 Poet
+übel
 Abenteuer
 Mauer
 Dübel

diff --git a/src/main.rs b/src/main.rs
@@ -12,10 +12,10 @@ use crate::{
 };
 
 mod cli;
-mod iteration;
 mod modules;
 #[cfg(test)]
 mod testing;
+mod util;
 
 const EXPECTABLE_MAXIMUM_WORD_LENGTH_BYTES: u8 = 64;
 const EXPECTABLE_MAXIMUM_MATCHES_PER_WORD: u8 = 8;

diff --git a/src/modules/german/machine.rs b/src/modules/german/machine.rs
@@ -1,11 +1,17 @@
 use log::{debug, trace};
 
 use crate::{
-    iteration::power_set,
     modules::{german::word::Replace, TextProcessor},
+    util::{
+        iteration::power_set,
+        strings::{first_char, lowercase_first_char, uppercase_first_char},
+    },
 };
 
-use super::{SpecialCharacter, Umlaut, Word};
+use super::{
+    Casing::Lower, Casing::Upper, SpecialCharacter, SpecialCharacter::Eszett,
+    SpecialCharacter::Umlaut, Umlaut::Ae, Umlaut::Oe, Umlaut::Ue, Word,
+};
 
 #[derive(Default, Debug)]
 enum State {
@@ -67,40 +73,7 @@ impl StateMachine {
         self.pre_transition();
 
         let next = match (&self.state, input) {
-            (
-                State::Word(None)
-                | State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
-                | State::Other,
-                'o',
-            ) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Oe)))),
-            (
-                State::Word(None)
-                | State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
-                | State::Other,
-                'u',
-            ) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Ue)))),
-            (
-                State::Word(None)
-                | State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
-                | State::Other,
-                'a',
-            ) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Ae)))),
-            (
-                State::Word(None)
-                | State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
-                | State::Other,
-                's',
-            ) => State::Word(Some(Potential(SpecialCharacter::Eszett))),
-            (State::Word(Some(Potential(SpecialCharacter::Eszett))), c @ 's') => {
-                let pos = self.word.len();
-
-                let start = pos - c.len_utf8(); // Previous char same as current `c`
-                let end = pos + c.len_utf8();
-                self.word
-                    .add_replacement(start, end, SpecialCharacter::Eszett);
-                State::Word(None)
-            }
-            (State::Word(Some(Potential(SpecialCharacter::Umlaut(umlaut)))), c @ 'e') => {
+            (State::Word(Some(Potential(Umlaut(umlaut)))), c @ 'e' | c @ 'E') => {
                 let pos = self.word.len();
 
                 const LENGTH_OF_PREVIOUS_CHARACTER: usize = 1;
@@ -112,8 +85,29 @@ impl StateMachine {
 
                 let start = pos - LENGTH_OF_PREVIOUS_CHARACTER;
                 let end = pos + c.len_utf8();
-                self.word
-                    .add_replacement(start, end, SpecialCharacter::Umlaut(*umlaut));
+                self.word.add_replacement(start, end, Umlaut(*umlaut));
+                State::Word(None)
+            }
+            (State::Word(None) | State::Word(Some(Potential(Umlaut(_)))) | State::Other, c) => {
+                match c {
+                    'a' => State::Word(Some(Potential(Umlaut(Ae(Lower))))),
+                    'A' => State::Word(Some(Potential(Umlaut(Ae(Upper))))),
+                    'o' => State::Word(Some(Potential(Umlaut(Oe(Lower))))),
+                    'O' => State::Word(Some(Potential(Umlaut(Oe(Upper))))),
+                    'u' => State::Word(Some(Potential(Umlaut(Ue(Lower))))),
+                    'U' => State::Word(Some(Potential(Umlaut(Ue(Upper))))),
+                    's' => State::Word(Some(Potential(Eszett(Lower)))),
+                    'S' => State::Word(Some(Potential(Eszett(Upper)))),
+                    c if c.is_alphabetic() => State::Word(None),
+                    _ => State::Other,
+                }
+            }
+            (State::Word(Some(Potential(Eszett(casing)))), c @ 's' | c @ 'S') => {
+                let pos = self.word.len();
+
+                let start = pos - c.len_utf8(); // Previous char same as current `c`
+                let end = pos + c.len_utf8();
+                self.word.add_replacement(start, end, Eszett(*casing));
                 State::Word(None)
             }
             //
@@ -149,34 +143,44 @@ fn is_valid(word: &str, words: &[&str]) -> bool {
 
     trace!("Trying candidate '{}'...", word);
 
-    if words.binary_search(&word).is_ok() {
+    // Pretty much all ordinarily lowercase words *might* appear uppercased, e.g. at the
+    // beginning of sentences. For example: "Uebel!" -> "Übel!", even though only "übel"
+    // is in the dictionary.
+    if first_char(word).is_uppercase() && is_valid(&lowercase_first_char(word), words) {
+        trace!("Candidate '{}' is valid when lowercased.", word);
+        return true;
+    }
+
+    let search = |word| words.binary_search(&word).is_ok();
+
+    if search(word) {
         trace!("Found candidate '{}' in word list, is valid.", word);
         return true;
     }
 
-    // Skip initial, else initial `prefix` slice is empty.
-    for (i, _) in word.char_indices().skip(1) {
+    for (i, _) in word
+        .char_indices()
+        // Skip, as `prefix` empty on first iteration otherwise, which is wasted work.
+        .skip(1)
+    {
         let prefix = &word[..i];
         trace!("Trying prefix '{}'", prefix);
 
-        if words.binary_search(&prefix).is_ok() {
+        if search(prefix) {
             let suffix = &word[i..];
 
-            // We cannot get around copying the whole string (`String.remove`), as the
-            // new, uppercased character might have a different byte length. It might
-            // therefore not fit into the newly open slot at index 0.
-            let mut uc_suffix = suffix.to_string();
-            uc_suffix = uc_suffix.remove(0).to_uppercase().to_string() + &uc_suffix;
-
             trace!(
-                "Prefix found in word list, seeing if either original '{}' or uppercased suffix '{}' is valid.",
-                suffix,
-                uc_suffix
+                "Prefix found in word list, seeing if (uppercased) suffix '{}' is valid.",
+                suffix
             );
 
-            // Recursively forks the search into two branches. The uppercase version is
-            // likelier to be a hit, hence try first in hopes of a short circuit.
-            return is_valid(&uc_suffix, words) || is_valid(suffix, words);
+            // We uppercase to detect e.g. `Mauerdübel`, where after the first iteration
+            // we'd have `Mauer` and `dübel`, with only `Dübel` being valid.
+            //
+            // Next recursion will test both lower- and this uppercased version, so also
+            // words like `Mauergrün` are valid, where `grün` is in the dictionary but
+            // `Grün` *might* not be, for example.
+            return is_valid(&uppercase_first_char(suffix), words);
         }
 
         trace!("Prefix not found in word list, trying next.");
@@ -237,6 +241,8 @@ impl TextProcessor for German {
                 if is_valid(&candidate, WORDS) {
                     trace!("Candidate is valid word, exiting search.");
                     break;
+                } else {
+                    trace!("Candidate is invalid word, trying the next one.");
                 }
 
                 candidate = get_fresh_candidate();
@@ -284,12 +290,17 @@ mod tests {
         is_valid("Doesn't matter, this will panic.", words);
     }
 
+    #[test]
+    #[should_panic]
+    fn test_is_valid_panics_on_empty_input() {
+        is_valid("", WORDS);
+    }
+
     instrament! {
         #[rstest]
         fn test_is_valid(
             #[values(
                 "????",
-                "",
                 "\0",
                 "\0Dübel",
                 "\0Dübel\0",

diff --git a/src/modules/german/mod.rs b/src/modules/german/mod.rs
@@ -4,5 +4,5 @@ mod word;
 
 // Re-export symbols.
 pub use machine::German;
-pub(self) use special_characters::{SpecialCharacter, Umlaut};
+pub(self) use special_characters::{Casing, SpecialCharacter, Umlaut};
 pub(self) use word::Word;
diff --git a/...dules/german/snapshots/betterletter__modules__german__machine__tests__test_is_valid-.snap b/...dules/german/snapshots/betterletter__modules__german__machine__tests__test_is_valid-.snap
@@ -2,7 +2,7 @@
 source: src/modules/german/machine.rs
 expression: "is_valid(&word, WORDS)"
 info:
-  word: ""
+  word: "????"
 ---
 false
 
diff --git a/src/modules/german/special_characters.rs b/src/modules/german/special_characters.rs
@@ -1,10 +1,16 @@
 use std::fmt::Display;
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum Casing {
+    Lower,
+    Upper,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub(super) enum Umlaut {
-    Ue,
-    Oe,
-    Ae,
+    Ae(Casing),
+    Oe(Casing),
+    Ue(Casing),
 }
 
 impl Display for Umlaut {
@@ -13,9 +19,12 @@ impl Display for Umlaut {
             f,
             "{}",
             match self {
-                Umlaut::Ue => 'ü',
-                Umlaut::Oe => 'ö',
-                Umlaut::Ae => 'ä',
+                Umlaut::Ae(Casing::Lower) => 'ä',
+                Umlaut::Ae(Casing::Upper) => 'Ä',
+                Umlaut::Oe(Casing::Lower) => 'ö',
+                Umlaut::Oe(Casing::Upper) => 'Ö',
+                Umlaut::Ue(Casing::Lower) => 'ü',
+                Umlaut::Ue(Casing::Upper) => 'Ü',
             }
         )
     }
@@ -24,7 +33,7 @@ impl Display for Umlaut {
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub(super) enum SpecialCharacter {
     Umlaut(Umlaut),
-    Eszett,
+    Eszett(Casing),
 }
 
 impl Display for SpecialCharacter {
@@ -34,7 +43,8 @@ impl Display for SpecialCharacter {
             "{}",
             match self {
                 SpecialCharacter::Umlaut(umlaut) => umlaut.to_string(),
-                SpecialCharacter::Eszett => String::from('ß'),
+                SpecialCharacter::Eszett(Casing::Lower) => String::from('ß'),
+                SpecialCharacter::Eszett(Casing::Upper) => String::from('ẞ'),
             }
         )
     }

diff --git a/src/iteration.rs → src/util/iteration.rs b/src/iteration.rs → src/util/iteration.rs
diff --git a/src/util/mod.rs b/src/util/mod.rs
@@ -0,0 +1,2 @@
+pub(crate) mod iteration;
+pub(crate) mod strings;
diff --git a/...tion__tests__test_power_set-[]-false.snap → ...tion__tests__test_power_set-[]-false.snap b/...tion__tests__test_power_set-[]-false.snap → ...tion__tests__test_power_set-[]-false.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection: []

diff --git a/...ation__tests__test_power_set-[]-true.snap → ...ation__tests__test_power_set-[]-true.snap b/...ation__tests__test_power_set-[]-true.snap → ...ation__tests__test_power_set-[]-true.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection: []

diff --git a/...s__test_power_set-[_1,_2,_3,_]-false.snap → ...s__test_power_set-[_1,_2,_3,_]-false.snap b/...s__test_power_set-[_1,_2,_3,_]-false.snap → ...s__test_power_set-[_1,_2,_3,_]-false.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/...ts__test_power_set-[_1,_2,_3,_]-true.snap → ...ts__test_power_set-[_1,_2,_3,_]-true.snap b/...ts__test_power_set-[_1,_2,_3,_]-true.snap → ...ts__test_power_set-[_1,_2,_3,_]-true.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/...ests__test_power_set-[_1,_2,_]-false.snap → ...ests__test_power_set-[_1,_2,_]-false.snap b/...ests__test_power_set-[_1,_2,_]-false.snap → ...ests__test_power_set-[_1,_2,_]-false.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/...tests__test_power_set-[_1,_2,_]-true.snap → ...tests__test_power_set-[_1,_2,_]-true.snap b/...tests__test_power_set-[_1,_2,_]-true.snap → ...tests__test_power_set-[_1,_2,_]-true.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/...__tests__test_power_set-[_1,_]-false.snap → ...__tests__test_power_set-[_1,_]-false.snap b/...__tests__test_power_set-[_1,_]-false.snap → ...__tests__test_power_set-[_1,_]-false.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/...n__tests__test_power_set-[_1,_]-true.snap → ...n__tests__test_power_set-[_1,_]-true.snap b/...n__tests__test_power_set-[_1,_]-true.snap → ...n__tests__test_power_set-[_1,_]-true.snap
@@ -1,5 +1,5 @@
 ---
-source: src/iteration.rs
+source: src/util/iteration.rs
 expression: result
 info:
   collection:

diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Hello.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Hello.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: Hello
+---
+H
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Uebel.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Uebel.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: Uebel
+---
+U
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Übel.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_Übel.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: Übel
+---
+Ü
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_ßuper.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_ßuper.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: ßuper
+---
+ß
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_ẞuperduper.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_ẞuperduper.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: ẞuperduper
+---
+ẞ
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_😀.snap b/src/util/snapshots/betterletter__util__strings__tests__test_first_char-_😀.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: first_char(&word).to_string()
+info:
+  word: 😀
+---
+😀
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_lowercasing-_Hello.snap b/src/util/snapshots/betterletter__util__strings__tests__test_lowercasing-_Hello.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: result
+info:
+  word: Hello
+---
+hello
diff --git a/src/util/snapshots/betterletter__util__strings__tests__test_lowercasing-_Uebel.snap b/src/util/snapshots/betterletter__util__strings__tests__test_lowercasing-_Uebel.snap
@@ -0,0 +1,7 @@
+---
+source: src/util/strings.rs
+expression: lowercase_first_char(&word)
+info:
+  word: Uebel
+---
+uebel