Skip to content

Commit

Permalink
feat: Support upper/mixed case special characters
Browse files Browse the repository at this point in the history
Closes #5
  • Loading branch information
alexpovel committed May 22, 2023
1 parent f544e4b commit 90111da
Show file tree
Hide file tree
Showing 35 changed files with 342 additions and 74 deletions.
1 change: 1 addition & 0 deletions data/word-lists/de/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Maßstab
Dominoeffekt
dröge
Poet
übel
Abenteuer
Mauer
Dübel
Expand Down
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ use crate::{
};

mod cli;
mod iteration;
mod modules;
#[cfg(test)]
mod testing;
mod util;

const EXPECTABLE_MAXIMUM_WORD_LENGTH_BYTES: u8 = 64;
const EXPECTABLE_MAXIMUM_MATCHES_PER_WORD: u8 = 8;
Expand Down
121 changes: 66 additions & 55 deletions src/modules/german/machine.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
use log::{debug, trace};

use crate::{
iteration::power_set,
modules::{german::word::Replace, TextProcessor},
util::{
iteration::power_set,
strings::{first_char, lowercase_first_char, uppercase_first_char},
},
};

use super::{SpecialCharacter, Umlaut, Word};
use super::{
Casing::Lower, Casing::Upper, SpecialCharacter, SpecialCharacter::Eszett,
SpecialCharacter::Umlaut, Umlaut::Ae, Umlaut::Oe, Umlaut::Ue, Word,
};

#[derive(Default, Debug)]
enum State {
Expand Down Expand Up @@ -67,40 +73,7 @@ impl StateMachine {
self.pre_transition();

let next = match (&self.state, input) {
(
State::Word(None)
| State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
| State::Other,
'o',
) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Oe)))),
(
State::Word(None)
| State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
| State::Other,
'u',
) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Ue)))),
(
State::Word(None)
| State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
| State::Other,
'a',
) => State::Word(Some(Potential(SpecialCharacter::Umlaut(Umlaut::Ae)))),
(
State::Word(None)
| State::Word(Some(Potential(SpecialCharacter::Umlaut(_))))
| State::Other,
's',
) => State::Word(Some(Potential(SpecialCharacter::Eszett))),
(State::Word(Some(Potential(SpecialCharacter::Eszett))), c @ 's') => {
let pos = self.word.len();

let start = pos - c.len_utf8(); // Previous char same as current `c`
let end = pos + c.len_utf8();
self.word
.add_replacement(start, end, SpecialCharacter::Eszett);
State::Word(None)
}
(State::Word(Some(Potential(SpecialCharacter::Umlaut(umlaut)))), c @ 'e') => {
(State::Word(Some(Potential(Umlaut(umlaut)))), c @ 'e' | c @ 'E') => {
let pos = self.word.len();

const LENGTH_OF_PREVIOUS_CHARACTER: usize = 1;
Expand All @@ -112,8 +85,29 @@ impl StateMachine {

let start = pos - LENGTH_OF_PREVIOUS_CHARACTER;
let end = pos + c.len_utf8();
self.word
.add_replacement(start, end, SpecialCharacter::Umlaut(*umlaut));
self.word.add_replacement(start, end, Umlaut(*umlaut));
State::Word(None)
}
(State::Word(None) | State::Word(Some(Potential(Umlaut(_)))) | State::Other, c) => {
match c {
'a' => State::Word(Some(Potential(Umlaut(Ae(Lower))))),
'A' => State::Word(Some(Potential(Umlaut(Ae(Upper))))),
'o' => State::Word(Some(Potential(Umlaut(Oe(Lower))))),
'O' => State::Word(Some(Potential(Umlaut(Oe(Upper))))),
'u' => State::Word(Some(Potential(Umlaut(Ue(Lower))))),
'U' => State::Word(Some(Potential(Umlaut(Ue(Upper))))),
's' => State::Word(Some(Potential(Eszett(Lower)))),
'S' => State::Word(Some(Potential(Eszett(Upper)))),
c if c.is_alphabetic() => State::Word(None),
_ => State::Other,
}
}
(State::Word(Some(Potential(Eszett(casing)))), c @ 's' | c @ 'S') => {
let pos = self.word.len();

let start = pos - c.len_utf8(); // Previous char same as current `c`
let end = pos + c.len_utf8();
self.word.add_replacement(start, end, Eszett(*casing));
State::Word(None)
}
//
Expand Down Expand Up @@ -149,34 +143,44 @@ fn is_valid(word: &str, words: &[&str]) -> bool {

trace!("Trying candidate '{}'...", word);

if words.binary_search(&word).is_ok() {
// Pretty much all ordinarily lowercase words *might* appear uppercased, e.g. at the
// beginning of sentences. For example: "Uebel!" -> "Übel!", even though only "übel"
// is in the dictionary.
if first_char(word).is_uppercase() && is_valid(&lowercase_first_char(word), words) {
trace!("Candidate '{}' is valid when lowercased.", word);
return true;
}

let search = |word| words.binary_search(&word).is_ok();

if search(word) {
trace!("Found candidate '{}' in word list, is valid.", word);
return true;
}

// Skip initial, else initial `prefix` slice is empty.
for (i, _) in word.char_indices().skip(1) {
for (i, _) in word
.char_indices()
// Skip, as `prefix` empty on first iteration otherwise, which is wasted work.
.skip(1)
{
let prefix = &word[..i];
trace!("Trying prefix '{}'", prefix);

if words.binary_search(&prefix).is_ok() {
if search(prefix) {
let suffix = &word[i..];

// We cannot get around copying the whole string (`String.remove`), as the
// new, uppercased character might have a different byte length. It might
// therefore not fit into the newly open slot at index 0.
let mut uc_suffix = suffix.to_string();
uc_suffix = uc_suffix.remove(0).to_uppercase().to_string() + &uc_suffix;

trace!(
"Prefix found in word list, seeing if either original '{}' or uppercased suffix '{}' is valid.",
suffix,
uc_suffix
"Prefix found in word list, seeing if (uppercased) suffix '{}' is valid.",
suffix
);

// Recursively forks the search into two branches. The uppercase version is
// likelier to be a hit, hence try first in hopes of a short circuit.
return is_valid(&uc_suffix, words) || is_valid(suffix, words);
// We uppercase to detect e.g. `Mauerdübel`, where after the first iteration
// we'd have `Mauer` and `dübel`, with only `Dübel` being valid.
//
// Next recursion will test both lower- and this uppercased version, so also
// words like `Mauergrün` are valid, where `grün` is in the dictionary but
// `Grün` *might* not be, for example.
return is_valid(&uppercase_first_char(suffix), words);
}

trace!("Prefix not found in word list, trying next.");
Expand Down Expand Up @@ -237,6 +241,8 @@ impl TextProcessor for German {
if is_valid(&candidate, WORDS) {
trace!("Candidate is valid word, exiting search.");
break;
} else {
trace!("Candidate is invalid word, trying the next one.");
}

candidate = get_fresh_candidate();
Expand Down Expand Up @@ -284,12 +290,17 @@ mod tests {
is_valid("Doesn't matter, this will panic.", words);
}

#[test]
#[should_panic]
fn test_is_valid_panics_on_empty_input() {
is_valid("", WORDS);
}

instrament! {
#[rstest]
fn test_is_valid(
#[values(
"????",
"",
"\0",
"\0Dübel",
"\0Dübel\0",
Expand Down
2 changes: 1 addition & 1 deletion src/modules/german/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ mod word;

// Re-export symbols.
pub use machine::German;
pub(self) use special_characters::{SpecialCharacter, Umlaut};
pub(self) use special_characters::{Casing, SpecialCharacter, Umlaut};
pub(self) use word::Word;
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
source: src/modules/german/machine.rs
expression: "is_valid(&word, WORDS)"
info:
word: ""
word: "????"
---
false

26 changes: 18 additions & 8 deletions src/modules/german/special_characters.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
use std::fmt::Display;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum Casing {
Lower,
Upper,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum Umlaut {
Ue,
Oe,
Ae,
Ae(Casing),
Oe(Casing),
Ue(Casing),
}

impl Display for Umlaut {
Expand All @@ -13,9 +19,12 @@ impl Display for Umlaut {
f,
"{}",
match self {
Umlaut::Ue => 'ü',
Umlaut::Oe => 'ö',
Umlaut::Ae => 'ä',
Umlaut::Ae(Casing::Lower) => 'ä',
Umlaut::Ae(Casing::Upper) => 'Ä',
Umlaut::Oe(Casing::Lower) => 'ö',
Umlaut::Oe(Casing::Upper) => 'Ö',
Umlaut::Ue(Casing::Lower) => 'ü',
Umlaut::Ue(Casing::Upper) => 'Ü',
}
)
}
Expand All @@ -24,7 +33,7 @@ impl Display for Umlaut {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(super) enum SpecialCharacter {
Umlaut(Umlaut),
Eszett,
Eszett(Casing),
}

impl Display for SpecialCharacter {
Expand All @@ -34,7 +43,8 @@ impl Display for SpecialCharacter {
"{}",
match self {
SpecialCharacter::Umlaut(umlaut) => umlaut.to_string(),
SpecialCharacter::Eszett => String::from('ß'),
SpecialCharacter::Eszett(Casing::Lower) => String::from('ß'),
SpecialCharacter::Eszett(Casing::Upper) => String::from('ẞ'),
}
)
}
Expand Down
File renamed without changes.
2 changes: 2 additions & 0 deletions src/util/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub(crate) mod iteration;
pub(crate) mod strings;
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection: []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection: []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
source: src/iteration.rs
source: src/util/iteration.rs
expression: result
info:
collection:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: Hello
---
H
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: Uebel
---
U
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: Übel
---
Ü
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: ßuper
---
ß
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: ẞuperduper
---
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: first_char(&word).to_string()
info:
word: 😀
---
😀
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: result
info:
word: Hello
---
hello
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
source: src/util/strings.rs
expression: lowercase_first_char(&word)
info:
word: Uebel
---
uebel
Loading

0 comments on commit 90111da

Please sign in to comment.