Skip to content

Commit

Permalink
Auto merge of #122013 - Swatinem:unicode-gen-fastpath, r=<try>
Browse files Browse the repository at this point in the history
Add a lower bound check to `unicode-table-generator` output

This adds a dedicated check for the lower bound
(if it is outside of ASCII range) to the output of the `unicode-table-generator` tool.

This generalized the ASCII-only fast-path, but only for the `Grapheme_Extend` property for now, as that is the only one with a lower bound outside of ASCII.
  • Loading branch information
bors committed Mar 8, 2024
2 parents 9c3ad80 + 6d7daa0 commit 554f230
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 3 deletions.
2 changes: 1 addition & 1 deletion library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ impl char {
#[must_use]
#[inline]
pub(crate) fn is_grapheme_extended(self) -> bool {
self > '\x7f' && unicode::Grapheme_Extend(self)
unicode::Grapheme_Extend(self)
}

/// Returns `true` if this `char` has one of the general categories for numbers.
Expand Down
1 change: 1 addition & 0 deletions library/core/src/unicode/unicode_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ pub mod grapheme_extend {
128, 240, 0,
];
pub fn lookup(c: char) -> bool {
(c as u32) >= 0x300 &&
super::skip_search(
c as u32,
&SHORT_OFFSET_RUNS,
Expand Down
6 changes: 5 additions & 1 deletion src/tools/unicode-table-generator/src/raw_emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ impl RawEmitter {
}

fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> {
let first_code_point = ranges.first().unwrap().start;
let last_code_point = ranges.last().unwrap().end;
// bitset for every bit in the codepoint range
//
Expand Down Expand Up @@ -101,7 +102,10 @@ impl RawEmitter {
)
.unwrap();
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
writeln!(&mut self.file, " super::bitset_search(",).unwrap();
if first_code_point > 0x7f {
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
}
writeln!(&mut self.file, " super::bitset_search(").unwrap();
writeln!(&mut self.file, " c as u32,").unwrap();
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
Expand Down
6 changes: 5 additions & 1 deletion src/tools/unicode-table-generator/src/skiplist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ impl ShortOffsetRunHeader {

impl RawEmitter {
pub fn emit_skiplist(&mut self, ranges: &[Range<u32>]) {
let first_code_point = ranges.first().unwrap().start;
let mut offsets = Vec::<u32>::new();
let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::<Vec<u32>>();
let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::<Vec<u32>>();
let mut offset = 0;
for pt in points {
let delta = pt - offset;
Expand Down Expand Up @@ -87,6 +88,9 @@ impl RawEmitter {
self.bytes_used += coded_offsets.len();

writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
if first_code_point > 0x7f {
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
}
writeln!(&mut self.file, " super::skip_search(",).unwrap();
writeln!(&mut self.file, " c as u32,").unwrap();
writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();
Expand Down

0 comments on commit 554f230

Please sign in to comment.