rewrite number parser

gfx-rs · May 31, 2022 · 3441dd6 · 3441dd6
1 parent ca5d52a
commit 3441dd6
Show file tree

Hide file tree

Showing 5 changed files with 527 additions and 311 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -32,7 +32,7 @@ serialize = ["serde", "indexmap/serde-1"]
 deserialize = ["serde", "indexmap/serde-1"]
 spv-in = ["petgraph", "spirv"]
 spv-out = ["spirv"]
-wgsl-in = ["codespan-reporting", "hexf-parse", "unicode-xid", "regex"]
+wgsl-in = ["codespan-reporting", "hexf-parse", "unicode-xid"]
 wgsl-out = []
 hlsl-out = []
 span = ["codespan-reporting"]
@@ -62,7 +62,6 @@ petgraph = { version ="0.6", optional = true }
 pp-rs = { version = "0.2.1", optional = true }
 hexf-parse = { version = "0.2.1", optional = true }
 unicode-xid = { version = "0.2.3", optional = true }
-regex = { version = "1", optional = true, default-features = false, features = ["std", "perf"] }
 
 [dev-dependencies]
 bincode = "1"

diff --git a/src/front/wgsl/lexer.rs b/src/front/wgsl/lexer.rs
@@ -1,288 +1,15 @@
-use std::borrow::Cow;
-
-use super::{conv, Error, ExpectedToken, NumberError, Span, Token, TokenSpan};
+use super::{
+    conv,
+    number::{consume_number, Number},
+    Error, ExpectedToken, NumberError, Span, Token, TokenSpan,
+};
 
 fn consume_any(input: &str, what: impl Fn(char) -> bool) -> (&str, &str) {
     let pos = input.find(|c| !what(c)).unwrap_or(input.len());
     input.split_at(pos)
 }
 
-/// When using this type assume no Abstract Int/Float for now
-#[derive(Copy, Clone, Debug, PartialEq)]
-pub enum Number {
-    /// Abstract Int (-2^63 ≤ i < 2^63)
-    Int(i64),
-    /// Abstract Float (IEEE-754 binary64)
-    Float(f64),
-    /// Concrete i32
-    I32(i32),
-    /// Concrete u32
-    U32(u32),
-    /// Concrete f32
-    F32(f32),
-}
-
-const PATTERNS: [&str; 5] = [
-    r"^(-)?    0[xX]    ([0-9a-fA-F]+\.[0-9a-fA-F]* | [0-9a-fA-F]*\.[0-9a-fA-F]+)    (?:([pP][+-]?[0-9]+)    ([fh]?))?",
-    r"^(-)?    0[xX]    ([0-9a-fA-F]+)                                                  ([pP][+-]?[0-9]+)    ([fh]?)",
-    r"^(-)?    0[xX]    ([0-9a-fA-F]+)                                                                       ([iu]?)",
-    r"^( -?              [0-9]+                                                          [eE][+-]?[0-9]+
-       | -?(?:           [0-9]+\.[0-9]* | [0-9]*\.[0-9]+)                             (?:[eE][+-]?[0-9]+)? ) ([fh]?)",
-    r"^((-)?           (?:0 | [1-9][0-9]*))                                                                  ([iufh]?)",
-];
-
-const NR_OF_CAPTURE_GROUPS: [usize; 5] = [4, 4, 3, 2, 3];
-
-pub(super) struct NumberRegexes {
-    set: regex::RegexSet,
-    regexes: [regex::Regex; 5],
-}
-
-impl NumberRegexes {
-    pub(super) fn new() -> Self {
-        let set = regex::RegexSetBuilder::new(PATTERNS)
-            .ignore_whitespace(true)
-            .build()
-            .unwrap();
-        let regexes = PATTERNS.map(|pattern| {
-            regex::RegexBuilder::new(pattern)
-                .ignore_whitespace(true)
-                .build()
-                .unwrap()
-        });
-        Self { set, regexes }
-    }
-}
-
-// TODO: when implementing Creation-Time Expressions, remove the ability to match the minus sign
-
-fn consume_number_impl<'a>(
-    input: &'a str,
-    number_regexes: &NumberRegexes,
-) -> (Result<Number, NumberError>, &'a str) {
-    // The following regexes will be matched:
-
-    // int_literal :
-    // | / 0                                                                [iu]?   /
-    // | / [1-9][0-9]*                                                      [iu]?   /
-    // | / 0[xX][0-9a-fA-F]+                                                [iu]?   /
-
-    // decimal_float_literal :
-    // | / 0                                                                [fh]    /
-    // | / [1-9][0-9]*                                                      [fh]    /
-    // | / [0-9]*               \.[0-9]+            ([eE][+-]?[0-9]+)?      [fh]?   /
-    // | / [0-9]+               \.[0-9]*            ([eE][+-]?[0-9]+)?      [fh]?   /
-    // | / [0-9]+                                    [eE][+-]?[0-9]+        [fh]?   /
-
-    // hex_float_literal :
-    // | / 0[xX][0-9a-fA-F]*    \.[0-9a-fA-F]+      ([pP][+-]?[0-9]+        [fh]?)? /
-    // | / 0[xX][0-9a-fA-F]+    \.[0-9a-fA-F]*      ([pP][+-]?[0-9]+        [fh]?)? /
-    // | / 0[xX][0-9a-fA-F]+                         [pP][+-]?[0-9]+        [fh]?   /
-
-    // Float parsing notes
-
-    // The following chapters of IEEE 754-2019 are relevant:
-    //
-    // 7.4 Overflow (largest finite number is exceeded by what would have been
-    //     the rounded floating-point result were the exponent range unbounded)
-    //
-    // 7.5 Underflow (tiny non-zero result is detected;
-    //     for decimal formats tininess is detected before rounding when a non-zero result
-    //     computed as though both the exponent range and the precision were unbounded
-    //     would lie strictly between 2^−126)
-    //
-    // 7.6 Inexact (rounded result differs from what would have been computed
-    //     were both exponent range and precision unbounded)
-
-    // The WGSL spec requires us to error:
-    //   on overflow for decimal floating point literals
-    //   on overflow and inexact for hexadecimal floating point literals
-    // (underflow is not mentioned)
-
-    // hexf_parse errors on overflow, underflow, inexact
-    // rust std lib float from str handles overflow, underflow, inexact transparently (rounds and will not error)
-
-    // Therefore we only check for overflow manually for decimal floating point literals
-
-    fn parse_hex_float(input: &str, kind: &str) -> Result<Number, NumberError> {
-        match kind {
-            "" => match hexf_parse::parse_hexf64(input, false) {
-                Ok(num) => Ok(Number::Float(num)),
-                // can only be ParseHexfErrorKind::Inexact but we can't check since it's private
-                _ => Err(NumberError::NotRepresentable),
-            },
-            "f" => match hexf_parse::parse_hexf32(input, false) {
-                Ok(num) => Ok(Number::F32(num)),
-                // can only be ParseHexfErrorKind::Inexact but we can't check since it's private
-                _ => Err(NumberError::NotRepresentable),
-            },
-            "h" => Err(NumberError::UnimplementedF16),
-            _ => unreachable!(),
-        }
-    }
-
-    fn parse_dec_float(input: &str, kind: &str) -> Result<Number, NumberError> {
-        match kind {
-            "" => {
-                let num = input.parse::<f64>().unwrap(); // will never fail
-                num.is_finite()
-                    .then(|| Number::Float(num))
-                    .ok_or(NumberError::NotRepresentable)
-            }
-            "f" => {
-                let num = input.parse::<f32>().unwrap(); // will never fail
-                num.is_finite()
-                    .then(|| Number::F32(num))
-                    .ok_or(NumberError::NotRepresentable)
-            }
-            "h" => Err(NumberError::UnimplementedF16),
-            _ => unreachable!(),
-        }
-    }
-
-    fn parse_int(
-        input: &str,
-        kind: &str,
-        radix: u32,
-        is_negative: bool,
-    ) -> Result<Number, NumberError> {
-        fn map_err(e: core::num::ParseIntError) -> NumberError {
-            match *e.kind() {
-                core::num::IntErrorKind::PosOverflow | core::num::IntErrorKind::NegOverflow => {
-                    NumberError::NotRepresentable
-                }
-                _ => unreachable!(),
-            }
-        }
-        match kind {
-            "" => match i64::from_str_radix(input, radix) {
-                Ok(num) => Ok(Number::Int(num)),
-                Err(e) => Err(map_err(e)),
-            },
-            "i" => match i32::from_str_radix(input, radix) {
-                Ok(num) => Ok(Number::I32(num)),
-                Err(e) => Err(map_err(e)),
-            },
-            "u" if is_negative => Err(NumberError::NotRepresentable),
-            "u" => match u32::from_str_radix(input, radix) {
-                Ok(num) => Ok(Number::U32(num)),
-                Err(e) => Err(map_err(e)),
-            },
-            _ => unreachable!(),
-        }
-    }
-
-    macro_rules! regex_captures {
-        ($index:literal) => {{
-            let regex = &number_regexes.regexes[$index];
-            const COUNT: usize = NR_OF_CAPTURE_GROUPS[$index];
-            debug_assert_eq!(COUNT, regex.captures_len() - 1);
-            regex
-                .captures(input)
-                .map(|captures| {
-                    let mut iter = captures
-                        .iter()
-                        .skip(1)
-                        .map(|m| m.map_or("", |m| m.as_str()));
-
-                    let end = captures.iter().flatten().last().unwrap().end();
-
-                    ([(); COUNT].map(|_| iter.next().unwrap()), &input[end..])
-                })
-                .unwrap()
-        }};
-    }
-
-    match number_regexes.set.matches(input).iter().next() {
-        Some(0) => {
-            let ([sign, significand, exponent, kind], rest) = regex_captures!(0);
-            // | / 0[xX][0-9a-fA-F]+    \.[0-9a-fA-F]*      ([pP][+-]?[0-9]+        [fh]?)? /
-            // | / 0[xX][0-9a-fA-F]*    \.[0-9a-fA-F]+      ([pP][+-]?[0-9]+        [fh]?)? /
-
-            // 0[xX] and [pP] is required by hexf
-            let hexf_input = &format!(
-                "{}0x{}{}",
-                sign,
-                significand,
-                if exponent.is_empty() { "p0" } else { exponent }
-            );
-
-            (parse_hex_float(hexf_input, kind), rest)
-        }
-        Some(1) => {
-            let ([sign, significand, exponent, kind], rest) = regex_captures!(1);
-            // | / 0[xX][0-9a-fA-F]+                         [pP][+-]?[0-9]+        [fh]?   /
-
-            // 0[xX] and . is required by hexf
-            let hexf_input = &format!("{}0x{}.{}", sign, significand, exponent);
-
-            (parse_hex_float(hexf_input, kind), rest)
-        }
-        Some(2) => {
-            let ([sign, digits, kind], rest) = regex_captures!(2);
-            // | / 0[xX][0-9a-fA-F]+                                                [iu]?   /
-
-            let is_negative = sign == "-";
-            let digits_with_sign = if is_negative {
-                Cow::Owned(format!("-{}", digits))
-            } else {
-                Cow::Borrowed(digits)
-            };
-
-            (parse_int(&digits_with_sign, kind, 16, is_negative), rest)
-        }
-        Some(3) => {
-            let ([number, kind], rest) = regex_captures!(3);
-            // | / [0-9]+                                    [eE][+-]?[0-9]+        [fh]?   /
-            // | / [0-9]+               \.[0-9]*            ([eE][+-]?[0-9]+)?      [fh]?   /
-            // | / [0-9]*               \.[0-9]+            ([eE][+-]?[0-9]+)?      [fh]?   /
-
-            (parse_dec_float(number, kind), rest)
-        }
-        Some(4) => {
-            let ([digits_with_sign, sign, kind], rest) = regex_captures!(4);
-            // | / 0                                                                [iufh]? /
-            // | / [1-9][0-9]*                                                      [iufh]? /
-
-            let is_negative = sign == "-";
-
-            match kind {
-                "" | "i" | "u" => (parse_int(digits_with_sign, kind, 10, is_negative), rest),
-                "f" | "h" => (parse_dec_float(digits_with_sign, kind), rest),
-                _ => unreachable!(),
-            }
-        }
-        _ => (Err(NumberError::Invalid), input),
-    }
-}
-
-fn consume_number<'a>(input: &'a str, number_regexes: &NumberRegexes) -> (Token<'a>, &'a str) {
-    let res = consume_number_impl(input, number_regexes);
-    let num = match res.0 {
-        Ok(Number::Int(num)) => {
-            use std::convert::TryFrom;
-            i32::try_from(num)
-                .map(Number::I32)
-                .map_err(|_| NumberError::NotRepresentable)
-        }
-        Ok(Number::Float(num)) => {
-            let num = num as f32;
-            if num.is_finite() {
-                Ok(Number::F32(num))
-            } else {
-                Err(NumberError::NotRepresentable)
-            }
-        }
-        num => num,
-    };
-    (Token::Number(num), res.1)
-}
-
-fn consume_token<'a>(
-    input: &'a str,
-    generic: bool,
-    number_regexes: &NumberRegexes,
-) -> (Token<'a>, &'a str) {
+fn consume_token(input: &str, generic: bool) -> (Token<'_>, &str) {
     let mut chars = input.chars();
     let cur = match chars.next() {
         Some(c) => c,
@@ -293,7 +20,7 @@ fn consume_token<'a>(
         '.' => {
             let og_chars = chars.as_str();
             match chars.next() {
-                Some('0'..='9') => consume_number(input, number_regexes),
+                Some('0'..='9') => consume_number(input),
                 _ => (Token::Separator(cur), og_chars),
             }
         }
@@ -313,7 +40,7 @@ fn consume_token<'a>(
                 _ => (Token::Paren(cur), og_chars),
             }
         }
-        '0'..='9' => consume_number(input, number_regexes),
+        '0'..='9' => consume_number(input),
         '/' => {
             let og_chars = chars.as_str();
             match chars.next() {
@@ -354,7 +81,7 @@ fn consume_token<'a>(
             let og_chars = chars.as_str();
             match chars.next() {
                 Some('>') => (Token::Arrow, chars.as_str()),
-                Some('0'..='9' | '.') => consume_number(input, number_regexes),
+                Some('0'..='9' | '.') => consume_number(input),
                 Some('-') => (Token::DecrementOperation, chars.as_str()),
                 Some('=') => (Token::AssignmentOperation(cur), chars.as_str()),
                 _ => (Token::Operation(cur), og_chars),
@@ -440,15 +167,13 @@ fn is_word_part(c: char) -> bool {
 pub(super) struct Lexer<'a> {
     input: &'a str,
     pub(super) source: &'a str,
-    number_regexes: &'a NumberRegexes,
 }
 
 impl<'a> Lexer<'a> {
-    pub(super) const fn new(input: &'a str, number_regexes: &'a NumberRegexes) -> Self {
+    pub(super) const fn new(input: &'a str) -> Self {
         Lexer {
             input,
             source: input,
-            number_regexes,
         }
     }
 
@@ -494,7 +219,7 @@ impl<'a> Lexer<'a> {
     pub(super) fn next(&mut self) -> TokenSpan<'a> {
         let mut start_byte_offset = self.current_byte_offset();
         loop {
-            let (token, rest) = consume_token(self.input, false, self.number_regexes);
+            let (token, rest) = consume_token(self.input, false);
             self.input = rest;
             match token {
                 Token::Trivia => start_byte_offset = self.current_byte_offset(),
@@ -507,7 +232,7 @@ impl<'a> Lexer<'a> {
     pub(super) fn next_generic(&mut self) -> TokenSpan<'a> {
         let mut start_byte_offset = self.current_byte_offset();
         loop {
-            let (token, rest) = consume_token(self.input, true, self.number_regexes);
+            let (token, rest) = consume_token(self.input, true);
             self.input = rest;
             match token {
                 Token::Trivia => start_byte_offset = self.current_byte_offset(),
@@ -654,8 +379,7 @@ impl<'a> Lexer<'a> {
 
 #[cfg(test)]
 fn sub_test(source: &str, expected_tokens: &[Token]) {
-    let number_regexes = NumberRegexes::new();
-    let mut lex = Lexer::new(source, &number_regexes);
+    let mut lex = Lexer::new(source);
     for &token in expected_tokens {
         assert_eq!(lex.next().0, token);
     }