Skip to content

Commit

Permalink
rewrite number parser
Browse files Browse the repository at this point in the history
  • Loading branch information
teoxoy committed May 31, 2022
1 parent ca5d52a commit 3441dd6
Show file tree
Hide file tree
Showing 5 changed files with 527 additions and 311 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ serialize = ["serde", "indexmap/serde-1"]
deserialize = ["serde", "indexmap/serde-1"]
spv-in = ["petgraph", "spirv"]
spv-out = ["spirv"]
wgsl-in = ["codespan-reporting", "hexf-parse", "unicode-xid", "regex"]
wgsl-in = ["codespan-reporting", "hexf-parse", "unicode-xid"]
wgsl-out = []
hlsl-out = []
span = ["codespan-reporting"]
Expand Down Expand Up @@ -62,7 +62,6 @@ petgraph = { version ="0.6", optional = true }
pp-rs = { version = "0.2.1", optional = true }
hexf-parse = { version = "0.2.1", optional = true }
unicode-xid = { version = "0.2.3", optional = true }
regex = { version = "1", optional = true, default-features = false, features = ["std", "perf"] }

[dev-dependencies]
bincode = "1"
Expand Down
302 changes: 13 additions & 289 deletions src/front/wgsl/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,288 +1,15 @@
use std::borrow::Cow;

use super::{conv, Error, ExpectedToken, NumberError, Span, Token, TokenSpan};
use super::{
conv,
number::{consume_number, Number},
Error, ExpectedToken, NumberError, Span, Token, TokenSpan,
};

fn consume_any(input: &str, what: impl Fn(char) -> bool) -> (&str, &str) {
let pos = input.find(|c| !what(c)).unwrap_or(input.len());
input.split_at(pos)
}

/// When using this type assume no Abstract Int/Float for now
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Number {
/// Abstract Int (-2^63 ≤ i < 2^63)
Int(i64),
/// Abstract Float (IEEE-754 binary64)
Float(f64),
/// Concrete i32
I32(i32),
/// Concrete u32
U32(u32),
/// Concrete f32
F32(f32),
}

const PATTERNS: [&str; 5] = [
r"^(-)? 0[xX] ([0-9a-fA-F]+\.[0-9a-fA-F]* | [0-9a-fA-F]*\.[0-9a-fA-F]+) (?:([pP][+-]?[0-9]+) ([fh]?))?",
r"^(-)? 0[xX] ([0-9a-fA-F]+) ([pP][+-]?[0-9]+) ([fh]?)",
r"^(-)? 0[xX] ([0-9a-fA-F]+) ([iu]?)",
r"^( -? [0-9]+ [eE][+-]?[0-9]+
| -?(?: [0-9]+\.[0-9]* | [0-9]*\.[0-9]+) (?:[eE][+-]?[0-9]+)? ) ([fh]?)",
r"^((-)? (?:0 | [1-9][0-9]*)) ([iufh]?)",
];

const NR_OF_CAPTURE_GROUPS: [usize; 5] = [4, 4, 3, 2, 3];

pub(super) struct NumberRegexes {
set: regex::RegexSet,
regexes: [regex::Regex; 5],
}

impl NumberRegexes {
pub(super) fn new() -> Self {
let set = regex::RegexSetBuilder::new(PATTERNS)
.ignore_whitespace(true)
.build()
.unwrap();
let regexes = PATTERNS.map(|pattern| {
regex::RegexBuilder::new(pattern)
.ignore_whitespace(true)
.build()
.unwrap()
});
Self { set, regexes }
}
}

// TODO: when implementing Creation-Time Expressions, remove the ability to match the minus sign

fn consume_number_impl<'a>(
input: &'a str,
number_regexes: &NumberRegexes,
) -> (Result<Number, NumberError>, &'a str) {
// The following regexes will be matched:

// int_literal :
// | / 0 [iu]? /
// | / [1-9][0-9]* [iu]? /
// | / 0[xX][0-9a-fA-F]+ [iu]? /

// decimal_float_literal :
// | / 0 [fh] /
// | / [1-9][0-9]* [fh] /
// | / [0-9]* \.[0-9]+ ([eE][+-]?[0-9]+)? [fh]? /
// | / [0-9]+ \.[0-9]* ([eE][+-]?[0-9]+)? [fh]? /
// | / [0-9]+ [eE][+-]?[0-9]+ [fh]? /

// hex_float_literal :
// | / 0[xX][0-9a-fA-F]* \.[0-9a-fA-F]+ ([pP][+-]?[0-9]+ [fh]?)? /
// | / 0[xX][0-9a-fA-F]+ \.[0-9a-fA-F]* ([pP][+-]?[0-9]+ [fh]?)? /
// | / 0[xX][0-9a-fA-F]+ [pP][+-]?[0-9]+ [fh]? /

// Float parsing notes

// The following chapters of IEEE 754-2019 are relevant:
//
// 7.4 Overflow (largest finite number is exceeded by what would have been
// the rounded floating-point result were the exponent range unbounded)
//
// 7.5 Underflow (tiny non-zero result is detected;
// for decimal formats tininess is detected before rounding when a non-zero result
// computed as though both the exponent range and the precision were unbounded
// would lie strictly between 2^−126)
//
// 7.6 Inexact (rounded result differs from what would have been computed
// were both exponent range and precision unbounded)

// The WGSL spec requires us to error:
// on overflow for decimal floating point literals
// on overflow and inexact for hexadecimal floating point literals
// (underflow is not mentioned)

// hexf_parse errors on overflow, underflow, inexact
// rust std lib float from str handles overflow, underflow, inexact transparently (rounds and will not error)

// Therefore we only check for overflow manually for decimal floating point literals

fn parse_hex_float(input: &str, kind: &str) -> Result<Number, NumberError> {
match kind {
"" => match hexf_parse::parse_hexf64(input, false) {
Ok(num) => Ok(Number::Float(num)),
// can only be ParseHexfErrorKind::Inexact but we can't check since it's private
_ => Err(NumberError::NotRepresentable),
},
"f" => match hexf_parse::parse_hexf32(input, false) {
Ok(num) => Ok(Number::F32(num)),
// can only be ParseHexfErrorKind::Inexact but we can't check since it's private
_ => Err(NumberError::NotRepresentable),
},
"h" => Err(NumberError::UnimplementedF16),
_ => unreachable!(),
}
}

fn parse_dec_float(input: &str, kind: &str) -> Result<Number, NumberError> {
match kind {
"" => {
let num = input.parse::<f64>().unwrap(); // will never fail
num.is_finite()
.then(|| Number::Float(num))
.ok_or(NumberError::NotRepresentable)
}
"f" => {
let num = input.parse::<f32>().unwrap(); // will never fail
num.is_finite()
.then(|| Number::F32(num))
.ok_or(NumberError::NotRepresentable)
}
"h" => Err(NumberError::UnimplementedF16),
_ => unreachable!(),
}
}

fn parse_int(
input: &str,
kind: &str,
radix: u32,
is_negative: bool,
) -> Result<Number, NumberError> {
fn map_err(e: core::num::ParseIntError) -> NumberError {
match *e.kind() {
core::num::IntErrorKind::PosOverflow | core::num::IntErrorKind::NegOverflow => {
NumberError::NotRepresentable
}
_ => unreachable!(),
}
}
match kind {
"" => match i64::from_str_radix(input, radix) {
Ok(num) => Ok(Number::Int(num)),
Err(e) => Err(map_err(e)),
},
"i" => match i32::from_str_radix(input, radix) {
Ok(num) => Ok(Number::I32(num)),
Err(e) => Err(map_err(e)),
},
"u" if is_negative => Err(NumberError::NotRepresentable),
"u" => match u32::from_str_radix(input, radix) {
Ok(num) => Ok(Number::U32(num)),
Err(e) => Err(map_err(e)),
},
_ => unreachable!(),
}
}

macro_rules! regex_captures {
($index:literal) => {{
let regex = &number_regexes.regexes[$index];
const COUNT: usize = NR_OF_CAPTURE_GROUPS[$index];
debug_assert_eq!(COUNT, regex.captures_len() - 1);
regex
.captures(input)
.map(|captures| {
let mut iter = captures
.iter()
.skip(1)
.map(|m| m.map_or("", |m| m.as_str()));

let end = captures.iter().flatten().last().unwrap().end();

([(); COUNT].map(|_| iter.next().unwrap()), &input[end..])
})
.unwrap()
}};
}

match number_regexes.set.matches(input).iter().next() {
Some(0) => {
let ([sign, significand, exponent, kind], rest) = regex_captures!(0);
// | / 0[xX][0-9a-fA-F]+ \.[0-9a-fA-F]* ([pP][+-]?[0-9]+ [fh]?)? /
// | / 0[xX][0-9a-fA-F]* \.[0-9a-fA-F]+ ([pP][+-]?[0-9]+ [fh]?)? /

// 0[xX] and [pP] is required by hexf
let hexf_input = &format!(
"{}0x{}{}",
sign,
significand,
if exponent.is_empty() { "p0" } else { exponent }
);

(parse_hex_float(hexf_input, kind), rest)
}
Some(1) => {
let ([sign, significand, exponent, kind], rest) = regex_captures!(1);
// | / 0[xX][0-9a-fA-F]+ [pP][+-]?[0-9]+ [fh]? /

// 0[xX] and . is required by hexf
let hexf_input = &format!("{}0x{}.{}", sign, significand, exponent);

(parse_hex_float(hexf_input, kind), rest)
}
Some(2) => {
let ([sign, digits, kind], rest) = regex_captures!(2);
// | / 0[xX][0-9a-fA-F]+ [iu]? /

let is_negative = sign == "-";
let digits_with_sign = if is_negative {
Cow::Owned(format!("-{}", digits))
} else {
Cow::Borrowed(digits)
};

(parse_int(&digits_with_sign, kind, 16, is_negative), rest)
}
Some(3) => {
let ([number, kind], rest) = regex_captures!(3);
// | / [0-9]+ [eE][+-]?[0-9]+ [fh]? /
// | / [0-9]+ \.[0-9]* ([eE][+-]?[0-9]+)? [fh]? /
// | / [0-9]* \.[0-9]+ ([eE][+-]?[0-9]+)? [fh]? /

(parse_dec_float(number, kind), rest)
}
Some(4) => {
let ([digits_with_sign, sign, kind], rest) = regex_captures!(4);
// | / 0 [iufh]? /
// | / [1-9][0-9]* [iufh]? /

let is_negative = sign == "-";

match kind {
"" | "i" | "u" => (parse_int(digits_with_sign, kind, 10, is_negative), rest),
"f" | "h" => (parse_dec_float(digits_with_sign, kind), rest),
_ => unreachable!(),
}
}
_ => (Err(NumberError::Invalid), input),
}
}

fn consume_number<'a>(input: &'a str, number_regexes: &NumberRegexes) -> (Token<'a>, &'a str) {
let res = consume_number_impl(input, number_regexes);
let num = match res.0 {
Ok(Number::Int(num)) => {
use std::convert::TryFrom;
i32::try_from(num)
.map(Number::I32)
.map_err(|_| NumberError::NotRepresentable)
}
Ok(Number::Float(num)) => {
let num = num as f32;
if num.is_finite() {
Ok(Number::F32(num))
} else {
Err(NumberError::NotRepresentable)
}
}
num => num,
};
(Token::Number(num), res.1)
}

fn consume_token<'a>(
input: &'a str,
generic: bool,
number_regexes: &NumberRegexes,
) -> (Token<'a>, &'a str) {
fn consume_token(input: &str, generic: bool) -> (Token<'_>, &str) {
let mut chars = input.chars();
let cur = match chars.next() {
Some(c) => c,
Expand All @@ -293,7 +20,7 @@ fn consume_token<'a>(
'.' => {
let og_chars = chars.as_str();
match chars.next() {
Some('0'..='9') => consume_number(input, number_regexes),
Some('0'..='9') => consume_number(input),
_ => (Token::Separator(cur), og_chars),
}
}
Expand All @@ -313,7 +40,7 @@ fn consume_token<'a>(
_ => (Token::Paren(cur), og_chars),
}
}
'0'..='9' => consume_number(input, number_regexes),
'0'..='9' => consume_number(input),
'/' => {
let og_chars = chars.as_str();
match chars.next() {
Expand Down Expand Up @@ -354,7 +81,7 @@ fn consume_token<'a>(
let og_chars = chars.as_str();
match chars.next() {
Some('>') => (Token::Arrow, chars.as_str()),
Some('0'..='9' | '.') => consume_number(input, number_regexes),
Some('0'..='9' | '.') => consume_number(input),
Some('-') => (Token::DecrementOperation, chars.as_str()),
Some('=') => (Token::AssignmentOperation(cur), chars.as_str()),
_ => (Token::Operation(cur), og_chars),
Expand Down Expand Up @@ -440,15 +167,13 @@ fn is_word_part(c: char) -> bool {
pub(super) struct Lexer<'a> {
input: &'a str,
pub(super) source: &'a str,
number_regexes: &'a NumberRegexes,
}

impl<'a> Lexer<'a> {
pub(super) const fn new(input: &'a str, number_regexes: &'a NumberRegexes) -> Self {
pub(super) const fn new(input: &'a str) -> Self {
Lexer {
input,
source: input,
number_regexes,
}
}

Expand Down Expand Up @@ -494,7 +219,7 @@ impl<'a> Lexer<'a> {
pub(super) fn next(&mut self) -> TokenSpan<'a> {
let mut start_byte_offset = self.current_byte_offset();
loop {
let (token, rest) = consume_token(self.input, false, self.number_regexes);
let (token, rest) = consume_token(self.input, false);
self.input = rest;
match token {
Token::Trivia => start_byte_offset = self.current_byte_offset(),
Expand All @@ -507,7 +232,7 @@ impl<'a> Lexer<'a> {
pub(super) fn next_generic(&mut self) -> TokenSpan<'a> {
let mut start_byte_offset = self.current_byte_offset();
loop {
let (token, rest) = consume_token(self.input, true, self.number_regexes);
let (token, rest) = consume_token(self.input, true);
self.input = rest;
match token {
Token::Trivia => start_byte_offset = self.current_byte_offset(),
Expand Down Expand Up @@ -654,8 +379,7 @@ impl<'a> Lexer<'a> {

#[cfg(test)]
fn sub_test(source: &str, expected_tokens: &[Token]) {
let number_regexes = NumberRegexes::new();
let mut lex = Lexer::new(source, &number_regexes);
let mut lex = Lexer::new(source);
for &token in expected_tokens {
assert_eq!(lex.next().0, token);
}
Expand Down
Loading

0 comments on commit 3441dd6

Please sign in to comment.