Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DRAFT: MF2: fix various parser bugs and add more tests #3092

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6920ffc
ICU-22834 Update tests to reflect MF2 schema in conformance repo
catamorphism Jul 19, 2024
48eff9c
Parse JSON tests according to schema
catamorphism Jul 19, 2024
8fb824f
ICU4C: Allow trailing whitespace for complex messages, due to spec ch…
catamorphism Jul 19, 2024
029cee8
ICU4C: Parse number literals correctly in Number::format
catamorphism Jul 22, 2024
696755f
ICU4J: Allow trailing whitespace after complex body, per spec change
catamorphism Jul 22, 2024
1555928
Add ignoreJava annotations for tests with runtime errors
catamorphism Jul 22, 2024
612387f
Update testdata/message2/README.txt
catamorphism Jul 24, 2024
724770a
Parse text and escape chars according to spec
catamorphism Jul 30, 2024
b070bb5
Handle markup with space after the initial left curly brace
catamorphism Jul 30, 2024
18655f2
ICU4J: Don't format unannotated number literals as numbers
catamorphism Jul 30, 2024
aba1a43
ICU4J: Treat whitespace after .input keyword as optional
catamorphism Jul 30, 2024
3ad4450
ICU4C: Fix bug that was assuming an .input variable can't have a rese…
catamorphism Jul 30, 2024
cb94e36
Escape curly braces when serializing and normalizing
catamorphism Jul 30, 2024
e46e8c4
ICU4C: Escape '|' in patterns
catamorphism Jul 30, 2024
3a022cc
ICU4C: Fix bug where unsupported '.i' was parsed as an '.input'
catamorphism Jul 30, 2024
6b73fe1
ICU4C: Make an invalid test valid (unsupported statement parsing)
catamorphism Jul 30, 2024
7ff3559
ICU4C: When normalizing input, escape optionally-escaped characters i…
catamorphism Jul 31, 2024
6d6135a
ICU4C/ICU4J: Allow trailing whitespace after a match
catamorphism Jul 31, 2024
aeeac80
ICU4J: handle trailing whitespace after a .match
catamorphism Aug 5, 2024
7dd36f3
ICU4C: Fix parser to iterate over code points, not code units
catamorphism Aug 3, 2024
4429088
ICU4J: Fix parser bug where backup() was counting in code units rathe…
catamorphism Aug 6, 2024
5c8ccb4
ICU4J: Fix bug where whitespace wasn't getting pushed back after read…
catamorphism Aug 7, 2024
4564da6
ICU4J: Fix reserved escape chars to match grammar
catamorphism Aug 7, 2024
a752a33
ICU4J: Fix bug with '||' in a reserved statement being misparsed
catamorphism Aug 7, 2024
bb4a03d
ICU4J: Allow any type of expression in an unsupported statement, per …
catamorphism Aug 7, 2024
cb3594b
ICU4J: Fix grammar rule in comment
catamorphism Aug 9, 2024
eee547c
ICU4J: Undo change to backup() and make uses of backup() and readCode…
catamorphism Aug 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 14 additions & 17 deletions icu4c/source/i18n/messageformat2_data_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,31 +829,28 @@ const Expression& Binding::getValue() const {
} else {
const Operator* rator = rhs.getOperator(errorCode);
bool hasOperator = U_SUCCESS(errorCode);
if (hasOperator && rator->isReserved()) {
errorCode = U_INVALID_STATE_ERROR;
// Clear error code -- the "error" from the absent operator
// is handled
errorCode = U_ZERO_ERROR;
b = Binding(variableName, std::move(rhs));
b.local = false;
if (hasOperator) {
rator = b.getValue().getOperator(errorCode);
U_ASSERT(U_SUCCESS(errorCode));
b.annotation = &rator->contents;
} else {
// Clear error code -- the "error" from the absent operator
// is handled
errorCode = U_ZERO_ERROR;
b = Binding(variableName, std::move(rhs));
b.local = false;
if (hasOperator) {
rator = b.getValue().getOperator(errorCode);
U_ASSERT(U_SUCCESS(errorCode));
b.annotation = std::get_if<Callable>(&(rator->contents));
} else {
b.annotation = nullptr;
}
U_ASSERT(!hasOperator || b.annotation != nullptr);
b.annotation = nullptr;
}
U_ASSERT(!hasOperator || b.annotation != nullptr);
}
}
return b;
}

const OptionMap& Binding::getOptionsInternal() const {
U_ASSERT(annotation != nullptr);
return annotation->getOptions();
U_ASSERT(std::holds_alternative<Callable>(*annotation));
return std::get_if<Callable>(annotation)->getOptions();
}

void Binding::updateAnnotation() {
Expand All @@ -863,7 +860,7 @@ void Binding::updateAnnotation() {
return;
}
U_ASSERT(U_SUCCESS(localErrorCode) && !rator->isReserved());
annotation = std::get_if<Callable>(&(rator->contents));
annotation = &rator->contents;
}

Binding::Binding(const Binding& other) : var(other.var), expr(other.expr), local(other.local) {
Expand Down
176 changes: 169 additions & 7 deletions icu4c/source/i18n/messageformat2_function_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#if !UCONFIG_NO_MF2

#include <math.h>

#include "unicode/dtptngen.h"
#include "unicode/messageformat2_data_model_names.h"
#include "unicode/messageformat2_function_registry.h"
Expand Down Expand Up @@ -414,25 +416,185 @@ static FormattedPlaceholder notANumber(const FormattedPlaceholder& input) {
return FormattedPlaceholder(input, FormattedValue(UnicodeString("NaN")));
}

static FormattedPlaceholder stringAsNumber(const number::LocalizedNumberFormatter& nf, const FormattedPlaceholder& input, UErrorCode& errorCode) {
// Returns true if `c` is in the interval [`first`, `last`]
static bool inRange(UChar32 c, UChar32 first, UChar32 last) {
U_ASSERT(first < last);
return c >= first && c <= last;
}

static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); }

static int32_t parseDigit(UChar32 c) {
switch(c) {
case DIGIT_ZERO: {
return 0;
}
case DIGIT_ONE: {
return 1;
}
case DIGIT_TWO: {
return 2;
}
case THREE: {
return 3;
}
case FOUR: {
return 4;
}
case FIVE: {
return 5;
}
case SIX: {
return 6;
}
case SEVEN: {
return 7;
}
case EIGHT: {
return 8;
}
case NINE: {
return 9;
}
default: {
// Should be unreachable
return 0;
}
}
}

static int32_t parseDigits(const UnicodeString& s, int32_t& i) {
int32_t firstDigit = i;
// Assumes that i < s.length() and s[i] is a digit
while (isDigit(s[i])) {
i++;
}
int32_t numDigits = i - firstDigit;
int32_t placeValue = 1;
int32_t result = 0;
for (int32_t j = firstDigit + numDigits - 1; j >= firstDigit; j--) {
result += placeValue * parseDigit(s[j]);
placeValue *= 10;
}
return result;
}

static double parseDecimalPart(const UnicodeString& s, int32_t& i) {
int32_t firstDigit = i;
// Assumes that i < s.length() and s[i] is a digit
while (isDigit(s[i])) {
i++;
}
int32_t numDigits = i - firstDigit;
double placeValue = 10;
double result = 0;
for (int32_t j = firstDigit; j < firstDigit + numDigits; j++) {
result += ((double) parseDigit(s[j])) / placeValue;
placeValue *= 10;
}
return result;
}

static double parseNumberLiteral(const FormattedPlaceholder& input, UErrorCode& errorCode) {
if (U_FAILURE(errorCode)) {
return {};
}

double numberValue;
// Copying string to avoid GCC dangling-reference warning
// (although the reference is safe)
UnicodeString inputStr = input.asFormattable().getString(errorCode);
// Precondition: `input`'s source Formattable has type string
if (U_FAILURE(errorCode)) {
return {};
}
UErrorCode localErrorCode = U_ZERO_ERROR;
strToDouble(inputStr, numberValue, localErrorCode);
if (U_FAILURE(localErrorCode)) {
errorCode = U_MF_OPERAND_MISMATCH_ERROR;

int32_t index = 0;

#define ERROR() errorCode = U_MF_OPERAND_MISMATCH_ERROR; return 0;
#define IN_BOUNDS(inputStr, index) (index < inputStr.length())
#define CHECK_BOUNDS(inputStr, index) if (!IN_BOUNDS(inputStr, index)) { ERROR(); }

CHECK_BOUNDS(inputStr, index);

// Parse the sign if present
double sign = 1;
if (inputStr[index] == HYPHEN) {
sign = -1;
index++;
}

CHECK_BOUNDS(inputStr, index);

// Parse the integer part
if (!isDigit(inputStr[index])) {
// Non-numeric first character after sign -- not valid
ERROR();
}

// First, check for leading zero with no decimal point
if (inputStr[index] == DIGIT_ZERO) {
bool isZero = inputStr.length() == index + 1;
bool hasDecimalPart = inputStr.length() > index && inputStr[index + 1] == PERIOD;
if (!(isZero || hasDecimalPart)) {
ERROR();
}
}

double result = parseDigits(inputStr, index);

if (IN_BOUNDS(inputStr, index) && inputStr[index] == PERIOD) {
index++;
CHECK_BOUNDS(inputStr, index);
if (isDigit(inputStr[index])) {
result += parseDecimalPart(inputStr, index);
} else {
// '.' not followed by a digit is an error
ERROR();
}
}

result *= sign;

if (IN_BOUNDS(inputStr, index) &&
(inputStr[index] == UPPERCASE_E || inputStr[index] == LOWERCASE_E)) {
double exponent;
bool positive = true;
index++;
CHECK_BOUNDS(inputStr, index);
// Parse sign if present
if (inputStr[index] == PLUS) {
index++;
} else if (inputStr[index] == HYPHEN) {
positive = false;
index++;
}
// Parse exponent digits
CHECK_BOUNDS(inputStr, index);
if (!isDigit(inputStr[index])) {
ERROR();
}
exponent = parseDigits(inputStr, index);
if (positive) {
result *= (exponent * 10);
} else {
result *= (exponent * -10);
}
}

// Make sure entire input is consumed
if (index != inputStr.length()) {
ERROR();
}

return result;
}

static FormattedPlaceholder tryParsingNumberLiteral(const number::LocalizedNumberFormatter& nf, const FormattedPlaceholder& input, UErrorCode& errorCode) {
double numberValue = parseNumberLiteral(input, errorCode);
if (U_FAILURE(errorCode)) {
return notANumber(input);
}

UErrorCode savedStatus = errorCode;
number::FormattedNumber result = nf.formatDouble(numberValue, errorCode);
// Ignore U_USING_DEFAULT_WARNING
Expand Down Expand Up @@ -583,7 +745,7 @@ FormattedPlaceholder StandardFunctions::Number::format(FormattedPlaceholder&& ar
}
case UFMT_STRING: {
// Try to parse the string as a number
return stringAsNumber(realFormatter, arg, errorCode);
return tryParsingNumberLiteral(realFormatter, arg, errorCode);
}
default: {
// Other types can't be parsed as a number
Expand Down
11 changes: 11 additions & 0 deletions icu4c/source/i18n/messageformat2_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ using namespace pluralimpl;
#define LOWERCASE_E ((UChar32)0x0065)
#define UPPERCASE_E ((UChar32)0x0045)

#define DIGIT_ZERO ((UChar32)0x0030)
#define DIGIT_ONE ((UChar32)0x0031)
#define DIGIT_TWO ((UChar32)0x0032)
#define THREE ((UChar32)0x0033)
#define FOUR ((UChar32)0x0034)
#define FIVE ((UChar32)0x0035)
#define SIX ((UChar32)0x0036)
#define SEVEN ((UChar32)0x0037)
#define EIGHT ((UChar32)0x0038)
#define NINE ((UChar32)0x0039)

// Reserved sigils
#define BANG ((UChar32)0x0021)
#define AT ((UChar32)0x0040)
Expand Down
Loading