Skip to content

Commit

Permalink
Fix regexp case insensitive flag (#531)
Browse files Browse the repository at this point in the history
  • Loading branch information
saghul committed Sep 13, 2024
1 parent ac958f1 commit f5c388d
Show file tree
Hide file tree
Showing 6 changed files with 516 additions and 253 deletions.
57 changes: 1 addition & 56 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@
/*
TODO:
- Add full unicode canonicalize rules for character ranges (not
really useful but needed for exact "ignorecase" compatibility).
- Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution
Expand Down Expand Up @@ -123,33 +120,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
return 0;
}

/* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_unicode) {
if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a';
} else {
lre_case_conv(res, c, 2);
c = res[0];
}
} else {
if (likely(c < 128)) {
if (c >= 'a' && c <= 'z')
c = c - 'a' + 'A';
} else {
/* legacy regexp: to upper case if single char >= 128 */
len = lre_case_conv(res, c, FALSE);
if (len == 1 && res[0] >= 128)
c = res[0];
}
}
return c;
}

static const uint16_t char_range_d[] = {
1,
0x0030, 0x0039 + 1,
Expand Down Expand Up @@ -248,31 +218,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
return -1;
}

static int cr_canonicalize(CharRange *cr)
{
CharRange a;
uint32_t pt[2];
int i, ret;

cr_init(&a, cr->mem_opaque, lre_realloc);
pt[0] = 'a';
pt[1] = 'z' + 1;
ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
if (ret)
goto fail;
/* convert to upper case */
/* XXX: the generic unicode case would be much more complicated
and not really useful */
for(i = 0; i < a.len; i++) {
a.points[i] += 'A' - 'a';
}
/* Note: for simplicity we keep the lower case ranges */
ret = cr_union1(cr, a.points, a.len);
fail:
cr_free(&a);
return ret;
}

#ifdef DUMP_REOP
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
int buf_len)
Expand Down Expand Up @@ -955,7 +900,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
}
}
if (s->ignore_case) {
if (cr_canonicalize(cr))
if (cr_regexp_canonicalize(cr, s->is_unicode))
goto memory_error;
}
if (invert) {
Expand Down
124 changes: 61 additions & 63 deletions libunicode-table.h
Original file line number Diff line number Diff line change
Expand Up @@ -3777,72 +3777,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = {
0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80,
};

static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = {
0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44,
0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f,
0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80,
0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b,
0x84,
static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = {
0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80,
0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00,
0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81,
0x89, 0x10, 0x81, 0x8d, 0x80,
};

static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = {
static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = {
0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12,
0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b,
0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80,
0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08,
0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08,
0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80,
0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87,
0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11,
0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe,
0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88,
0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00,
0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03,
0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81,
0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10,
0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1,
0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87,
0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80,
0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08,
0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b,
0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a,
0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a,
0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06,
0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80,
0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41,
0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6,
0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0,
0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40,
0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09,
0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf,
0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f,
0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40,
0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80,
0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81,
0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9,
0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00,
0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91,
0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95,
0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00,
0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40,
0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80,
0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60,
0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87,
0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01,
0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80,
0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04,
0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23,
0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18,
0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00,
0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00,
0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b,
0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90,
0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84,
0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee,
0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d,
0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05,
0xe1, 0x4f, 0xff,
0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84,
0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb,
0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89,
0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80,
0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28,
0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08,
0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40,
0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7,
0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03,
0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80,
0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b,
0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52,
0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80,
0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40,
0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80,
0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c,
0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f,
0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00,
0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80,
0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01,
0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05,
0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40,
0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34,
0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82,
0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80,
0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5,
0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80,
0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e,
0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60,
0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80,
0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60,
0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89,
0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2,
0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb,
0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7,
0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80,
0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08,
0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d,
0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20,
0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54,
0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e,
0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80,
0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06,
0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41,
0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f,
0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08,
0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00,
0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00,
0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00,
0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90,
0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99,
0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83,
0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05,
0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff,
};

static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {
Expand Down
Loading

0 comments on commit f5c388d

Please sign in to comment.