regex.c

/* Copyright 2009-2024
 * Kaz Kylheku <kaz@kylheku.com>
 * Vancouver, Canada
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <assert.h>
#include <limits.h>
#include <signal.h>
#include <stdarg.h>
#include "config.h"
#include "alloca.h"
#include "lib.h"
#include "parser.h"
#include "signal.h"
#include "unwind.h"
#include "stream.h"
#include "gc.h"
#include "eval.h"
#include "cadr.h"
#include "itypes.h"
#include "regex.h"
#include "txr.h"

#if WCHAR_MAX > 65535
#define FULL_UNICODE
#endif

typedef union nfa_state nfa_state_t;

typedef struct nfa {
  nfa_state_t *start;
  nfa_state_t *accept;
} nfa_t;

typedef enum { REGEX_NFA, REGEX_DV } regex_kind_t;

typedef struct regex {
  regex_kind_t kind;
  union {
    struct nfa nfa;
    val dv;
  } r;
  int nstates;
  val source;
} regex_t;

/*
 * Result from regex_machine_feed.
 * These values have two meanings, based on whether
 * the matching is still open (characters are being fed)
 * or finalized.
 *
 * When feeding characters:
 * REGM_INCOMPLETE: no match at this character, but matching can continue.
 * REGM_FAIL: no more state transitions are possible.
 * REGM_MATCH: match (accept state) for this character.
 * REGM_MATCH_DONE: match (accept state) for this character, and no more
 * moves are possible.
 *
 * When the end of the input is encountered, or a REGM_FAIL,
 * then regex_machine_feed is called one more time with
 * the null character. It then reports:
 * REGM_INCOMPLETE: there was a partial match for the input.
 * REGM_FAIL: none of the input matched.
 * REGM_MATCH: the input was completely matched
 * REGM_MATCH_DONE: not returned.
 *
 * Note that a REGM_FAIL (no transitions) during the character feeding phase
 * can turn into REGM_INCOMPLETE (partial match) when the match is sealed with
 * the null character signal!
 */
typedef enum regm_result {
  REGM_INCOMPLETE,
  REGM_FAIL,
  REGM_MATCH,
  REGM_MATCH_DONE
} regm_result_t;

typedef union regex_machine regex_machine_t;

typedef unsigned int bitcell_t;

#define CHAR_SET_SIZE (256 / (sizeof (bitcell_t) * CHAR_BIT))

#define BITCELL_BIT (sizeof (bitcell_t) * CHAR_BIT)

#define CHAR_SET_INDEX(CH) ((CH) / BITCELL_BIT)
#define CHAR_SET_BIT(CH) ((CH) % BITCELL_BIT)

#define CHAR_SET_L0(CH) ((CH) & 0xFF)
#define CHAR_SET_L1(CH) (((CH) >> 8) & 0xF)
#define CHAR_SET_L2(CH) (((CH) >> 12) & 0xF)
#ifdef FULL_UNICODE
#define CHAR_SET_L3(CH) (((CH) >> 16) & 0x1F)
#endif

#ifdef FULL_UNICODE
#define CHAR_SET_L2_LO(CH) ((CH) & ~convert(wchar_t, 0xFFFF))
#define CHAR_SET_L2_HI(CH) ((CH) | convert(wchar_t, 0xFFFF))
#endif

#define CHAR_SET_L1_LO(CH) ((CH) & ~convert(wchar_t, 0xFFF))
#define CHAR_SET_L1_HI(CH) ((CH) | convert(wchar_t, 0xFFF))

#define CHAR_SET_L0_LO(CH) ((CH) & ~convert(wchar_t, 0xFF))
#define CHAR_SET_L0_HI(CH) ((CH) | convert(wchar_t, 0xFF))

typedef enum {
  CHSET_SMALL, CHSET_DISPLACED, CHSET_LARGE,
#ifdef FULL_UNICODE
  CHSET_XLARGE
#endif
} chset_type_t;

typedef bitcell_t cset_L0_t[CHAR_SET_SIZE];
typedef cset_L0_t *cset_L1_t[16];
typedef cset_L1_t *cset_L2_t[16];
#ifdef FULL_UNICODE
typedef cset_L2_t *cset_L3_t[17];
#endif

struct any_char_set {
  unsigned type : 3;
  unsigned comp : 1;
  unsigned stat : 1;
};

struct small_char_set {
  unsigned type : 3;
  unsigned comp : 1;
  unsigned stat : 1;
  cset_L0_t bitcell;
};

struct displaced_char_set {
  unsigned type : 3;
  unsigned comp : 1;
  unsigned stat : 1;
  cset_L0_t bitcell;
  wchar_t base;
};


struct large_char_set {
  unsigned type : 3;
  unsigned comp : 1;
  unsigned stat : 1;
  cset_L2_t dir;
};

#ifdef FULL_UNICODE
struct xlarge_char_set {
  unsigned type : 3;
  unsigned comp : 1;
  unsigned stat : 1;
  cset_L3_t dir;
};
#endif

typedef union char_set {
  struct any_char_set any;
  struct small_char_set s;
  struct displaced_char_set d;
  struct large_char_set l;
#ifdef FULL_UNICODE
  struct xlarge_char_set xl;
#endif
} char_set_t;

typedef enum {
  nfa_empty, nfa_accept, nfa_wild, nfa_single, nfa_set
} nfa_kind_t;

struct nfa_state_any {
  nfa_kind_t kind;
  unsigned visited;
};

struct nfa_state_empty {
  nfa_kind_t kind;
  unsigned visited;
  nfa_state_t *trans0;
  nfa_state_t *trans1;
};

struct nfa_state_single {
  nfa_kind_t kind;
  unsigned visited;
  nfa_state_t *trans;
  wchar_t ch;
};

struct nfa_state_set {
  nfa_kind_t kind;
  unsigned visited;
  nfa_state_t *trans;
  char_set_t *set;
};

union nfa_state {
  struct nfa_state_any a;
  struct nfa_state_empty e;
  struct nfa_state_single o;
  struct nfa_state_set s;
};

#define nfa_accept_state_p(s) ((s)->a.kind == nfa_accept)
#define nfa_empty_state_p(s) ((s)->a.kind == nfa_accept || \
                              (s)->a.kind == nfa_empty)
#define nfa_has_transitions(s) ((s)->a.kind != nfa_empty || \
                                (s)->e.trans0 || (s)->e.trans1)

struct nfa_machine {
  int is_nfa;           /* common member */
  cnum last_accept_pos; /* common member */
  cnum count;           /* common member */
  unsigned visited;
  nfa_state_t **set, **stack;
  int nclos;
  nfa_t nfa;
  int nstates;
};

struct dv_machine {
  int is_nfa;           /* common member */
  cnum last_accept_pos; /* common member */
  cnum count;           /* common member */
  val deriv;
  val regex;
};

union regex_machine {
  struct nfa_machine n;
  struct dv_machine d;
};

int opt_derivative_regex = 0;

struct cobj_class *regex_cls;
static struct cobj_class *chset_cls;

wchar_t spaces[] = {
  0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x0020, 0x00a0, 0x1680, 0x180e,
  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
  0x2009, 0x200a, 0x2028, 0x2029, 0x205f, 0x3000, 0
};

static int L0_full(cset_L0_t *L0)
{
  int i;

  for (i = 0; i < convert(int, CHAR_SET_SIZE); i++)
    if ((*L0)[i] != convert(bitcell_t, -1))
      return 0;
  return 1;
}

static void L0_fill_range(cset_L0_t *L0, wchar_t ch0, wchar_t ch1)
{
  int i;
  int bt0 = CHAR_SET_BIT(ch0);
  int bc0 = CHAR_SET_INDEX(ch0);
  bitcell_t mask0 = ~((convert(bitcell_t, 1) << bt0) - 1);
  int bt1 = CHAR_SET_BIT(ch1);
  int bc1 = CHAR_SET_INDEX(ch1);
  bitcell_t mask1 = (bt1 == (BITCELL_BIT - 1))
                     ? convert(bitcell_t, -1)
                     : (convert(bitcell_t, 1) << (bt1 + 1)) - 1;

  if (bc1 == bc0) {
    (*L0)[bc0] |= (mask0 & mask1);
  } else {
    (*L0)[bc0] |= mask0;
    (*L0)[bc1] |= mask1;
    for (i = bc0 + 1; i < bc1; i++)
      (*L0)[i] = convert(bitcell_t, -1);
  }
}

static int L0_contains(cset_L0_t *L0, wchar_t ch)
{
  return ((*L0)[CHAR_SET_INDEX(ch)] &
          (convert(bitcell_t, 1) << CHAR_SET_BIT(ch))) != 0;
}

static int L1_full(cset_L1_t *L1)
{
  int i;
  for (i = 0; i < 16; i++)
    if ((*L1)[i] != coerce(cset_L0_t *, -1))
      return 0;
  return 1;
}

static void L1_fill_range(cset_L1_t *L1, wchar_t ch0, wchar_t ch1)
{
  int i1, i10, i11;

  i10 = CHAR_SET_L1(ch0);
  i11 = CHAR_SET_L1(ch1);

  for (i1 = i10; i1 <= i11; i1++) {
    wchar_t c0 = 0, c1 = 0;
    cset_L0_t *L0;

    if (i1 > i10 && i1 < i11) {
      if ((*L1)[i1] != coerce(cset_L0_t *, -1)) {
        free((*L1)[i1]);
        (*L1)[i1] = coerce(cset_L0_t *, -1);
      }
      continue;
    } else if (i10 == i11) {
      c0 = ch0;
      c1 = ch1;
    } else if (i1 == i10) {
      c0 = ch0;
      c1 = CHAR_SET_L0_HI(ch0);
    } else if (i1 == i11) {
      c0 = CHAR_SET_L0_LO(ch1);
      c1 = ch1;
    }

    if ((L0 = (*L1)[i1]) == coerce(cset_L0_t *, -1))
      continue;

    if (L0 == 0) {
      static cset_L0_t blank;
      L0 = (*L1)[i1] = coerce(cset_L0_t *, chk_malloc(sizeof *L0));
      memcpy(L0, &blank, sizeof *L0);
    }

    L0_fill_range(L0, CHAR_SET_L0(c0), CHAR_SET_L0(c1));

    if (L0_full(L0)) {
      free(L0);
      (*L1)[i1] = coerce(cset_L0_t *, -1);
    }
  }
}

static int L1_contains(cset_L1_t *L1, wchar_t ch)
{
  int i1 = CHAR_SET_L1(ch);
  cset_L0_t *L0 = (*L1)[i1];

  if (L0 == 0)
    return 0;
  else if (L0 == coerce(cset_L0_t *, -1))
    return 1;
  else
    return L0_contains(L0, CHAR_SET_L0(ch));
}


static void L1_free(cset_L1_t *L1)
{
  int i1;

  if (L1 == coerce(cset_L1_t *, -1))
    return;

  for (i1 = 0; i1 < 16; i1++)
    if ((*L1)[i1] != coerce(cset_L0_t *, -1))
      free((*L1)[i1]);
}

#ifdef FULL_UNICODE
static int L2_full(cset_L2_t *L2)
{
  int i;
  for (i = 0; i < 16; i++)
    if ((*L2)[i] != coerce(cset_L1_t *, -1))
      return 0;
  return 1;
}
#endif

static void L2_fill_range(cset_L2_t *L2, wchar_t ch0, wchar_t ch1)
{
  int i2, i20, i21;

  i20 = CHAR_SET_L2(ch0);
  i21 = CHAR_SET_L2(ch1);

  for (i2 = i20; i2 <= i21; i2++) {
    wchar_t c0 = 0, c1 = 0;
    cset_L1_t *L1;

    if (i2 > i20 && i2 < i21) {
      if ((*L2)[i2] != coerce(cset_L1_t *, -1)) {
        free((*L2)[i2]);
        (*L2)[i2] = coerce(cset_L1_t *, -1);
      }
      continue;
    } else if (i20 == i21) {
      c0 = ch0;
      c1 = ch1;
    } else if (i2 == i20) {
      c0 = ch0;
      c1 = CHAR_SET_L1_HI(ch0);
    } else if (i2 == i21) {
      c0 = CHAR_SET_L1_LO(ch1);
      c1 = ch1;
    }

    if ((L1 = (*L2)[i2]) == coerce(cset_L1_t *, -1))
      continue;

    if (L1 == 0) {
      static cset_L1_t blank;
      L1 = (*L2)[i2] = coerce(cset_L1_t *, chk_malloc(sizeof *L1));
      memcpy(L1, &blank, sizeof *L1);
    }

    L1_fill_range(L1, c0, c1);

    if (L1_full(L1)) {
      free(L1);
      (*L2)[i2] = coerce(cset_L1_t *, -1);
    }
  }
}

static int L2_contains(cset_L2_t *L2, wchar_t ch)
{
  int i2 = CHAR_SET_L2(ch);
  cset_L1_t *L1 = (*L2)[i2];

  if (L1 == 0)
    return 0;
  else if (L1 == coerce(cset_L1_t *, -1))
    return 1;
  else
    return L1_contains(L1, ch);
}

static void L2_free(cset_L2_t *L2)
{
  int i2;

  for (i2 = 0; i2 < 16; i2++) {
    cset_L1_t *L1 = (*L2)[i2];
    if (L1 != 0 && L1 != coerce(cset_L1_t *, -1)) {
      L1_free((*L2)[i2]);
      free((*L2)[i2]);
    }
  }
}

#ifdef FULL_UNICODE

static void L3_fill_range(cset_L3_t *L3, wchar_t ch0, wchar_t ch1)
{
  int i3, i30, i31;

  i30 = CHAR_SET_L3(ch0);
  i31 = CHAR_SET_L3(ch1);

  for (i3 = i30; i3 <= i31; i3++) {
    wchar_t c0 = 0, c1 = 0;
    cset_L2_t *L2;

    if (i3 > i30 && i3 < i31) {
      if ((*L3)[i3] != coerce(cset_L2_t *, -1)) {
        free((*L3)[i3]);
        (*L3)[i3] = coerce(cset_L2_t *, -1);
      }
      continue;
    } else if (i30 == i31) {
      c0 = ch0;
      c1 = ch1;
    } else if (i3 == i30) {
      c0 = ch0;
      c1 = CHAR_SET_L2_HI(ch0);
    } else if (i3 == i31) {
      c0 = CHAR_SET_L2_LO(ch1);
      c1 = ch1;
    }

    if ((L2 = (*L3)[i3]) == coerce(cset_L2_t *, -1))
      continue;

    if (L2 == 0) {
      static cset_L2_t blank;
      L2 = (*L3)[i3] = coerce(cset_L2_t *, chk_malloc(sizeof *L2));
      memcpy(L2, &blank, sizeof *L2);
    }

    L2_fill_range(L2, c0, c1);
    if (L2_full(L2)) {
      free(L2);
      (*L3)[i3] = coerce(cset_L2_t *, -1);
    }
  }
}


static int L3_contains(cset_L3_t *L3, wchar_t ch)
{
  int i3 = CHAR_SET_L3(ch);
  cset_L2_t *L2 = (*L3)[i3];

  if (L2 == 0)
    return 0;
  else if (L2 == coerce(cset_L2_t *, -1))
    return 1;
  else
    return L2_contains(L2, ch);
}

static void L3_free(cset_L3_t *L3)
{
  int i3;

  for (i3 = 0; i3 < 17; i3++) {
    cset_L2_t *L2 = (*L3)[i3];
    if (L2 != 0 && L2 != coerce(cset_L2_t *, -1)) {
      L2_free((*L3)[i3]);
      free((*L3)[i3]);
    }
  }
}

#endif

static char_set_t *char_set_create(chset_type_t type, wchar_t base, unsigned st)
{
  static char_set_t blank;
  char_set_t *cs = coerce(char_set_t *, chk_malloc(sizeof *cs));
  *cs = blank;
  cs->any.type = type;
  cs->any.stat = st;

  if (type == CHSET_DISPLACED)
    cs->d.base = base;

  return cs;
}

static void char_set_destroy(char_set_t *set, int force)
{
  if (!set)
    return;

  if (set->any.stat && !force)
    return;

  switch (set->any.type) {
  case CHSET_DISPLACED:
  case CHSET_SMALL:
    free(set);
    break;
  case CHSET_LARGE:
    L2_free(&set->l.dir);
    free(set);
    break;
#ifdef FULL_UNICODE
  case CHSET_XLARGE:
    L3_free(&set->xl.dir);
    free(set);
    break;
#endif
  }
}

static void char_set_compl(char_set_t *set)
{
  set->any.comp = 1;
}

static void char_set_add(char_set_t *set, wchar_t ch)
{
  switch (set->any.type) {
  case CHSET_DISPLACED:
    assert (ch >= set->d.base && ch < set->d.base + 256);
    ch -= set->d.base;
    /* fallthrough */
  case CHSET_SMALL:
    assert (ch < 256);
    set->s.bitcell[CHAR_SET_INDEX(ch)] |= (convert(bitcell_t, 1)
                                           << CHAR_SET_BIT(ch));
    break;
  case CHSET_LARGE:
    assert (ch < 0x10000);
    L2_fill_range(&set->l.dir, ch, ch);
    break;
#ifdef FULL_UNICODE
  case CHSET_XLARGE:
    assert (ch < 0x110000);
    L3_fill_range(&set->xl.dir, ch, ch);
    break;
#endif
  }
}

static void char_set_add_range(char_set_t *set, wchar_t ch0, wchar_t ch1)
{
  if (ch0 >= ch1)
    return;

  switch (set->any.type) {
  case CHSET_DISPLACED:
    assert (ch0 >= set->d.base && ch1 < set->d.base + 256);
    ch0 -= set->d.base;
    ch1 -= set->d.base;
    /* fallthrough */
  case CHSET_SMALL:
    assert (ch1 < 256);
    L0_fill_range(&set->s.bitcell, ch0, ch1);
    break;
  case CHSET_LARGE:
    assert (ch1 < 0x10000);
    L2_fill_range(&set->l.dir, ch0, ch1);
    break;
#ifdef FULL_UNICODE
  case CHSET_XLARGE:
    assert (ch1 < 0x110000);
    L3_fill_range(&set->xl.dir, ch0, ch1);
    break;
#endif
  }
}

static void char_set_add_str(char_set_t *set, wchar_t *str)
{
  while (*str != 0)
    char_set_add(set, *str++);
}

static int char_set_contains(char_set_t *set, wchar_t ch)
{
  int result = 0;

  switch (set->any.type) {
  case CHSET_DISPLACED:
    if (ch < set->d.base)
      break;
    ch -= set->d.base;
    /* fallthrough */
  case CHSET_SMALL:
    if (ch >= 256)
      break;
    result = L0_contains(&set->s.bitcell, ch);
    break;
  case CHSET_LARGE:
    if (ch >= 0x10000)
      break;
    result = L2_contains(&set->l.dir, ch);
    break;
#ifdef FULL_UNICODE
  case CHSET_XLARGE:
    if (ch >= 0x110000)
      break;
    result = L3_contains(&set->xl.dir, ch);
    break;
#endif
  }

  return set->any.comp ? !result : result;
}

static char_set_t *char_set_compile(val args, val comp)
{
  val iter;
  wchar_t min = WCHAR_MAX;
  wchar_t max = 0;
  chset_type_t cst;

  for (iter = args; iter; iter = rest(iter)) {
    val item = first(iter);

    if (consp(item)) {
      val from = car(item);
      val to = cdr(item);

      assert (is_chr(from) && is_chr(to));

      if (c_chr(from) < min)
        min = c_chr(from);
      if (c_chr(from) > max)
        max = c_chr(from);

      if (c_chr(to) < min)
        min = c_chr(to);
      if (c_chr(to) > max)
        max = c_chr(to);
    } else if (is_chr(item)) {
      if (c_chr(item) < min)
        min = c_chr(item);
      if (c_chr(item) > max)
        max = c_chr(item);
    } else if (item == space_k) {
      if (max < 0x3000)
        max = 0x3000;
      if (min > 0x9)
        min = 0x9;
    } else if (item == digit_k) {
      if (max < '9')
        max = 9;
      if (min > '0')
        min = 0;
    } else if (item == word_char_k) {
      if (min > 'A')
        min = 'A';
      if (max < 'z')
        max = 'z';
    } else if (item == cspace_k || item == cdigit_k || item == cword_char_k) {
      uw_throwf(error_s, lit("bad object in character class syntax: ~s"),
                item, nao);
    } else {
      assert(0 && "bad regex set");
    }
  }

  if (max < 0x100)
    cst = CHSET_SMALL;
  else if (max - min < 0x100)
    cst = CHSET_DISPLACED;
  else if (max < 0x10000)
    cst = CHSET_LARGE;
  else
#ifdef FULL_UNICODE
    cst = CHSET_XLARGE;
#else
    cst = CHSET_LARGE;
#endif


  {
    char_set_t *set = char_set_create(cst, min, 0);

    for (iter = args; iter; iter = rest(iter)) {
      val item = first(iter);

      if (consp(item)) {
        val from = car(item);
        val to = cdr(item);

        assert (is_chr(from) && is_chr(to));
        char_set_add_range(set, c_chr(from), c_chr(to));
      } else if (is_chr(item)) {
        char_set_add(set, c_chr(item));
      } else if (item == space_k) {
        char_set_add_str(set, spaces);
      } else if (item == digit_k) {
        char_set_add_range(set, '0', '9');
      } else if (item == word_char_k) {
        char_set_add_range(set, 'A', 'Z');
        char_set_add_range(set, 'a', 'z');
        char_set_add(set, '_');
      } else {
        assert(0 && "bad regex set");
      }
    }

    if (comp)
      char_set_compl(set);

    return set;
  }
}

static char_set_t *space_cs, *digit_cs, *word_cs;
static char_set_t *cspace_cs, *cdigit_cs, *cword_cs;

static void init_special_char_sets(void)
{
  space_cs = char_set_create(CHSET_LARGE, 0, 1);
  cspace_cs = char_set_create(CHSET_LARGE, 0, 1);
  digit_cs = char_set_create(CHSET_SMALL, 0, 1);
  cdigit_cs = char_set_create(CHSET_SMALL, 0, 1);
  word_cs = char_set_create(CHSET_SMALL, 0, 1);
  cword_cs = char_set_create(CHSET_SMALL, 0, 1);

  char_set_compl(cspace_cs);
  char_set_compl(cdigit_cs);
  char_set_compl(cword_cs);

  char_set_add_str(space_cs, spaces);
  char_set_add_str(cspace_cs, spaces);

  char_set_add_range(digit_cs, '0', '9');
  char_set_add_range(cdigit_cs, '0', '9');

  char_set_add_range(word_cs, 'A', 'Z');
  char_set_add_range(cword_cs, 'A', 'Z');
  char_set_add_range(word_cs, 'a', 'z');
  char_set_add_range(cword_cs, 'a', 'z');
  char_set_add(word_cs, '_');
  char_set_add(cword_cs, '_');
}

static void char_set_cobj_destroy(val chset)
{
  char_set_t *set = coerce(char_set_t *, chset->co.handle);
  char_set_destroy(set, 0);
  chset->co.handle = 0;
}

static struct cobj_ops char_set_obj_ops = cobj_ops_init(eq,
                                                        cobj_print_op,
                                                        char_set_cobj_destroy,
                                                        cobj_mark_op,
                                                        cobj_eq_hash_op,
                                                        0);

static nfa_state_t *nfa_state_accept(void)
{
  nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  st->e.kind = nfa_accept;
  st->e.visited = 0;
  st->e.trans0 = st->e.trans1 = 0;
  return st;
}

static nfa_state_t *nfa_state_empty(nfa_state_t *t0, nfa_state_t *t1)
{
  nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  st->e.kind = nfa_empty;
  st->e.visited = 0;
  st->e.trans0 = t0;
  st->e.trans1 = t1;
  return st;
}

static nfa_state_t *nfa_state_single(nfa_state_t *t, wchar_t ch)
{
  nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  st->o.kind = nfa_single;
  st->o.visited = 0;
  st->o.trans = t;
  st->o.ch = ch;
  return st;
}

static nfa_state_t *nfa_state_wild(nfa_state_t *t)
{
  nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  st->o.kind = nfa_wild;
  st->o.visited = 0;
  st->o.trans = t;
  st->o.ch = 0;
  return st;
}

static void nfa_state_free(nfa_state_t *st)
{
  if (st->a.kind == nfa_set)
    char_set_destroy(st->s.set, 0);
  free(st);
}

static void nfa_state_shallow_free(nfa_state_t *st)
{
  free(st);
}

static nfa_state_t *nfa_state_set(nfa_state_t *t, char_set_t *cs)
{
  nfa_state_t *st = coerce(nfa_state_t *, chk_malloc(sizeof *st));
  st->s.kind = nfa_set;
  st->s.visited = 0;
  st->s.trans = t;
  st->s.set = cs;
  return st;
}

/*
 * An acceptance state is converted to an empty transition
 * state with specified transitions. It thereby loses
 * its acceptance state status. This is used during
 * compilation to hook new output paths into an inner NFA,
 * either back to itself, or to a new state in the
 * surrounding new NFA.
 */
static void nfa_state_empty_convert(nfa_state_t *acc, nfa_state_t *t0,
                                    nfa_state_t *t1)
{
  assert (nfa_accept_state_p(acc));
  acc->e.kind = nfa_empty;
  acc->e.trans0 = t0;
  acc->e.trans1 = t1;
}

/*
 * Acceptance state takes on the kind of st, and all associated
 * data. I.e. we merge the identity of accept,
 * with the contents of st, such that the new state has
 * all of the outgoing arrows of st, and
 * all of the incoming arrows of acc.
 * This is easily done with an assignment, provided
 * that st doesn't have any incoming arrows.
 * We ensure that start states don't have any incoming
 * arrows in the compiler, by ensuring that repetition
 * operators terminate their backwards arrows on an
 * existing start state, and allocate a new start
 * state in front of it.
 */
static void nfa_state_merge(nfa_state_t *acc, nfa_state_t *st)
{
  assert (nfa_accept_state_p(acc));
  *acc = *st;
}

static nfa_t nfa_make(nfa_state_t *s, nfa_state_t *acc)
{
  nfa_t ret;
  ret.start = s;
  ret.accept = acc;
  return ret;
}

/*
 * Combine two NFA's representing regexps that are catenated.
 * The acceptance state of the predecessor is merged with the start state of
 * the successor.
 */
static nfa_t nfa_combine(nfa_t pred, nfa_t succ)
{
  nfa_t ret;
  ret.start = pred.start;
  ret.accept = succ.accept;
  nfa_state_merge(pred.accept, succ.start);
  nfa_state_shallow_free(succ.start); /* No longer needed. */
  return ret;
}

static nfa_t nfa_compile_set(val args, val comp)
{
  char_set_t *set = char_set_compile(args, comp);
  nfa_state_t *acc = nfa_state_accept();
  nfa_state_t *s = nfa_state_set(acc, set);
  return nfa_make(s, acc);
}

static nfa_t nfa_compile_given_set(char_set_t *set)
{
  nfa_state_t *acc = nfa_state_accept();
  nfa_state_t *s = nfa_state_set(acc, set);
  return nfa_make(s, acc);
}

static nfa_t nfa_compile_regex(val regex);

/*
 * Helper to nfa_compile_regex for compiling the argument list of
 * a compound regex.
 */

static nfa_t nfa_compile_list(val exp_list)
{
  nfa_t nfa_first = nfa_compile_regex(first(exp_list));

  if (rest(exp_list)) {
    nfa_t nfa_rest = nfa_compile_list(rest(exp_list));
    return nfa_combine(nfa_first, nfa_rest);
  } else {
    return nfa_first;
  }
}

/*
 * Input is the items from a regex form,
 * not including the regex symbol.
 * I.e.  (rest '(regex ...)) not '(regex ...).
 */
static nfa_t nfa_compile_regex(val exp)
{
  if (nilp(exp)) {
    nfa_state_t *acc = nfa_state_accept();
    nfa_state_t *s = nfa_state_empty(acc, 0);
    return nfa_make(s, acc);
  } else if (chrp(exp)) {
    nfa_state_t *acc = nfa_state_accept();
    nfa_state_t *s = nfa_state_single(acc, c_chr(exp));
    return nfa_make(s, acc);
  } else if (stringp(exp)) {
    return nfa_compile_regex(cons(compound_s, list_str(exp)));
  } else if (exp == wild_s) {
    nfa_state_t *acc = nfa_state_accept();
    nfa_state_t *s = nfa_state_wild(acc);
    return nfa_make(s, acc);
  } else if (exp == space_k) {
    return nfa_compile_given_set(space_cs);
  } else if (exp == digit_k) {
    return nfa_compile_given_set(digit_cs);
  } else if (exp == word_char_k) {
    return nfa_compile_given_set(word_cs);
  } else if (exp == cspace_k) {
    return nfa_compile_given_set(cspace_cs);
  } else if (exp == cdigit_k) {
    return nfa_compile_given_set(cdigit_cs);
  } else if (exp == cword_char_k) {
    return nfa_compile_given_set(cword_cs);
  } else if (consp(exp)) {
    val sym = first(exp), args = rest(exp);

    if (sym == set_s) {
      return nfa_compile_set(args, nil);
    } else if (sym == cset_s) {
      return nfa_compile_set(args, t);
    } else if (sym == compound_s) {
      return nfa_compile_list(args);
    } else if (sym == zeroplus_s) {
      nfa_t nfa_arg = nfa_compile_regex(first(args));
      nfa_state_t *acc = nfa_state_accept();
      /* New start state has empty transitions going through
         the inner NFA, or skipping it right to the new acceptance state. */
      nfa_state_t *s = nfa_state_empty(nfa_arg.start, acc);
      /* Convert acceptance state of inner NFA to one which has
         an empty transition back to the start state, and
         an empty transition to the new acceptance state. */
      nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
      return nfa_make(s, acc);
    } else if (sym == oneplus_s) {
      /* One-plus case differs from zero-plus in that the new start state
         does not have an empty transition to the acceptance state.
         So the inner NFA must be traversed once. */
      nfa_t nfa_arg = nfa_compile_regex(first(args));
      nfa_state_t *acc = nfa_state_accept();
      nfa_state_t *s = nfa_state_empty(nfa_arg.start, 0); /* <-- diff */
      nfa_state_empty_convert(nfa_arg.accept, nfa_arg.start, acc);
      return nfa_make(s, acc);
    } else if (sym == optional_s) {
      /* In this case, we can keep the acceptance state of the inner
         NFA as the acceptance state of the new NFA. We simply add
         a new start state which can short-circuit to it via an empty
         transition.  */
      nfa_t nfa_arg = nfa_compile_regex(first(args));
      nfa_state_t *s = nfa_state_empty(nfa_arg.start, nfa_arg.accept);
      return nfa_make(s, nfa_arg.accept);
    } else if (sym == or_s) {
      /* Simple: make a new start and acceptance state, which form
         the ends of a spindle that goes through two branches. */
      nfa_t nfa_first = nfa_compile_regex(first(args));
      nfa_t nfa_second = nfa_compile_regex(second(args));
      nfa_state_t *acc = nfa_state_accept();
      /* New state s has empty transitions into each inner NFA. */
      nfa_state_t *s = nfa_state_empty(nfa_first.start, nfa_second.start);
      /* Acceptance state of each inner NFA converted to empty
         transition to new combined acceptance state. */
      nfa_state_empty_convert(nfa_first.accept, acc, 0);
      nfa_state_empty_convert(nfa_second.accept, acc, 0);
      return nfa_make(s, acc);
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  } else if (exp == t) {
    return nfa_make(0, 0);
  } else {
    uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
  }
}

INLINE int nfa_test_set_visited(nfa_state_t *s, unsigned visited)
{
  if (s && s->a.visited != visited) {
    s->a.visited = visited;
    return 1;
  }
  return 0;
}

static void nfa_map_states(nfa_state_t *s,
                           mem_t *ctx, void (*fun)(nfa_state_t *,
                                                   mem_t *ctx),
                           unsigned visited)
{
  if (nfa_test_set_visited(s, visited)) {
    fun(s, ctx);

    switch (s->a.kind) {
    case nfa_empty:
    case nfa_accept:
      nfa_map_states(s->e.trans0, ctx, fun, visited);
      nfa_map_states(s->e.trans1, ctx, fun, visited);
      break;
    case nfa_wild:
    case nfa_single:
    case nfa_set:
      nfa_map_states(s->o.trans, ctx, fun, visited);
      break;
    }
  }
}

static void nfa_count_one(nfa_state_t *s, mem_t *ctx)
{
  int *pcount = coerce(int *, ctx);
  (void) s;
  (*pcount)++;
}

static int nfa_count_states(nfa_state_t *s)
{
  int count = 0;
  if (s) {
    unsigned visited = s->a.visited + 1;
    nfa_map_states(s, coerce(mem_t *, &count), nfa_count_one, visited);
  }
  return count;
}

static void nfa_handle_wraparound(nfa_state_t *s, unsigned *pvisited)
{
  if (s && *pvisited > UINT_MAX - 8) {
    s->a.visited = UINT_MAX - 1;
    (void) nfa_count_states(s);
    s->a.visited = UINT_MAX;
    (void) nfa_count_states(s);
    *pvisited = 1;
  }
}

static void nfa_collect_one(nfa_state_t *s, mem_t *ctx)
{
  nfa_state_t ***ppel = coerce(nfa_state_t ***, ctx);
  *(*ppel)++ = s;
}

static void nfa_free(nfa_t nfa, int nstates)
{
  nfa_state_t **all = coerce(nfa_state_t **, alloca(nstates * sizeof *all));
  nfa_state_t **pelem = all, *s = nfa.start;
  unsigned visited = s->a.visited + 1;
  int i;

  /* We don't care if visited has reached UINT_MAX here, because the regex is
   * going away, so we don't bother with nfa_handle_wraparound.
   */
  nfa_map_states(s, coerce(mem_t *, &pelem), nfa_collect_one, visited);

  assert (pelem - all == nstates);

  for (i = 0; i < nstates; i++)
    nfa_state_free(all[i]);
}

static void nfa_thread_epsilons(nfa_state_t **ps, unsigned visited)
{
  nfa_state_t *s = *ps, **ps0 = 0, **ps1 = 0;

  if (!s)
    return;

  switch (s->a.kind) {
  case nfa_empty:
    if (s->e.trans0 && s->e.trans1) {
      ps0 = &s->e.trans0;
      ps1 = &s->e.trans1;
    } else if (s->e.trans0) {
      *ps = s->e.trans0;
      ps0 = ps;
    } else if (s->e.trans1) {
      *ps = s->e.trans1;
      ps0 = ps;
    } else {
      *ps = 0;
    }
    break;
  case nfa_accept:
    ps0 = &s->e.trans0;
    ps1 = &s->e.trans1;
    break;
  case nfa_single:
  case nfa_wild:
  case nfa_set:
    ps0 = &s->o.trans;
    break;
  }

  if (nfa_test_set_visited(s, visited)) {
    if (ps1)
      nfa_thread_epsilons(ps1, visited);
    if (ps0)
      nfa_thread_epsilons(ps0, visited);
  }
}

static void nfa_fold_accept(nfa_state_t *s, mem_t *ctx)
{
  (void) ctx;

  if (s->a.kind == nfa_empty) {
    nfa_state_t *e0 = s->e.trans0;
    nfa_state_t *e1 = s->e.trans1;

    if (e0 && nfa_accept_state_p(e0)) {
      s->a.kind = nfa_accept;
      s->e.trans0 = 0;
    }

    if (e1 && nfa_accept_state_p(e1)) {
      s->a.kind = nfa_accept;
      s->e.trans1 = 0;
    }
  }
}

static void nfa_noop(nfa_state_t *s, mem_t *ctx)
{
  (void) s;
  (void) ctx;
}

static nfa_t nfa_optimize(nfa_t nfa)
{
  if (nfa.start) {
    int nstates = nfa_count_states(nfa.start), i;
    nfa_state_t **all = coerce(nfa_state_t **, alloca(nstates * sizeof *all));
    nfa_state_t **pelem = all;
    unsigned visited;

    /* Get all states in flat array. */
    nfa_map_states(nfa.start, coerce(mem_t *, &pelem), nfa_collect_one, nfa.start->a.visited + 1);

    /* Eliminate useless epsilons from graph. */
    nfa_thread_epsilons(&nfa.start, nfa.start->a.visited + 1);

    /* Fold accept states into empty transitions which reference them. */
    nfa_map_states(nfa.start, 0, nfa_fold_accept, nfa.start->a.visited + 1);

    /* Visit all reachable states. */
    nfa_map_states(nfa.start, 0, nfa_noop, nfa.start->a.visited + 1);

    /* Garbage-collect unreachable states. */
    visited = nfa.start->a.visited;

    for (i = 0; i < nstates; i++) {
      nfa_state_t *s = all[i];
      if (s->a.visited != visited)
        nfa_state_free(s);
    }
  }
  return nfa;
}


/*
 * Compute the epsilon-closure of the NFA states stored in the set, whose
 * size is given by nin. The results are stored back in the same array, the
 * size of which is returned. The stack parameter provides storage used by the
 * algorithm, so it doesn't have to be allocated and freed repeatedly.
 * The visited parameter is a stamp used for marking states which are added
 * to the epsilon-closure set, so that sets are not added twice.
 * If any of the states added to the closure are acceptance states,
 * the accept parameter is used to store the flag 1.
 *
 * An epsilon-closure is the set of all input states, plus all additional
 * states which are reachable from that set with empty (epsilon) transitions.
 * (Transitions that don't do not consume and match an input character).
 */
static int nfa_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
                       int nstates, unsigned visited, int *accept, int *more)
{
  int i, nout = 0;
  int stackp = 0;
  int moreset = 0;

  /* First, add all states in the input set to the closure,
     push them on the stack, and mark them as visited. */
  for (i = 0; i < nin; i++) {
    nfa_state_t *s = set[i];
    bug_unless (stackp < nstates);

    s->a.visited = visited;
    stack[stackp++] = s;
    set[nout++] = s;
    if (nfa_accept_state_p(s))
      *accept = 1;
    if (!moreset && nfa_has_transitions(s))
      moreset = *more = 1;
  }

  while (stackp) {
    nfa_state_t *top = stack[--stackp];

    /* Only states of type nfa_empty are interesting.
       Each such state at most two epsilon transitions. */

    if (nfa_empty_state_p(top)) {
      nfa_state_t *e0 = top->e.trans0;
      nfa_state_t *e1 = top->e.trans1;

      if (nfa_test_set_visited(e0, visited)) {
        stack[stackp++] = e0;
        set[nout++] = e0;
        if (nfa_accept_state_p(e0))
          *accept = 1;
        if (!moreset && nfa_has_transitions(e0))
          *more = 1;
      }

      if (nfa_test_set_visited(e1, visited)) {
        stack[stackp++] = e1;
        set[nout++] = e1;
        if (nfa_accept_state_p(e1))
          *accept = 1;
        if (!moreset && nfa_has_transitions(e1))
          *more = 1;
      }
    }
  }

  bug_unless (nout <= nstates);

  return nout;
}

/*
 * The nfa_move_closure function combines nfa_move and nfa_closure into a
 * single operation, elminating an intermediate array.
 */
static int nfa_move_closure(nfa_state_t **stack, nfa_state_t **set, int nin,
                            int nstates, wchar_t ch,
                            unsigned visited, int *accept, int *more)
{
  int i, nout, stackp, moreset = 0;

  /*
   * Compute the move set from a given set of NFA states. The move
   * set is the set of states which are reachable from the set of
   * input states on the consumpion of the input character given by ch.
   */
  for (nout = 0, stackp = 0, i = 0; i < nin; i++) {
    nfa_state_t *s = set[i];

    switch (s->a.kind) {
    case nfa_wild:
      /* Unconditional match; don't have to look at ch. */
      break;
    case nfa_single:
      if (s->o.ch == ch) /* Character match. */
        break;
      continue; /* no match */
    case nfa_set:
      if (char_set_contains(s->s.set, ch)) /* Set match. */
        break;
      continue; /* no match */
    default:
      /* Epsilon-transition and acceptance states have no character moves. */
      continue;
    }

    /* The state matches the character, so add it to the move set.
       C trick: all character-transitioning state types have the
       pointer to the next state in the same position,
       among a common set of leading struct members in the union,
       so we can use s->o.trans. */
    {
      nfa_state_t *mov = s->o.trans;

      bug_unless (stackp < nstates);

      if (nfa_test_set_visited(mov, visited)) {
        stack[stackp++] = mov;
        set[nout++] = mov;
        if (nfa_accept_state_p(mov))
          *accept = 1;
        if (!moreset && nfa_has_transitions(mov))
          *more = 1;
      }
    }
  }

  while (stackp) {
    nfa_state_t *top = stack[--stackp];

    /* Only states of type nfa_empty are interesting.
       Each such state at most two epsilon transitions. */

    if (nfa_empty_state_p(top)) {
      nfa_state_t *e0 = top->e.trans0;
      nfa_state_t *e1 = top->e.trans1;

      if (nfa_test_set_visited(e0, visited)) {
        stack[stackp++] = e0;
        set[nout++] = e0;
        if (nfa_accept_state_p(e0))
          *accept = 1;
        if (!moreset && nfa_has_transitions(e0))
          *more = 1;
      }

      if (nfa_test_set_visited(e1, visited)) {
        stack[stackp++] = e1;
        set[nout++] = e1;
        if (nfa_accept_state_p(e1))
          *accept = 1;
        if (!moreset && nfa_has_transitions(e0))
          *more = 1;
      }
    }
  }

  bug_unless (nout <= nstates);

  return nout;
}

/*
 * Match regex against the string in. The match is
 * anchored to the front of the string; to search
 * within the string, a .* must be added to the front
 * of the regex.
 *
 * Returns the length of the prefix of the string
 * which matches the regex.  Or, if you will,
 * the position of the first mismatching
 * character.
 *
 * If the regex does not match at all, zero is
 * returned.
 *
 * Matching stops when a state is reached from which
 * there are no transitions on the next input character,
 * or when the string runs out of characters.
 * The most recently visited acceptance state then
 * determines the match length (defaulting to zero
 * if no acceptance states were encountered).
 */
static cnum nfa_run(nfa_t nfa, int nstates, const wchar_t *str)
{
  const wchar_t *last_accept_pos = 0, *ptr = str;
  unsigned visited = nfa.start ? nfa.start->a.visited : 0;
  nfa_state_t **set = coerce(nfa_state_t **, alloca(nstates * sizeof *set));
  nfa_state_t **stack = coerce(nfa_state_t **, alloca(nstates * sizeof *stack));
  int nclos = 0;
  int accept = 0, more = 0;

  nfa_handle_wraparound(nfa.start, &visited);

  if (nfa.start) {
    set[0] = nfa.start;
    nclos = nfa_closure(stack, set, 1, nstates, ++visited, &accept, &more);
  }

  if (accept)
    last_accept_pos = ptr;

  for (; *ptr != 0; ptr++) {
    wchar_t ch = *ptr;

    accept = more = 0;

    nfa_handle_wraparound(nfa.start, &visited);

    nclos = nfa_move_closure(stack, set, nclos,
                             nstates, ch, ++visited, &accept, &more);

    if (accept)
      last_accept_pos = ptr + 1;

    if (nclos == 0 || more == 0) /* dead end */
      break;
  }

  if (nfa.start)
    nfa.start->a.visited = visited;
  return last_accept_pos ? last_accept_pos - str : -1;
}

static cnum regex_machine_match_span(regex_machine_t *regm)
{
  return regm->n.last_accept_pos;
}

static void regex_destroy(val obj)
{
  regex_t *regex = coerce(regex_t *, obj->co.handle);
  if (regex->kind == REGEX_NFA)
    nfa_free(regex->r.nfa, regex->nstates);
  free(regex);
  obj->co.handle = 0;
}

static void regex_mark(val obj)
{
  regex_t *regex = coerce(regex_t *, obj->co.handle);
  if (regex->kind == REGEX_DV)
    gc_mark(regex->r.dv);
  gc_mark(regex->source);
}

static void regex_print(val obj, val stream, val pretty, struct strm_ctx *);

static struct cobj_ops regex_obj_ops = cobj_ops_init(eq,
                                                     regex_print,
                                                     regex_destroy,
                                                     regex_mark,
                                                     cobj_eq_hash_op,
                                                     0);

static val reg_nullable(val);

static val reg_expand_nongreedy(val exp)
{
  if (atom(exp)) {
    return exp;
  } else if (consp(exp)) {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      return exp;
    } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
               sym == optional_s || sym == compl_s ||
               sym == or_s || sym == and_s)
    {
      list_collect_decl (out, iter);
      iter = list_collect(iter, sym);
      for (; args; args = cdr(args))
        iter = list_collect(iter, reg_expand_nongreedy(first(args)));
      return out;
    } else if (sym == nongreedy_s) {
      val xfirst = reg_expand_nongreedy(first(args));
      val xsecond = reg_expand_nongreedy(second(args));
      val zplus = cons(zeroplus_s, cons(xfirst, nil));

      if (xsecond == nil) {
        return zplus;
      } else {
        val any = list(zeroplus_s, wild_s, nao);
        val notempty = list(oneplus_s, wild_s, nao);

        return list(compound_s,
                    list(and_s,
                         zplus,
                         list(compl_s,
                              list(compound_s,
                                   any,
                                   if3(reg_nullable(xsecond),
                                       list(and_s, xsecond, notempty, nao),
                                       xsecond),
                                   any, nao),
                              nao),
                         nao),
                    xsecond, nao);
      }
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  } else {
    uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
  }
}

static val reg_nary_to_bin(val regex);

static val reg_nary_unfold(val sym, val args, val orig)
{
  if (atom(args)) {
    return t; /* Nullary intersection and union are both empty. */
  } else if (!cdr(args)) {
    return reg_nary_to_bin(car(args));
  } else if (!cddr(args)) {
    val rx1 = pop(&args);
    val rx2 = pop(&args);
    val newrx1 = reg_nary_to_bin(rx1);
    val newrx2 = reg_nary_to_bin(rx2);
    if (!orig || rx1 != newrx1 || rx2 != newrx2)
      return list(sym, newrx1, newrx2, nao);
    return orig;
  } else {
    return list(sym,
                reg_nary_to_bin(car(args)),
                reg_nary_unfold(sym, cdr(args), nil), nao);
  }
}

static val reg_nary_to_bin(val regex)
{
  if (atom(regex)) {
    return regex;
  } else {
    val sym = first(regex);
    val args = rest(regex);

    if (sym == or_s || sym == and_s) {
      return reg_nary_unfold(sym, args, regex);
    } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
               sym == optional_s || sym == compl_s || sym == nongreedy_s)
    {
     list_collect_decl (out, ptail);
      val args_orig = args;
      val nochange = t;

      for (; args; args = cdr(args))  {
        val rx = car(args);
        val newrx = reg_nary_to_bin(car(args));

        if (nochange && rx != newrx) {
          ptail = list_collect_nconc(ptail, ldiff(args_orig, args));
          nochange = nil;
        }

        if (!nochange)
          ptail = list_collect(ptail, newrx);
      }

      if (!nochange)
        return cons(sym, out);
    }

    return regex;
  }
}

static val reg_compile_csets(val exp)
{
  if (exp == space_k) {
    return cobj(coerce(mem_t *, space_cs), chset_cls, &char_set_obj_ops);
  } else if (exp == digit_k) {
    return cobj(coerce(mem_t *, digit_cs), chset_cls, &char_set_obj_ops);
  } else if (exp == word_char_k) {
    return cobj(coerce(mem_t *, word_cs), chset_cls, &char_set_obj_ops);
  } else if (exp == cspace_k) {
    return cobj(coerce(mem_t *, cspace_cs), chset_cls, &char_set_obj_ops);
  } else if (exp == cdigit_k) {
    return cobj(coerce(mem_t *, cdigit_cs), chset_cls, &char_set_obj_ops);
  } else if (exp == cword_char_k) {
    return cobj(coerce(mem_t *, cword_cs), chset_cls, &char_set_obj_ops);
  } else if (symbolp(exp) || chrp(exp)) {
    return exp;
  } else if (stringp(exp)) {
    return cons(compound_s, list_str(exp));
  } else if (consp(exp)) {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      char_set_t *set = char_set_compile(args, eq(sym, cset_s));
      return cobj(coerce(mem_t *, set), chset_cls, &char_set_obj_ops);
    } else if (sym == compound_s || sym == zeroplus_s || sym == oneplus_s ||
               sym == optional_s || sym == compl_s || sym == nongreedy_s ||
               sym == or_s || sym == and_s)
    {
      list_collect_decl (out, iter);
      iter = list_collect(iter, sym);
      for (; args; args = cdr(args))
        iter = list_collect(iter, reg_compile_csets(first(args)));
      return out;
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  } else {
    uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
  }
}

/*
 * Helper to reg_nullable for recursing over
 * contents of a compound expression.
 */
static val reg_nullable_list(val exp_list)
{
  if (rest(exp_list)) {
    return if2(reg_nullable(first(exp_list)) &&
               reg_nullable_list(rest(exp_list)),
               t);
  } else {
    return reg_nullable(first(exp_list));
  }
}

/*
 * Determine whether the given regular expression is nullable: that is
 * to say, can the regular expression match the empty string?
 */
static val reg_nullable(val exp)
{
  if (exp == nil) {
    return t;
  } else if (atom(exp)) {
    return nil;
  } else {
    val sym = first(exp), args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      return nil;
    } else if (sym == compound_s) {
      return reg_nullable_list(args);
    } else if (sym == oneplus_s) {
      return reg_nullable(first(args));
    } else if (sym == zeroplus_s || sym == optional_s) {
      return t;
    } else if (sym == compl_s) {
      return if3(reg_nullable(first(args)), nil, t);
    } else if (sym == or_s) {
      return if2((reg_nullable(first(args)) || reg_nullable(second(args))), t);
    } else if (sym == and_s) {
      return if2((reg_nullable(first(args)) && reg_nullable(second(args))), t);
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  }
}

static val reg_matches_all(val exp)
{
  if (atom(exp)) {
    return nil;
  } else {
    val sym = first(exp), args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      return nil;
    } else if (sym == compound_s) {
      val am = nil;
      for (; args; args = cdr(args)) {
        val arg = car(args);
        if (!reg_nullable(arg))
          return nil;
        if (!am && reg_matches_all(arg))
          am = t;
      }
      return am;
    } else if (sym == oneplus_s) {
      return reg_matches_all(car(args));
    } else if (sym == zeroplus_s) {
      val arg = car(args);
      if (arg == wild_s || reg_matches_all(arg))
        return t;
      return nil;
    } else if (sym == optional_s) {
      return reg_matches_all(car(args));
    } else if (sym == compl_s) {
      val arg = car(args);
      return if2(arg == t, t);
    } else if (sym == or_s) {
      return tnil(reg_matches_all(pop(&args)) || reg_matches_all(pop(&args)));
    } else if (sym == and_s) {
      return tnil(reg_matches_all(pop(&args)) && reg_matches_all(pop(&args)));
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  }
}

static val flatten_or(val or_expr)
{
  if (atom(or_expr) || car(or_expr) != or_s) {
    return cons(or_expr, nil);
  } else {
    val left = second(or_expr);
    val right = third(or_expr);
    return nappend2(flatten_or(left), flatten_or(right));
  }
}

static val unflatten_or(val exlist)
{
  val f = first(exlist);
  val r = rest(exlist);

  if (r) {
    return cons(or_s, cons(f, cons(unflatten_or(r), nil)));
  } else {
    return f;
  }
}

static val unique_first(val exlist)
{
  val f = first(exlist);
  val r = rest(exlist);

  if (!memqual(f, r))
    return cons(first(exlist), nil);
  return nil;
}

static val reduce_or(val or_expr)
{
  val left = second(or_expr);
  val right = third(or_expr);

  /*
   * Do optimization only if this is an or of two or expressions.
   */

  if (consp(left) && first(left) == or_s &&
      consp(right) && first(right) == or_s)
  {
    val exlist = flatten_or(or_expr);
    val repeats_removed = mapcon(func_n1(unique_first), exlist);
    return unflatten_or(repeats_removed);
  } else {
    return or_expr;
  }
}

static val reg_derivative(val, val);

static val reg_derivative_list(val exp_list, val ch)
{
  if (rest(exp_list)) {
    if (reg_nullable(first(exp_list))) {
      val d_first = reg_derivative(first(exp_list), ch);
      val d_rest = reg_derivative_list(rest(exp_list), ch);

      if (d_rest == t && d_first == t)
        return t;

      if (d_rest == t)
        return if3(d_first == nil,
                   cons(compound_s, rest(exp_list)),
                   cons(compound_s, cons(d_first, rest(exp_list))));

      if (d_first == t)
        return d_rest;

      return list(or_s,
                  if3(d_first == nil,
                      cons(compound_s, rest(exp_list)),
                      cons(compound_s, cons(d_first, rest(exp_list)))),
                  d_rest,
                  nao);
    } else {
      val d_first = reg_derivative(first(exp_list), ch);

      if (d_first == t)
        return t;
      else if (d_first == nil)
        return cons(compound_s, rest(exp_list));
      else
        return cons(compound_s,
                    cons(d_first, rest(exp_list)));
    }
  } else {
    return reg_derivative(first(exp_list), ch);
  }
}

/*
 * Determine derivative of regex with respect to character.
 */
static val reg_derivative(val exp, val ch)
{
  if (exp == nil || exp == t) {
    return t;
  } else if (chrp(exp)) {
    return null(eq(exp, ch));
  } else if (cobjclassp(exp, chset_cls)) {
    char_set_t *set = coerce(char_set_t *, exp->co.handle);
    return if3(char_set_contains(set, c_chr(ch)), nil, t);
  } else if (exp == wild_s) {
    return nil;
  } else {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == compound_s) {
      return reg_derivative_list(args, ch);
    } else if (sym == optional_s) {
      return reg_derivative(first(args), ch);
    } else if (sym == oneplus_s) {
      val arg = first(args);
      val d_arg = reg_derivative(arg, ch);
      if (d_arg == t)
        return t;
      if (d_arg == nil)
        return cons(zeroplus_s, cons(arg, nil));
      return cons(compound_s, cons(d_arg,
                                   cons(cons(zeroplus_s,
                                             cons(arg, nil)), nil)));
    } else if (sym == zeroplus_s) {
      val arg = first(args);
      val d_arg = reg_derivative(arg, ch);
      if (d_arg == t)
        return t;
      if (d_arg == nil)
        return exp;
      return cons(compound_s, cons(d_arg, cons(exp, nil)));
    } else if (sym == compl_s) {
      val d_arg = reg_derivative(first(args), ch);
      if (reg_matches_all(d_arg))
        return t;
      if (d_arg == nil)
        return cons(oneplus_s, cons(wild_s, nil));
      if (d_arg == t)
        return cons(zeroplus_s, cons(wild_s, nil));
      return cons(sym, cons(d_arg, nil));
    } else if (sym == or_s) {
      val d_arg1 = reg_derivative(first(args), ch);
      val d_arg2 = reg_derivative(second(args), ch);

      if (d_arg1 == t)
        return d_arg2;

      if (d_arg2 == t)
        return d_arg1;

      return reduce_or(cons(or_s, cons(d_arg1, cons(d_arg2, nil))));
    } else if (sym == and_s) {
      val d_arg1 = reg_derivative(first(args), ch);
      val d_arg2 = nil;

      if (d_arg1 == t)
        return t;

      d_arg2 = reg_derivative(second(args), ch);

      if (d_arg2 == t)
        return t;

      return cons(and_s, cons(d_arg1, cons(d_arg2, nil)));
    } else if (sym == set_s || sym == cset_s) {
      uw_throwf(error_s, lit("uncompiled regex passed to reg_derivative"), nao);
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  }
}

static cnum dv_run(val regex, const wchar_t *str)
{
  const wchar_t *last_accept_pos = 0, *ptr = str;

  for (; *ptr != 0; ptr++) {
    wchar_t ch = *ptr;
    val nullable = reg_nullable(regex);
    val deriv = reg_derivative(regex, chr(ch));

    if (nullable)
      last_accept_pos = ptr;

    if (deriv == t)
      return last_accept_pos ? last_accept_pos - str : -1;
  }

  if (reg_nullable(regex))
    return ptr - str;
  return last_accept_pos ? last_accept_pos - str : -1;
}

static val reg_single_char_p(val exp)
{
  if (chrp(exp))
    return t;
  if (consp(exp) && (car(exp) == set_s || car(exp) == cset_s))
    return t;
  if (exp == space_k || exp == word_char_k || exp == digit_k)
    return t;
  if (exp == cspace_k || exp == cword_char_k || exp == cdigit_k)
    return t;
  return nil;
}

static val reg_compl_char_p(val exp)
{
  if (consp(exp) && car(exp) == cset_s)
    return t;
  if (exp == cspace_k || exp == cword_char_k || exp == cdigit_k)
    return t;
  return nil;
}

static val reg_optimize(val exp);

static val invert_single(val exp)
{
  if (chrp(exp))
    return cons(cset_s, cons(exp, nil));
  if (consp(exp)) {
    if (car(exp) == set_s)
      return cons(cset_s, cdr(exp));
    if (car(exp) == cset_s)
      return reg_optimize(cons(set_s, cdr(exp)));
  }
  if (exp == space_k)
    return cspace_k;
  if (exp == cspace_k)
    return space_k;
  if (exp == word_char_k)
    return cword_char_k;
  if (exp == cword_char_k)
    return word_char_k;
  if (exp == digit_k)
    return cdigit_k;
  if (exp == cdigit_k)
    return digit_k;

  abort();
}

static val reg_optimize(val exp)
{
  if (atom(exp)) {
    return exp;
  } else if (reg_matches_all(exp)) {
    return cons(zeroplus_s, cons(wild_s, nil));
  } else {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == set_s) {
      if (!args)
        return t;
      if (!rest(args) && chrp(first(args)))
        return first(args);
      return exp;
    } else if (sym == cset_s) {
      return if3(rest(exp), exp, wild_s);
    } else if (sym == compound_s) {
      val xargs = mapcar(func_n1(reg_optimize), args);
      while (rest(xargs)) {
        if (reg_matches_all(first(xargs)) &&
            reg_matches_all(second(xargs)))
        {
          pop(&xargs);
          continue;
        }
        break;
      }
      if (!xargs)
        return nil;
      if (!cdr(xargs))
        return car(xargs);
      if (memq(t, xargs))
        return t;
      return cons(sym, xargs);
    } else if (sym == zeroplus_s) {
      val arg = reg_optimize(first(args));
      if (consp(arg)) {
        val arg2 = first(arg);
        if (arg2 == zeroplus_s)
          return arg;
        if (arg2 == oneplus_s || arg2 == optional_s)
          return cons(zeroplus_s, cdr(arg));
      }
      if (reg_matches_all(arg))
        return arg;
      return cons(sym, cons(arg, nil));
    } else if (sym == oneplus_s) {
      val arg = reg_optimize(first(args));
      if (reg_matches_all(arg))
        return cons(zeroplus_s, cons(wild_s, nil));
      if (consp(arg)) {
        val arg2 = first(arg);
        if (arg2 == zeroplus_s || arg2 == oneplus_s)
          return arg;
        if (arg2 == optional_s)
          return cons(zeroplus_s, cdr(arg));
      }
      return cons(sym, cons(arg, nil));
    } else if (sym == optional_s) {
      val arg = reg_optimize(first(args));
      if (reg_matches_all(arg))
        return arg;
      if (consp(arg)) {
        val arg2 = first(arg);
        if (arg2 == zeroplus_s || arg2 == optional_s)
          return arg;
        if (arg2 == oneplus_s)
          return cons(zeroplus_s, cdr(arg));
      }
      return cons(sym, cons(arg, nil));
    } else if (sym == compl_s) {
      val arg_unopt = car(args);

      if (consp(arg_unopt) && car(arg_unopt) == compl_s) {
        return reg_optimize(cadr(arg_unopt));
      } else {
        val arg = reg_optimize(arg_unopt);
        if (reg_matches_all(arg))
          return t;
        if (arg == nil)
          return cons(oneplus_s, cons(wild_s, nil));
        if (arg == t)
          return cons(zeroplus_s, cons(wild_s, nil));
        if (reg_single_char_p(arg))
          return list(or_s,
                      list(optional_s, invert_single(arg), nao),
                      list(compound_s, wild_s,
                           list(oneplus_s, wild_s, nao), nao), nao);
        if (consp(arg)) {
          val sym2 = first(arg);
          if (sym2 == compound_s) {
            val args2 = rest(arg);
            if (cddr(args2) && !cdddr(args2)) {
              if (reg_matches_all(first(args2)) &&
                  reg_single_char_p(second(args2)) &&
                  reg_matches_all(third(args2)))
              {
                return cons(zeroplus_s,
                            cons(invert_single(second(args2)), nil));
              }
            }

            if (cdr(args2) && !cddr(args2)) {
              if (reg_matches_all(first(args2)) &&
                  reg_single_char_p(second(args2)))
              {
                return list(optional_s,
                            list(compound_s,
                                 cons(zeroplus_s, cons(wild_s, nil)),
                                 invert_single(second(args2)), nao), nao);
              }

              if (reg_single_char_p(first(args2)) &&
                  reg_matches_all(second(args2))) {
                return list(optional_s,
                            list(compound_s,
                                 invert_single(first(args2)),
                                 cons(zeroplus_s, cons(wild_s, nil)), nao), nao);
              }
            }
          }
        }
        return cons(sym, cons(arg, nil));
      }
    } else if (sym == or_s) {
      val arg1 = reg_optimize(pop(&args));
      val arg2 = reg_optimize(pop(&args));

      if (arg1 == t || reg_matches_all(arg2))
        return arg2;

      if (arg2 == t || reg_matches_all(arg1))
        return arg1;

      if (arg1 == nil)
        return cons(optional_s, cons(arg2, nil));

      if (arg2 == nil)
        return cons(optional_s, cons(arg1, nil));

      if (reg_single_char_p(arg1) && reg_single_char_p(arg2)) {
        if (!reg_compl_char_p(arg1) && !reg_compl_char_p(arg2)) {
          if (atom(arg1) && atom(arg2))
            return list(set_s, arg1, arg2, nao);
          if (consp(arg1) && car(arg1) == set_s && atom(arg2))
            return cons(set_s, cons(arg2, cdr(arg1)));
          if (consp(arg2) && car(arg2) == set_s && atom(arg1))
            return cons(set_s, cons(arg1, cdr(arg2)));
          if (consp(arg1) && car(arg1) == set_s &&
              consp(arg1) && car(arg1) == set_s)
            return cons(set_s, append2(cdr(arg1), cdr(arg2)));
        }
        if (reg_compl_char_p(arg1) && reg_compl_char_p(arg2)) {
          arg1 = invert_single(arg1);
          arg2 = invert_single(arg2);

          if (atom(arg1) && atom(arg2))
            return list(cset_s, arg1, arg2, nao);
          if (consp(arg1) && car(arg1) == set_s && atom(arg2))
            return cons(cset_s, cons(arg2, cdr(arg1)));
          if (consp(arg2) && car(arg2) == set_s && atom(arg1))
            return cons(cset_s, cons(arg1, cdr(arg2)));
          if (consp(arg1) && car(arg1) == set_s &&
              consp(arg1) && car(arg1) == set_s)
            return cons(cset_s, append2(cdr(arg1), cdr(arg2)));
        }
      }

      return cons(sym, cons(arg1, cons(arg2, nil)));
    } else if (sym == and_s) {
      val arg1 = reg_optimize(pop(&args));
      val arg2 = reg_optimize(pop(&args));

      if (arg1 == t || arg2 == t)
        return t;

      if (arg1 == nil)
        return null(reg_nullable(arg2));

      if (arg2 == nil)
        return null(reg_nullable(arg1));

      if (reg_matches_all(arg1))
        return arg2;

      if (reg_matches_all(arg2))
        return arg2;

      return cons(sym, cons(arg1, cons(arg2, nil)));
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  }
}

static val regex_requires_dv(val exp)
{
  if (atom(exp)) {
    return nil;
  } else {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      return nil;
    } else if (sym == compound_s) {
      return some_satisfy(args, func_n1(regex_requires_dv), nil);
    } else if (sym == zeroplus_s || sym == oneplus_s ||
               sym == optional_s) {
      return regex_requires_dv(first(args));
    } else if (sym == compl_s) {
      return t;
    } else if (sym == or_s) {
      return if2(regex_requires_dv(first(args)) ||
                 regex_requires_dv(second(args)), t);
    } else if (sym == and_s || sym == nongreedy_s) {
      return t;
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  }
}

static val regex_optimize(val regex_sexp)
{
  return reg_optimize(reg_expand_nongreedy(reg_nary_to_bin(regex_sexp)));
}

val regex_compile(val regex_sexp, val error_stream)
{
  val regex_source;

  if (stringp(regex_sexp)) {
    regex_sexp = regex_parse(regex_sexp, error_stream);
    return if2(regex_sexp, regex_compile(regex_sexp, error_stream));
  }

  regex_source = reg_nary_to_bin(regex_sexp);

  regex_sexp = reg_optimize(reg_expand_nongreedy(regex_source));

  if (opt_derivative_regex || regex_requires_dv(regex_sexp)) {
    regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex));
    val ret;
    val dv = reg_compile_csets(regex_sexp);
    regex->kind = REGEX_DV;
    regex->nstates = 0;
    regex->source = nil;
    ret = cobj(coerce(mem_t *, regex), regex_cls, &regex_obj_ops);
    regex->r.dv = dv;
    regex->source = regex_source;
    return ret;
  } else {
    regex_t *regex = coerce(regex_t *, chk_malloc(sizeof *regex));
    val ret;
    regex->kind = REGEX_NFA;
    regex->source = nil;
    ret = cobj(coerce(mem_t *, regex), regex_cls, &regex_obj_ops);
    regex->r.nfa = nfa_optimize(nfa_compile_regex(regex_sexp));
    regex->nstates = nfa_count_states(regex->r.nfa.start);
    regex->source = regex_source;
    return ret;
  }
}

val regexp(val obj)
{
  return cobjclassp(obj, regex_cls);
}

val regex_source(val compiled_regex)
{
  val self = lit("regex-source");
  regex_t *regex = coerce(regex_t *,
                          cobj_handle(self, compiled_regex, regex_cls));
  return regex->source;
}

static void puts_clear_flag(val str, val stream, int *semi_flag)
{
  *semi_flag = 0;
  put_string(str, stream);
}

static void putc_clear_flag(val ch, val stream, int *semi_flag)
{
  *semi_flag = 0;
  put_char(ch, stream);
}

static void print_class_char(val ch, val first_p, val stream, int *semi_flag)
{
  wchar_t c = c_chr(ch);
  switch (c) {
  case '^':
    if (!first_p)
      break;
    /* fallthrough */
  case '-': case '[': case ']':
    putc_clear_flag(chr('\\'), stream, semi_flag);
    put_char(ch, stream);
    return;
  }
  out_str_char(c_chr(ch), stream, semi_flag, 1);
}

static void print_rec(val exp, val stream, int *semi_flag);

static void paren_print_rec(val exp, val stream, int *semi_flag)
{
  putc_clear_flag(chr('('), stream, semi_flag);
  print_rec(exp, stream, semi_flag);
  putc_clear_flag(chr(')'), stream, semi_flag);
}

static void print_rec(val exp, val stream, int *semi_flag)
{
  val self = lit("regex-print");

  if (exp == space_k) {
    puts_clear_flag(lit("\\s"), stream, semi_flag);
  } else if (exp == digit_k) {
    puts_clear_flag(lit("\\d"), stream, semi_flag);
  } else if (exp == word_char_k) {
    puts_clear_flag(lit("\\w"), stream, semi_flag);
  } else if (exp == cspace_k) {
    puts_clear_flag(lit("\\S"), stream, semi_flag);
  } else if (exp == cdigit_k) {
    puts_clear_flag(lit("\\D"), stream, semi_flag);
  } else if (exp == cword_char_k) {
    puts_clear_flag(lit("\\W"), stream, semi_flag);
  } else if (exp == wild_s) {
    putc_clear_flag(chr('.'), stream, semi_flag);
  } else if (chrp(exp)) {
    wchar_t ch = c_chr(exp);
    switch (ch) {
    case '?': case '.': case '*': case '+':
    case '(': case ')': case '|': case '~':
    case '&': case '%': case '/':
    case '[': case ']': case '\\':
      putc_clear_flag(chr('\\'), stream, semi_flag);
      put_char(exp, stream);
      break;
    default:
      out_str_char(ch, stream, semi_flag, 1);
    }
  } else if (stringp(exp)) {
    cnum i;
    cnum l = c_num(length(exp), self);
    for (i = 0; i < l; i++)
      print_rec(chr_str(exp, num(i)), stream, semi_flag);
  } else if (consp(exp)) {
    val sym = first(exp);
    val args = rest(exp);

    if (sym == set_s || sym == cset_s) {
      val first_p = t;

      putc_clear_flag(chr('['), stream, semi_flag);

      if (sym == cset_s) {
        put_char(chr('^'), stream);
        first_p = nil;
      }

      while (args) {
        val arg = pop(&args);
        if (consp(arg)) {
          print_class_char(car(arg), first_p, stream, semi_flag);
          putc_clear_flag(chr('-'), stream, semi_flag);
          print_class_char(cdr(arg), nil, stream, semi_flag);
        } else if (symbolp(arg)) {
          print_rec(arg, stream, semi_flag);
        } else {
          print_class_char(arg, first_p, stream, semi_flag);
        }
        first_p = nil;
      }
      putc_clear_flag(chr(']'), stream, semi_flag);
    } else if (sym == compound_s) {
      for (; args; args = cdr(args)) {
        val arg = car(args);
        if (consp(arg) && (car(arg) == compl_s || car(arg) == and_s ||
                           car(arg) == or_s || car(arg) == nongreedy_s))
          paren_print_rec(arg, stream, semi_flag);
        else
          print_rec(arg, stream, semi_flag);
      }
    } else if (sym == zeroplus_s || sym == oneplus_s || sym == optional_s) {
      val arg = pop(&args);
      if (consp(arg) && car(arg) != set_s && car(arg) != cset_s)
        paren_print_rec(arg, stream, semi_flag);
      else
        print_rec(arg, stream, semi_flag);
      if (sym == zeroplus_s)
        putc_clear_flag(chr('*'), stream, semi_flag);
      else if (sym == oneplus_s)
        putc_clear_flag(chr('+'), stream, semi_flag);
      else
        putc_clear_flag(chr('?'), stream, semi_flag);
    } else if (sym == compl_s) {
      val arg = pop(&args);
      putc_clear_flag(chr('~'), stream, semi_flag);
      if (consp(arg) && (car(arg) == or_s || car(arg) == and_s))
        paren_print_rec(arg, stream, semi_flag);
      else
        print_rec(arg, stream, semi_flag);
    } else if (sym == and_s) {
      val arg1 = pop(&args);
      val arg2 = pop(&args);
      if (consp(arg1) && car(arg1) == or_s)
        paren_print_rec(arg1, stream, semi_flag);
      else
        print_rec(arg1, stream, semi_flag);
      putc_clear_flag(chr('&'), stream, semi_flag);
      if (consp(arg2) && car(arg2) == or_s)
        paren_print_rec(arg2, stream, semi_flag);
      else
        print_rec(arg2, stream, semi_flag);
    } else if (sym == or_s) {
      print_rec(pop(&args), stream, semi_flag);
      putc_clear_flag(chr('|'), stream, semi_flag);
      print_rec(pop(&args), stream, semi_flag);
    } else if (sym == nongreedy_s) {
      val arg1 = pop(&args);
      val arg2 = pop(&args);
      if (consp(arg1) && car(arg1) != set_s && car(arg1) != cset_s)
        paren_print_rec(arg1, stream, semi_flag);
      else
        print_rec(arg1, stream, semi_flag);
      putc_clear_flag(chr('%'), stream, semi_flag);
      if (consp(arg2) && (car(arg2) == and_s && car(arg2) == or_s))
        paren_print_rec(arg2, stream, semi_flag);
      else
        print_rec(arg2, stream, semi_flag);
    } else {
      uw_throwf(error_s, lit("bad operator in regex syntax: ~s"), sym, nao);
    }
  } else if (exp == t) {
    puts_clear_flag(lit("[]"), stream, semi_flag);
  } else if (exp != nil) {
    uw_throwf(error_s, lit("bad object in regex syntax: ~s"), exp, nao);
  }
}

static void regex_print(val obj, val stream, val pretty, struct strm_ctx *ctx)
{
  val self = lit("regex-print");
  regex_t *regex = coerce(regex_t *, cobj_handle(self, obj, regex_cls));
  int semi_flag = 0;

  (void) pretty;
  (void) ctx;

  put_string(lit("#/"), stream);
  print_rec(regex->source, stream, &semi_flag);
  put_char(chr('/'), stream);
}

static cnum regex_run(val compiled_regex, const wchar_t *str)
{
  val self = lit("regex-run");
  regex_t *regex = coerce(regex_t *, cobj_handle(self, compiled_regex, regex_cls));

  return if3(regex->kind == REGEX_DV,
             dv_run(regex->r.dv, str),
             nfa_run(regex->r.nfa, regex->nstates, str));
}

/*
 * Regex machine: represents the logic of the regex_run function as state
 * machine object which can be fed one character at a time.
 */

static void regex_machine_reset(regex_machine_t *regm)
{
  int accept = 0, more = 0;

  regm->n.last_accept_pos = -1;
  regm->n.count = 0;

  if (regm->n.is_nfa) {
    nfa_state_t *s = regm->n.nfa.start;

    if (s) {
      regm->n.visited = s->a.visited + 1;
      nfa_handle_wraparound(s, &regm->n.visited);
      regm->n.set[0] = s;
      regm->n.nclos = nfa_closure(regm->n.stack, regm->n.set, 1,
                                  regm->n.nstates,
                                  regm->n.visited, &accept, &more);
      s->a.visited = regm->n.visited;
    } else {
      regm->n.nclos = 0;
    }
  } else {
    regm->d.deriv = regm->d.regex;
    accept = (reg_nullable(regm->d.regex) != nil);
  }

  if (accept)
    regm->n.last_accept_pos = regm->n.count;
}

static void regex_machine_init(val self, regex_machine_t *regm, val reg)
{
  regex_t *regex = coerce(regex_t *, cobj_handle(self, reg, regex_cls));

  if (regex->kind == REGEX_DV) {
    regm->n.is_nfa = 0;
    regm->d.regex = regex->r.dv;
  } else {
    regm->n.is_nfa = 1;
    regm->n.nfa = regex->r.nfa;
    regm->n.nstates = regex->nstates;
    regm->n.visited = 0;
    regm->n.set = coerce(nfa_state_t **,
                          chk_malloc(regex->nstates * sizeof *regm->n.set));
    regm->n.stack = coerce(nfa_state_t **,
                           chk_malloc(regex->nstates * sizeof *regm->n.stack));
  }

  regex_machine_reset(regm);
}

static void regex_machine_cleanup(regex_machine_t *regm)
{
  if (regm->n.is_nfa) {
    free(regm->n.stack);
    free(regm->n.set);
    regm->n.stack = 0;
    regm->n.set = 0;
    regm->n.nfa.start = 0;
    regm->n.nfa.accept = 0;
  }
}

static regm_result_t regex_machine_infer_init_state(regex_machine_t *regm)
{
  if (regm->n.is_nfa)
    return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
  else
    return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
}

static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch)
{
  int accept = 0, more = 0;

  if (regm->n.is_nfa) {
    nfa_handle_wraparound(regm->n.nfa.start, &regm->n.visited);

    if (ch != 0) {
      regm->n.count++;

      regm->n.nclos = nfa_move_closure(regm->n.stack,
                                       regm->n.set, regm->n.nclos,
                                       regm->n.nstates, ch, ++regm->n.visited,
                                       &accept, &more);

      if (regm->n.nfa.start)
        regm->n.nfa.start->a.visited = regm->n.visited;

      if (accept) {
        regm->n.last_accept_pos = regm->n.count;
        return more ? REGM_MATCH : REGM_MATCH_DONE;
      }

      return (regm->n.nclos != 0) ? REGM_INCOMPLETE : REGM_FAIL;
    }
  } else {
    val accept = nil;

    if (ch != 0) {
      regm->d.count++;
      regm->d.deriv = reg_derivative(regm->d.deriv, chr(ch));

      if ((accept = reg_nullable(regm->d.deriv))) {
        regm->d.last_accept_pos = regm->d.count;
        return regm->d.deriv ? REGM_MATCH : REGM_MATCH_DONE;
      }

      return (regm->d.deriv != t) ? REGM_INCOMPLETE : REGM_FAIL;
    }
  }

  /* Reached if the null character is consumed. */

  if (regm->n.last_accept_pos == regm->n.count)
    return REGM_MATCH;
  if (regm->n.last_accept_pos == -1)
    return REGM_FAIL;
  return REGM_INCOMPLETE;
}

val search_regex(val haystack, val needle_regex, val start,
                 val from_end)
{
  val self = lit("search-regex");
  val slen = nil;
  start = default_arg(start, zero);
  from_end = default_null_arg(from_end);

  if (minusp(start)) {
    slen = length_str(haystack);
    start = plus(start, slen);
    if (minusp(start))
      start = zero;
  }

  if (from_end) {
    cnum i;
    cnum s = c_num(start, self);
    const wchar_t *h = c_str(haystack, self);

    slen = (slen ? slen : length_str(haystack));

    if (regex_run(needle_regex, L"") >= 0)
      return cons(slen, zero);

    for (i = c_num(slen, self) - 1; i >= s; i--) {
      cnum span = regex_run(needle_regex, h + i);
      if (span >= 0)
        return cons(num(i), num(span));
    }

    gc_hint(haystack);
  } else {
    regex_machine_t regm;
    val i, pos = start, retval;
    regm_result_t last_res = REGM_INCOMPLETE;

    if (length_str_lt(haystack, pos))
      return nil;

    regex_machine_init(self, &regm, needle_regex);

again:
    for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) {
      last_res = regex_machine_feed(&regm, c_chr(chr_str(haystack, i)));

      if (last_res == REGM_FAIL) {
        last_res = regex_machine_feed(&regm, 0);
        if (last_res == REGM_FAIL) {
          regex_machine_reset(&regm);
          pos = plus(pos, one);
          goto again;
        }
        break;
      }

      if (last_res == REGM_MATCH_DONE)
        break;
    }

    if (last_res != REGM_MATCH_DONE)
      last_res = regex_machine_feed(&regm, 0);

    switch (last_res) {
    case REGM_INCOMPLETE:
    case REGM_MATCH:
    case REGM_MATCH_DONE:
      retval = cons(pos, num(regex_machine_match_span(&regm)));
      regex_machine_cleanup(&regm);
      return retval;
    case REGM_FAIL:
      regex_machine_cleanup(&regm);
      return nil;
    }
  }

  return nil;
}

val range_regex(val haystack, val needle_regex, val start,
                val from_end)
{
  val result = search_regex(haystack, needle_regex, start, from_end);

  if (result) {
    cons_bind (pos, len, result);
    return rcons(pos, plus(pos, len));
  }

  return result;
}

val range_regex_all(val haystack, val needle_regex, val start, val end)
{
  list_collect_decl (out, ptail);
  val slen = length_str(haystack);

  if (null_or_missing_p(start)) {
    start = zero;
  } else if (minusp(start)) {
    start = minus(start, slen);
    if (minusp(start))
      start = zero;
  }

  if (null_or_missing_p(end)) {
    end = slen;
  } else if (minusp(end)) {
    end = minus(start, slen);
    if (minusp(end))
      return nil;
  }

  for (;;) {
    val range = range_regex(haystack, needle_regex, start, nil);

    if (!range)
      break;

    ptail = list_collect(ptail, range);

    {
      val rt = to(range);
      start = if3(eql(start, rt), succ(start), rt);
    }
  }

  return out;
}

val match_regex(val str, val reg, val pos)
{
  val self = lit("match-regex");
  regex_machine_t regm;
  val i, retval;
  regm_result_t last_res = REGM_INCOMPLETE;

  if (null_or_missing_p(pos)) {
    pos = zero;
  } else if (minusp(pos)) {
    pos = plus(pos, length_str(str));
    if (minusp(pos))
      return nil;
  } else if (length_str_lt(str, pos)) {
    return nil;
  }

  regex_machine_init(self, &regm, reg);

  for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
    last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
    if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
      break;
  }

  if (last_res != REGM_MATCH_DONE)
    last_res = regex_machine_feed(&regm, 0);

  switch (last_res) {
  case REGM_INCOMPLETE:
  case REGM_MATCH:
  case REGM_MATCH_DONE:
    retval = plus(pos, num(regex_machine_match_span(&regm)));
    regex_machine_cleanup(&regm);
    return retval;
  case REGM_FAIL:
    regex_machine_cleanup(&regm);
    return nil;
  }

  return nil;
}

val match_regex_len(val str, val regex, val pos)
{
  if (null_or_missing_p(pos)) {
    return match_regex(str, regex, pos);
  } else {
    val new_pos = match_regex(str, regex, pos);
    return if2(new_pos, minus(new_pos, pos));
  }
}

static val match_regex_right_old(val str, val regex, val end)
{
  val pos = zero;
  val slen = length(str);

  if (null_or_missing_p(end) || gt(end, slen))
    end = slen;
  else if (minusp(end))
    end = plus(end, slen);

  while (le(pos, end)) {
    cons_bind (from, len, search_regex(str, regex, pos, nil));

    if (!from)
      return nil;

    if (eql(plus(from, len), end))
      return len;

    pos = plus(pos, one);
  }

  return nil;
}

val match_regex_right(val str, val regex, val end)
{
  val self = lit("match-regex-right");
  val pos = zero;
  val len = length(str);

  if (null_or_missing_p(end)) {
    end = len;
  } else if (minusp(end)) {
    end = plus(end, len);
    if (minusp(end))
      return nil;
  } else if (gt(end, len)) {
    return nil;
  }

  while (le(pos, end)) {
    regex_machine_t regm;
    val i;
    regm_result_t last_res = REGM_INCOMPLETE;

    regex_machine_init(self, &regm, regex);

    for (i = pos; lt(i, end); i = plus(i, one)) {
      last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
      if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
        break;
    }

    if (last_res != REGM_MATCH_DONE)
      last_res = regex_machine_feed(&regm, 0);

    switch (last_res) {
    case REGM_MATCH_DONE:
      if (!lt(plus(i, one), end)) {
    case REGM_MATCH:
        regex_machine_cleanup(&regm);
        return minus(end, pos);
      }
      /* fallthrough */
    case REGM_INCOMPLETE:
    case REGM_FAIL:
      regex_machine_cleanup(&regm);
      break;
    }

    pos = succ(pos);
  }

  return nil;
}

val regex_prefix_match(val reg, val str, val pos)
{
  val self = lit("regex-prefix-match");
  regex_machine_t regm;
  val i;
  regm_result_t last_res;

  if (null_or_missing_p(pos)) {
    pos = zero;
  } else if (minusp(pos)) {
    pos = plus(pos, length_str(str));
    if (minusp(pos))
      return nil;
  } else if (length_str_lt(str, pos)) {
    return nil;
  }

  regex_machine_init(self, &regm, reg);

  last_res = regex_machine_infer_init_state(&regm);

  for (i = pos; length_str_gt(str, i); i = plus(i, one)) {
    last_res = regex_machine_feed(&regm, c_chr(chr_str(str, i)));
    if (last_res == REGM_FAIL || last_res == REGM_MATCH_DONE)
      break;
  }

  regex_machine_cleanup(&regm);

  switch (last_res) {
  case REGM_INCOMPLETE:
  case REGM_MATCH:
  case REGM_MATCH_DONE:
    return t;
  default:
  case REGM_FAIL:
    return nil;
  }
}

val regsub(val regex, val repl, val str)
{
  val isfunc = functionp(repl);

  if (functionp(regex)) {
    val range = funcall1(regex, str);

    if (!range)
      return str;

    {
      val rf = from(range);
      val rt = to(range);
      val scopy = copy_str(str);

      return replace_str(scopy, if3(isfunc,
                                    funcall1(repl, sub_str(scopy, rf, rt)),
                                    repl),
                         rf, rt);
    }
  } else {
    val pos = zero;
    val out = mkustring(zero);
    val slen = if2(stringp(regex), length(regex));

    do {
      val find, len;

      if (slen) {
        len = slen;
        find = search_str(str, regex, pos, nil);
      } else {
        cons_bind (a, d, search_regex(str, regex, pos, nil));
        find = a;
        len = d;
      }

      if (!find) {
        if (pos == zero)
          return str;
        return string_extend(out, sub_str(str, pos, nil), t);
      }
      string_extend(out, sub_str(str, pos, find), nil);
      string_extend(out, if3(isfunc,
                             funcall1(repl, sub_str(str, find,
                                                    plus(find, len))),
                             repl),
                    nil);
      if (len == zero && eql(find, pos)) {
        if (lt(pos, length_str(str))) {
          string_extend(out, chr_str(str, pos), nil);
          pos = plus(pos, one);
        }
      } else {
        pos = plus(find, len);
      }
    } while (lt(pos, length_str(str)));

    return string_finish(out);
  }
}

val search_regst(val haystack, val needle_regex, val start_num, val from_end)
{
  val range = range_regex(haystack, needle_regex, start_num, from_end);
  return if2(range, sub_str(haystack, from(range), to(range)));
}

val match_regst(val str, val regex, val pos_in)
{
  val pos = if3(null_or_missing_p(pos_in),
                zero,
                if3(minusp(pos_in),
                    plus(pos_in, length_str(str)), pos_in));
  val new_pos = if3(minusp(pos), nil, match_regex(str, regex, pos));
  return if2(new_pos, sub_str(str, pos, new_pos));
}

static val match_regst_right_old(val str, val regex, val end)
{
  val len = match_regex_right_old(str, regex, end);
  return if2(len, if3(null_or_missing_p(end),
                      sub_str(str, neg(len), zero),
                      sub_str(str, minus(end, len), end)));
}

val match_regst_right(val str, val regex, val end)
{
  val len = match_regex_right(str, regex, end);
  return if2(len, if3(null_or_missing_p(end),
                      sub_str(str, neg(len), zero),
                      sub_str(str, minus(end, len), end)));
}
static val do_match_full(val regex, val str)
{
   return if2(eql(match_regex(str, regex, zero), length_str(str)), str);
}

static val do_match_full_offs(val env, val str)
{
  cons_bind (regex, pos_in, env);
  val len = length_str(str);
  val pos = if3(minusp(pos_in), plus(pos_in, len), pos_in);
  return if3(minusp(pos),
             nil,
             if2(eql(match_regex(str, regex, pos), len),
                 sub_str(str, pos, t)));
}

val regex_match_full_fun(val regex, val pos)
{
  if (null_or_missing_p(pos))
    return func_f1(regex, do_match_full);
  return func_f1(cons(regex, pos), do_match_full_offs);
}

static val do_match_left(val regex, val str)
{
  return match_regst(str, regex, zero);
}

static val do_match_left_offs(val env, val str)
{
  cons_bind (regex, pos, env);
  return match_regst(str, regex, pos);
}

val regex_match_left_fun(val regex, val pos)
{
  if (null_or_missing_p(pos))
    return func_f1(regex, do_match_left);
  return func_f1(cons(regex, pos), do_match_left_offs);
}

static val do_match_right(val regex, val str)
{
  return match_regst_right(str, regex, nil);
}

static val do_match_right_offs(val env, val str)
{
  cons_bind (regex, end, env);
  return match_regst_right(str, regex, end);
}

val regex_match_right_fun(val regex, val end)
{
  if (null_or_missing_p(end))
    return func_f1(regex, do_match_right);
  return func_f1(cons(regex, end), do_match_right_offs);
}

val regex_match_full(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2)) {
    val str = arg1;
    return if2(eql(match_regex(arg1, regex, arg2), length_str(str)), str);
  } else {
    val str = arg2;
    val len = length_str(str);
    val pos = if3(minusp(arg1), plus(len, arg1), arg1);
    return if3(minusp(pos),
               nil,
               if2(eql(match_regex(str, regex, pos), len),
                   sub_str(str, pos, t)));
  }
}

val regex_match_left(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2))
    return match_regst(arg1, regex, arg2);
  return match_regst(arg2, regex, arg1);
}

val regex_match_right(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2))
    return match_regst_right(arg1, regex, arg2);
  return match_regst_right(arg2, regex, arg1);
}

val regex_range_full(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2)) {
    val str = arg1;
    val len = length_str(str);
    return if2(eql(match_regex(str, regex, zero), len), rcons(zero, len));
  } else {
    val str = arg2;
    val len = length_str(str);
    val pos = if3(minusp(arg1), plus(len, arg1), arg1);
    return if3(minusp(pos), nil,
               if2(eql(match_regex(str, regex, pos), len), rcons(pos, len)));
  }
}

val regex_range_left(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2)) {
    val len = match_regex(arg1, regex, arg2);
    return if2(len, rcons(zero, len));
  } else {
    val pos = if3(minusp(arg1), plus(arg1, length_str(arg2)), arg1);
    val new_pos = if3(minusp(pos), nil, match_regex(arg2, regex, pos));
    return if2(new_pos, rcons(pos, new_pos));
  }
}

val regex_range_right(val regex, val arg1, val arg2)
{
  if (null_or_missing_p(arg2)) {
    val len = match_regex_right(arg1, regex, arg2);
    if (len) {
      val slen = length_str(arg1);
      return rcons(minus(slen, len), slen);
    } else {
      return nil;
    }
  } else {
    val end = if3(minusp(arg1), plus(arg1, length_str(arg2)), arg1);
    val len = match_regex_right(arg2, regex, end);
    return if2(len, rcons(minus(end, len), end));
  }
}

val regex_range_search(val regex, val arg1, val arg2, val arg3)
{
  if (missingp(arg2)) {
    return range_regex(arg1, regex, zero, nil);
  } else if (missingp(arg3)) {
    return range_regex(arg2, regex, arg1, nil);
  } else {
    return range_regex(arg3, regex, arg1, arg2);
  }
}

val regex_range_all(val regex, val arg1, val arg2, val arg3)
{
  if (missingp(arg2)) {
    return range_regex_all(arg1, regex, zero, nil);
  } else if (missingp(arg3)) {
    return range_regex_all(arg2, regex, arg1, nil);
  } else {
    return range_regex_all(arg3, regex, arg1, arg2);
  }
}

val regex_range_full_fun(val regex, val pos)
{
  return pa_123_3(func_n3(regex_range_full),
                  regex, default_arg(pos, zero));
}

val regex_range_left_fun(val regex, val pos)
{
  return pa_123_3(func_n3(regex_range_left),
                  regex, default_arg(pos, zero));
}

val regex_range_right_fun(val regex, val end)
{
  if (null_or_missing_p(end))
    return pa_123_2(func_n3(regex_range_right), regex, end);
  return pa_123_3(func_n3(regex_range_left), regex, end);
}

val regex_range_search_fun(val regex, val start, val from_end)
{
  return pa_1234_1(func_n4(range_regex), regex, start, from_end);
}

static val scan_until_common(val self, val regex, val stream_in,
                             val include_match_in, val accum)
{
  regex_machine_t regm;
  val out = nil;
  u64_t count = 0;
  val stack = nil;
  val match = nil;
  val stream = default_arg(stream_in, std_input);
  val include_match = default_null_arg(include_match_in);

  regex_machine_init(self, &regm, regex);

  for (;;) {
    val ch = get_char(stream);

    if (!ch) {
      switch (regex_machine_feed(&regm, 0)) {
      case REGM_FAIL:
      case REGM_INCOMPLETE:
        if (match)
          goto out_match;
        break;
      case REGM_MATCH:
      case REGM_MATCH_DONE:
        goto out_match;
      }
      break;
    }

    switch (regex_machine_feed(&regm, c_chr(ch))) {
    case REGM_FAIL:
      unget_char(ch, stream);

      if (match)
        goto out_match;

      while (stack)
        unget_char(rcyc_pop(&stack), stream);

      ch = get_char(stream);

      if (accum) {
        if (!out)
          out = mkstring(one, ch);
        else
          string_extend(out, ch, nil);
      } else {
        count++;
      }

      regex_machine_reset(&regm);
      continue;
    case REGM_MATCH_DONE:
      match = stack;
      goto out_match;
    case REGM_MATCH:
      match = stack;
      continue;
    case REGM_INCOMPLETE:
      push(ch, &stack);
      continue;
    }

    break;
  }

  if (nil) {
out_match:
    while (stack && stack != match)
      unget_char(rcyc_pop(&stack), stream);
    if (accum && !out)
      out = null_string;
    if (include_match)
      out = cat_str(cons(out, stack = nreverse(stack)), nil);
    if (!accum && match) {
      val c = unum_64(count);
      out = if3(include_match, cons(c, out), c);
    }
  }

  regex_machine_cleanup(&regm);

  while (stack)
    rcyc_pop(&stack);

  return out;
}

val read_until_match(val regex, val stream_in, val include_match_in)
{
  return scan_until_common(lit("read-until-match"), regex, stream_in,
                           include_match_in, t);
}

val scan_until_match(val regex, val stream_in)
{
  return scan_until_common(lit("scan-until-match"), regex, stream_in, t, nil);
}

val count_until_match(val regex, val stream_in)
{
  return scan_until_common(lit("count-until-match"), regex, stream_in, nil, nil);
}

static val trim_left(val regex, val string)
{
  if (regexp(regex)) {
    val pos = match_regex(string, regex, nil);
    if (pos)
      return sub_str(string, pos, t);
  } else if (starts_with(regex, string, nil, nil)) {
    return sub_str(string, length(regex), t);
  }

  return string;
}

static val trim_right(val regex, val string)
{
  if (regexp(regex)) {
    val pos = match_regex_right(string, regex, nil);
    if (pos)
      return sub_str(string, zero, minus(length(string), pos));
  } else if (ends_with(regex, string, nil, nil)) {
    return sub_str(string, zero, minus(length(string), length(regex)));
  }

  return string;
}

static char_set_t *create_wide_cs(void)
{
#ifdef FULL_UNICODE
  chset_type_t cst = CHSET_XLARGE;
#else
  chset_type_t cst = CHSET_LARGE;
#endif

  char_set_t *cs = char_set_create(cst, 0, 1);

  char_set_add_range(cs, 0x1100, 0x11F9);
  char_set_add_range(cs, 0x2329, 0x232A);
  char_set_add_range(cs, 0x2E80, 0x2E99);
  char_set_add_range(cs, 0x2E9B, 0x2EF3);
  char_set_add_range(cs, 0x2F00, 0x2FD5);
  char_set_add_range(cs, 0x2FF0, 0x2FFB);
  char_set_add_range(cs, 0x3000, 0x303F);
  char_set_add_range(cs, 0x3041, 0x3096);
  char_set_add_range(cs, 0x3099, 0x30FF);
  char_set_add_range(cs, 0x3105, 0x312D);
  char_set_add_range(cs, 0x3131, 0x318E);
  char_set_add_range(cs, 0x3190, 0x31BA);
  char_set_add_range(cs, 0x31C0, 0x31E3);
  char_set_add_range(cs, 0x31F0, 0x321E);
  char_set_add_range(cs, 0x3220, 0x3247);
  char_set_add_range(cs, 0x3250, 0x4DBF);
  char_set_add_range(cs, 0x4E00, 0x9FFF);
  char_set_add_range(cs, 0xA000, 0xA48C);
  char_set_add_range(cs, 0xA490, 0xA4C6);
  char_set_add_range(cs, 0xA960, 0xA97C);
  char_set_add_range(cs, 0xAC00, 0xD7A3);
  char_set_add_range(cs, 0xE000, 0xE757);
  char_set_add_range(cs, 0xF900, 0xFAFF);
  char_set_add_range(cs, 0xFE10, 0xFE19);
  char_set_add_range(cs, 0xFE30, 0xFE52);
  char_set_add_range(cs, 0xFE54, 0xFE6B);
  char_set_add_range(cs, 0xFF01, 0xFF60);
  char_set_add_range(cs, 0xFFE0, 0xFFE6);

#ifdef FULL_UNICODE
  char_set_add_range(cs, 0x1B000, 0x1B001);
  char_set_add_range(cs, 0x1F004, 0x1F004);
  char_set_add_range(cs, 0x1F0CF, 0x1F0CF);
  char_set_add_range(cs, 0x1F170, 0x1F171);
  char_set_add_range(cs, 0x1F17E, 0x1F17F);
  char_set_add_range(cs, 0x1F191, 0x1F19A);
  char_set_add_range(cs, 0x1F200, 0x1F202);
  char_set_add_range(cs, 0x1F210, 0x1F23A);
  char_set_add_range(cs, 0x1F240, 0x1F248);
  char_set_add_range(cs, 0x1F250, 0x1F251);
  char_set_add_range(cs, 0x1F300, 0x1F7FF);
  char_set_add_range(cs, 0x1F900, 0x1FAFF);
  char_set_add_range(cs, 0x20000, 0x2FFFF);
  char_set_add_range(cs, 0x30000, 0x3FFFF);
#endif

  return cs;
}

static char_set_t *wide_cs;

int wide_display_char_p(wchar_t ch)
{
  if (ch < 0x1100)
    return 0;

  if (!wide_cs)
    wide_cs = create_wide_cs();

  return char_set_contains(wide_cs, ch);
}

val space_k, digit_k, word_char_k;
val cspace_k, cdigit_k, cword_char_k;

void regex_init(void)
{
  space_k = intern(lit("space"), keyword_package);
  digit_k = intern(lit("digit"), keyword_package);
  word_char_k = intern(lit("word-char"), keyword_package);
  cspace_k = intern(lit("cspace"), keyword_package);
  cdigit_k = intern(lit("cdigit"), keyword_package);
  cword_char_k = intern(lit("cword-char"), keyword_package);

  regex_cls = cobj_register(regex_s);
  chset_cls = cobj_register(chset_s);

  reg_fun(intern(lit("regex-compile"), user_package), func_n2o(regex_compile, 1));
  reg_fun(intern(lit("regexp"), user_package), func_n1(regexp));
  reg_fun(intern(lit("regex-source"), user_package), func_n1(regex_source));
  reg_fun(intern(lit("search-regex"), user_package), func_n4o(search_regex, 2));
  reg_fun(intern(lit("range-regex"), user_package), func_n4o(range_regex, 2));
  reg_fun(intern(lit("search-regst"), user_package), func_n4o(search_regst, 2));
  reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex_len, 2));
  reg_fun(intern(lit("match-regst"), user_package), func_n3o(match_regst, 2));
  reg_fun(intern(lit("match-regex-right"), user_package),
          func_n3o(match_regex_right, 2));
  reg_fun(intern(lit("match-regst-right"), user_package),
          func_n3o(match_regst_right, 2));
  reg_fun(intern(lit("regex-prefix-match"), user_package),
          func_n3o(regex_prefix_match, 2));
  reg_fun(intern(lit("regsub"), user_package), func_n3(regsub));
  reg_fun(intern(lit("regex-parse"), user_package), func_n2o(regex_parse, 1));

  reg_fun(intern(lit("reg-expand-nongreedy"), system_package),
          func_n1(reg_expand_nongreedy));
  reg_fun(intern(lit("regex-optimize"), user_package), func_n1(regex_optimize));
  reg_fun(intern(lit("read-until-match"), user_package), func_n3o(read_until_match, 1));
  reg_fun(intern(lit("scan-until-match"), user_package), func_n2(scan_until_match));
  reg_fun(intern(lit("count-until-match"), user_package), func_n2(count_until_match));
  reg_fun(intern(lit("f^$"), user_package), func_n2o(regex_match_full_fun, 1));
  reg_fun(intern(lit("f^"), user_package), func_n2o(regex_match_left_fun, 1));
  reg_fun(intern(lit("f$"), user_package), func_n2o(regex_match_right_fun, 1));
  reg_fun(intern(lit("m^$"), user_package), func_n3o(regex_match_full, 2));
  reg_fun(intern(lit("m^"), user_package), func_n3o(regex_match_left, 2));
  reg_fun(intern(lit("m$"), user_package), func_n3o(regex_match_right, 2));
  reg_fun(intern(lit("r^$"), user_package), func_n3o(regex_range_full, 2));
  reg_fun(intern(lit("r^"), user_package), func_n3o(regex_range_left, 2));
  reg_fun(intern(lit("r$"), user_package), func_n3o(regex_range_right, 2));
  reg_fun(intern(lit("rr"), user_package), func_n4o(regex_range_search, 2));
  reg_fun(intern(lit("rra"), user_package), func_n4o(regex_range_all, 2));
  reg_fun(intern(lit("fr^$"), user_package), func_n2o(regex_range_full_fun, 1));
  reg_fun(intern(lit("fr^"), user_package), func_n2o(regex_range_left_fun, 1));
  reg_fun(intern(lit("fr$"), user_package), func_n2o(regex_range_right_fun, 1));
  reg_fun(intern(lit("frr"), user_package), func_n3o(regex_range_search_fun, 1));
  reg_fun(intern(lit("trim-left"), user_package), func_n2(trim_left));
  reg_fun(intern(lit("trim-right"), user_package), func_n2(trim_right));
  init_special_char_sets();
}

void regex_compat_fixup(int compat_ver)
{
  if (compat_ver <= 150) {
    reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex, 2));
    reg_fun(intern(lit("match-regex-right"), user_package),
            func_n3o(match_regex_right_old, 2));
    reg_fun(intern(lit("match-regst-right"), user_package),
            func_n3o(match_regst_right_old, 2));
  }
}

void regex_free_all(void)
{
  char_set_destroy(space_cs, 1);
  char_set_destroy(digit_cs, 1);
  char_set_destroy(word_cs, 1);
  char_set_destroy(cspace_cs, 1);
  char_set_destroy(cdigit_cs, 1);
  char_set_destroy(cword_cs, 1);
  char_set_destroy(wide_cs, 1);
}