From 037595438947eeb4c63a0185065b0143a7f2ac2c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 30 Dec 2017 10:16:10 -0500 Subject: [PATCH] regex_macros: delete it The regex_macros crate hasn't been maintained in quite some time, and has been broken. Nobody has complained. Given the fact that there are no immediate plans to improve the situation, and the fact that it is slower than the runtime engine, we simply remove it. --- HACKING.md | 67 ++--- README.md | 31 --- bench/Cargo.toml | 1 - bench/run | 5 +- bench/src/bench.rs | 15 +- bench/src/main.rs | 1 - bench/src/misc.rs | 1 - ci/run-kcov | 14 - regex_macros/Cargo.toml | 35 --- regex_macros/src/lib.rs | 600 ---------------------------------------- src/input.rs | 9 - src/lib.rs | 9 +- src/re_plugin.rs | 93 ------- src/re_unicode.rs | 242 +++------------- tests/test_plugin.rs | 31 --- 15 files changed, 73 insertions(+), 1081 deletions(-) delete mode 100644 regex_macros/Cargo.toml delete mode 100644 regex_macros/src/lib.rs delete mode 100644 src/re_plugin.rs delete mode 100644 tests/test_plugin.rs diff --git a/HACKING.md b/HACKING.md index 9556de6ecc..718076c9c6 100644 --- a/HACKING.md +++ b/HACKING.md @@ -185,37 +185,36 @@ A regular expression program is essentially a sequence of opcodes produced by the compiler plus various facts about the regular expression (such as whether it is anchored, its capture names, etc.). -### The regex! macro (or why `regex::internal` exists) - -The `regex!` macro is defined in the `regex_macros` crate as a compiler plugin, -which is maintained in this repository. The `regex!` macro compiles a regular -expression at compile time into specialized Rust code. - -The `regex!` macro was written when this library was first conceived and -unfortunately hasn't changed much since then. In particular, it encodes the -entire Pike VM into stack allocated space (no heap allocation is done). When -`regex!` was first written, this provided a substantial speed boost over -so-called "dynamic" regexes compiled at runtime, and in particular had much -lower overhead per match. This was because the only matching engine at the -time was the Pike VM. The addition of other matching engines has inverted -the relationship; the `regex!` macro is almost never faster than the dynamic -variant. (In fact, it is typically substantially slower.) - -In order to build the `regex!` macro this way, it must have access to some -internals of the regex library, which is in a distinct crate. (Compiler plugins -must be part of a distinct crate.) Namely, it must be able to compile a regular -expression and access its opcodes. The necessary internals are exported as part -of the top-level `internal` module in the regex library, but is hidden from -public documentation. In order to present a uniform API between programs build -by the `regex!` macro and their dynamic analoges, the `Regex` type is an enum -whose variants are hidden from public documentation. - -In the future, the `regex!` macro should probably work more like Ragel, but -it's not clear how hard this is. In particular, the `regex!` macro should be -able to support all the features of dynamic regexes, which may be hard to do -with a Ragel-style implementation approach. (Which somewhat suggests that the -`regex!` macro may also need to grow conditional execution logic like the -dynamic variants, which seems rather grotesque.) +### The regex! macro + +The `regex!` macro no longer exists. It was developed in a bygone era as a +compiler plugin during the infancy of the regex crate. Back then, then only +matching engine in the crate was the Pike VM. The `regex!` macro was, itself, +also a Pike VM. The only advantages it offered over the dynamic Pike VM that +was built at runtime were the following: + + 1. Syntax checking was done at compile time. Your Rust program wouldn't + compile if your regex didn't compile. + 2. Reduction of overhead that was proportional to the size of the regex. + For the most part, this overhead consisted of heap allocation, which + was nearly eliminated in the compiler plugin. + +The main takeaway here is that the compiler plugin was a marginally faster +version of a slow regex engine. As the regex crate evolved, it grew other regex +engines (DFA, bounded backtracker) and sophisticated literal optimizations. +The regex macro didn't keep pace, and it therefore became (dramatically) slower +than the dynamic engines. The only reason left to use it was for the compile +time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint +tool) has a lint that checks your regular expression validity, which mostly +replaces that use case. + +Additionally, the regex compiler plugin stopped receiving maintenance. Nobody +complained. At that point, it seemed prudent to just remove it. + +Will a compiler plugin be brought back? The future is murky, but there is +definitely an opportunity there to build something that is faster than the +dynamic engines in some cases. But it will be challenging! As of now, there +are no plans to work on this. ## Testing @@ -236,7 +235,6 @@ the AT&T test suite) and code generate tests for each matching engine. The approach we use in this library is to create a Cargo.toml entry point for each matching engine we want to test. The entry points are: -* `tests/test_plugin.rs` - tests the `regex!` macro * `tests/test_default.rs` - tests `Regex::new` * `tests/test_default_bytes.rs` - tests `bytes::Regex::new` * `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA @@ -261,10 +259,6 @@ entry points, it can take a while to compile everything. To reduce compile times slightly, try using `cargo test --test default`, which will only use the `tests/test_default.rs` entry point. -N.B. To run tests for the `regex!` macro, use: - - cargo test --manifest-path regex_macros/Cargo.toml - ## Benchmarking @@ -284,7 +278,6 @@ separately from the main regex crate. Benchmarking follows a similarly wonky setup as tests. There are multiple entry points: -* `bench_rust_plugin.rs` - benchmarks the `regex!` macro * `bench_rust.rs` - benchmarks `Regex::new` * `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` * `bench_pcre.rs` - benchmarks PCRE diff --git a/README.md b/README.md index ebffe39d2c..1f6eddbadf 100644 --- a/README.md +++ b/README.md @@ -188,37 +188,6 @@ assert!(!matches.matched(5)); assert!(matches.matched(6)); ``` -### Usage: `regex!` compiler plugin - -**WARNING**: The `regex!` compiler plugin is orders of magnitude slower than -the normal `Regex::new(...)` usage. You should not use the compiler plugin -unless you have a very special reason for doing so. The performance difference -may be the temporary, but the path forward at this point isn't clear. - -The `regex!` compiler plugin will compile your regexes at compile time. **This -only works with a nightly compiler.** - -Here is a small example: - -```rust -#![feature(plugin)] - -#![plugin(regex_macros)] -extern crate regex; - -fn main() { - let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); - let caps = re.captures("2010-03-14").unwrap(); - - assert_eq!("2010", caps[1]); - assert_eq!("03", caps[2]); - assert_eq!("14", caps[3]); -} -``` - -Notice that we never `unwrap` the result of `regex!`. This is because your -*program* won't compile if the regex doesn't compile. (Try `regex!("(")`.) - ### Usage: a regular expression parser diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 01544d2525..bebac3ee5e 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -49,7 +49,6 @@ re-onig = ["onig"] re-re2 = [] re-rust = [] re-rust-bytes = [] -re-rust-plugin = ["regex_macros"] re-tcl = [] [[bench]] diff --git a/bench/run b/bench/run index 1d0321c1db..40004d9ad3 100755 --- a/bench/run +++ b/bench/run @@ -1,7 +1,7 @@ #!/bin/bash usage() { - echo "Usage: $(basename $0) [rust | rust-bytes | rust-plugin | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 + echo "Usage: $(basename $0) [rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2 exit 1 } @@ -22,9 +22,6 @@ case $which in rust-bytes) exec cargo bench --bench bench --features re-rust-bytes "$@" ;; - rust-plugin) - exec cargo bench --bench bench --features re-rust-plugin "$@" - ;; re2) exec cargo bench --bench bench --features re-re2 "$@" ;; diff --git a/bench/src/bench.rs b/bench/src/bench.rs index a45079edc0..6ddadec8c9 100644 --- a/bench/src/bench.rs +++ b/bench/src/bench.rs @@ -11,11 +11,6 @@ // Enable the benchmarking harness. #![feature(test)] -// If we're benchmarking the Rust regex plugin, then pull that in. -// This will bring a `regex!` macro into scope. -#![cfg_attr(feature = "re-rust-plugin", feature(plugin))] -#![cfg_attr(feature = "re-rust-plugin", plugin(regex_macros))] - #[macro_use] extern crate lazy_static; #[cfg(not(any(feature = "re-rust", feature = "re-rust-bytes")))] @@ -27,7 +22,6 @@ extern crate onig; #[cfg(any( feature = "re-rust", feature = "re-rust-bytes", - feature = "re-rust-plugin", ))] extern crate regex; #[cfg(feature = "re-rust")] @@ -43,7 +37,7 @@ pub use ffi::pcre1::Regex; pub use ffi::pcre2::Regex; #[cfg(feature = "re-re2")] pub use ffi::re2::Regex; -#[cfg(any(feature = "re-rust", feature = "re-rust-plugin"))] +#[cfg(feature = "re-rust")] pub use regex::Regex; #[cfg(feature = "re-rust-bytes")] pub use regex::bytes::Regex; @@ -52,14 +46,11 @@ pub use ffi::tcl::Regex; // Usage: regex!(pattern) // -// Builds a ::Regex from a borrowed string. This is used in every regex -// engine except for the Rust plugin, because the plugin itself defines the -// same macro. +// Builds a ::Regex from a borrowed string. // // Due to macro scoping rules, this definition only applies for the modules // defined below. Effectively, it allows us to use the same tests for both // native and dynamic regexes. -#[cfg(not(feature = "re-rust-plugin"))] macro_rules! regex { ($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() } } @@ -99,7 +90,6 @@ macro_rules! text { feature = "re-pcre2", feature = "re-re2", feature = "re-rust", - feature = "re-rust-plugin", ))] macro_rules! text { ($text:expr) => { $text } @@ -116,7 +106,6 @@ type Text = Vec; feature = "re-pcre2", feature = "re-re2", feature = "re-rust", - feature = "re-rust-plugin", ))] type Text = String; diff --git a/bench/src/main.rs b/bench/src/main.rs index 7b9abb437f..2837b82105 100644 --- a/bench/src/main.rs +++ b/bench/src/main.rs @@ -18,7 +18,6 @@ extern crate onig; #[cfg(any( feature = "re-rust", feature = "re-rust-bytes", - feature = "re-rust-plugin", ))] extern crate regex; #[cfg(feature = "re-rust")] diff --git a/bench/src/misc.rs b/bench/src/misc.rs index 86f93c4878..edb274e9c7 100644 --- a/bench/src/misc.rs +++ b/bench/src/misc.rs @@ -19,7 +19,6 @@ use {Regex, Text}; #[cfg(not(feature = "re-onig"))] #[cfg(not(feature = "re-pcre1"))] #[cfg(not(feature = "re-pcre2"))] -#[cfg(not(feature = "re-rust-plugin"))] bench_match!(no_exponential, { format!( "{}{}", diff --git a/ci/run-kcov b/ci/run-kcov index 092123775d..0ef842c319 100755 --- a/ci/run-kcov +++ b/ci/run-kcov @@ -14,15 +14,10 @@ tests=( regex ) tmpdir=$(mktemp -d) -with_plugin= coveralls_id= while true; do case "$1" in - --with-plugin) - with_plugin=yes - shift - ;; --coveralls-id) coveralls_id="$2" shift 2 @@ -33,15 +28,6 @@ while true; do esac done -if [ -n "$with_plugin" ]; then - cargo test --manifest-path regex_macros/Cargo.toml --no-run --verbose - kcov \ - --verify \ - --include-pattern '/regex/src/' \ - "$tmpdir/plugin" \ - $(ls -t ./regex_macros/target/debug/plugin-* | head -n1) -fi - cargo test --no-run --verbose --jobs 4 for t in ${tests[@]}; do kcov \ diff --git a/regex_macros/Cargo.toml b/regex_macros/Cargo.toml deleted file mode 100644 index 7fd4ecd7a5..0000000000 --- a/regex_macros/Cargo.toml +++ /dev/null @@ -1,35 +0,0 @@ -[package] -name = "regex_macros" -version = "0.2.0" -authors = ["The Rust Project Developers"] -license = "MIT/Apache-2.0" -repository = "https://github.com/rust-lang/regex" -homepage = "https://github.com/rust-lang/regex" -description = """ -An implementation of statically compiled regular expressions for Rust. - -Unless you specifically need compile time regular expressions or a matching -engine that is guaranteed not to allocate, you should temporarily prefer using -the plain regex crate (since it is almost always faster). -""" - -[lib] -name = "regex_macros" -plugin = true - -[dependencies.regex] -path = ".." -version = "0.2.0" -features = ["pattern"] - -[dependencies.regex-syntax] -path = "../regex-syntax" -version = "0.4.0" - -[dev-dependencies] -# For generating random test data. -rand = "0.3.15" - -[[test]] -path = "../tests/test_plugin.rs" -name = "plugin" diff --git a/regex_macros/src/lib.rs b/regex_macros/src/lib.rs deleted file mode 100644 index c8353f1780..0000000000 --- a/regex_macros/src/lib.rs +++ /dev/null @@ -1,600 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! This crate provides the `regex!` macro. Its use is documented in the -//! `regex` crate. - -#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", - html_favicon_url = "http://www.rust-lang.org/favicon.ico", - html_root_url = "http://doc.rust-lang.org/nightly/")] - -#![feature(plugin_registrar, quote, rustc_private)] - -extern crate regex; -extern crate regex_syntax; -extern crate rustc_plugin; -extern crate syntax; - -use std::collections::BTreeMap; -use std::usize; - -use syntax::ast; -use syntax::codemap; -use syntax::tokenstream; -use syntax::ext::build::AstBuilder; -use syntax::ext::base::{ExtCtxt, MacResult, MacEager, DummyResult}; -use syntax::parse::token; -use syntax::print::pprust; -use syntax::fold::Folder; -use syntax::ptr::P; - -use rustc_plugin::Registry; - -use regex::internal::{Compiler, EmptyLook, Inst, Program}; -use regex_syntax::Expr; - -/// For the `regex!` syntax extension. Do not use. -#[plugin_registrar] -#[doc(hidden)] -pub fn plugin_registrar(reg: &mut Registry) { - reg.register_macro("regex", native); -} - -/// Generates specialized code for the Pike VM for a particular regular -/// expression. -/// -/// There are two primary differences between the code generated here and the -/// general code in vm.rs. -/// -/// 1. All heap allocation is removed. Sized vector types are used instead. -/// Care must be taken to make sure that these vectors are not copied -/// gratuitously. (If you're not sure, run the benchmarks. They will yell -/// at you if you do.) -/// 2. The main `match instruction { ... }` expressions are replaced with more -/// direct `match pc { ... }`. The generators can be found in -/// `step_insts` and `add_insts`. -/// -/// It is strongly recommended to read the dynamic implementation in vm.rs -/// first before trying to understand the code generator. The implementation -/// strategy is identical and vm.rs has comments and will be easier to follow. -fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[tokenstream::TokenTree]) - -> Box { - let regex = match parse(cx, tts) { - Some(r) => r, - // error is logged in 'parse' with cx.span_err - None => return DummyResult::any(sp), - }; - // We use the largest possible size limit because this is happening at - // compile time. We trust the programmer. - let expr = match Expr::parse(®ex) { - Ok(expr) => expr, - Err(err) => { - cx.span_err(sp, &err.to_string()); - return DummyResult::any(sp) - } - }; - let prog = match Compiler::new().size_limit(usize::MAX).compile(&[expr]) { - Ok(re) => re, - Err(err) => { - cx.span_err(sp, &err.to_string()); - return DummyResult::any(sp) - } - }; - let names = prog.captures.iter().cloned().collect(); - let mut gen = NfaGen { - cx: cx, - sp: sp, - prog: prog, - names: names, - original: regex, - }; - MacEager::expr(gen.code()) -} - -struct NfaGen<'cx, 'a: 'cx> { - cx: &'cx ExtCtxt<'a>, - sp: codemap::Span, - prog: Program, - names: Vec>, - original: String, -} - -impl<'a, 'cx> NfaGen<'a, 'cx> { - fn code(&mut self) -> P { - // Most or all of the following things are used in the quasiquoted - // expression returned. - let num_cap_locs = 2 * self.prog.captures.len(); - let num_insts = self.prog.len(); - let cap_names = self.vec_expr(self.names.iter(), - &mut |cx, name| match *name { - Some(ref name) => { - let name = &**name; - quote_expr!(cx, Some($name)) - } - None => cx.expr_none(self.sp), - } - ); - let capture_name_idx = { - let mut capture_name_idx = BTreeMap::new(); - for (i, name) in self.names.iter().enumerate() { - if let Some(ref name) = *name { - capture_name_idx.insert(name.to_owned(), i); - } - } - self.vec_expr(capture_name_idx.iter(), - &mut |cx, (name, group_idx)| - quote_expr!(cx, ($name, $group_idx)) - ) - }; - - let is_anchored_start = self.prog.is_anchored_start; - let step_insts = self.step_insts(); - let add_insts = self.add_insts(); - let regex = &*self.original; - - quote_expr!(self.cx, { -// When `regex!` is bound to a name that is not used, we have to make sure -// that dead_code warnings don't bubble up to the user from the generated -// code. Therefore, we suppress them by allowing dead_code. The effect is that -// the user is only warned about *their* unused variable/code, and not the -// unused code generated by regex!. See #14185 for an example. -#[allow(dead_code)] -static CAPTURES: &'static [Option<&'static str>] = &$cap_names; -#[allow(dead_code)] -static CAPTURE_NAME_IDX: &'static [(&'static str, usize)] = &$capture_name_idx; - -#[allow(dead_code)] -fn exec<'t>( - mut caps: &mut [Option], - input: &'t str, - start: usize, -) -> bool { - #![allow(unused_imports)] - #![allow(unused_mut)] - - use regex::internal::{Char, CharInput, InputAt, Input, Inst}; - - let input = CharInput::new(input.as_bytes()); - let at = input.at(start); - return Nfa { - input: input, - ncaps: caps.len(), - }.exec(&mut NfaThreads::new(), &mut caps, at); - - struct Nfa<'t> { - input: CharInput<'t>, - ncaps: usize, - } - - impl<'t> Nfa<'t> { - #[allow(unused_variables)] - fn exec( - &mut self, - mut q: &mut NfaThreads, - mut caps: &mut [Option], - mut at: InputAt, - ) -> bool { - let mut matched = false; - let (mut clist, mut nlist) = (&mut q.clist, &mut q.nlist); - clist.empty(); nlist.empty(); -'LOOP: loop { - if clist.size == 0 { - if matched || (!at.is_start() && $is_anchored_start) { - break; - } - // TODO: Prefix matching... Hmm. - // Prefix matching now uses a DFA, so I think this is - // going to require encoding that DFA statically. - } - if clist.size == 0 || (!$is_anchored_start && !matched) { - self.add(clist, &mut caps, 0, at); - } - let at_next = self.input.at(at.next_pos()); - for i in 0..clist.size { - let pc = clist.pc(i); - let tcaps = clist.caps(i); - if self.step(nlist, caps, tcaps, pc, at, at_next) { - matched = true; - if caps.len() == 0 { - break 'LOOP; - } - break; - } - } - if at.char().is_none() { - break; - } - at = at_next; - ::std::mem::swap(&mut clist, &mut nlist); - nlist.empty(); - } - matched - } - - // Sometimes `nlist` is never used (for empty regexes). - #[allow(unused_variables)] - #[inline] - fn step( - &self, - nlist: &mut Threads, - caps: &mut [Option], - thread_caps: &mut [Option], - pc: usize, - at: InputAt, - at_next: InputAt, - ) -> bool { - $step_insts; - false - } - - fn add( - &self, - nlist: &mut Threads, - thread_caps: &mut [Option], - pc: usize, - at: InputAt, - ) { - if nlist.contains(pc) { - return; - } - let ti = nlist.add(pc); - $add_insts - } - } - - struct NfaThreads { - clist: Threads, - nlist: Threads, - } - - struct Threads { - dense: [Thread; $num_insts], - sparse: [usize; $num_insts], - size: usize, - } - - struct Thread { - pc: usize, - caps: [Option; $num_cap_locs], - } - - impl NfaThreads { - fn new() -> NfaThreads { - NfaThreads { - clist: Threads::new(), - nlist: Threads::new(), - } - } - - fn swap(&mut self) { - ::std::mem::swap(&mut self.clist, &mut self.nlist); - } - } - - impl Threads { - fn new() -> Threads { - Threads { - // These unsafe blocks are used for performance reasons, as it - // gives us a zero-cost initialization of a sparse set. The - // trick is described in more detail here: - // http://research.swtch.com/sparse - // The idea here is to avoid initializing threads that never - // need to be initialized, particularly for larger regexs with - // a lot of instructions. - dense: unsafe { ::std::mem::uninitialized() }, - sparse: unsafe { ::std::mem::uninitialized() }, - size: 0, - } - } - - #[inline] - fn add(&mut self, pc: usize) -> usize { - let i = self.size; - self.dense[i].pc = pc; - self.sparse[pc] = i; - self.size += 1; - i - } - - #[inline] - fn thread(&mut self, i: usize) -> &mut Thread { - &mut self.dense[i] - } - - #[inline] - fn contains(&self, pc: usize) -> bool { - let s = unsafe { ::std::ptr::read_volatile(&self.sparse[pc]) }; - s < self.size && self.dense[s].pc == pc - } - - #[inline] - fn empty(&mut self) { - self.size = 0; - } - - #[inline] - fn pc(&self, i: usize) -> usize { - self.dense[i].pc - } - - #[inline] - fn caps<'r>(&'r mut self, i: usize) -> &'r mut [Option] { - &mut self.dense[i].caps - } - } -} - -::regex::Regex(::regex::internal::_Regex::Plugin(::regex::internal::Plugin { - original: $regex, - names: &CAPTURES, - groups: &CAPTURE_NAME_IDX, - prog: exec, -})) - }) - } - - // Generates code for the `add` method, which is responsible for adding - // zero-width states to the next queue of states to visit. - fn add_insts(&self) -> P { - let arms = self.prog.iter().enumerate().map(|(pc, inst)| { - let body = match *inst { - Inst::EmptyLook(ref inst) => { - let nextpc = inst.goto; - match inst.look { - EmptyLook::StartLine => { - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - if prev.is_none() || prev == '\n' { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::EndLine => { - quote_expr!(self.cx, { - if at.char().is_none() || at.char() == '\n' { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::StartText => { - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - if prev.is_none() { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::EndText => { - quote_expr!(self.cx, { - if at.char().is_none() { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::WordBoundary - | EmptyLook::NotWordBoundary => { - let m = if inst.look == EmptyLook::WordBoundary { - quote_expr!(self.cx, { w1 ^ w2 }) - } else { - quote_expr!(self.cx, { !(w1 ^ w2) }) - }; - quote_expr!(self.cx, { - let prev = self.input.previous_char(at); - let w1 = prev.is_word_char(); - let w2 = at.char().is_word_char(); - if $m { - self.add(nlist, thread_caps, $nextpc, at); - } - }) - } - EmptyLook::WordBoundaryAscii - | EmptyLook::NotWordBoundaryAscii => { - unreachable!() - } - } - } - Inst::Save(ref inst) => { - let nextpc = inst.goto; - let slot = inst.slot; - quote_expr!(self.cx, { - if $slot >= self.ncaps { - self.add(nlist, thread_caps, $nextpc, at); - } else { - let old = thread_caps[$slot]; - thread_caps[$slot] = Some(at.pos()); - self.add(nlist, thread_caps, $nextpc, at); - thread_caps[$slot] = old; - } - }) - } - Inst::Split(ref inst) => { - let (x, y) = (inst.goto1, inst.goto2); - quote_expr!(self.cx, { - self.add(nlist, thread_caps, $x, at); - self.add(nlist, thread_caps, $y, at); - }) - } - // For Match, Char, Ranges - _ => quote_expr!(self.cx, { - let mut t = &mut nlist.thread(ti); - for (slot, val) in t.caps.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } - }), - }; - self.arm_inst(pc, body) - }).collect::>(); - self.match_insts(arms) - } - - // Generates the code for the `step` method, which processes all states - // in the current queue that consume a single character. - fn step_insts(&self) -> P { - let arms = self.prog.iter().enumerate().map(|(pc, inst)| { - let body = match *inst { - Inst::Match(_) => quote_expr!(self.cx, { - for (slot, val) in caps.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } - return true; - }), - Inst::Char(ref inst) => { - let nextpc = inst.goto; - let c = inst.c; - quote_expr!(self.cx, { - if $c == at.char() { - self.add(nlist, thread_caps, $nextpc, at_next); - } - return false; - }) - } - Inst::Ranges(ref inst) => { - let match_class = self.match_class(&inst.ranges); - let nextpc = inst.goto; - quote_expr!(self.cx, { - let mut c = at.char(); - if let Some(c) = c.as_char() { - if $match_class { - self.add(nlist, thread_caps, $nextpc, at_next); - } - } - return false; - }) - } - // EmptyLook, Save, Jump, Split - _ => quote_expr!(self.cx, { return false; }), - }; - self.arm_inst(pc, body) - }).collect::>(); - - self.match_insts(arms) - } - - // Translates a character class into a match expression. - // This avoids a binary search (and is hopefully replaced by a jump - // table). - fn match_class(&self, ranges: &[(char, char)]) -> P { - let mut arms = ranges.iter().map(|&(start, end)| { - let pat = self.cx.pat( - self.sp, ast::PatKind::Range( - quote_expr!(self.cx, $start), quote_expr!(self.cx, $end))); - self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true)) - }).collect::>(); - - arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); - let match_on = quote_expr!(self.cx, c); - self.cx.expr_match(self.sp, match_on, arms) - } - - // Generates code for checking a literal prefix of the search string. - // The code is only generated if the regex *has* a literal prefix. - // Otherwise, a no-op is returned. - // fn check_prefix(&self) -> P { - // if self.prog.prefixes.len() == 0 { - // self.empty_block() - // } else { - // quote_expr!(self.cx, - // if clist.size == 0 { - // let haystack = &self.input.as_bytes()[self.ic..]; - // match find_prefix(prefix_bytes, haystack) { - // None => break, - // Some(i) => { - // self.ic += i; - // next_ic = self.chars.set(self.ic); - // } - // } - // } - // ) - // } - // } - - // Builds a `match pc { ... }` expression from a list of arms, specifically - // for matching the current program counter with an instruction. - // A wild-card arm is automatically added that executes a no-op. It will - // never be used, but is added to satisfy the compiler complaining about - // non-exhaustive patterns. - fn match_insts(&self, mut arms: Vec) -> P { - arms.push(self.wild_arm_expr(self.empty_block())); - self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms) - } - - fn empty_block(&self) -> P { - quote_expr!(self.cx, {}) - } - - // Creates a match arm for the instruction at `pc` with the expression - // `body`. - fn arm_inst(&self, pc: usize, body: P) -> ast::Arm { - let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc)); - - self.cx.arm(self.sp, vec!(pc_pat), body) - } - - // Creates a wild-card match arm with the expression `body`. - fn wild_arm_expr(&self, body: P) -> ast::Arm { - ast::Arm { - attrs: vec!(), - pats: vec!(P(ast::Pat{ - id: ast::DUMMY_NODE_ID, - span: self.sp, - node: ast::PatKind::Wild, - })), - guard: None, - body: body, - } - } - - // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` - // on each element in `xs`. - fn vec_expr>( - &self, - xs: It, - to_expr: &mut FnMut(&ExtCtxt, T) -> P, - ) -> P { - let exprs = xs.map(|x| to_expr(self.cx, x)).collect(); - self.cx.expr_vec(self.sp, exprs) - } -} - -/// Looks for a single string literal and returns it. -/// Otherwise, logs an error with cx.span_err and returns None. -fn parse(cx: &mut ExtCtxt, tts: &[tokenstream::TokenTree]) -> Option { - let mut parser = cx.new_parser_from_tts(tts); - if let Ok(expr) = parser.parse_expr() { - let entry = cx.expander().fold_expr(expr); - let regex = match entry.node { - ast::ExprKind::Lit(ref lit) => { - match lit.node { - ast::LitKind::Str(ref s, _) => s.to_string(), - _ => { - cx.span_err(entry.span, &format!( - "expected string literal but got `{}`", - pprust::lit_to_string(&**lit))); - return None - } - } - } - _ => { - cx.span_err(entry.span, &format!( - "expected string literal but got `{}`", - pprust::expr_to_string(&*entry))); - return None - } - }; - if !parser.eat(&token::Eof) { - cx.span_err(parser.span, "only one string literal allowed"); - return None; - } - Some(regex) - } else { - cx.parse_sess().span_diagnostic.err("failure parsing token tree"); - None - } -} diff --git a/src/input.rs b/src/input.rs index 87bf72d72f..3d87257c01 100644 --- a/src/input.rs +++ b/src/input.rs @@ -383,15 +383,6 @@ impl Char { None | Some(_) => false, } } - - /// Converts the character to a real primitive `char`. - /// - /// If the character is absent, then `None` is returned. - pub fn as_char(self) -> Option { - // This is only used in the `regex!` macro because it expands char - // classes into `match` expressions (instead of binary search). - char::from_u32(self.0) - } } impl From for Char { diff --git a/src/lib.rs b/src/lib.rs index 075ac3f097..73d2264dd5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -641,7 +641,6 @@ mod pikevm; mod prog; mod re_builder; mod re_bytes; -mod re_plugin; mod re_set; mod re_trait; mod re_unicode; @@ -652,9 +651,9 @@ mod simd_accel; mod simd_accel; mod sparse; -/// The `internal` module exists to support the `regex!` macro and other -/// suspicious activity, such as testing different matching engines and -/// supporting the `regex-debug` CLI utility. +/// The `internal` module exists to support suspicious activity, such as +/// testing different matching engines and supporting the `regex-debug` CLI +/// utility. #[doc(hidden)] pub mod internal { pub use compile::Compiler; @@ -662,6 +661,4 @@ pub mod internal { pub use input::{Char, Input, CharInput, InputAt}; pub use literals::LiteralSearcher; pub use prog::{Program, Inst, EmptyLook, InstRanges}; - pub use re_plugin::Plugin; - pub use re_unicode::_Regex; } diff --git a/src/re_plugin.rs b/src/re_plugin.rs deleted file mode 100644 index afd828921b..0000000000 --- a/src/re_plugin.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use re_trait::{RegularExpression, Slot, Locations, as_slots}; - -/// Plugin is the compiler plugin's data structure. It declare some static -/// data (like capture groups and the original regex string), but defines its -/// matching engine as a simple function. -#[doc(hidden)] -pub struct Plugin { - #[doc(hidden)] - pub original: &'static str, - #[doc(hidden)] - pub names: &'static &'static [Option<&'static str>], - #[doc(hidden)] - pub groups: &'static &'static [(&'static str, usize)], - #[doc(hidden)] - pub prog: fn(&mut [Slot], &str, usize) -> bool, -} - -impl Copy for Plugin {} - -impl Clone for Plugin { - fn clone(&self) -> Plugin { - *self - } -} - -impl RegularExpression for Plugin { - type Text = str; - - fn slots_len(&self) -> usize { - self.names.len() * 2 - } - - fn next_after_empty(&self, text: &str, i: usize) -> usize { - let b = match text.as_bytes().get(i) { - None => return text.len() + 1, - Some(&b) => b, - }; - let inc = if b <= 0x7F { - 1 - } else if b <= 0b110_11111 { - 2 - } else if b <= 0b1110_1111 { - 3 - } else { - 4 - }; - i + inc - } - - fn shortest_match_at(&self, text: &str, start: usize) -> Option { - self.find_at(text, start).map(|(_, e)| e) - } - - fn is_match_at(&self, text: &str, start: usize) -> bool { - (self.prog)(&mut [], text, start) - } - - fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { - let mut slots = [None, None]; - (self.prog)(&mut slots, text, start); - match (slots[0], slots[1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - } - } - - fn read_captures_at<'t>( - &self, - locs: &mut Locations, - text: &'t str, - start: usize, - ) -> Option<(usize, usize)> { - let slots = as_slots(locs); - for slot in slots.iter_mut() { - *slot = None; - } - (self.prog)(slots, text, start); - match (slots[0], slots[1]) { - (Some(s), Some(e)) => Some((s, e)), - _ => None, - } - } -} diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 7129dfa4c7..7fb68257bb 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -22,7 +22,6 @@ use error::Error; use exec::{Exec, ExecNoSyncStr}; use expand::expand_str; use re_builder::unicode::RegexBuilder; -use re_plugin::Plugin; use re_trait::{self, RegularExpression, Locations, SubCapturesPosIter}; /// Escapes all regular expression meta characters in `text`. @@ -135,21 +134,7 @@ impl<'t> From> for &'t str { /// assert_eq!(haystack.split(&re).collect::>(), vec!["a", "b", "c"]); /// ``` #[derive(Clone)] -pub struct Regex(#[doc(hidden)] pub _Regex); - -#[derive(Clone)] -#[doc(hidden)] -pub enum _Regex { - // The representation of `Regex` is exported to support the `regex!` - // syntax extension. Do not rely on it. - // - // See the comments for the `internal` module in `lib.rs` for a more - // detailed explanation for what `regex!` requires. - #[doc(hidden)] - Dynamic(Exec), - #[doc(hidden)] - Plugin(Plugin), -} +pub struct Regex(Exec); impl fmt::Display for Regex { /// Shows the original regular expression. @@ -168,7 +153,7 @@ impl fmt::Debug for Regex { #[doc(hidden)] impl From for Regex { fn from(exec: Exec) -> Regex { - Regex(_Regex::Dynamic(exec)) + Regex(exec) } } @@ -257,16 +242,7 @@ impl Regex { /// # } /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { - match self.0 { - _Regex::Dynamic(ref exec) => { - let it = exec.searcher_str().find_iter(text); - Matches(MatchesInner::Dynamic(it)) - } - _Regex::Plugin(ref plug) => { - let it = plug.find_iter(text); - Matches(MatchesInner::Plugin(it)) - } - } + Matches(self.0.searcher_str().find_iter(text)) } /// Returns the capture groups corresponding to the leftmost-first @@ -337,7 +313,7 @@ impl Regex { self.read_captures_at(&mut locs, text, 0).map(|_| Captures { text: text, locs: locs, - named_groups: NamedGroups::from_regex(self) + named_groups: self.0.capture_name_idx().clone(), }) } @@ -370,16 +346,7 @@ impl Regex { &'r self, text: &'t str, ) -> CaptureMatches<'r, 't> { - match self.0 { - _Regex::Dynamic(ref exec) => { - let it = exec.searcher_str().captures_iter(text); - CaptureMatches(CaptureMatchesInner::Dynamic(it)) - } - _Regex::Plugin(ref plug) => { - let it = plug.captures_iter(text); - CaptureMatches(CaptureMatchesInner::Plugin(it)) - } - } + CaptureMatches(self.0.searcher_str().captures_iter(text)) } /// Returns an iterator of substrings of `text` delimited by a match of the @@ -663,12 +630,7 @@ impl Regex { text: &str, start: usize, ) -> Option { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().shortest_match_at(text, start) - } - _Regex::Plugin(ref plug) => plug.shortest_match_at(text, start), - } + self.0.searcher_str().shortest_match_at(text, start) } /// Returns the same as is_match, but starts the search at the given @@ -694,16 +656,9 @@ impl Regex { text: &'t str, start: usize, ) -> Option> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().find_at(text, start).map(|(s, e)| { - Match::new(text, s, e) - }) - } - _Regex::Plugin(ref plug) => { - plug.find_at(text, start).map(|(s, e)| Match::new(text, s, e)) - } - } + self.0.searcher_str().find_at(text, start).map(|(s, e)| { + Match::new(text, s, e) + }) } /// Returns the same as captures, but starts the search at the given @@ -719,16 +674,10 @@ impl Regex { text: &'t str, start: usize, ) -> Option> { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().read_captures_at(locs, text, start) - .map(|(s, e)| Match::new(text, s, e)) - } - _Regex::Plugin(ref plug) => { - plug.read_captures_at(locs, text, start) - .map(|(s, e)| Match::new(text, s, e)) - } - } + self.0 + .searcher_str() + .read_captures_at(locs, text, start) + .map(|(s, e)| Match::new(text, s, e)) } } @@ -736,40 +685,24 @@ impl Regex { impl Regex { /// Returns the original string of this regex. pub fn as_str(&self) -> &str { - match self.0 { - _Regex::Dynamic(ref exec) => &exec.regex_strings()[0], - _Regex::Plugin(ref plug) => plug.original, - } + &self.0.regex_strings()[0] } /// Returns an iterator over the capture names. pub fn capture_names(&self) -> CaptureNames { - CaptureNames(match self.0 { - _Regex::Plugin(ref n) => _CaptureNames::Plugin(n.names.iter()), - _Regex::Dynamic(ref d) => { - _CaptureNames::Dynamic(d.capture_names().iter()) - } - }) + CaptureNames(self.0.capture_names().iter()) } /// Returns the number of captures. pub fn captures_len(&self) -> usize { - match self.0 { - _Regex::Plugin(ref n) => n.names.len(), - _Regex::Dynamic(ref d) => d.capture_names().len() - } + self.0.capture_names().len() } /// Returns an empty set of locations that can be reused in multiple calls /// to `read_captures`. #[doc(hidden)] pub fn locations(&self) -> Locations { - match self.0 { - _Regex::Dynamic(ref exec) => { - exec.searcher_str().locations() - } - _Regex::Plugin(ref plug) => plug.locations(), - } + self.0.searcher_str().locations() } } @@ -779,30 +712,20 @@ impl Regex { /// whole matched region) is always unnamed. /// /// `'r` is the lifetime of the compiled regular expression. -pub struct CaptureNames<'r>(_CaptureNames<'r>); - -enum _CaptureNames<'r> { - Plugin(::std::slice::Iter<'r, Option<&'static str>>), - Dynamic(::std::slice::Iter<'r, Option>) -} +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option>); impl<'r> Iterator for CaptureNames<'r> { type Item = Option<&'r str>; fn next(&mut self) -> Option> { - match self.0 { - _CaptureNames::Plugin(ref mut i) => i.next().cloned(), - _CaptureNames::Dynamic(ref mut i) => { - i.next().as_ref().map(|o| o.as_ref().map(|s| s.as_ref())) - } - } + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) } fn size_hint(&self) -> (usize, Option) { - match self.0 { - _CaptureNames::Plugin(ref i) => i.size_hint(), - _CaptureNames::Dynamic(ref i) => i.size_hint(), - } + self.0.size_hint() } } @@ -819,7 +742,7 @@ impl<'r, 't> Iterator for Split<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { - let text = self.finder.text(); + let text = self.finder.0.text(); match self.finder.next() { None => { if self.last >= text.len() { @@ -859,7 +782,7 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { } self.n -= 1; if self.n == 0 { - let text = self.splits.finder.text(); + let text = self.splits.finder.0.text(); Some(&text[self.splits.last..]) } else { self.splits.next() @@ -867,59 +790,6 @@ impl<'r, 't> Iterator for SplitN<'r, 't> { } } -enum NamedGroups { - Plugin(&'static [(&'static str, usize)]), - Dynamic(Arc>), -} - -impl NamedGroups { - fn from_regex(regex: &Regex) -> NamedGroups { - match regex.0 { - _Regex::Plugin(ref plug) => NamedGroups::Plugin(plug.groups), - _Regex::Dynamic(ref exec) => { - NamedGroups::Dynamic(exec.capture_name_idx().clone()) - } - } - } - - fn pos(&self, name: &str) -> Option { - match *self { - NamedGroups::Plugin(groups) => { - groups.binary_search_by(|&(n, _)| n.cmp(name)) - .ok().map(|i| groups[i].1) - }, - NamedGroups::Dynamic(ref groups) => { - groups.get(name).cloned() - }, - } - } - - fn iter(& self) -> NamedGroupsIter { - match *self { - NamedGroups::Plugin(g) => NamedGroupsIter::Plugin(g.iter()), - NamedGroups::Dynamic(ref g) => NamedGroupsIter::Dynamic(g.iter()), - } - } -} - -enum NamedGroupsIter<'n> { - Plugin(::std::slice::Iter<'static, (&'static str, usize)>), - Dynamic(::std::collections::hash_map::Iter<'n, String, usize>), -} - -impl<'n> Iterator for NamedGroupsIter<'n> { - type Item = (&'n str, usize); - - fn next(&mut self) -> Option { - match *self { - NamedGroupsIter::Plugin(ref mut it) => it.next().cloned(), - NamedGroupsIter::Dynamic(ref mut it) => { - it.next().map(|(s, i)| (s.as_ref(), *i)) - } - } - } -} - /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent @@ -934,7 +804,7 @@ impl<'n> Iterator for NamedGroupsIter<'n> { pub struct Captures<'t> { text: &'t str, locs: Locations, - named_groups: NamedGroups, + named_groups: Arc>, } impl<'t> Captures<'t> { @@ -964,7 +834,7 @@ impl<'t> Captures<'t> { /// Returns the match for the capture group named `name`. If `name` isn't a /// valid capture group or didn't match anything, then `None` is returned. pub fn name(&self, name: &str) -> Option> { - self.named_groups.pos(name).and_then(|i| self.get(i)) + self.named_groups.get(name).and_then(|&i| self.get(i)) } /// An iterator that yields all capturing matches in the order in which @@ -1021,7 +891,7 @@ impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We'd like to show something nice here, even if it means an // allocation to build a reverse index. - let slot_to_name: HashMap = + let slot_to_name: HashMap<&usize, &String> = self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); for (slot, m) in self.0.locs.iter().enumerate() { @@ -1107,34 +977,17 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct CaptureMatches<'r, 't>(CaptureMatchesInner<'r, 't>); - -enum CaptureMatchesInner<'r, 't> { - Dynamic(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::CaptureMatches<'t, Plugin>), -} +pub struct CaptureMatches<'r, 't>(re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>); impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { - match self.0 { - CaptureMatchesInner::Dynamic(ref mut it) => { - let named = it.regex().capture_name_idx().clone(); - it.next().map(|locs| Captures { - text: it.text(), - locs: locs, - named_groups: NamedGroups::Dynamic(named), - }) - } - CaptureMatchesInner::Plugin(ref mut it) => { - it.next().map(|locs| Captures { - text: it.text(), - locs: locs, - named_groups: NamedGroups::Plugin(it.regex().groups), - }) - } - } + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs: locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) } } @@ -1145,35 +998,14 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> { /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. -pub struct Matches<'r, 't>(MatchesInner<'r, 't>); - -enum MatchesInner<'r, 't> { - Dynamic(re_trait::Matches<'t, ExecNoSyncStr<'r>>), - Plugin(re_trait::Matches<'t, Plugin>), -} - -impl<'r, 't> Matches<'r, 't> { - fn text(&self) -> &'t str { - match self.0 { - MatchesInner::Dynamic(ref it) => it.text(), - MatchesInner::Plugin(ref it) => it.text(), - } - } -} +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>); impl<'r, 't> Iterator for Matches<'r, 't> { type Item = Match<'t>; fn next(&mut self) -> Option> { - let text = self.text(); - match self.0 { - MatchesInner::Dynamic(ref mut it) => { - it.next().map(|(s, e)| Match::new(text, s, e)) - } - MatchesInner::Plugin(ref mut it) => { - it.next().map(|(s, e)| Match::new(text, s, e)) - } - } + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) } } diff --git a/tests/test_plugin.rs b/tests/test_plugin.rs deleted file mode 100644 index b4bc973433..0000000000 --- a/tests/test_plugin.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(plugin, test)] -#![plugin(regex_macros)] - -extern crate rand; -extern crate regex; -extern crate test; - -// Must come before other module definitions. -include!("macros_str.rs"); -include!("macros.rs"); - -mod api; -mod api_str; -mod crazy; -mod flags; -mod fowler; -mod multiline; -mod plugin; -mod replace; -mod suffix_reverse; -mod unicode;