diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index a3e1189..4166bd1 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1 +1 @@ -github: [raphlinus, robinst] +github: [raphlinus, robinst, keith-hall] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4cdac1a..8506698 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,8 @@ jobs: - uses: dtolnay/rust-toolchain@stable - run: cargo check - run: cargo check --no-default-features + - run: cargo check --examples + - run: cargo check --benches test: name: test @@ -47,6 +49,7 @@ jobs: run: cp Cargo.lock.msrv Cargo.lock - run: cargo test + - run: cargo test --no-default-features example: name: example @@ -55,6 +58,7 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - run: cargo run --example toy trace '\d*' '1122 33' + - run: cargo test --examples fmt: name: rustfmt @@ -73,13 +77,13 @@ jobs: name: coverage runs-on: ubuntu-latest container: - image: xd009642/tarpaulin:0.31.2 + image: xd009642/tarpaulin:0.31.5 options: --security-opt seccomp=unconfined steps: - uses: actions/checkout@v4 - name: Generate code coverage - run: cargo tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml + run: cargo tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml --engine llvm - name: Upload to codecov.io uses: codecov/codecov-action@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 68b0c57..cb3974d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,34 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), with the exception that 0.x versions can break between minor versions. +## [0.16.0] - 2025-08-01 +### Added +- Add an optimization step after the pattern is parsed but before it is analyzed. + Currently it only optimizes one specific use-case - where the expression is easy except + for a trailing positive lookahead whose contents are also easy. The optimization is to delegate to the regex crate instead of using the backtracking VM. (#171) +### Changed +- Patterns which are anchored to the start of the text (i.e. with `^` when not in multiline mode) should now fail faster when there is no match, because `fancy-regex` no longer tries to match at other positions. (#170) +- The `CompileError` for an invalid (numbered) backref has been updated to mention which backref was invalid (#170) +- Removed dependency on derivative (#169) +### Fixed +- Fixed a bug whereby sometimes a capture group containing a backref to itself would cause a compile error, when it is valid - this fixes a few Oniguruma test cases (#170) + +## [0.15.0] - 2025-07-06 +### Added +- Support `\Z` - anchor to the end of the text before any trailing newlines. (#148) +- Support `\O` - any character including newlines. (#158) +- The parser can now parse subroutine calls and relative backreferences (but execution is still unsupported). This is preparation for future work. Some new error variants have been added for features which can be parsed but are still otherwise unsupported. +- Backreferences can now be case insensitive. (#160) +- `RegexBuilder`: Add options for `multi_line`, `ignore_whitespace`, `dot_matches_new_line` (#165) +### Fixed +- Fix infinite loop when backtracking limit is hit (#153) +- Fix `RegexBuilder.case_insensitive` not always applying when it should. (#163) +- The `toy` example has had various bugfixes, and unit tests added. (#152, #159) + ## [0.14.0] - 2024-10-24 ### Added - Add `split`, `splitn` methods to `Regex` to split a string into substrings (#140) -- Add `case_insensitive` method to `RegexBuilder` to force case-insensitive mode +- Add `case_insensitive` method to `RegexBuilder` to force case-insensitive mode (#132) ### Changed - Bump bit-set dependency to 0.8 (#139) @@ -22,6 +46,10 @@ with the exception that 0.x versions can break between minor versions. ### Changed - Switch from regex crate to regex-automata and regex-syntax (lower level APIs) to simplify internals (#121) +- **Note:** Due to above change, more backtracking is done in fancy-regex itself + instead of regex-automata, and you might get a `BacktrackLimitExceeded` with + some patterns that you didn't get before. You can increase the backtrack limit + using `RegexBuilder::backtrack_limit` to help with that. - Allow escaping some letters in character classes, e.g. `[\A]` used to error but now matches the same as `[A]` (for compatibility with Oniguruma) - MSRV (minimum supported Rust version) is now 1.66.1 (from 1.61.0) @@ -183,6 +211,8 @@ with the exception that 0.x versions can break between minor versions. - Initial release +[0.16.0]: https://github.com/fancy-regex/fancy-regex/compare/0.15.0...0.16.0 +[0.15.0]: https://github.com/fancy-regex/fancy-regex/compare/0.14.0...0.15.0 [0.14.0]: https://github.com/fancy-regex/fancy-regex/compare/0.13.0...0.14.0 [0.13.0]: https://github.com/fancy-regex/fancy-regex/compare/0.12.0...0.13.0 [0.12.0]: https://github.com/fancy-regex/fancy-regex/compare/0.11.0...0.12.0 diff --git a/Cargo.toml b/Cargo.toml index 7184e40..31ea018 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,7 @@ [package] name = "fancy-regex" -# remember to update html_root_url -version = "0.14.0" -authors = ["Raph Levien ", "Robin Stocker "] +version = "0.16.0" +authors = ["Raph Levien ", "Robin Stocker ", "Keith Hall "] edition = "2018" license = "MIT" description = "An implementation of regexes, supporting a relatively rich set of features, including backreferences and look-around." diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 07f0773..adb5c84 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -12,7 +12,9 @@ For a good explanation of that, read Let's look at the regex from the README again: +```regex (a|b|ab)*bc +``` And the input text: @@ -47,15 +49,28 @@ delegate to the regex crate which matches it in linear runtime. Let's look at another regex, one that makes use of a "fancy" look-ahead: +```regex (a|b|ab)*(?=c) - +``` When fancy-regex matches it against this input: abababababababababababababababababababababababababababab -It's slow! The reason is that `(?=c)` is not supported by the regex -crate, so we need to handle it with backtracking. And because -`(a|b|ab)*` is before it, we need to do it with backtracking as well. +It's still fast! The reason is that although `(?=c)` is not supported +by the regex crate, fancy-regex detects the trailing positive lookahead +and is able to essentially rewrite the pattern into + +```regex + ((a|b|ab)*)c +``` + +and thus delegate the whole thing to the regex crate, and fixup the +captures/match boundaries after a match is found. + +If, however, the lookahead didn't come at the end of the pattern, +it would be slow! The reason is that fancy-regex would need to handle +the lookahead with backtracking. And because `(a|b|ab)*` is before it, +that also needs to be done with backtracking as well. Oniguruma doesn't have a problem with this particular case because its optimization saves it again: It checks if there's a `c` in the input @@ -70,8 +85,10 @@ inner part of the look-ahead can be delegated to regex entirely. ### Summary -* If the regex doesn't use fancy features, fancy-regex should have - linear runtime compared to Oniguruma's exponential worst-case. +* If the regex doesn't use fancy features, or the features used in the + pattern can be identified as being syntactic sugar and slightly + rewritten, fancy-regex should have linear runtime compared to + Oniguruma's exponential worst-case. * Even if the regex doesn't use any fancy features, Oniguruma can be faster because it is a mature and highly optimized engine. * With fancy features, Oniguruma can be faster because of optimizations. diff --git a/README.md b/README.md index ccbedcd..dbbce4d 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,8 @@ creating the excellent regex crate. ## Authors -The main author is Raph Levien, with many contributions from Robin Stocker. +The main author is Raph Levien, with many contributions from Robin Stocker +and Keith Hall. ## Contributions diff --git a/benches/bench.rs b/benches/bench.rs index c6e7ea0..bb29e4e 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -55,13 +55,15 @@ fn parse_misc(c: &mut Criterion) { fn analyze_literal_re(c: &mut Criterion) { let re = "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})"; let tree = Expr::parse_tree(re).unwrap(); - c.bench_function("analyze_literal_re", |b| b.iter(|| analyze(&tree).unwrap())); + c.bench_function("analyze_literal_re", |b| { + b.iter(|| analyze(&tree, 1).unwrap()) + }); } fn run_backtrack(c: &mut Criterion) { let tree = Expr::parse_tree("^.*?(([ab]+)\\1b)").unwrap(); - let a = analyze(&tree).unwrap(); - let p = compile(&a).unwrap(); + let a = analyze(&tree, 0).unwrap(); + let p = compile(&a, true).unwrap(); c.bench_function("run_backtrack", |b| { b.iter(|| { let result = run_default(&p, "babab", 0).unwrap(); @@ -75,8 +77,8 @@ fn run_backtrack(c: &mut Criterion) { // implementations, see README.md: fn run_tricky(c: &mut Criterion) { let tree = Expr::parse_tree("(a|b|ab)*bc").unwrap(); - let a = analyze(&tree).unwrap(); - let p = compile(&a).unwrap(); + let a = analyze(&tree, 1).unwrap(); + let p = compile(&a, false).unwrap(); let mut s = String::new(); for _ in 0..28 { s.push_str("ab"); @@ -86,9 +88,9 @@ fn run_tricky(c: &mut Criterion) { } fn run_backtrack_limit(c: &mut Criterion) { - let tree = Expr::parse_tree("(?i)(a|b|ab)*(?=c)").unwrap(); - let a = analyze(&tree).unwrap(); - let p = compile(&a).unwrap(); + let tree = Expr::parse_tree("(?i)(a|b|ab)*(?>c)").unwrap(); + let a = analyze(&tree, 1).unwrap(); + let p = compile(&a, false).unwrap(); let s = "abababababababababababababababababababababababababababab"; c.bench_function("run_backtrack_limit", |b| { b.iter(|| run_default(&p, &s, 0).unwrap_err()) diff --git a/examples/toy.rs b/examples/toy.rs index 7dacff5..c927bbe 100644 --- a/examples/toy.rs +++ b/examples/toy.rs @@ -20,30 +20,41 @@ //! A simple test app for exercising and debugging the regex engine. -use fancy_regex::internal::{analyze, compile, run_trace, Insn, Prog}; +use fancy_regex::internal::{ + analyze, can_compile_as_anchored, compile, optimize, run_trace, Insn, Prog, +}; use fancy_regex::*; use std::env; +use std::fmt::{Display, Formatter, Result}; +use std::io; +use std::io::Write; use std::str::FromStr; fn main() { let mut args = env::args().skip(1); if let Some(cmd) = args.next() { if cmd == "parse" { - if let Some(re) = args.next() { - let e = Expr::parse_tree(&re); - println!("{:#?}", e); - } + let re = args.next().expect("expected regexp argument"); + let e = Expr::parse_tree(&re); + println!("{:#?}", e); + } else if cmd == "optimize" { + let re = args.next().expect("expected regexp argument"); + let mut e = Expr::parse_tree(&re).expect("expected regexp to be parsed successfully"); + let explicit_capture_group0 = optimize(&mut e); + println!("explicit capture group 0: {:?}", explicit_capture_group0); + println!("{:#?}", e); } else if cmd == "analyze" { - if let Some(re) = args.next() { - let tree = Expr::parse_tree(&re).unwrap(); - let a = analyze(&tree); - println!("{:#?}", a); - } + let re = args.next().expect("expected regexp argument"); + let stdout = io::stdout(); + let mut handle = stdout.lock(); + write!(&mut handle, "{}", AnalyzeFormatterWrapper { regex: &re }) + .expect("error analyzing regexp"); } else if cmd == "compile" { - if let Some(re) = args.next() { - let r = Regex::new(&re).unwrap(); - r.debug_print(); - } + let re = args.next().expect("expected regexp argument"); + let stdout = io::stdout(); + let mut handle = stdout.lock(); + write!(&mut handle, "{}", CompileFormatterWrapper { regex: &re }) + .expect("error compiling regexp"); } else if cmd == "run" { let re = args.next().expect("expected regexp argument"); let r = Regex::new(&re).unwrap(); @@ -70,57 +81,244 @@ fn main() { println!("no match"); } } else if cmd == "trace" { - if let Some(re) = args.next() { - let prog = prog(&re); - if let Some(s) = args.next() { - run_trace(&prog, &s, 0).unwrap(); - } - } + let re = args.next().expect("expected regexp argument"); + let prog = prog(&re, 1); + let text = args.next().expect("expected text argument"); + run_trace(&prog, &text, 0).unwrap(); } else if cmd == "trace-inner" { - if let Some(re) = args.next() { - let tree = Expr::parse_tree(&re).unwrap(); - let a = analyze(&tree).unwrap(); - let p = compile(&a).unwrap(); - if let Some(s) = args.next() { - run_trace(&p, &s, 0).unwrap(); - } - } + let re = args.next().expect("expected regexp argument"); + let tree = Expr::parse_tree(&re).unwrap(); + let text = args.next().expect("expected text argument"); + let a = analyze(&tree, 1).unwrap(); + let p = compile(&a, true).unwrap(); + run_trace(&p, &text, 0).unwrap(); } else if cmd == "graph" { let re = args.next().expect("expected regexp argument"); - graph(&re); + graph(&re, &mut io::stdout()).expect("error making graph"); } else { println!("commands: parse|analyze|compile|graph , run|trace|trace-inner "); } } } -fn graph(re: &str) { - let prog = prog(re); - println!("digraph G {{"); +fn graph(re: &str, writer: &mut dyn std::io::Write) -> std::io::Result<()> { + let prog = prog(re, 1); + write!(writer, "digraph G {{\n")?; for (i, insn) in prog.body.iter().enumerate() { let label = format!("{:?}", insn) .replace(r#"\"#, r#"\\"#) .replace(r#"""#, r#"\""#); - println!(r#"{:3} [label="{}: {}"];"#, i, i, label); + write!(writer, r#"{:3} [label="{}: {}"];{}"#, i, i, label, "\n")?; match *insn { Insn::Split(a, b) => { - println!("{:3} -> {};", i, a); - println!("{:3} -> {};", i, b); + write!(writer, "{:3} -> {};\n", i, a)?; + write!(writer, "{:3} -> {};\n", i, b)?; } Insn::Jmp(target) => { - println!("{:3} -> {};", i, target); + write!(writer, "{:3} -> {};\n", i, target)?; } Insn::End => {} _ => { - println!("{:3} -> {};", i, i + 1); + write!(writer, "{:3} -> {};\n", i, i + 1)?; } } } - println!("}}"); + write!(writer, "}}\n")?; + Ok(()) } -fn prog(re: &str) -> Prog { - let tree = Expr::parse_tree(re).expect("Expected parsing regex to work"); - let result = analyze(&tree).expect("Expected analyze to succeed"); - compile(&result).expect("Expected compile to succeed") +fn show_analysis(re: &str, writer: &mut Formatter<'_>) -> Result { + let mut tree = Expr::parse_tree(&re).unwrap(); + optimize(&mut tree); + let a = analyze(&tree, 1); + write!(writer, "{:#?}\n", a) +} + +fn show_compiled_program(re: &str, writer: &mut Formatter<'_>) -> Result { + let r = Regex::new(&re).unwrap(); + r.debug_print(writer) +} + +fn prog(re: &str, start_group: usize) -> Prog { + // one thing to note here is that we want the prog, but in lib.rs, + // constructing a regex might not produce a prog - it may be wrapped Regex instead, + // which means that "toy" behaves differently to tests etc. + let mut tree = Expr::parse_tree(re).expect("Expected parsing regex to work"); + optimize(&mut tree); + let result = analyze(&tree, start_group).expect("Expected analyze to succeed"); + compile(&result, can_compile_as_anchored(&tree.expr)).expect("Expected compile to succeed") +} + +struct AnalyzeFormatterWrapper<'a> { + regex: &'a str, +} + +impl<'a> Display for AnalyzeFormatterWrapper<'a> { + fn fmt(&self, f: &mut Formatter) -> Result { + show_analysis(self.regex, f) + } +} + +struct CompileFormatterWrapper<'a> { + regex: &'a str, +} + +impl<'a> Display for CompileFormatterWrapper<'a> { + fn fmt(&self, f: &mut Formatter) -> Result { + show_compiled_program(self.regex, f) + } +} + +#[cfg(test)] +mod tests { + use crate::Write; + + #[test] + fn test_simple_graph() { + assert_graph( + "^a+bc?", + "\ +digraph G { + 0 [label=\"0: Save(0)\"]; + 0 -> 1; + 1 [label=\"1: Delegate(Delegate { pattern: \\\"^a+bc?\\\", start_group: 1, end_group: 1 })\"]; + 1 -> 2; + 2 [label=\"2: Save(1)\"]; + 2 -> 3; + 3 [label=\"3: End\"]; +} +", + ); + } + + #[test] + fn test_backref_graph() { + assert_graph( + r"a+(?b*)(?=c)\k", + "\ +digraph G { + 0 [label=\"0: Split(3, 1)\"]; + 0 -> 3; + 0 -> 1; + 1 [label=\"1: Any\"]; + 1 -> 2; + 2 [label=\"2: Jmp(0)\"]; + 2 -> 0; + 3 [label=\"3: Save(0)\"]; + 3 -> 4; + 4 [label=\"4: Lit(\\\"a\\\")\"]; + 4 -> 5; + 5 [label=\"5: Split(4, 6)\"]; + 5 -> 4; + 5 -> 6; + 6 [label=\"6: Save(2)\"]; + 6 -> 7; + 7 [label=\"7: Split(8, 10)\"]; + 7 -> 8; + 7 -> 10; + 8 [label=\"8: Lit(\\\"b\\\")\"]; + 8 -> 9; + 9 [label=\"9: Jmp(7)\"]; + 9 -> 7; + 10 [label=\"10: Save(3)\"]; + 10 -> 11; + 11 [label=\"11: Save(4)\"]; + 11 -> 12; + 12 [label=\"12: Lit(\\\"c\\\")\"]; + 12 -> 13; + 13 [label=\"13: Restore(4)\"]; + 13 -> 14; + 14 [label=\"14: Backref { slot: 2, casei: false }\"]; + 14 -> 15; + 15 [label=\"15: Save(1)\"]; + 15 -> 16; + 16 [label=\"16: End\"]; +} +", + ); + } + + #[test] + fn test_simple_analysis() { + assert_analysis_succeeds("a+bc?"); + } + + #[test] + fn test_backref_analysis() { + assert_analysis_succeeds("a+(?b*)(?=c)\\k"); + } + + #[test] + fn test_compilation_fancy_debug_output() { + let expected = " ".to_owned() + + "\ + 0: Split(3, 1) + 1: Any + 2: Jmp(0) + 3: Save(0) + 4: Lit(\"a\") + 5: Split(4, 6) + 6: Save(2) + 7: Split(8, 10) + 8: Lit(\"b\") + 9: Jmp(7) + 10: Save(3) + 11: Save(4) + 12: Lit(\"c\") + 13: Restore(4) + 14: Backref { slot: 2, casei: false } + 15: Save(1) + 16: End +"; + + assert_compiled_prog("a+(?b*)(?=c)\\k", &expected); + } + + #[test] + fn test_compilation_wrapped_debug_output() { + let expected = "wrapped Regex \"a+bc?\", explicit_capture_group_0: false"; + + assert_compiled_prog("a+bc?", &expected); + } + + #[test] + fn test_compilation_wrapped_debug_output_explict_capture_group_zero() { + let expected = "wrapped Regex \"(a+b)c\", explicit_capture_group_0: true"; + + assert_compiled_prog("a+b(?=c)", &expected); + } + + #[test] + fn test_compilation_wrapped_debug_output_explict_capture_group_zero_with_non_capture_group() { + let expected = "wrapped Regex \"(a+b)(?:c|d)\", explicit_capture_group_0: true"; + + assert_compiled_prog("a+b(?=c|d)", &expected); + } + + fn assert_graph(re: &str, expected: &str) { + use crate::graph; + let mut buf = Vec::new(); + graph(&re, &mut buf).expect("error compiling regexp and building graph"); + let output = String::from_utf8(buf).expect("string not utf8"); + assert_eq!(&output, &expected); + } + + fn assert_compiled_prog(re: &str, expected: &str) { + use crate::CompileFormatterWrapper; + let mut buf = Vec::new(); + write!(&mut buf, "{}", CompileFormatterWrapper { regex: &re }) + .expect("error compiling regexp"); + let output = String::from_utf8(buf).expect("string not utf8"); + assert_eq!(&output, &expected); + } + + fn assert_analysis_succeeds(re: &str) { + use crate::AnalyzeFormatterWrapper; + let mut buf = Vec::new(); + write!(&mut buf, "{}", AnalyzeFormatterWrapper { regex: &re }) + .expect("error analyzing regexp"); + let output = String::from_utf8(buf).expect("string not utf8"); + println!("{}", output); + assert!(&output.starts_with("Ok(\n Info {")); + } } diff --git a/src/analyze.rs b/src/analyze.rs index c4f700b..e2ce9b1 100644 --- a/src/analyze.rs +++ b/src/analyze.rs @@ -26,6 +26,7 @@ use core::cmp::min; use bit_set::BitSet; +use crate::alloc::string::ToString; use crate::parse::ExprTree; use crate::{CompileError, Error, Expr, Result}; @@ -149,9 +150,9 @@ impl<'a> Analyzer<'a> { min_size = size; const_size = true; } - Expr::Backref(group) => { - if group >= self.group_ix { - return Err(Error::CompileError(CompileError::InvalidBackref)); + Expr::Backref { group, .. } => { + if group == 0 { + return Err(Error::CompileError(CompileError::InvalidBackref(group))); } hard = true; } @@ -170,10 +171,7 @@ impl<'a> Analyzer<'a> { hard = true; const_size = true; } - Expr::BackrefExistsCondition(group) => { - if group >= self.group_ix { - return Err(Error::CompileError(CompileError::InvalidBackref)); - } + Expr::BackrefExistsCondition(_) => { hard = true; const_size = true; } @@ -200,6 +198,21 @@ impl<'a> Analyzer<'a> { children.push(child_info_truth); children.push(child_info_false); } + Expr::SubroutineCall(_) => { + return Err(Error::CompileError(CompileError::FeatureNotYetSupported( + "Subroutine Call".to_string(), + ))); + } + Expr::UnresolvedNamedSubroutineCall { ref name, ix } => { + return Err(Error::CompileError( + CompileError::SubroutineCallTargetNotFound(name.to_string(), ix), + )); + } + Expr::BackrefWithRelativeRecursionLevel { .. } => { + return Err(Error::CompileError(CompileError::FeatureNotYetSupported( + "Backref at recursion level".to_string(), + ))); + } }; Ok(Info { @@ -222,20 +235,42 @@ fn literal_const_size(_: &str, _: bool) -> bool { } /// Analyze the parsed expression to determine whether it requires fancy features. -pub fn analyze<'a>(tree: &'a ExprTree) -> Result> { +pub fn analyze<'a>(tree: &'a ExprTree, start_group: usize) -> Result> { let mut analyzer = Analyzer { backrefs: &tree.backrefs, - group_ix: 0, + group_ix: start_group, }; - analyzer.visit(&tree.expr) + let analyzed = analyzer.visit(&tree.expr); + if analyzer.backrefs.len() > analyzer.group_ix { + return Err(Error::CompileError(CompileError::InvalidBackref( + analyzer.backrefs.len() - 1, + ))); + } + analyzed +} + +/// Determine if the expression will always only ever match at position 0. +/// Note that false negatives are possible - it can return false even if it could be anchored. +/// This should therefore only be treated as an optimization. +pub fn can_compile_as_anchored(root_expr: &Expr) -> bool { + use crate::Assertion; + + match root_expr { + Expr::Concat(ref children) => match children[0] { + Expr::Assertion(ref assertion) => *assertion == Assertion::StartText, + _ => false, + }, + Expr::Assertion(ref assertion) => *assertion == Assertion::StartText, + _ => false, + } } #[cfg(test)] mod tests { use super::analyze; // use super::literal_const_size; - use crate::Expr; + use crate::{can_compile_as_anchored, CompileError, Error, Expr}; // #[test] // fn case_folding_safe() { @@ -252,31 +287,87 @@ mod tests { // } #[test] - fn invalid_backref_1() { - assert!(analyze(&Expr::parse_tree(".\\0").unwrap()).is_err()); + fn invalid_backref_self_zero() { + assert!(analyze(&Expr::parse_tree(".\\0").unwrap(), 1).is_err()); } #[test] - fn invalid_backref_2() { - assert!(analyze(&Expr::parse_tree("(.\\1)").unwrap()).is_err()); + fn allow_analysis_of_self_backref() { + // even if it will never match, see issue 103 + assert!(!analyze(&Expr::parse_tree("(.\\1)").unwrap(), 1).is_err()); + assert!(!analyze(&Expr::parse_tree(r"(([ab]+)\1b)").unwrap(), 1).is_err()); + // in the following scenario it can match + assert!(!analyze(&Expr::parse_tree(r"(([ab]+?)(?(1)\1| )c)+").unwrap(), 1).is_err()); } #[test] - fn invalid_backref_3() { - assert!(analyze(&Expr::parse_tree("\\1(.)").unwrap()).is_err()); + fn allow_backref_even_when_capture_group_occurs_after_backref() { + assert!(!analyze(&Expr::parse_tree("\\1(.)").unwrap(), 1).is_err()); + } + + #[test] + fn valid_backref_occurs_after_capture_group() { + assert!(!analyze(&Expr::parse_tree("(.)\\1").unwrap(), 1).is_err()); + } + + #[test] + fn feature_not_yet_supported() { + let tree = &Expr::parse_tree(r"(a)\g<1>").unwrap(); + let result = analyze(tree, 1); + assert!(result.is_err()); + assert!(matches!( + result.err(), + Some(Error::CompileError(CompileError::FeatureNotYetSupported(_))) + )); + + let tree = &Expr::parse_tree(r"(a)\k<1-0>").unwrap(); + let result = analyze(tree, 1); + assert!(result.is_err()); + assert!(matches!( + result.err(), + Some(Error::CompileError(CompileError::FeatureNotYetSupported(_))) + )); + } + + #[test] + fn subroutine_call_undefined() { + let tree = &Expr::parse_tree(r"\g(?a)").unwrap(); + let result = analyze(tree, 1); + assert!(result.is_err()); + assert!(matches!( + result.err(), + Some(Error::CompileError( + CompileError::SubroutineCallTargetNotFound(_, _) + )) + )); } #[test] fn is_literal() { let tree = Expr::parse_tree("abc").unwrap(); - let info = analyze(&tree).unwrap(); + let info = analyze(&tree, 1).unwrap(); assert_eq!(info.is_literal(), true); } #[test] fn is_literal_with_repeat() { let tree = Expr::parse_tree("abc*").unwrap(); - let info = analyze(&tree).unwrap(); + let info = analyze(&tree, 1).unwrap(); assert_eq!(info.is_literal(), false); } + + #[test] + fn anchored_for_starttext_assertions() { + let tree = Expr::parse_tree(r"^(\w+)\1").unwrap(); + assert_eq!(can_compile_as_anchored(&tree.expr), true); + + let tree = Expr::parse_tree(r"^").unwrap(); + assert_eq!(can_compile_as_anchored(&tree.expr), true); + } + + #[test] + fn not_anchored_for_startline_assertions() { + let tree = Expr::parse_tree(r"(?m)^(\w+)\1").unwrap(); + assert_eq!(can_compile_as_anchored(&tree.expr), false); + } } diff --git a/src/compile.rs b/src/compile.rs index bc63293..bf36f2a 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -20,7 +20,7 @@ //! Compilation of regexes to VM. -use alloc::string::String; +use alloc::string::{String, ToString}; use alloc::vec::Vec; use core::usize; use regex_automata::meta::Regex as RaRegex; @@ -29,7 +29,7 @@ use regex_automata::meta::{Builder as RaBuilder, Config as RaConfig}; use std::{collections::BTreeMap, sync::RwLock}; use crate::analyze::Info; -use crate::vm::{Insn, Prog}; +use crate::vm::{Delegate, Insn, Prog}; use crate::LookAround::*; use crate::{CompileError, Error, Expr, LookAround, RegexOptions, Result}; @@ -145,8 +145,11 @@ impl Compiler { Expr::LookAround(_, la) => { self.compile_lookaround(info, la)?; } - Expr::Backref(group) => { - self.b.add(Insn::Backref(group * 2)); + Expr::Backref { group, casei } => { + self.b.add(Insn::Backref { + slot: group * 2, + casei, + }); } Expr::BackrefExistsCondition(group) => { self.b.add(Insn::BackrefExistsCondition(group)); @@ -174,6 +177,13 @@ impl Compiler { Expr::Conditional { .. } => { self.compile_conditional(|compiler, i| compiler.visit(&info.children[i], hard))?; } + Expr::SubroutineCall(_) => { + return Err(Error::CompileError(CompileError::FeatureNotYetSupported( + "Subroutine Call".to_string(), + ))); + } + Expr::UnresolvedNamedSubroutineCall { .. } => unreachable!(), + Expr::BackrefWithRelativeRecursionLevel { .. } => unreachable!(), } Ok(()) } @@ -514,9 +524,26 @@ pub(crate) fn compile_inner(inner_re: &str, options: &RegexOptions) -> Result) -> Result { +pub fn compile(info: &Info<'_>, anchored: bool) -> Result { let mut c = Compiler::new(info.end_group); + if !anchored { + // add instructions as if \O*? was used at the start of the expression + // so that we bump the haystack index by one when failing to match at the current position + let current_pc = c.b.pc(); + // we are adding 3 instructions, so the current program counter plus 3 gives us the first real instruction + c.b.add(Insn::Split(current_pc + 3, current_pc + 1)); + c.b.add(Insn::Any); + c.b.add(Insn::Jmp(current_pc)); + } + if info.start_group == 1 { + // add implicit capture group 0 begin + c.b.add(Insn::Save(0)); + } c.visit(info, false)?; + if info.start_group == 1 { + // add implicit capture group 0 end + c.b.add(Insn::Save(1)); + } c.b.add(Insn::End); Ok(c.b.build()) } @@ -571,11 +598,12 @@ impl DelegateBuilder { let compiled = compile_inner(&self.re, options)?; - Ok(Insn::Delegate { + Ok(Insn::Delegate(Delegate { inner: compiled, + pattern: self.re.clone(), start_group, end_group, - }) + })) } } @@ -608,8 +636,10 @@ mod tests { ]), backrefs: BitSet::new(), named_groups: Default::default(), + contains_subroutines: false, + self_recursive: false, }; - let info = analyze(&tree).unwrap(); + let info = analyze(&tree, 1).unwrap(); let mut c = Compiler::new(0); // Force "hard" so that compiler doesn't just delegate @@ -702,17 +732,36 @@ mod tests { assert_matches!(prog[7], End); } + #[test] + fn lazy_any_can_be_compiled_explicit_capture_group_zero() { + let prog = compile_prog(r"\O*?((?!a))"); + + assert_eq!(prog.len(), 9, "prog: {:?}", prog); + + assert_matches!(prog[0], Split(3, 1)); + assert_matches!(prog[1], Any); + assert_matches!(prog[2], Jmp(0)); + assert_matches!(prog[3], Save(0)); + assert_matches!(prog[4], Split(5, 7)); + assert_matches!(prog[5], Lit(ref l) if l == "a"); + assert_matches!(prog[6], FailNegativeLookAround); + assert_matches!(prog[7], Save(1)); + assert_matches!(prog[8], End); + } + fn compile_prog(re: &str) -> Vec { let tree = Expr::parse_tree(re).unwrap(); - let info = analyze(&tree).unwrap(); - let prog = compile(&info).unwrap(); + let info = analyze(&tree, 0).unwrap(); + let prog = compile(&info, true).unwrap(); prog.body } #[cfg(feature = "std")] fn assert_delegate(insn: &Insn, re: &str) { + use crate::vm::Delegate; + match insn { - Insn::Delegate { inner, .. } => { + Insn::Delegate(Delegate { inner, .. }) => { assert_eq!( PATTERN_MAPPING .read() diff --git a/src/error.rs b/src/error.rs index 9acba42..be42474 100644 --- a/src/error.rs +++ b/src/error.rs @@ -70,9 +70,13 @@ pub enum CompileError { /// Invalid group id in escape sequence InvalidGroupNameBackref(String), /// Invalid back reference - InvalidBackref, + InvalidBackref(usize), /// Once named groups are used you cannot refer to groups by number NamedBackrefOnly, + /// Feature not supported yet + FeatureNotYetSupported(String), + /// Subroutine call to non-existent group + SubroutineCallTargetNotFound(String, usize), } /// An error as the result of executing a regex. @@ -128,8 +132,12 @@ impl fmt::Display for CompileError { }, CompileError::InvalidGroupName => write!(f, "Could not parse group name"), CompileError::InvalidGroupNameBackref(s) => write!(f, "Invalid group name in back reference: {}", s), - CompileError::InvalidBackref => write!(f, "Invalid back reference"), + CompileError::InvalidBackref(g) => write!(f, "Invalid back reference to group {}", g), CompileError::NamedBackrefOnly => write!(f, "Numbered backref/call not allowed because named group was used, use a named backref instead"), + CompileError::FeatureNotYetSupported(s) => write!(f, "Regex uses currently unimplemented feature: {}", s), + CompileError::SubroutineCallTargetNotFound(s, ix) => { + write!(f, "Subroutine call target not found at position {}: {}", ix, s) + } } } } diff --git a/src/expand.rs b/src/expand.rs index 2574402..438c489 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -2,7 +2,7 @@ use alloc::borrow::Cow; use alloc::string::{String, ToString}; use alloc::vec::Vec; -use crate::parse::{parse_decimal, parse_id}; +use crate::parse::{parse_decimal, parse_id, ParsedId}; use crate::{Captures, CompileError, Error, ParseError, Regex}; /// A set of options for expanding a template string using the contents @@ -76,7 +76,7 @@ impl Expander { } else if num < regex.captures_len() { Ok(()) } else { - Err(Error::CompileError(CompileError::InvalidBackref)) + Err(Error::CompileError(CompileError::InvalidBackref(num))) } }; self.exec(template, |step| match step { @@ -87,7 +87,9 @@ impl Expander { } else if let Ok(num) = name.parse() { on_group_num(num) } else { - Err(Error::CompileError(CompileError::InvalidBackref)) + Err(Error::CompileError(CompileError::InvalidGroupNameBackref( + name.to_string(), + ))) } } Step::GroupNum(num) => on_group_num(num), @@ -227,15 +229,17 @@ impl Expander { let skip = if tail.starts_with(self.sub_char) { f(Step::Char(self.sub_char))?; 1 - } else if let Some((id, skip)) = parse_id(tail, self.open, self.close, false) - .or_else(|| { - if self.allow_undelimited_name { - parse_id(tail, "", "", false) - } else { - None - } - }) - { + } else if let Some(ParsedId { + id, + relative: None, + skip, + }) = parse_id(tail, self.open, self.close, false).or_else(|| { + if self.allow_undelimited_name { + parse_id(tail, "", "", false) + } else { + None + } + }) { f(Step::GroupName(id))?; skip } else if let Some((skip, num)) = parse_decimal(tail, 0) { diff --git a/src/flags.rs b/src/flags.rs new file mode 100644 index 0000000..79a93e7 --- /dev/null +++ b/src/flags.rs @@ -0,0 +1,6 @@ +pub const FLAG_CASEI: u32 = 1; +pub const FLAG_MULTI: u32 = 1 << 1; +pub const FLAG_DOTNL: u32 = 1 << 2; +pub const FLAG_SWAP_GREED: u32 = 1 << 3; +pub const FLAG_IGNORE_SPACE: u32 = 1 << 4; +pub const FLAG_UNICODE: u32 = 1 << 5; diff --git a/src/lib.rs b/src/lib.rs index 19b0284..f36ddee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,7 +115,11 @@ Escapes: `\K` : keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\ `\G` -: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html)) +: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\ +`\Z` +: anchor to the end of the text before any trailing newlines\ +`\O` +: any character including newline Backreferences: @@ -170,14 +174,13 @@ Conditionals - if/then/else: [regex]: https://crates.io/crates/regex */ -#![doc(html_root_url = "https://docs.rs/fancy-regex/0.14.0")] #![deny(missing_docs)] #![deny(missing_debug_implementations)] #![cfg_attr(not(feature = "std"), no_std)] extern crate alloc; -use alloc::borrow::{Cow, ToOwned}; +use alloc::borrow::Cow; use alloc::boxed::Box; use alloc::string::{String, ToString}; use alloc::sync::Arc; @@ -198,12 +201,17 @@ mod analyze; mod compile; mod error; mod expand; +mod flags; +mod optimize; mod parse; mod replacer; mod vm; use crate::analyze::analyze; +use crate::analyze::can_compile_as_anchored; use crate::compile::compile; +use crate::flags::*; +use crate::optimize::optimize; use crate::parse::{ExprTree, NamedGroups, Parser}; use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH}; @@ -233,6 +241,8 @@ enum RegexImpl { Wrap { inner: RaRegex, options: RegexOptions, + /// Some optimizations avoid the VM, but need to use an extra capture group to represent the match boundaries + explicit_capture_group_0: bool, }, Fancy { prog: Prog, @@ -300,7 +310,12 @@ impl<'r, 't> Iterator for Matches<'r, 't> { .re .find_from_pos_with_option_flags(self.text, self.last_end, option_flags) { - Err(error) => return Some(Err(error)), + Err(error) => { + // Stop on first error: If an error is encountered, return it, and set the "last match position" + // to the string length, so that the next next() call will return None, to prevent an infinite loop. + self.last_end = self.text.len() + 1; + return Some(Err(error)); + } Ok(None) => return None, Ok(Some(mat)) => mat, }; @@ -358,7 +373,12 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> { } let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) { - Err(error) => return Some(Err(error)), + Err(error) => { + // Stop on first error: If an error is encountered, return it, and set the "last match position" + // to the string length, so that the next next() call will return None, to prevent an infinite loop. + self.0.last_end = self.0.text.len() + 1; + return Some(Err(error)); + } Ok(None) => return None, Ok(Some(captures)) => captures, }; @@ -393,6 +413,10 @@ enum CapturesImpl<'t> { Wrap { text: &'t str, locations: RaCaptures, + /// Some optimizations avoid the VM but need an extra capture group to represent the match boundaries. + /// Therefore what is actually capture group 1 should be treated as capture group 0, and all other + /// capture groups should have their index reduced by one as well to line up with what the pattern specifies. + explicit_capture_group_0: bool, }, Fancy { text: &'t str, @@ -526,6 +550,28 @@ struct RegexOptions { delegate_dfa_size_limit: Option, } +impl RegexOptions { + fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 { + if flag_value { + enum_value + } else { + 0 + } + } + + fn compute_flags(&self) -> u32 { + let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI); + let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI); + let whitespace = + Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE); + let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL); + let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE); + + let all_flags = insensitive | multiline | whitespace | dotnl | unicode | unicode; + all_flags + } +} + impl Default for RegexOptions { fn default() -> Self { RegexOptions { @@ -555,15 +601,67 @@ impl RegexBuilder { Regex::new_options(self.0.clone()) } + fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self { + self.0.syntaxc = func(self.0.syntaxc); + self + } + /// Override default case insensitive /// this is to enable/disable casing via builder instead of a flag within /// the raw string provided to the regex builder /// /// Default is false pub fn case_insensitive(&mut self, yes: bool) -> &mut Self { - let syntaxc = self.0.syntaxc.to_owned(); - self.0.syntaxc = syntaxc.case_insensitive(yes); - self + self.set_config(|x| x.case_insensitive(yes)) + } + + /// Enable multi-line regex + pub fn multi_line(&mut self, yes: bool) -> &mut Self { + self.set_config(|x| x.multi_line(yes)) + } + + /// Allow ignore whitespace + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self { + self.set_config(|x| x.ignore_whitespace(yes)) + } + + /// Enable or disable the "dot matches any character" flag. + /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character + /// except for a new line character. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self { + self.set_config(|x| x.dot_matches_new_line(yes)) + } + + /// Enable verbose mode in the regular expression. + /// + /// The same as ignore_whitespace + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn verbose_mode(&mut self, yes: bool) -> &mut Self { + self.set_config(|x| x.ignore_whitespace(yes)) + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode_mode(&mut self, yes: bool) -> &mut Self { + self.set_config(|x| x.unicode(yes)) } /// Limit for how many times backtracking should be attempted for fancy regexes (where @@ -635,48 +733,34 @@ impl Regex { } fn new_options(options: RegexOptions) -> Result { - let raw_tree = Expr::parse_tree(&options.pattern)?; - - // wrapper to search for re at arbitrary start position, - // and to capture the match bounds - let tree = ExprTree { - expr: Expr::Concat(vec![ - Expr::Repeat { - child: Box::new(Expr::Any { newline: true }), - lo: 0, - hi: usize::MAX, - greedy: false, - }, - Expr::Group(Box::new(raw_tree.expr)), - ]), - ..raw_tree - }; + let mut tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?; - let info = analyze(&tree)?; + // try to optimize the expression tree + let requires_capture_group_fixup = optimize(&mut tree); + let info = analyze(&tree, if requires_capture_group_fixup { 0 } else { 1 })?; - let inner_info = &info.children[1].children[0]; // references inner expr - if !inner_info.hard { + if !info.hard { // easy case, wrap regex // we do our own to_str because escapes are different + // NOTE: there is a good opportunity here to use Hir to avoid regex-automata re-parsing it let mut re_cooked = String::new(); - // same as raw_tree.expr above, but it was moved, so traverse to find it - let raw_e = match tree.expr { - Expr::Concat(ref v) => match v[1] { - Expr::Group(ref child) => child, - _ => unreachable!(), - }, - _ => unreachable!(), - }; - raw_e.to_str(&mut re_cooked, 0); + tree.expr.to_str(&mut re_cooked, 0); let inner = compile::compile_inner(&re_cooked, &options)?; return Ok(Regex { - inner: RegexImpl::Wrap { inner, options }, + inner: RegexImpl::Wrap { + inner, + options: RegexOptions { + pattern: re_cooked.clone(), + ..options + }, + explicit_capture_group_0: requires_capture_group_fixup, + }, named_groups: Arc::new(tree.named_groups), }); } - let prog = compile(&info)?; + let prog = compile(&info, can_compile_as_anchored(&tree.expr))?; Ok(Regex { inner: RegexImpl::Fancy { prog, @@ -795,9 +879,27 @@ impl Regex { option_flags: u32, ) -> Result>> { match &self.inner { - RegexImpl::Wrap { inner, .. } => Ok(inner - .search(&RaInput::new(text).span(pos..text.len())) - .map(|m| Match::new(text, m.start(), m.end()))), + RegexImpl::Wrap { + inner, + explicit_capture_group_0, + .. + } => { + if !*explicit_capture_group_0 { + Ok(inner + .search(&RaInput::new(text).span(pos..text.len())) + .map(|m| Match::new(text, m.start(), m.end()))) + } else { + let mut locations = inner.create_captures(); + inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations); + Ok(locations.is_match().then(|| { + Match::new( + text, + locations.get_group(1).unwrap().start, + locations.get_group(1).unwrap().end, + ) + })) + } + } RegexImpl::Fancy { prog, options, .. } => { let result = vm::run(prog, text, pos, option_flags, options)?; Ok(result.map(|saves| Match::new(text, saves[0], saves[1]))) @@ -895,11 +997,19 @@ impl Regex { pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result>> { let named_groups = self.named_groups.clone(); match &self.inner { - RegexImpl::Wrap { inner, .. } => { + RegexImpl::Wrap { + inner, + explicit_capture_group_0, + .. + } => { let mut locations = inner.create_captures(); inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations); Ok(locations.is_match().then(|| Captures { - inner: CapturesImpl::Wrap { text, locations }, + inner: CapturesImpl::Wrap { + text, + locations, + explicit_capture_group_0: *explicit_capture_group_0, + }, named_groups, })) } @@ -924,7 +1034,11 @@ impl Regex { /// Returns the number of captures, including the implicit capture of the entire expression. pub fn captures_len(&self) -> usize { match &self.inner { - RegexImpl::Wrap { inner, .. } => inner.captures_len(), + RegexImpl::Wrap { + inner, + explicit_capture_group_0, + .. + } => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 }, RegexImpl::Fancy { n_groups, .. } => *n_groups, } } @@ -941,13 +1055,20 @@ impl Regex { // for debugging only #[doc(hidden)] - pub fn debug_print(&self) { + pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result { match &self.inner { - #[cfg(feature = "std")] - RegexImpl::Wrap { inner, .. } => println!("wrapped {:?}", inner), - #[cfg(not(feature = "std"))] - RegexImpl::Wrap { .. } => {} - RegexImpl::Fancy { prog, .. } => prog.debug_print(), + RegexImpl::Wrap { + options, + explicit_capture_group_0, + .. + } => { + write!( + writer, + "wrapped Regex {:?}, explicit_capture_group_0: {:}", + options.pattern, *explicit_capture_group_0 + ) + } + RegexImpl::Fancy { prog, .. } => prog.debug_print(writer), } } @@ -1261,11 +1382,17 @@ impl<'t> Captures<'t> { /// returned. The index 0 returns the whole match. pub fn get(&self, i: usize) -> Option> { match &self.inner { - CapturesImpl::Wrap { text, locations } => locations.get_group(i).map(|span| Match { + CapturesImpl::Wrap { text, - start: span.start, - end: span.end, - }), + locations, + explicit_capture_group_0, + } => locations + .get_group(i + if *explicit_capture_group_0 { 1 } else { 0 }) + .map(|span| Match { + text, + start: span.start, + end: span.end, + }), CapturesImpl::Fancy { text, ref saves } => { let slot = i * 2; if slot >= saves.len() { @@ -1325,7 +1452,11 @@ impl<'t> Captures<'t> { /// match. pub fn len(&self) -> usize { match &self.inner { - CapturesImpl::Wrap { locations, .. } => locations.group_len(), + CapturesImpl::Wrap { + locations, + explicit_capture_group_0, + .. + } => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 }, CapturesImpl::Fancy { saves, .. } => saves.len() / 2, } } @@ -1391,7 +1522,7 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { // TODO: might be nice to implement ExactSizeIterator etc for SubCaptures /// Regular expression AST. This is public for now but may change. -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq, Eq, Clone)] pub enum Expr { /// An empty expression, e.g. the last branch in `(a|b|)` Empty, @@ -1445,7 +1576,21 @@ pub enum Expr { }, /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group /// and the whole regex matches either `abcabc` or `defdef`. - Backref(usize), + Backref { + /// The capture group number being referenced + group: usize, + /// Whether the matching is case-insensitive or not + casei: bool, + }, + /// Back reference to a capture group at the given specified relative recursion level. + BackrefWithRelativeRecursionLevel { + /// The capture group number being referenced + group: usize, + /// Relative recursion level + relative_level: isize, + /// Whether the matching is case-insensitive or not + casei: bool, + }, /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and /// never backtrack and try `a`, even if matching fails after the atomic group. AtomicGroup(Box), @@ -1464,6 +1609,15 @@ pub enum Expr { /// What to execute if the condition is false false_branch: Box, }, + /// Subroutine call to the specified group number + SubroutineCall(usize), + /// Unresolved subroutine call to the specified group name + UnresolvedNamedSubroutineCall { + /// The capture group name + name: String, + /// The position in the original regex pattern where the subroutine call is made + ix: usize, + }, } /// Type of look-around assertion as used for a look-around expression. @@ -1587,6 +1741,12 @@ impl Expr { Parser::parse(re) } + /// Parse the regex and return an expression (AST) + /// Flags should be bit based based on flags + pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result { + Parser::parse_with_flags(re, flags) + } + /// Convert expression to a regex string in the regex crate's syntax. /// /// # Panics @@ -1686,7 +1846,7 @@ impl Expr { buf.push_str(")"); } } - _ => panic!("attempting to format hard expr"), + _ => panic!("attempting to format hard expr {:?}", self), } } } @@ -1759,8 +1919,9 @@ pub fn detect_possible_backref(re: &str) -> bool { /// experimenting. #[doc(hidden)] pub mod internal { - pub use crate::analyze::analyze; + pub use crate::analyze::{analyze, can_compile_as_anchored}; pub use crate::compile::compile; + pub use crate::optimize::optimize; pub use crate::vm::{run_default, run_trace, Insn, Prog}; } @@ -1772,7 +1933,7 @@ mod tests { use alloc::{format, vec}; use crate::parse::make_literal; - use crate::{Expr, Regex}; + use crate::{Expr, Regex, RegexImpl}; //use detect_possible_backref; @@ -1876,6 +2037,35 @@ mod tests { assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø"); } + #[test] + fn trailing_positive_lookahead_wrap_capture_group_fixup() { + let s = r"(a+)(?=c)"; + let regex = s.parse::().unwrap(); + assert!(matches!(regex.inner, + RegexImpl::Wrap { explicit_capture_group_0: true, .. }), + "trailing positive lookahead for an otherwise easy pattern should avoid going through the VM"); + } + + #[test] + fn easy_regex() { + let s = r"(a+)b"; + let regex = s.parse::().unwrap(); + assert!( + matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }), + "easy pattern should avoid going through the VM, and capture group 0 should be implicit" + ); + } + + #[test] + fn hard_regex() { + let s = r"(a+)(?>c)"; + let regex = s.parse::().unwrap(); + assert!( + matches!(regex.inner, RegexImpl::Fancy { .. }), + "hard regex should be compiled into a VM" + ); + } + /* #[test] fn detect_backref() { diff --git a/src/optimize.rs b/src/optimize.rs new file mode 100644 index 0000000..4f0a1b1 --- /dev/null +++ b/src/optimize.rs @@ -0,0 +1,183 @@ +// Copyright 2025 The Fancy Regex Authors. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +//! Optimization of regex expressions. + +use crate::parse::ExprTree; +use crate::Expr; +use crate::LookAround; + +use alloc::boxed::Box; +use alloc::vec; +use core::mem; + +/// Rewrite the expression tree to help the VM compile an efficient program. +/// Returns a boolean to say whether the new tree explicitly contains capture group 0. +pub fn optimize(tree: &mut ExprTree) -> bool { + // self recursion prevents us from moving the trailing lookahead out of group 0 + if !tree.self_recursive { + let requires_capture_group_fixup = optimize_trailing_lookahead(tree); + requires_capture_group_fixup + } else { + false + } +} + +fn optimize_trailing_lookahead(tree: &mut ExprTree) -> bool { + // returns a boolean to say whether the optimization was applied. + // - if it was applied, capture group 0 is no longer implicit, but explicit + // if/when the whole expression gets delegated to regex-automata + // converts i.e. original pattern `a(?=b)` when wrapped in the capture group 0 + // as `(a(?=b))` + // to `(a)b` + + if let Expr::Concat(ref mut root_concat_children) = tree.expr { + // we get the last child if it is a positive lookahead + if let Some(Expr::LookAround(_, LookAround::LookAhead)) = root_concat_children.last() { + // then pop the lookahead + let lookahead_expr = root_concat_children + .pop() + .expect("lookaround should be popped"); + // take the rest of the children from the original Concat + let group0_children = mem::take(root_concat_children); + + // extract the inner expression from the lookahead + if let Expr::LookAround(inner, LookAround::LookAhead) = lookahead_expr { + let group0 = Expr::Group(Box::new(Expr::Concat(group0_children))); + // compose new Concat: [Group0, lookahead inner expr] + let new_concat = Expr::Concat(vec![group0, *inner]); + tree.expr = new_concat; + return true; + } else { + unreachable!("already checked it is a lookahead"); + } + } + } else if let Expr::LookAround(ref mut inner, LookAround::LookAhead) = &mut tree.expr { + let group0 = Expr::Group(Box::new(Expr::Empty)); + let mut swap = Expr::Empty; + mem::swap(&mut swap, inner); + // compose new Concat: [Group0, lookahead inner expr] + tree.expr = Expr::Concat(vec![group0, swap]); + return true; + } + false +} + +#[cfg(test)] +mod tests { + use super::optimize; + use super::vec; + use super::Box; + use crate::parse::make_literal; + use crate::Expr; + use alloc::string::String; + + #[test] + fn trailing_positive_lookahead_optimized() { + let mut tree = Expr::parse_tree("a(?=b)").unwrap(); + let requires_capture_group_fixup = optimize(&mut tree); + assert_eq!(requires_capture_group_fixup, true); + let mut s = String::new(); + tree.expr.to_str(&mut s, 0); + assert_eq!(s, "(a)b"); + } + + #[test] + fn standalone_positive_lookahead_optimized() { + let mut tree = Expr::parse_tree("(?=b)").unwrap(); + let requires_capture_group_fixup = optimize(&mut tree); + assert_eq!(requires_capture_group_fixup, true); + let mut s = String::new(); + tree.expr.to_str(&mut s, 0); + assert_eq!(s, "()b"); + } + + #[test] + fn trailing_positive_lookahead_with_alternative_optimized() { + let mut tree = Expr::parse_tree("a(?=b|c)").unwrap(); + let requires_capture_group_fixup = optimize(&mut tree); + assert_eq!(requires_capture_group_fixup, true); + let mut s = String::new(); + tree.expr.to_str(&mut s, 0); + assert_eq!(s, "(a)(?:b|c)"); + } + + #[test] + fn trailing_positive_lookahead_moved_even_if_not_easy() { + let mut tree = Expr::parse_tree(r"(a)\1(?=c)").unwrap(); + let requires_capture_group_fixup = optimize(&mut tree); + assert_eq!(requires_capture_group_fixup, true); + assert_eq!( + tree.expr, + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::Backref { + group: 1, + casei: false + } + ]))), + make_literal("c"), + ]) + ); + } + + #[test] + fn trailing_positive_lookahead_left_alone_when_self_recursive() { + let tree = Expr::parse_tree(r"ab?\g<0>?(?=a|$)").unwrap(); + let mut optimized_tree = tree.clone(); + let requires_capture_group_fixup = optimize(&mut optimized_tree); + assert_eq!(requires_capture_group_fixup, false); + assert_eq!(&optimized_tree.expr, &tree.expr); + } + + #[test] + fn trailing_negative_lookahead_left_alone() { + let tree = Expr::parse_tree(r"a(?!b)").unwrap(); + let mut optimized_tree = tree.clone(); + let requires_capture_group_fixup = optimize(&mut optimized_tree); + assert_eq!(requires_capture_group_fixup, false); + assert_eq!(&optimized_tree.expr, &tree.expr); + } + + #[test] + fn trailing_positive_lookbehind_left_alone() { + let tree = Expr::parse_tree(r"(?<=b)").unwrap(); + let mut optimized_tree = tree.clone(); + let requires_capture_group_fixup = optimize(&mut optimized_tree); + assert_eq!(requires_capture_group_fixup, false); + assert_eq!(&optimized_tree.expr, &tree.expr); + } + + #[test] + fn non_trailing_positive_lookahead_left_alone() { + let tree = Expr::parse_tree(r"a(?=(b))\1").unwrap(); + let mut optimized_tree = tree.clone(); + let requires_capture_group_fixup = optimize(&mut optimized_tree); + assert_eq!(requires_capture_group_fixup, false); + assert_eq!(&optimized_tree.expr, &tree.expr); + + let tree = Expr::parse_tree(r"(?=(b))\1").unwrap(); + let mut optimized_tree = tree.clone(); + let requires_capture_group_fixup = optimize(&mut optimized_tree); + assert_eq!(requires_capture_group_fixup, false); + assert_eq!(&optimized_tree.expr, &tree.expr); + } +} diff --git a/src/parse.rs b/src/parse.rs index c04a426..fa0bb17 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -20,36 +20,32 @@ //! A regex parser yielding an AST. +use crate::RegexOptions; use alloc::boxed::Box; use alloc::string::{String, ToString}; use alloc::vec::Vec; use alloc::{format, vec}; use bit_set::BitSet; -use core::convert::TryInto; use core::usize; use regex_syntax::escape_into; +use crate::flags::*; use crate::{codepoint_len, CompileError, Error, Expr, ParseError, Result, MAX_RECURSION}; use crate::{Assertion, LookAround::*}; -const FLAG_CASEI: u32 = 1; -const FLAG_MULTI: u32 = 1 << 1; -const FLAG_DOTNL: u32 = 1 << 2; -const FLAG_SWAP_GREED: u32 = 1 << 3; -const FLAG_IGNORE_SPACE: u32 = 1 << 4; -const FLAG_UNICODE: u32 = 1 << 5; - #[cfg(not(feature = "std"))] pub(crate) type NamedGroups = alloc::collections::BTreeMap; #[cfg(feature = "std")] pub(crate) type NamedGroups = std::collections::HashMap; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ExprTree { pub expr: Expr, pub backrefs: BitSet, pub named_groups: NamedGroups, + pub(crate) contains_subroutines: bool, + pub(crate) self_recursive: bool, } #[derive(Debug)] @@ -60,35 +56,60 @@ pub(crate) struct Parser<'a> { named_groups: NamedGroups, numeric_backrefs: bool, curr_group: usize, // need to keep track of which group number we're parsing + contains_subroutines: bool, + has_unresolved_subroutines: bool, + self_recursive: bool, +} + +struct NamedBackrefOrSubroutine<'a> { + ix: usize, + group_ix: Option, + group_name: Option<&'a str>, + recursion_level: Option, } impl<'a> Parser<'a> { - /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups - /// that are referenced by backrefs. - pub(crate) fn parse(re: &str) -> Result { - let mut p = Parser::new(re); - let (ix, expr) = p.parse_re(0, 0)?; + pub(crate) fn parse_with_flags(re: &str, flags: u32) -> Result { + let mut p = Parser::new(re, flags); + let (ix, mut expr) = p.parse_re(0, 0)?; if ix < re.len() { return Err(Error::ParseError( ix, ParseError::GeneralParseError("end of string not reached".to_string()), )); } + + if p.has_unresolved_subroutines { + p.has_unresolved_subroutines = false; + p.resolve_named_subroutine_calls(&mut expr); + } + Ok(ExprTree { expr, - backrefs: Default::default(), + backrefs: p.backrefs, named_groups: p.named_groups, + contains_subroutines: p.contains_subroutines, + self_recursive: p.self_recursive, }) } - fn new(re: &str) -> Parser<'_> { + pub(crate) fn parse(re: &str) -> Result { + Self::parse_with_flags(re, RegexOptions::default().compute_flags()) + } + + fn new(re: &str, flags: u32) -> Parser<'_> { + let flags = flags | FLAG_UNICODE; + Parser { re, backrefs: Default::default(), named_groups: Default::default(), numeric_backrefs: false, - flags: FLAG_UNICODE, + flags, curr_group: 0, + contains_subroutines: false, + has_unresolved_subroutines: false, + self_recursive: false, } } @@ -186,6 +207,9 @@ impl<'a> Parser<'a> { Expr::LookAround(_, _) => false, Expr::Empty => false, Expr::Assertion(_) => false, + Expr::KeepOut => false, + Expr::ContinueFromPreviousMatchEnd => false, + Expr::BackrefExistsCondition(_) => false, _ => true, } } @@ -262,13 +286,7 @@ impl<'a> Parser<'a> { }, )), b'(' => self.parse_group(ix, depth), - b'\\' => { - let (next, expr) = self.parse_escape(ix, false)?; - if let Expr::Backref(group) = expr { - self.backrefs.insert(group); - } - Ok((next, expr)) - } + b'\\' => self.parse_escape(ix, false), b'+' | b'*' | b'?' | b'|' | b')' => Ok((ix, Expr::Empty)), b'[' => self.parse_class(ix), b => { @@ -286,34 +304,129 @@ impl<'a> Parser<'a> { } fn parse_named_backref( - &self, + &mut self, + ix: usize, + open: &str, + close: &str, + allow_relative: bool, + ) -> Result<(usize, Expr)> { + let NamedBackrefOrSubroutine { + ix: end, + group_ix, + group_name, + recursion_level, + } = self.parse_named_backref_or_subroutine(ix, open, close, allow_relative)?; + if let Some(group) = group_ix { + self.backrefs.insert(group); + return Ok(( + end, + if let Some(recursion_level) = recursion_level { + Expr::BackrefWithRelativeRecursionLevel { + group, + relative_level: recursion_level, + casei: self.flag(FLAG_CASEI), + } + } else { + Expr::Backref { + group, + casei: self.flag(FLAG_CASEI), + } + }, + )); + } + if let Some(group_name) = group_name { + // here the name was parsed but doesn't match a capture group we have already parsed + return Err(Error::ParseError( + ix, + ParseError::InvalidGroupNameBackref(group_name.to_string()), + )); + } + unreachable!() + } + + fn parse_named_subroutine_call( + &mut self, ix: usize, open: &str, close: &str, allow_relative: bool, ) -> Result<(usize, Expr)> { - if let Some((id, skip)) = parse_id(&self.re[ix..], open, close, allow_relative) { + let NamedBackrefOrSubroutine { + ix: end, + group_ix, + group_name, + recursion_level, + } = self.parse_named_backref_or_subroutine(ix, open, close, allow_relative)?; + if let Some(_) = recursion_level { + return Err(Error::ParseError(ix, ParseError::InvalidGroupName)); + } + if let Some(group) = group_ix { + self.contains_subroutines = true; + if group == 0 { + self.self_recursive = true; + } + return Ok((end, Expr::SubroutineCall(group))); + } + if let Some(group_name) = group_name { + // here the name was parsed but doesn't match a capture group we have already parsed + let expr = Expr::UnresolvedNamedSubroutineCall { + name: group_name.to_string(), + ix, + }; + self.has_unresolved_subroutines = true; + self.contains_subroutines = true; + return Ok((end, expr)); + } + unreachable!() + } + + fn parse_named_backref_or_subroutine( + &self, + ix: usize, + open: &str, + close: &str, + allow_relative: bool, + ) -> Result { + if let Some(ParsedId { + id, + mut relative, + skip, + }) = parse_id(&self.re[ix..], open, close, allow_relative) + { let group = if let Some(group) = self.named_groups.get(id) { Some(*group) - } else if let Ok(group) = id.parse::() { - group.try_into().map_or_else( - |_| { - // relative backref - self.curr_group.checked_add_signed(group + 1) - }, - |group| Some(group), - ) + } else if let Ok(group) = id.parse::() { + Some(group) + } else if let Some(relative_group) = relative { + if id.is_empty() { + relative = None; + self.curr_group.checked_add_signed(if relative_group < 0 { + relative_group + 1 + } else { + relative_group + }) + } else { + None + } } else { None }; if let Some(group) = group { - return Ok((ix + skip, Expr::Backref(group))); + Ok(NamedBackrefOrSubroutine { + ix: ix + skip, + group_ix: Some(group), + group_name: None, + recursion_level: relative, + }) + } else { + // here the name was parsed but doesn't match a capture group we have already parsed + Ok(NamedBackrefOrSubroutine { + ix: ix + skip, + group_ix: None, + group_name: Some(id), + recursion_level: relative, + }) } - // here the name is parsed but it is invalid - Err(Error::ParseError( - ix, - ParseError::InvalidGroupNameBackref(id.to_string()), - )) } else { // in this case the name can't be parsed Err(Error::ParseError(ix, ParseError::InvalidGroupName)) @@ -321,11 +434,33 @@ impl<'a> Parser<'a> { } fn parse_numbered_backref(&mut self, ix: usize) -> Result<(usize, Expr)> { + let (end, group) = self.parse_numbered_backref_or_subroutine_call(ix)?; + self.numeric_backrefs = true; + self.backrefs.insert(group); + Ok(( + end, + Expr::Backref { + group, + casei: self.flag(FLAG_CASEI), + }, + )) + } + + fn parse_numbered_subroutine_call(&mut self, ix: usize) -> Result<(usize, Expr)> { + let (end, group) = self.parse_numbered_backref_or_subroutine_call(ix)?; + self.numeric_backrefs = true; + self.contains_subroutines = true; + if group == 0 { + self.self_recursive = true; + } + Ok((end, Expr::SubroutineCall(group))) + } + + fn parse_numbered_backref_or_subroutine_call(&self, ix: usize) -> Result<(usize, usize)> { if let Some((end, group)) = parse_decimal(self.re, ix) { // protect BitSet against unreasonably large value if group < self.re.len() / 2 { - self.numeric_backrefs = true; - return Ok((end, Expr::Backref(group))); + return Ok((end, group)); } } return Err(Error::ParseError(ix, ParseError::InvalidBackref)); @@ -351,6 +486,18 @@ impl<'a> Parser<'a> { (end, Expr::Assertion(Assertion::StartText)) } else if b == b'z' && !in_class { (end, Expr::Assertion(Assertion::EndText)) + } else if b == b'Z' && !in_class { + ( + end, + Expr::LookAround( + Box::new(Expr::Delegate { + inner: "\\n*$".to_string(), + size: 0, + casei: false, + }), + LookAhead, + ), + ) } else if b == b'b' && !in_class { if bytes.get(end) == Some(&b'{') { // Support for \b{...} is not implemented yet @@ -431,6 +578,23 @@ impl<'a> Parser<'a> { (end, Expr::KeepOut) } else if b == b'G' && !in_class { (end, Expr::ContinueFromPreviousMatchEnd) + } else if b == b'O' && !in_class { + (end, Expr::Any { newline: true }) + } else if b == b'g' && !in_class { + if end == self.re.len() { + return Err(Error::ParseError( + ix, + ParseError::InvalidEscape("\\g".to_string()), + )); + } + let b = bytes[end]; + if is_digit(b) { + self.parse_numbered_subroutine_call(end)? + } else if b == b'\'' { + self.parse_named_subroutine_call(end, "'", "'", true)? + } else { + self.parse_named_subroutine_call(end, "<", ">", true)? + } } else { // printable ASCII (including space, see issue #29) ( @@ -601,10 +765,20 @@ impl<'a> Parser<'a> { (Some(LookBehind), 3) } else if self.re[ix..].starts_with("?...) + } else if self.re[ix..].starts_with("?<") || self.re[ix..].starts_with("?'") { + // Named capture group using Oniguruma syntax: (?...) or (?'name'...) self.curr_group += 1; - if let Some((id, skip)) = parse_id(&self.re[ix + 1..], "<", ">", false) { + let (open, close) = if self.re[ix..].starts_with("?<") { + ("<", ">") + } else { + ("'", "'") + }; + if let Some(ParsedId { + id, + relative: None, + skip, + }) = parse_id(&self.re[ix + 1..], open, close, false) + { self.named_groups.insert(id.to_string(), self.curr_group); (None, skip + 1) } else { @@ -613,7 +787,12 @@ impl<'a> Parser<'a> { } else if self.re[ix..].starts_with("?P<") { // Named capture group using Python syntax: (?P...) self.curr_group += 1; // this is a capture group - if let Some((id, skip)) = parse_id(&self.re[ix + 2..], "<", ">", false) { + if let Some(ParsedId { + id, + relative: None, + skip, + }) = parse_id(&self.re[ix + 2..], "<", ">", false) + { self.named_groups.insert(id.to_string(), self.curr_group); (None, skip + 2) } else { @@ -626,6 +805,8 @@ impl<'a> Parser<'a> { (None, 2) } else if self.re[ix..].starts_with("?(") { return self.parse_conditional(ix + 2, depth); + } else if self.re[ix..].starts_with("?P>") { + return self.parse_named_subroutine_call(ix + 3, "", ")", false); } else if self.re[ix..].starts_with('?') { return self.parse_flags(ix, depth); } else { @@ -729,20 +910,20 @@ impl<'a> Parser<'a> { let bytes = self.re.as_bytes(); // get the character after the open paren let b = bytes[ix]; - let (mut next, condition) = if is_digit(b) { - self.parse_numbered_backref(ix)? - } else if b == b'\'' { - self.parse_named_backref(ix, "'", "'", true)? + let (next, condition) = if b == b'\'' { + self.parse_named_backref(ix, "'", "')", true)? } else if b == b'<' { - self.parse_named_backref(ix, "<", ">", true)? + self.parse_named_backref(ix, "<", ">)", true)? + } else if b == b'+' || b == b'-' || is_digit(b) { + self.parse_named_backref(ix, "", ")", true)? } else { - self.parse_re(ix, depth)? + let (next, condition) = self.parse_re(ix, depth)?; + (self.check_for_close_paren(next)?, condition) }; - next = self.check_for_close_paren(next)?; let (end, child) = self.parse_re(next, depth)?; if end == next { // Backreference validity checker - if let Expr::Backref(group) = condition { + if let Expr::Backref { group, .. } = condition { let after = self.check_for_close_paren(end)?; return Ok((after, Expr::BackrefExistsCondition(group))); } else { @@ -770,7 +951,7 @@ impl<'a> Parser<'a> { // there is only one branch - the truth branch. i.e. "if" without "else" if_true = child; } - let inner_condition = if let Expr::Backref(group) = condition { + let inner_condition = if let Expr::Backref { group, .. } = condition { Expr::BackrefExistsCondition(group) } else { condition @@ -792,7 +973,8 @@ impl<'a> Parser<'a> { } fn flag(&self, flag: u32) -> bool { - (self.flags & flag) != 0 + let v = self.flags & flag; + v == flag } fn update_flag(&mut self, flag: u32, neg: bool) { @@ -837,6 +1019,40 @@ impl<'a> Parser<'a> { } } } + + fn resolve_named_subroutine_calls(&mut self, expr: &mut Expr) { + match expr { + Expr::UnresolvedNamedSubroutineCall { name, .. } => { + if let Some(group) = self.named_groups.get(name) { + *expr = Expr::SubroutineCall(*group); + } else { + self.has_unresolved_subroutines = true; + } + } + // recursively resolve in inner expressions + Expr::Group(inner) | Expr::LookAround(inner, _) | Expr::AtomicGroup(inner) => { + self.resolve_named_subroutine_calls(inner); + } + Expr::Concat(children) | Expr::Alt(children) => { + for child in children { + self.resolve_named_subroutine_calls(child); + } + } + Expr::Repeat { child, .. } => { + self.resolve_named_subroutine_calls(child); + } + Expr::Conditional { + condition, + true_branch, + false_branch, + } => { + self.resolve_named_subroutine_calls(condition); + self.resolve_named_subroutine_calls(true_branch); + self.resolve_named_subroutine_calls(false_branch); + } + _ => {} + } + } } // return (ix, value) @@ -850,41 +1066,69 @@ pub(crate) fn parse_decimal(s: &str, ix: usize) -> Option<(usize, usize)> { .map(|val| (end, val)) } -/// Attempts to parse an identifier between the specified opening and closing -/// delimiters. On success, returns `Some((id, skip))`, where `skip` is how much -/// of the string was used. +#[derive(Debug, PartialEq)] +pub(crate) struct ParsedId<'a> { + pub id: &'a str, + pub relative: Option, + pub skip: usize, +} + +/// Attempts to parse an identifier, optionally followed by a relative number between the +/// specified opening and closing delimiters. On success, returns +/// `Some((id, relative, skip))`, where `skip` is how much of the string was used. pub(crate) fn parse_id<'a>( s: &'a str, open: &'_ str, close: &'_ str, allow_relative: bool, -) -> Option<(&'a str, usize)> { +) -> Option> { debug_assert!(!close.starts_with(is_id_char)); - if !s.starts_with(open) { + if !s.starts_with(open) || s.len() <= open.len() + close.len() { return None; } let id_start = open.len(); let mut iter = s[id_start..].char_indices().peekable(); - let after_id = if allow_relative && iter.next_if(|(_, ch)| *ch == '-').is_some() { - iter.find(|(_, ch)| !ch.is_ascii_digit()) - } else { - iter.find(|(_, ch)| !is_id_char(*ch)) - }; + let after_id = iter.find(|(_, ch)| !is_id_char(*ch)); + let id_len = match after_id.map(|(i, _)| i) { - Some(id_len) if s[id_start + id_len..].starts_with(close) => Some(id_len), - None if close.is_empty() => Some(s.len()), - _ => None, + Some(id_len) => id_len, + None if close.is_empty() => s.len(), + _ => 0, }; - match id_len { - Some(0) => None, - Some(id_len) => { - let id_end = id_start + id_len; - Some((&s[id_start..id_end], id_end + close.len())) + + let id_end = id_start + id_len; + if id_len > 0 && s[id_end..].starts_with(close) { + return Some(ParsedId { + id: &s[id_start..id_end], + relative: None, + skip: id_end + close.len(), + }); + } else if !allow_relative { + return None; + } + let relative_sign = s.as_bytes()[id_end]; + if relative_sign == b'+' || relative_sign == b'-' { + if let Some((end, relative_amount)) = parse_decimal(&s, id_end + 1) { + if s[end..].starts_with(close) { + if relative_amount == 0 && id_len == 0 { + return None; + } + let relative_amount_signed = if relative_sign == b'-' { + -(relative_amount as isize) + } else { + relative_amount as isize + }; + return Some(ParsedId { + id: &s[id_start..id_end], + relative: Some(relative_amount_signed), + skip: end + close.len(), + }); + } } - _ => None, } + None } fn is_id_char(c: char) -> bool { @@ -900,9 +1144,13 @@ fn is_hex_digit(b: u8) -> bool { } pub(crate) fn make_literal(s: &str) -> Expr { + make_literal_case_insensitive(s, false) +} + +pub(crate) fn make_literal_case_insensitive(s: &str, case_insensitive: bool) -> Expr { Expr::Literal { val: String::from(s), - casei: false, + casei: case_insensitive, } } @@ -912,9 +1160,9 @@ mod tests { use alloc::string::{String, ToString}; use alloc::{format, vec}; - use crate::parse::{make_literal, parse_id}; - use crate::LookAround::*; + use crate::parse::{make_literal, make_literal_case_insensitive, parse_id}; use crate::{Assertion, Expr}; + use crate::{LookAround::*, RegexOptions, SyntaxConfig}; fn p(s: &str) -> Expr { Expr::parse_tree(s).unwrap().expr @@ -961,6 +1209,21 @@ mod tests { assert_eq!(p("$"), Expr::Assertion(Assertion::EndText)); } + #[test] + fn end_text_before_empty_lines() { + assert_eq!( + p("\\Z"), + Expr::LookAround( + Box::new(Expr::Delegate { + inner: "\\n*$".to_string(), + size: 0, + casei: false, + }), + LookAhead, + ) + ); + } + #[test] fn literal() { assert_eq!(p("a"), make_literal("a")); @@ -974,15 +1237,65 @@ mod tests { #[test] fn parse_id_test() { - assert_eq!(parse_id("foo.", "", "", true), Some(("foo", 3))); - assert_eq!(parse_id("{foo}", "{", "}", true), Some(("foo", 5))); + use crate::parse::ParsedId; + fn create_id(id: &str, relative: Option, skip: usize) -> Option { + Some(ParsedId { id, relative, skip }) + } + assert_eq!(parse_id("foo.", "", "", true), create_id("foo", None, 3)); + assert_eq!(parse_id("1.", "", "", true), create_id("1", None, 1)); + assert_eq!(parse_id("{foo}", "{", "}", true), create_id("foo", None, 5)); assert_eq!(parse_id("{foo.", "{", "}", true), None); assert_eq!(parse_id("{foo", "{", "}", true), None); assert_eq!(parse_id("{}", "{", "}", true), None); assert_eq!(parse_id("", "", "", true), None); - assert_eq!(parse_id("{-1}", "{", "}", true), Some(("-1", 4))); + assert_eq!(parse_id("{-1}", "{", "}", true), create_id("", Some(-1), 4)); assert_eq!(parse_id("{-1}", "{", "}", false), None); assert_eq!(parse_id("{-a}", "{", "}", true), None); + assert_eq!(parse_id("{-a}", "{", "}", false), None); + assert_eq!(parse_id("{+a}", "{", "}", false), None); + assert_eq!(parse_id("+a", "", "", false), None); + assert_eq!(parse_id("-a", "", "", false), None); + assert_eq!(parse_id("2+a", "", "", false), create_id("2", None, 1)); + assert_eq!(parse_id("2-a", "", "", false), create_id("2", None, 1)); + + assert_eq!(parse_id("<+1>", "<", ">", true), create_id("", Some(1), 4)); + assert_eq!(parse_id("<-3>", "<", ">", true), create_id("", Some(-3), 4)); + assert_eq!( + parse_id("", "<", ">", true), + create_id("n", Some(1), 5) + ); + assert_eq!( + parse_id("", "<", ">", true), + create_id("n", Some(-1), 5) + ); + assert_eq!(parse_id("<>", "<", ">", true), None); + assert_eq!(parse_id("<", "<", ">", true), None); + assert_eq!(parse_id("<+0>", "<", ">", true), None); + assert_eq!(parse_id("<-0>", "<", ">", true), None); + assert_eq!( + parse_id("", "<", ">", true), + create_id("n", Some(0), 5) + ); + assert_eq!( + parse_id("", "<", ">", true), + create_id("n", Some(0), 5) + ); + assert_eq!( + parse_id("<2-0>", "<", ">", true), + create_id("2", Some(0), 5) + ); + assert_eq!( + parse_id("<2+0>", "<", ">", true), + create_id("2", Some(0), 5) + ); + assert_eq!( + parse_id("<2+1>", "<", ">", true), + create_id("2", Some(1), 5) + ); + assert_eq!( + parse_id("<2-1>", "<", ">", true), + create_id("2", Some(-1), 5) + ); } #[test] @@ -1091,6 +1404,12 @@ mod tests { assert_eq!(p("(a)"), Expr::Group(Box::new(make_literal("a"),))); } + #[test] + fn named_group() { + assert_eq!(p("(?'name'a)"), Expr::Group(Box::new(make_literal("a"),))); + assert_eq!(p("(?a)"), Expr::Group(Box::new(make_literal("a"),))); + } + #[test] fn group_repeat() { assert_eq!( @@ -1205,6 +1524,29 @@ mod tests { make_literal(","), ]) ); + assert_eq!( + p("a{1,A}"), + Expr::Concat(vec![ + make_literal("a"), + make_literal("{"), + make_literal("1"), + make_literal(","), + make_literal("A"), + make_literal("}"), + ]) + ); + assert_eq!( + p("a{1,2A}"), + Expr::Concat(vec![ + make_literal("a"), + make_literal("{"), + make_literal("1"), + make_literal(","), + make_literal("2"), + make_literal("A"), + make_literal("}"), + ]) + ); } #[test] @@ -1263,7 +1605,10 @@ mod tests { p("(.)\\1"), Expr::Concat(vec![ Expr::Group(Box::new(Expr::Any { newline: false })), - Expr::Backref(1), + Expr::Backref { + group: 1, + casei: false, + }, ]) ); } @@ -1274,7 +1619,10 @@ mod tests { p("(?.)\\k"), Expr::Concat(vec![ Expr::Group(Box::new(Expr::Any { newline: false })), - Expr::Backref(1), + Expr::Backref { + group: 1, + casei: false, + }, ]) ); } @@ -1282,15 +1630,135 @@ mod tests { #[test] fn relative_backref() { assert_eq!( - p("(a)(.)\\k<-1>"), + p(r"(a)(.)\k<-1>"), Expr::Concat(vec![ Expr::Group(Box::new(make_literal("a"))), Expr::Group(Box::new(Expr::Any { newline: false })), - Expr::Backref(2) + Expr::Backref { + group: 2, + casei: false, + }, ]) ); + + assert_eq!( + p(r"(a)\k<+1>(.)"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::Backref { + group: 2, + casei: false, + }, + Expr::Group(Box::new(Expr::Any { newline: false })), + ]) + ); + fail("(?P<->.)"); - fail("(.)(?P=-)") + fail("(.)(?P=-)"); + fail(r"(a)\k<-0>(.)"); + fail(r"(a)\k<+0>(.)"); + fail(r"(a)\k<+>(.)"); + fail(r"(a)\k<->(.)"); + fail(r"(a)\k<>(.)"); + } + + #[test] + fn relative_backref_with_recursion_level() { + assert_eq!( + p(r"()\k<1+3>"), + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Empty)), + Expr::BackrefWithRelativeRecursionLevel { + group: 1, + relative_level: 3, + casei: false, + }, + ]), + ); + + assert_eq!( + p(r"()\k<1-0>"), + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Empty)), + Expr::BackrefWithRelativeRecursionLevel { + group: 1, + relative_level: 0, + casei: false, + }, + ]), + ); + + assert_eq!( + p(r"(?)\k"), + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Empty)), + Expr::BackrefWithRelativeRecursionLevel { + group: 1, + relative_level: 3, + casei: false, + }, + ]), + ); + + assert_eq!( + p(r"(?)\k"), + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Empty)), + Expr::BackrefWithRelativeRecursionLevel { + group: 1, + relative_level: -3, + casei: false, + } + ]), + ); + + assert_eq!( + p(r"\A(?|.|(?:(?.)\g\k))\z"), + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartText), + Expr::Group(Box::new(Expr::Alt(vec![ + Expr::Empty, + Expr::Any { newline: false }, + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Any { newline: false })), + Expr::SubroutineCall(1), + Expr::BackrefWithRelativeRecursionLevel { + group: 2, + relative_level: 0, + casei: false, + }, + ]) + ]))), + Expr::Assertion(Assertion::EndText) + ]), + ); + } + + #[test] + fn relative_subroutine_call() { + assert_eq!( + p(r"(a)(.)\g<-1>"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::Group(Box::new(Expr::Any { newline: false })), + Expr::SubroutineCall(2), + ]) + ); + + assert_eq!( + p(r"(a)\g<+1>(.)"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(2), + Expr::Group(Box::new(Expr::Any { newline: false })), + ]) + ); + + fail(r"(a)\g<-0>(.)"); + fail(r"(a)\g<+0>(.)"); + fail(r"(a)\g<+>(.)"); + fail(r"(a)\g<->(.)"); + fail(r"(a)\g<>(.)"); } #[test] @@ -1513,9 +1981,9 @@ mod tests { "\\kxxx", "Parsing error at position 2: Could not parse group name", ); - // "-" can only be at the start for a relative backref + // "-" can only be used after a name for relative recursion level, so must be followed by a number assert_error( - "\\k", + "\\k", "Parsing error at position 2: Could not parse group name", ); assert_error( @@ -1593,6 +2061,22 @@ mod tests { ); } + #[test] + fn no_quantifiers_on_other_non_repeatable_expressions() { + assert_error( + r"\K?", + "Parsing error at position 2: Target of repeat operator is invalid", + ); + assert_error( + r"\G*", + "Parsing error at position 2: Target of repeat operator is invalid", + ); + assert_error( + r"\b+", + "Parsing error at position 2: Target of repeat operator is invalid", + ); + } + #[test] fn backref_exists_condition() { assert_eq!( @@ -1637,6 +2121,69 @@ mod tests { ); } + #[test] + fn conditional_unclosed_at_end_of_pattern() { + assert_error( + r"(?(", + "Parsing error at position 3: Opening parenthesis without closing parenthesis", + ); + } + + #[test] + fn subroutine_call_unclosed_at_end_of_pattern() { + assert_error( + r"\g<", + "Parsing error at position 2: Could not parse group name", + ); + + assert_error( + r"\g", + "Parsing error at position 2: Could not parse group name", + ); + + assert_error(r"\g", "Parsing error at position 0: Invalid escape: \\g"); + + assert_error( + r"\g test", + "Parsing error at position 2: Could not parse group name", + ); + } + + #[test] + fn subroutine_call_missing_subroutine_reference() { + assert_error( + r"\g test", + "Parsing error at position 2: Could not parse group name", + ); + } + + #[test] + fn subroutine_call_name_includes_dash() { + assert_error( + r"\g<1-0>(a)", + "Parsing error at position 2: Could not parse group name", + ); + assert_error( + r"\g(?'name'a)", + "Parsing error at position 2: Could not parse group name", + ); + } + #[test] fn backref_condition_with_one_two_or_three_branches() { assert_eq!( @@ -1728,7 +2275,10 @@ mod tests { true_branch: Box::new(make_literal("b")), false_branch: Box::new(make_literal("c")) })), - Expr::Group(Box::new(Expr::Backref(1))) + Expr::Group(Box::new(Expr::Backref { + group: 1, + casei: false, + },)) ]) ); @@ -1793,6 +2343,164 @@ mod tests { ); } + #[test] + fn subroutines() { + assert_eq!( + p(r"(a)\g1"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(1) + ]) + ); + + assert_eq!( + p(r"(a)\g<1>"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(1) + ]) + ); + + assert_eq!( + p(r"(?a)\g"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(1) + ]) + ); + + assert_eq!( + p(r"(?a)\g'group_name'"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(1) + ]) + ); + + assert_eq!( + p(r"(?a)(?P>group_name)"), + Expr::Concat(vec![ + Expr::Group(Box::new(make_literal("a"))), + Expr::SubroutineCall(1) + ]) + ); + } + + #[test] + fn subroutine_defined_later() { + assert_eq!( + p(r"\g(?a)"), + Expr::Concat(vec![ + Expr::SubroutineCall(1), + Expr::Group(Box::new(make_literal("a"))), + ]) + ); + + assert_eq!( + p(r"\g(?:a|b|(?c)?)"), + Expr::Concat(vec![ + Expr::SubroutineCall(1), + Expr::Alt(vec![ + make_literal("a"), + make_literal("b"), + Expr::Repeat { + child: Box::new(Expr::Group(Box::new(make_literal("c")))), + lo: 0, + hi: 1, + greedy: true + } + ]) + ]) + ); + + assert_eq!( + p(r"(?a)?\g(?()(?b)|c)"), + Expr::Concat(vec![ + Expr::Repeat { + child: Box::new(Expr::Group(Box::new(make_literal("a")))), + lo: 0, + hi: 1, + greedy: true + }, + Expr::SubroutineCall(2), + Expr::Conditional { + condition: Box::new(Expr::BackrefExistsCondition(1)), + true_branch: Box::new(Expr::Group(Box::new(make_literal("b")))), + false_branch: Box::new(make_literal("c")), + } + ]) + ); + + assert_eq!( + p(r"\g<1>(a)"), + Expr::Concat(vec![ + Expr::SubroutineCall(1), + Expr::Group(Box::new(make_literal("a"))), + ]) + ); + } + + #[test] + fn recursive_subroutine_call() { + assert_eq!( + p(r"\A(?|.|(?:(?.)\g\k))\z"), + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartText,), + Expr::Group(Box::new(Expr::Alt(vec![ + Expr::Empty, + Expr::Any { newline: false }, + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Any { newline: false },)), + Expr::SubroutineCall(1,), + Expr::Backref { + group: 2, + casei: false, + }, + ],), + ],),)), + Expr::Assertion(Assertion::EndText,), + ],) + ); + } + + #[test] + fn self_recursive_subroutine_call() { + let tree = Expr::parse_tree(r"hello\g<0>?world").unwrap(); + assert_eq!(tree.self_recursive, true); + + let tree = Expr::parse_tree(r"hello\g0?world").unwrap(); + assert_eq!(tree.self_recursive, true); + + let tree = Expr::parse_tree(r"hello world").unwrap(); + assert_eq!(tree.self_recursive, false); + + let tree = Expr::parse_tree(r"hello\g1world").unwrap(); + assert_eq!(tree.self_recursive, false); + + let tree = Expr::parse_tree(r"hello\g<1>world").unwrap(); + assert_eq!(tree.self_recursive, false); + + let tree = Expr::parse_tree(r"(hello\g1?world)").unwrap(); + assert_eq!(tree.self_recursive, false); + + let tree = Expr::parse_tree(r"(?hello\gworld)").unwrap(); + assert_eq!(tree.self_recursive, false); + } + + #[test] + fn named_subroutine_not_defined_later() { + assert_eq!( + p(r"\g(?a)"), + Expr::Concat(vec![ + Expr::UnresolvedNamedSubroutineCall { + name: "wrong_name".to_string(), + ix: 2 + }, + Expr::Group(Box::new(make_literal("a"))), + ]) + ); + } + // found by cargo fuzz, then minimized #[test] fn fuzz_1() { @@ -1814,4 +2522,197 @@ mod tests { fn fuzz_4() { fail(r"\u{2}(?(2)"); } + + fn get_options(pattern: &str, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> RegexOptions { + let mut options = RegexOptions::default(); + options.syntaxc = func(options.syntaxc); + options.pattern = String::from(pattern); + options + } + + #[test] + fn parse_with_case_insensitive_in_pattern() { + let tree = Expr::parse_tree("(?i)hello"); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + make_literal_case_insensitive("h", true), + make_literal_case_insensitive("e", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("o", true) + ]) + ); + } + + #[test] + fn parse_with_case_insensitive_option() { + let options = get_options("hello", |x| x.case_insensitive(true)); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + make_literal_case_insensitive("h", true), + make_literal_case_insensitive("e", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("o", true) + ]) + ); + } + + #[test] + fn parse_with_multiline_in_pattern() { + let options = get_options("(?m)^hello$", |x| x); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartLine { crlf: false }), + make_literal("h"), + make_literal("e"), + make_literal("l"), + make_literal("l"), + make_literal("o"), + Expr::Assertion(Assertion::EndLine { crlf: false }) + ]) + ); + } + + #[test] + fn pparse_with_multiline_option() { + let options = get_options("^hello$", |x| x.multi_line(true)); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartLine { crlf: false }), + make_literal("h"), + make_literal("e"), + make_literal("l"), + make_literal("l"), + make_literal("o"), + Expr::Assertion(Assertion::EndLine { crlf: false }) + ]) + ); + } + + #[test] + fn parse_with_dot_matches_new_line_in_pattern() { + let options = get_options("(?s)(.*)", |x| x); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Group(Box::new(Expr::Repeat { + child: Box::new(Expr::Any { newline: true }), + lo: 0, + hi: usize::MAX, + greedy: true + })) + ); + } + + #[test] + fn parse_with_dot_matches_new_line_option() { + let options = get_options("(.*)", |x| x.dot_matches_new_line(true)); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Group(Box::new(Expr::Repeat { + child: Box::new(Expr::Any { newline: true }), + lo: 0, + hi: usize::MAX, + greedy: true + })) + ); + } + + #[test] + fn parse_fancy_with_dot_matches_new_line_in_pattern() { + let options = get_options("(.*)(?<=hugo)", |x| x.dot_matches_new_line(true)); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + Expr::Group(Box::new(Expr::Repeat { + child: Box::new(Expr::Any { newline: true }), + lo: 0, + hi: usize::MAX, + greedy: true + })), + Expr::LookAround( + Box::new(Expr::Concat(vec![ + make_literal("h"), + make_literal("u"), + make_literal("g"), + make_literal("o") + ])), + LookBehind + ) + ]) + ); + } + + #[test] + fn parse_with_case_insensitre_from_pattern_and_multi_line_option() { + let options = get_options("(?i)^hello$", |x| x.multi_line(true)); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartLine { crlf: false }), + make_literal_case_insensitive("h", true), + make_literal_case_insensitive("e", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("o", true), + Expr::Assertion(Assertion::EndLine { crlf: false }) + ]) + ); + } + + #[test] + fn parse_with_multi_line_and_case_insensitive_options() { + let mut options = get_options("^hello$", |x| x.multi_line(true)); + options.syntaxc = options.syntaxc.case_insensitive(true); + + let tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags()); + let expr = tree.unwrap().expr; + + assert_eq!( + expr, + Expr::Concat(vec![ + Expr::Assertion(Assertion::StartLine { crlf: false }), + make_literal_case_insensitive("h", true), + make_literal_case_insensitive("e", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("l", true), + make_literal_case_insensitive("o", true), + Expr::Assertion(Assertion::EndLine { crlf: false }) + ]) + ); + } } diff --git a/src/vm.rs b/src/vm.rs index 06a5db0..5de21d4 100644 --- a/src/vm.rs +++ b/src/vm.rs @@ -84,6 +84,7 @@ use crate::error::RuntimeError; use crate::prev_codepoint_ix; use crate::Assertion; use crate::Error; +use crate::Formatter; use crate::Result; use crate::{codepoint_len, RegexOptions}; @@ -100,8 +101,39 @@ pub(crate) const OPTION_SKIPPED_EMPTY_MATCH: u32 = 1 << 1; // TODO: make configurable const MAX_STACK: usize = 1_000_000; +#[derive(Clone)] +/// Delegate matching to the regex crate +pub struct Delegate { + /// The regex + pub inner: Regex, + /// The regex pattern as a string + pub pattern: String, + /// The first group number that this regex captures (if it contains groups) + pub start_group: usize, + /// The last group number + pub end_group: usize, +} + +impl core::fmt::Debug for Delegate { + fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { + // Ensures it fails to compile if the struct changes + let Self { + inner: _, + pattern, + start_group, + end_group, + } = self; + + f.debug_struct("Delegate") + .field("pattern", pattern) + .field("start_group", start_group) + .field("end_group", end_group) + .finish() + } +} + /// Instruction of the VM. -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub enum Insn { /// Successful end of program End, @@ -173,20 +205,18 @@ pub enum Insn { /// Set IX back by the specified number of characters GoBack(usize), /// Back reference to a group number to check - Backref(usize), + Backref { + /// The save slot representing the start of the capture group + slot: usize, + /// Whether the backref should be matched case insensitively + casei: bool, + }, /// Begin of atomic group BeginAtomic, /// End of atomic group EndAtomic, /// Delegate matching to the regex crate - Delegate { - /// The regex - inner: Regex, - /// The first group number that this regex captures (if it contains groups) - start_group: usize, - /// The last group number - end_group: usize, - }, + Delegate(Delegate), /// Anchor to match at the position where the previous match ended ContinueFromPreviousMatchEnd, /// Continue only if the specified capture group has already been populated as part of the match @@ -207,11 +237,11 @@ impl Prog { } #[doc(hidden)] - pub(crate) fn debug_print(&self) { - #[cfg(feature = "std")] + pub(crate) fn debug_print(&self, writer: &mut Formatter<'_>) -> core::fmt::Result { for (i, insn) in self.body.iter().enumerate() { - println!("{:3}: {:?}", i, insn); + write!(writer, "{:3}: {:?}\n", i, insn)?; } + Ok(()) } } @@ -413,6 +443,47 @@ fn matches_literal(s: &str, ix: usize, end: usize, literal: &str) -> bool { end <= s.len() && &s.as_bytes()[ix..end] == literal.as_bytes() } +fn matches_literal_casei(s: &str, ix: usize, end: usize, literal: &str) -> bool { + if end > s.len() { + return false; + } + if matches_literal(s, ix, end, literal) { + return true; + } + if s.is_char_boundary(ix) && s.is_char_boundary(end) && s.is_ascii() { + return s[ix..end].eq_ignore_ascii_case(literal); + } + + // text captured and being backreferenced is not ascii, so we utilize regex-automata's case insensitive matching + use regex_syntax::ast::*; + let span = Span::splat(Position::new(0, 0, 0)); + let literals = literal + .chars() + .map(|c| { + Ast::literal(Literal { + span, + kind: LiteralKind::Verbatim, + c, + }) + }) + .collect(); + let ast = Ast::concat(Concat { + span, + asts: literals, + }); + + let mut translator = regex_syntax::hir::translate::TranslatorBuilder::new() + .case_insensitive(true) + .build(); + let hir = translator.translate(literal, &ast).unwrap(); + + use regex_automata::meta::Builder as RaBuilder; + let re = RaBuilder::new() + .build_from_hir(&hir) + .expect("literal hir should get built successfully"); + re.find(&s[ix..end]).is_some() +} + /// Run the program with trace printing for debugging. pub fn run_trace(prog: &Prog, s: &str, pos: usize) -> Result>> { run(prog, s, pos, OPTION_TRACE, &RegexOptions::default()) @@ -629,7 +700,7 @@ pub(crate) fn run( } break 'fail; } - Insn::Backref(slot) => { + Insn::Backref { slot, casei } => { let lo = state.get(slot); if lo == usize::MAX { // Referenced group hasn't matched, so the backref doesn't match either @@ -642,7 +713,11 @@ pub(crate) fn run( } let ref_text = &s[lo..hi]; let ix_end = ix + ref_text.len(); - if !matches_literal(s, ix, ix_end, ref_text) { + if casei { + if !matches_literal_casei(s, ix, ix_end, ref_text) { + break 'fail; + } + } else if !matches_literal(s, ix, ix_end, ref_text) { break 'fail; } ix = ix_end; @@ -662,11 +737,12 @@ pub(crate) fn run( let count = state.stack_pop(); state.backtrack_cut(count); } - Insn::Delegate { + Insn::Delegate(Delegate { ref inner, + pattern: _, start_group, end_group, - } => { + }) => { let input = Input::new(s).span(ix..s.len()).anchored(Anchored::Yes); if start_group == end_group { // No groups, so we can use faster methods diff --git a/tests/captures.rs b/tests/captures.rs index dc559ff..688d056 100644 --- a/tests/captures.rs +++ b/tests/captures.rs @@ -198,6 +198,19 @@ fn captures_from_pos_looking_left() { assert_match(captures.get(1), "x", 1, 2); } +#[test] +fn captures_iter_collect_when_backtrack_limit_hit() { + use fancy_regex::RegexBuilder; + let r = RegexBuilder::new("(x+x+)+(?>y)") + .backtrack_limit(1) + .build() + .unwrap(); + let result: Vec<_> = r.captures_iter("xxxxxxxxxxy").collect(); + println!("{:?}", result); + assert_eq!(result.len(), 1); + assert!(result[0].is_err()); +} + #[cfg_attr(feature = "track_caller", track_caller)] fn captures<'a>(re: &str, text: &'a str) -> Captures<'a> { let regex = common::regex(re); @@ -352,20 +365,18 @@ fn expander_errors() { // Unmatched group number. assert_err!( exp.check("$2", &without_names), - Error::CompileError(CompileError::InvalidBackref) + Error::CompileError(CompileError::InvalidBackref(2)) ); assert_err!( exp.check("${2}", &without_names), - Error::CompileError(CompileError::InvalidBackref) + Error::CompileError(CompileError::InvalidBackref(2)) ); // Unmatched group name. - assert_err!( - exp.check("$xx", &with_names), - Error::CompileError(CompileError::InvalidBackref) + assert!( + matches!(exp.check("$xx", &with_names), Err(Error::CompileError(CompileError::InvalidGroupNameBackref(ref name))) if name == "xx"), ); - assert_err!( + assert!(matches!( exp.check("${xx}", &with_names), - Error::CompileError(CompileError::InvalidBackref) - ); + Err(Error::CompileError(CompileError::InvalidGroupNameBackref(ref name))) if name == "xx")); } diff --git a/tests/finding.rs b/tests/finding.rs index 1645445..1b2cd64 100644 --- a/tests/finding.rs +++ b/tests/finding.rs @@ -260,6 +260,17 @@ fn find_iter_attributes() { assert_eq!(regex.as_str(), matches.regex().as_str()); } +#[test] +fn find_iter_collect_when_backtrack_limit_hit() { + use fancy_regex::RegexBuilder; + let r = RegexBuilder::new("(x+x+)+(?=y)") + .backtrack_limit(1) + .build() + .unwrap(); + let result: Vec<_> = r.find_iter("xxxxxxxxxxy").collect(); + assert_eq!(result.len(), 1); +} + #[test] fn find_conditional() { assert_eq!(find(r"(?(ab)c|d)", "acd"), Some((2, 3))); @@ -267,6 +278,11 @@ fn find_conditional() { assert_eq!(find(r"(a)?b(?(1)c|d)", "abd"), Some((1, 3))); } +#[test] +fn find_endtext_before_newlines() { + assert_eq!(find(r"\Z", "hello\nworld\n\n\n"), Some((11, 11))); +} + fn find(re: &str, text: &str) -> Option<(usize, usize)> { find_match(re, text).map(|m| (m.start(), m.end())) } diff --git a/tests/matching.rs b/tests/matching.rs index 0686ff3..e3ca26d 100644 --- a/tests/matching.rs +++ b/tests/matching.rs @@ -92,7 +92,7 @@ fn atomic_group() { #[test] fn backtrack_limit() { - let re = RegexBuilder::new("(?i)(a|b|ab)*(?=c)") + let re = RegexBuilder::new("(?i)(a|b|ab)*(?>c)") .backtrack_limit(100_000) .build() .unwrap(); @@ -154,6 +154,49 @@ fn conditional_with_lookaround_condition() { assert_no_match(r"^(?((?=\d))\wabc|\d!)$", "5!"); } +#[test] +fn backrefs() { + assert_match(r"(abc)\1", "abcabc"); + assert_match(r"(abc|def)\1", "abcabc"); + assert_no_match(r"(abc|def)\1", "abcdef"); + assert_match(r"(abc|def)\1", "defdef"); + + assert_no_match(r"(abc|def)\1", "abcABC"); + assert_match(r"(abc|def)(?i:\1)", "abcABC"); + assert_match(r"(abc|def)(?i:\1)", "abcAbc"); + assert_no_match(r"(abc|def)(?i:\1)", "abcAB"); + assert_no_match(r"(abc|def)(?i:\1)", "abcdef"); + + assert_match(r"(δ)(?i:\1)", "δΔ"); + assert_no_match(r"(δ)\1", "δΔ"); + assert_no_match(r"(δδ)\1", "δΔfoo"); + assert_no_match(r"(δδ)\1", "δΔ"); + + assert_match(r"(.)(?i:\1)", "\\\\"); + assert_match(r"(.)(?i:\1)", "(("); + + assert_match(r"(.)(?i:\1)", "įĮ"); + assert_no_match(r"(.)(?i:\1)", "įi"); + assert_no_match(r"(.)(?i:\1)", "įĖ"); +} + +#[test] +fn easy_trailing_positive_lookaheads() { + assert_match(r"(?=c)", "abcabc"); + assert_match(r"abc(?=abc)", "abcabc"); + assert_no_match(r"abc(?=abc)", "abcdef"); + assert_match(r"abc(?=a|b)", "abcabc"); + assert_no_match(r"abc(?=a|f)", "f"); +} + +#[test] +fn hard_trailing_positive_lookaheads() { + assert_match(r"(abc|def)(?=\1)", "defdef"); + assert_match(r"(abc|def)(?=a(?!b))", "abca"); + assert_match(r"(abc|def)(?=a(?!b))", "abcaa"); + assert_no_match(r"(abc|def)(?=a(?!b))", "abcabc"); +} + #[cfg_attr(feature = "track_caller", track_caller)] fn assert_match(re: &str, text: &str) { let result = match_text(re, text); diff --git a/tests/oniguruma.rs b/tests/oniguruma.rs index d9e4482..b1a66c2 100644 --- a/tests/oniguruma.rs +++ b/tests/oniguruma.rs @@ -5,7 +5,7 @@ use std::panic; use regex::Regex; -use fancy_regex::Regex as FancyRegex; +use fancy_regex::RegexBuilder; #[derive(Debug, Eq, Hash, PartialEq)] struct Test { @@ -151,7 +151,7 @@ fn run_test(test: &Test) -> Option { .. } = test; - let compile_result = FancyRegex::new(pattern); + let compile_result = RegexBuilder::new(pattern).multi_line(true).build(); let Ok(regex) = compile_result else { let error = format!("{:?}", compile_result.unwrap_err()); return Some(format!("Compile failed: {}", error)); @@ -181,7 +181,6 @@ fn run_test(test: &Test) -> Option { } } Assertion::NoMatch => { - let regex = FancyRegex::new(pattern).unwrap(); let result = regex.find(&text).unwrap(); if result.is_some() { Some("Match found".to_string()) diff --git a/tests/oniguruma/test_utf8_ignore.c b/tests/oniguruma/test_utf8_ignore.c index 889d0f4..a320632 100644 --- a/tests/oniguruma/test_utf8_ignore.c +++ b/tests/oniguruma/test_utf8_ignore.c @@ -4,15 +4,6 @@ // x3 tests have an additional argument which is the group number to check. - // No match found - x2("^a", "\na", 1, 2); - - // Compile failed: ParseError(1, InvalidEscape("\\O")) - x2("$\\O", "bb\n", 2, 3); - - // Compile failed: ParseError(0, InvalidEscape("\\Z")) - x2("\\Z", "", 0, 0); - // Compile failed: ParseError(0, InvalidEscape("\\c")) x2("\\ca", "\001", 0, 1); @@ -37,15 +28,6 @@ // Compile failed: CompileError(InnerError(BuildError { kind: Syntax { pid: PatternID(0), err: Parse(Error { kind: ClassRangeInvalid, pattern: "[a-&&-a]", span: Span(Position(o: 1, l: 1, c: 2), Position(o: 4, l: 1, c: 5)) }) } })) x2("[a-&&-a]", "-", 0, 1); - // Compile failed: ParseError(2, InvalidEscape("\\Z")) - x2("\\A\\Z", "", 0, 0); - - // Compile failed: ParseError(3, InvalidEscape("\\Z")) - x2("xyz\\Z", "xyz", 0, 3); - - // Compile failed: ParseError(1, InvalidEscape("\\Z")) - x2("a\\Z", "a", 0, 1); - // No match found x2("(?i:ss)", "\xc3\x9f", 0, 2); @@ -79,12 +61,6 @@ // No match found x2("(?m:.b)", "a\nb", 1, 3); - // Compile failed: ParseError(3, InvalidEscape("\\Z")) - x2("a|b\\Z", "ba", 1, 2); - - // Compile failed: ParseError(3, InvalidEscape("\\Z")) - x2("a|b\\Z", "b", 0, 1); - // Match found at start 1 and end 2 (expected 0 and 2) x2("a(?i)b|c", "aC", 0, 2); @@ -103,49 +79,37 @@ // No match found x2("(?:abc)+?{2}", "abcabcabc", 0, 6); - // No match found - x2("(abc)(?i:\\1)", "abcABC", 0, 6); - // No match found x3("((?m:a.c))", "a\nc", 0, 3, 1); - // Compile failed: CompileError(InvalidBackref) - x2("(?:(?:\\1|z)(a))+$", "zaaa", 0, 4); - - // Compile failed: ParseError(3, InvalidEscape("\\Z")) - x2("(a*\\Z)\\1", "a", 1, 1); - - // Compile failed: ParseError(4, InvalidEscape("\\Z")) - x2(".(a*\\Z)\\1", "ba", 1, 2); - - // Compile failed: ParseError(3, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(a)\\g<1>", "aa", 0, 2); - // Compile failed: ParseError(13, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?ab)\\g", "abab", 0, 4); - // Compile failed: ParseError(4, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?<=\\g)|-\\zEND (?XyZ)", "XyZ", 3, 3); - // Compile failed: ParseError(7, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?|a\\g)+", "", 0, 0); - // Compile failed: ParseError(8, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?|\\(\\g\\))+$", "()(())", 0, 6); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x3("\\g(?.){0}", "X", 0, 1, 1); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("\\g(abc|df(?.YZ){2,8}){0}", "XYZ", 0, 3); - // Compile failed: ParseError(9, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\A(?(a\\g)|)\\z", "aaaa", 0, 4); - // Compile failed: ParseError(6, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("(?|\\g\\g)\\z|\\zEND (?a|(b)\\g)", "bbbbabba", 0, 8); - // Compile failed: ParseError(15, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x3("(z)()()(?<_9>a)\\g<_9>", "zaa", 2, 3, 1); // No match found @@ -154,37 +118,37 @@ // No match found x2("(?:(?.)|(?..)|(?...)|(?....)|(?.....)|(?......)|(?.......)|(?........)|(?.........)|(?..........)|(?...........)|(?............)|(?.............)|(?..............))\\k$", "a-pyumpyum", 2, 10); - // Compile failed: ParseError(11, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?a|\\(\\g\\))", "a", 0, 1); - // Compile failed: ParseError(11, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?a|\\(\\g\\))", "((((((a))))))", 0, 13); - // Compile failed: ParseError(11, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x3("(?a|\\(\\g\\))", "((((((((a))))))))", 0, 17, 1); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("\\g|\\zEND(?.*abc$)", "abcxxxabc", 0, 9); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g<1>|\\zEND(.a.)", "bac", 0, 3); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x3("\\g<_A>\\g<_A>|\\zEND(.a.)(?<_A>.b.)", "xbxyby", 3, 6, 1); - // Compile failed: ParseError(5, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("\\A(?:\\g|\\g|\\zEND (?a|c\\gc)(?b|d\\gd))$", "cdcbcdc", 0, 7); - // Compile failed: ParseError(9, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("\\A(?|a\\g)\\z|\\zEND (?\\g)", "aaaa", 0, 4); - // Compile failed: ParseError(9, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?(a|b\\gc){3,5})", "baaaaca", 1, 5); - // Compile failed: ParseError(9, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?(a|b\\gc){3,5})", "baaaacaaaaa", 0, 10); - // Compile failed: ParseError(21, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?\\(([^\\(\\)]++|\\g)*+\\))", "((a))", 0, 5); // No match found @@ -193,57 +157,42 @@ // No match found x2("(?:()|())*\\1\\2", "", 0, 0); - // Compile failed: CompileError(InvalidBackref) + // Expected group to exist x3("(?:\\1a|())*", "a", 0, 0, 1); - // Compile failed: ParseError(16, InvalidEscape("\\Z")) - x2("x((.)*)*x(?i:\\1)\\Z", "0x1x2x1X2", 1, 9); - // No match found x2("(?:()|()|()|()|()|())*\\2\\5", "", 0, 0); // No match found x2("(?:()|()|()|(x)|()|())*\\2b\\5", "b", 0, 1); - // Compile failed: ParseError(12, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x3("(\\(((?:[^(]|\\g<1>)*)\\))", "(abc)(abc)", 1, 4, 2); // Compile failed: ParseError(0, InvalidEscape("\\o")) x2("\\o{101}", "A", 0, 1); - // Compile failed: ParseError(6, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\A(a|b\\g<1>c)\\k<1+3>\\z", "bbacca", 0, 6); - // Compile failed: ParseError(10, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?i)\\A(a|b\\g<1>c)\\k<1+2>\\z", "bBACcbac", 0, 8); - // No match found - x2("(?i)(?aa)|(?bb)\\k", "BBbb", 0, 4); - - // Compile failed: ParseError(5, InvalidGroupName) - x2("(?:\\k'+1'B|(A)C)*", "ACAB", 0, 4); - - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g<+2>(abc)(ABC){0}", "ABCabc", 0, 6); - // Compile failed: ParseError(1, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("A\\g'0'|B()", "AAAAB", 0, 5); - // Compile failed: ParseError(2, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x3("(A\\g'0')|B", "AAAAB", 0, 5, 1); - // Compile failed: ParseError(10, GeneralParseError("expected conditional to be a backreference or at least an expression for when the condition is true")) - x2("(a*)(?(-1))aa", "aaaaa", 0, 5); - - // Compile failed: ParseError(7, GeneralParseError("expected close paren")) + // Compile failed: CompileError(FeatureNotYetSupported("Backref at recursion level")) x2("(a)(?(1+0)b|c)d", "abd", 0, 3); - // Compile failed: ParseError(5, UnknownFlag("(?'")) + // No match found x2("(?:(?'name'a)|(?'name'b))(?('name')c|d)e", "ace", 0, 3); - // Compile failed: ParseError(5, UnknownFlag("(?'")) - x2("(?:(?'name'a)|(?'name'b))(?('name')c|d)e", "bce", 0, 3); - // Compile failed: ParseError(0, InvalidEscape("\\R")) x2("\\R", "\r\n", 0, 2); @@ -262,18 +211,6 @@ // Compile failed: ParseError(0, InvalidEscape("\\N")) x2("\\N", "a", 0, 1); - // Compile failed: ParseError(0, InvalidEscape("\\O")) - x2("\\O", "a", 0, 1); - - // Compile failed: ParseError(0, InvalidEscape("\\O")) - x2("\\O", "\n", 0, 1); - - // Compile failed: ParseError(4, InvalidEscape("\\O")) - x2("(?m:\\O)", "\n", 0, 1); - - // Compile failed: ParseError(5, InvalidEscape("\\O")) - x2("(?-m:\\O)", "\n", 0, 1); - // No match found x2("(?:()|())*\\1", "abc", 0, 0); @@ -283,19 +220,19 @@ // No match found x2("(?:()|()|())*\\3\\1", "abc", 0, 0); - // Compile failed: ParseError(9, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(|(?:a(?:\\g'1')*))b|", "abc", 0, 2); - // Compile failed: ParseError(14, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("((?abc){0}a\\gd)+", "aabcd", 0, 5); // Match found at start 0 and end 3 (expected 0 and 6) x2("(?a)(?b)(\\k)+", "abbaab", 0, 6); - // Compile failed: ParseError(8, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?$|b\\g)", "bbb", 0, 3); - // Compile failed: ParseError(16, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?(?(a)a|b)|c\\g)", "cccb", 0, 4); // Compile failed: ParseError(1, InvalidEscape("\\o")) @@ -442,27 +379,12 @@ // No match found x2("\\xca\\xb8", "\xca\xb8", 0, 2); - // Compile failed: ParseError(9, InvalidEscape("\\Z")) - x2("むめも\\Z", "むめも", 0, 9); - - // Compile failed: ParseError(9, InvalidEscape("\\Z")) - x2("かきく\\Z", "かきく\n", 0, 9); - // No match found x2("(?m:よ.)", "よ\n", 0, 4); // No match found x2("(?m:.め)", "ま\nめ", 3, 7); - // Compile failed: ParseError(7, InvalidEscape("\\Z")) - x2("鬼|車\\Z", "車鬼", 3, 6); - - // Compile failed: ParseError(7, InvalidEscape("\\Z")) - x2("鬼|車\\Z", "車", 0, 3); - - // Compile failed: ParseError(7, InvalidEscape("\\Z")) - x2("鬼|車\\Z", "車\n", 0, 3); - // No match found x2("(?:あい)?{2}", "", 0, 0); @@ -481,16 +403,10 @@ // No match found x3("((?m:あ.う))", "あ\nう", 0, 7, 1); - // Compile failed: ParseError(5, InvalidEscape("\\Z")) - x2("(あ*\\Z)\\1", "あ", 3, 3); - - // Compile failed: ParseError(6, InvalidEscape("\\Z")) - x2(".(あ*\\Z)\\1", "いあ", 3, 6); - - // Compile failed: ParseError(16, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("(?<愚か>変|\\(\\g<愚か>\\))", "((((((変))))))", 0, 15); - // Compile failed: ParseError(5, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call") x2("\\A(?:\\g<阿_1>|\\g<云_2>|\\z終了 (?<阿_1>観|自\\g<云_2>自)(?<云_2>在|菩薩\\g<阿_1>菩薩))$", "菩薩自菩薩自在自菩薩自菩薩", 0, 39); // Compile failed: CompileError(InnerError(BuildError { kind: Syntax { pid: PatternID(0), err: Parse(Error { kind: ClassRangeInvalid, pattern: "[あ-&&-あ]", span: Span(Position(o: 1, l: 1, c: 2), Position(o: 6, l: 1, c: 5)) }) } })) @@ -734,16 +650,16 @@ // Compile failed: ParseError(3, TargetNotRepeatable) x2("(?(*FAIL)123|456)", "456", 0, 3); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g'0'++{,0}", "abcdefgh", 0, 0); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g'0'++{,0}?", "abcdefgh", 0, 0); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g'0'++{,0}b", "abcdefgh", 1, 2); - // Compile failed: ParseError(0, InvalidEscape("\\g")) + // Compile failed: CompileError(FeatureNotYetSupported("Subroutine Call")) x2("\\g'0'++{,0}?def", "abcdefgh", 3, 6); // Compile failed: CompileError(InnerError(BuildError { kind: Syntax { pid: PatternID(0), err: Parse(Error { kind: RepetitionCountInvalid, pattern: "a{3,2}b", span: Span(Position(o: 1, l: 1, c: 2), Position(o: 6, l: 1, c: 7)) }) } })) diff --git a/tests/regex_options.rs b/tests/regex_options.rs index 9fbd4d1..1cf56ff 100644 --- a/tests/regex_options.rs +++ b/tests/regex_options.rs @@ -1,44 +1,114 @@ +use fancy_regex::Regex; use fancy_regex::RegexBuilder; +pub fn build_regex(builder: &RegexBuilder) -> Regex { + let result = builder.build(); + assert!( + result.is_ok(), + "Expected regex to build successfully, got {:?}", + result.err() + ); + result.unwrap() +} + #[test] fn check_casing_option() { - let builder = RegexBuilder::new(r"TEST foo") - .case_insensitive(false) - .build(); + let regex = build_regex(RegexBuilder::new(r"TEST foo").case_insensitive(false)); - match builder { - Ok(regex) => { - assert!(regex.is_match(r"TEST foo").unwrap_or_default()); - assert!(!regex.is_match(r"test foo").unwrap_or_default()); - } - _ => panic!("builder should be able to compile with casing options"), - } + assert!(regex.is_match(r"TEST foo").unwrap_or_default()); + assert!(!regex.is_match(r"test foo").unwrap_or_default()); } #[test] fn check_override_casing_option() { - let builder = RegexBuilder::new(r"FOO(?i:bar)quux") - .case_insensitive(false) - .build(); + let regex = build_regex(RegexBuilder::new(r"FOO(?i:bar)quux").case_insensitive(false)); - match builder { - Ok(regex) => { - assert!(!regex.is_match("FoObarQuUx").unwrap_or_default()); - assert!(!regex.is_match("fooBARquux").unwrap_or_default()); - assert!(regex.is_match("FOObarquux").unwrap_or_default()); - } - _ => panic!("builder should be able to compile with casing options"), - } + assert!(!regex.is_match("FoObarQuUx").unwrap_or_default()); + assert!(!regex.is_match("fooBARquux").unwrap_or_default()); + assert!(regex.is_match("FOObarquux").unwrap_or_default()); } #[test] fn check_casing_insensitive_option() { - let builder = RegexBuilder::new(r"TEST FOO") - .case_insensitive(true) - .build(); - - match builder { - Ok(regex) => assert!(regex.is_match(r"test foo").unwrap_or_default()), - _ => panic!("builder should be able to compile with casing options"), - } + let regex = build_regex(RegexBuilder::new(r"TEST FOO").case_insensitive(true)); + + assert!(regex.is_match(r"test foo").unwrap_or_default()); +} + +#[test] +fn check_multi_line_option() { + let test_text = r"test +hugo +test"; + + let regex = build_regex(RegexBuilder::new(r"^test$").multi_line(true)); + assert!(regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn check_ignore_whitespace_option() { + let regex = build_regex(RegexBuilder::new(r"test foo").ignore_whitespace(true)); + + let test_text = r"testfoo"; + assert!(regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn check_dot_matches_new_line_option() { + let regex = build_regex(RegexBuilder::new(r"
(.*?)<\/div>").dot_matches_new_line(true)); + + let test_text = r"
+ hello
"; + + assert!(regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn check_casing_insensitive_option_hard() { + let regex = build_regex(RegexBuilder::new(r"[a-z](?<=[^f])").case_insensitive(true)); + + assert!(regex.is_match(r"J").unwrap_or_default()); + assert!(!regex.is_match(r"F").unwrap_or_default()); + assert!(regex.is_match(r"j").unwrap_or_default()); +} + +#[test] +fn check_ignore_whitespace_option_fancy() { + let regex = build_regex(RegexBuilder::new(r"(?=test foo)").ignore_whitespace(true)); + + let test_text = r"testfoo"; + + assert!(regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn check_ignore_whitespace_with_lookahead_matches() { + let regex = build_regex(RegexBuilder::new(r"(?=test foo)").ignore_whitespace(true)); + + let test_text = r"test foo"; + + assert!(!regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn check_verbose_mode_option() { + let pattern = " +test foo #hugo +"; + let regex = build_regex(RegexBuilder::new(pattern).verbose_mode(true)); + + let test_text = r"test foo"; + + assert!(!regex.is_match(test_text).unwrap_or_default()); +} + +#[test] +fn issue_163_fancy_email_test() { + let pattern = + r"^(?!\.)(?!.*\.\.)([a-z0-9_'+\-\.]*)[a-z0-9_'+\-]@([a-z0-9][a-z0-9\-]*\.)+[a-z]{2,}$"; + + let regex = build_regex(RegexBuilder::new(pattern).case_insensitive(true)); + + let test_text = "VALID@domain.com"; + assert!(regex.is_match(test_text).unwrap()); }