mirror of
https://git.proxmox.com/git/rustc
synced 2025-10-30 21:19:57 +00:00
281 lines
9.2 KiB
Rust
281 lines
9.2 KiB
Rust
use regex_automata::{
|
|
dfa::{self, dense, regex::Regex, sparse, Automaton},
|
|
nfa::thompson,
|
|
MatchKind, SyntaxConfig,
|
|
};
|
|
use regex_syntax as syntax;
|
|
|
|
use regex_test::{
|
|
bstr::{BString, ByteSlice},
|
|
CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
|
|
SearchKind as TestSearchKind, TestResult, TestRunner,
|
|
};
|
|
|
|
use crate::{suite, Result};
|
|
|
|
/// Runs the test suite with the default configuration.
|
|
#[test]
|
|
fn unminimized_default() -> Result<()> {
|
|
let builder = Regex::builder();
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), dense_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Runs the test suite with byte classes disabled.
|
|
#[test]
|
|
fn unminimized_no_byte_class() -> Result<()> {
|
|
let mut builder = Regex::builder();
|
|
builder.dense(dense::Config::new().byte_classes(false));
|
|
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), dense_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Runs the test suite with NFA shrinking disabled.
|
|
#[test]
|
|
fn unminimized_no_nfa_shrink() -> Result<()> {
|
|
let mut builder = Regex::builder();
|
|
builder.thompson(thompson::Config::new().shrink(false));
|
|
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), dense_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Runs the test suite on a minimized DFA with an otherwise default
|
|
/// configuration.
|
|
#[test]
|
|
fn minimized_default() -> Result<()> {
|
|
let mut builder = Regex::builder();
|
|
builder.dense(dense::Config::new().minimize(true));
|
|
TestRunner::new()?
|
|
// These regexes tend to be too big. Minimization takes... forever.
|
|
.blacklist("expensive")
|
|
.test_iter(suite()?.iter(), dense_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Runs the test suite on a minimized DFA with byte classes disabled.
|
|
#[test]
|
|
fn minimized_no_byte_class() -> Result<()> {
|
|
let mut builder = Regex::builder();
|
|
builder.dense(dense::Config::new().minimize(true).byte_classes(false));
|
|
|
|
TestRunner::new()?
|
|
// These regexes tend to be too big. Minimization takes... forever.
|
|
.blacklist("expensive")
|
|
.test_iter(suite()?.iter(), dense_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Runs the test suite on a sparse unminimized DFA.
|
|
#[test]
|
|
fn sparse_unminimized_default() -> Result<()> {
|
|
let builder = Regex::builder();
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), sparse_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// Another basic sanity test that checks we can serialize and then deserialize
|
|
/// a regex, and that the resulting regex can be used for searching correctly.
|
|
#[test]
|
|
fn serialization_unminimized_default() -> Result<()> {
|
|
let builder = Regex::builder();
|
|
let my_compiler = |builder| {
|
|
compiler(builder, |builder, re| {
|
|
let builder = builder.clone();
|
|
let (fwd_bytes, _) = re.forward().to_bytes_native_endian();
|
|
let (rev_bytes, _) = re.reverse().to_bytes_native_endian();
|
|
Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
|
|
let fwd: dense::DFA<&[u32]> =
|
|
dense::DFA::from_bytes(&fwd_bytes).unwrap().0;
|
|
let rev: dense::DFA<&[u32]> =
|
|
dense::DFA::from_bytes(&rev_bytes).unwrap().0;
|
|
let re = builder.build_from_dfas(fwd, rev);
|
|
|
|
run_test(&re, test)
|
|
}))
|
|
})
|
|
};
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), my_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
/// A basic sanity test that checks we can serialize and then deserialize a
|
|
/// regex using sparse DFAs, and that the resulting regex can be used for
|
|
/// searching correctly.
|
|
#[test]
|
|
fn sparse_serialization_unminimized_default() -> Result<()> {
|
|
let builder = Regex::builder();
|
|
let my_compiler = |builder| {
|
|
compiler(builder, |builder, re| {
|
|
let builder = builder.clone();
|
|
let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian();
|
|
let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian();
|
|
Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
|
|
let fwd: sparse::DFA<&[u8]> =
|
|
sparse::DFA::from_bytes(&fwd_bytes).unwrap().0;
|
|
let rev: sparse::DFA<&[u8]> =
|
|
sparse::DFA::from_bytes(&rev_bytes).unwrap().0;
|
|
let re = builder.build_from_dfas(fwd, rev);
|
|
run_test(&re, test)
|
|
}))
|
|
})
|
|
};
|
|
TestRunner::new()?
|
|
.test_iter(suite()?.iter(), my_compiler(builder))
|
|
.assert();
|
|
Ok(())
|
|
}
|
|
|
|
fn dense_compiler(
|
|
builder: dfa::regex::Builder,
|
|
) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
|
|
compiler(builder, |_, re| {
|
|
Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
|
|
run_test(&re, test)
|
|
}))
|
|
})
|
|
}
|
|
|
|
fn sparse_compiler(
|
|
builder: dfa::regex::Builder,
|
|
) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
|
|
compiler(builder, |builder, re| {
|
|
let fwd = re.forward().to_sparse()?;
|
|
let rev = re.reverse().to_sparse()?;
|
|
let re = builder.build_from_dfas(fwd, rev);
|
|
Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
|
|
run_test(&re, test)
|
|
}))
|
|
})
|
|
}
|
|
|
|
fn compiler(
|
|
mut builder: dfa::regex::Builder,
|
|
mut create_matcher: impl FnMut(
|
|
&dfa::regex::Builder,
|
|
Regex,
|
|
) -> Result<CompiledRegex>,
|
|
) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
|
|
move |test, regexes| {
|
|
let regexes = regexes
|
|
.iter()
|
|
.map(|r| r.to_str().map(|s| s.to_string()))
|
|
.collect::<std::result::Result<Vec<String>, _>>()?;
|
|
|
|
// Check if our regex contains things that aren't supported by DFAs.
|
|
// That is, Unicode word boundaries when searching non-ASCII text.
|
|
let mut thompson = thompson::Builder::new();
|
|
thompson.configure(config_thompson(test));
|
|
// TODO: Modify Hir to report facts like this, instead of needing to
|
|
// build an NFA to do it.
|
|
if let Ok(nfa) = thompson.build_many(®exes) {
|
|
let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
|
|
if nfa.has_word_boundary_unicode() && non_ascii {
|
|
return Ok(CompiledRegex::skip());
|
|
}
|
|
}
|
|
if !configure_regex_builder(test, &mut builder) {
|
|
return Ok(CompiledRegex::skip());
|
|
}
|
|
create_matcher(&builder, builder.build_many(®exes)?)
|
|
}
|
|
}
|
|
|
|
fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> Vec<TestResult> {
|
|
let is_match = if re.is_match(test.input()) {
|
|
TestResult::matched()
|
|
} else {
|
|
TestResult::no_match()
|
|
};
|
|
let is_match = is_match.name("is_match");
|
|
|
|
let find_matches = match test.search_kind() {
|
|
TestSearchKind::Earliest => {
|
|
let it = re
|
|
.find_earliest_iter(test.input())
|
|
.take(test.match_limit().unwrap_or(std::usize::MAX))
|
|
.map(|m| Match {
|
|
id: m.pattern().as_usize(),
|
|
start: m.start(),
|
|
end: m.end(),
|
|
});
|
|
TestResult::matches(it).name("find_earliest_iter")
|
|
}
|
|
TestSearchKind::Leftmost => {
|
|
let it = re
|
|
.find_leftmost_iter(test.input())
|
|
.take(test.match_limit().unwrap_or(std::usize::MAX))
|
|
.map(|m| Match {
|
|
id: m.pattern().as_usize(),
|
|
start: m.start(),
|
|
end: m.end(),
|
|
});
|
|
TestResult::matches(it).name("find_leftmost_iter")
|
|
}
|
|
TestSearchKind::Overlapping => {
|
|
let it = re
|
|
.find_overlapping_iter(test.input())
|
|
.take(test.match_limit().unwrap_or(std::usize::MAX))
|
|
.map(|m| Match {
|
|
id: m.pattern().as_usize(),
|
|
start: m.start(),
|
|
end: m.end(),
|
|
});
|
|
TestResult::matches(it).name("find_overlapping_iter")
|
|
}
|
|
};
|
|
|
|
vec![is_match, find_matches]
|
|
}
|
|
|
|
/// Configures the given regex builder with all relevant settings on the given
|
|
/// regex test.
|
|
///
|
|
/// If the regex test has a setting that is unsupported, then this returns
|
|
/// false (implying the test should be skipped).
|
|
fn configure_regex_builder(
|
|
test: &RegexTest,
|
|
builder: &mut dfa::regex::Builder,
|
|
) -> bool {
|
|
let match_kind = match test.match_kind() {
|
|
TestMatchKind::All => MatchKind::All,
|
|
TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
|
|
TestMatchKind::LeftmostLongest => return false,
|
|
};
|
|
|
|
let syntax_config = SyntaxConfig::new()
|
|
.case_insensitive(test.case_insensitive())
|
|
.unicode(test.unicode())
|
|
.utf8(test.utf8());
|
|
let dense_config = dense::Config::new()
|
|
.anchored(test.anchored())
|
|
.match_kind(match_kind)
|
|
.unicode_word_boundary(true);
|
|
let regex_config = Regex::config().utf8(test.utf8());
|
|
|
|
builder
|
|
.configure(regex_config)
|
|
.syntax(syntax_config)
|
|
.thompson(config_thompson(test))
|
|
.dense(dense_config);
|
|
true
|
|
}
|
|
|
|
/// Configuration of a Thompson NFA compiler from a regex test.
|
|
fn config_thompson(test: &RegexTest) -> thompson::Config {
|
|
thompson::Config::new().utf8(test.utf8())
|
|
}
|