rustc/vendor/regex-automata-0.2.0/tests/hybrid/suite.rs
2024-05-29 11:23:22 +02:00

213 lines
7.2 KiB
Rust

use regex_automata::{
hybrid::{
dfa::DFA,
regex::{self, Regex},
},
nfa::thompson,
MatchKind, SyntaxConfig,
};
use regex_syntax as syntax;
use regex_test::{
bstr::{BString, ByteSlice},
CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
SearchKind as TestSearchKind, TestResult, TestRunner,
};
use crate::{suite, Result};
/// Tests the default configuration of the hybrid NFA/DFA.
#[test]
fn default() -> Result<()> {
let builder = Regex::builder();
TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests the hybrid NFA/DFA with NFA shrinking disabled.
///
/// This is actually the typical configuration one wants for a lazy DFA. NFA
/// shrinking is mostly only advantageous when building a full DFA since it
/// can sharply decrease the amount of time determinization takes. But NFA
/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
/// no compilation time (other than for building the NFA of course) before
/// executing a search, it's usually worth it to forgo NFA shrinking.
#[test]
fn no_nfa_shrink() -> Result<()> {
let mut builder = Regex::builder();
builder.thompson(thompson::Config::new().shrink(false));
TestRunner::new()?
// Without NFA shrinking, this test blows the default cache capacity.
.blacklist("expensive/regression-many-repeat-no-stack-overflow")
.test_iter(suite()?.iter(), compiler(builder))
.assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
#[test]
fn starts_for_each_pattern() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().starts_for_each_pattern(true));
TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when byte classes are disabled.
///
/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
/// All it does is cause every byte value to be its own distinct equivalence
/// class.
#[test]
fn no_byte_classes() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().byte_classes(false));
TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests that hybrid NFA/DFA never clears its cache for any test with the
/// default capacity.
///
/// N.B. If a regex suite test is added that causes the cache to be cleared,
/// then this should just skip that test. (Which can be done by calling the
/// 'blacklist' method on 'TestRunner'.)
#[test]
fn no_cache_clearing() -> Result<()> {
let mut builder = Regex::builder();
builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
/// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
#[test]
fn min_cache_capacity() -> Result<()> {
let mut builder = Regex::builder();
builder
.dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
Ok(())
}
fn compiler(
mut builder: regex::Builder,
) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
move |test, regexes| {
let regexes = regexes
.iter()
.map(|r| r.to_str().map(|s| s.to_string()))
.collect::<std::result::Result<Vec<String>, _>>()?;
// Check if our regex contains things that aren't supported by DFAs.
// That is, Unicode word boundaries when searching non-ASCII text.
let mut thompson = thompson::Builder::new();
thompson.syntax(config_syntax(test)).configure(config_thompson(test));
if let Ok(nfa) = thompson.build_many(&regexes) {
let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
if nfa.has_word_boundary_unicode() && non_ascii {
return Ok(CompiledRegex::skip());
}
}
if !configure_regex_builder(test, &mut builder) {
return Ok(CompiledRegex::skip());
}
let re = builder.build_many(&regexes)?;
let mut cache = re.create_cache();
Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
run_test(&re, &mut cache, test)
}))
}
}
fn run_test(
re: &Regex,
cache: &mut regex::Cache,
test: &RegexTest,
) -> Vec<TestResult> {
let is_match = if re.is_match(cache, test.input()) {
TestResult::matched()
} else {
TestResult::no_match()
};
let is_match = is_match.name("is_match");
let find_matches = match test.search_kind() {
TestSearchKind::Earliest => {
let it = re
.find_earliest_iter(cache, test.input())
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
start: m.start(),
end: m.end(),
});
TestResult::matches(it).name("find_earliest_iter")
}
TestSearchKind::Leftmost => {
let it = re
.find_leftmost_iter(cache, test.input())
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
start: m.start(),
end: m.end(),
});
TestResult::matches(it).name("find_leftmost_iter")
}
TestSearchKind::Overlapping => {
let it = re
.find_overlapping_iter(cache, test.input())
.take(test.match_limit().unwrap_or(std::usize::MAX))
.map(|m| Match {
id: m.pattern().as_usize(),
start: m.start(),
end: m.end(),
});
TestResult::matches(it).name("find_overlapping_iter")
}
};
vec![is_match, find_matches]
}
/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_regex_builder(
test: &RegexTest,
builder: &mut regex::Builder,
) -> bool {
let match_kind = match test.match_kind() {
TestMatchKind::All => MatchKind::All,
TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
TestMatchKind::LeftmostLongest => return false,
};
let dense_config = DFA::config()
.anchored(test.anchored())
.match_kind(match_kind)
.unicode_word_boundary(true);
let regex_config = Regex::config().utf8(test.utf8());
builder
.configure(regex_config)
.syntax(config_syntax(test))
.thompson(config_thompson(test))
.dfa(dense_config);
true
}
/// Configuration of a Thompson NFA compiler from a regex test.
fn config_thompson(test: &RegexTest) -> thompson::Config {
thompson::Config::new().utf8(test.utf8())
}
/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> SyntaxConfig {
SyntaxConfig::new()
.case_insensitive(test.case_insensitive())
.unicode(test.unicode())
.utf8(test.utf8())
}