mirror of
https://git.proxmox.com/git/rustc
synced 2025-10-29 22:30:29 +00:00
196 lines
6.7 KiB
Rust
196 lines
6.7 KiB
Rust
use std::error::Error;
|
|
|
|
use regex_automata::{
|
|
hybrid::{
|
|
dfa::{self, DFA},
|
|
regex::Regex,
|
|
OverlappingState,
|
|
},
|
|
nfa::thompson,
|
|
HalfMatch, MatchError, MatchKind, MultiMatch,
|
|
};
|
|
|
|
use crate::util::{BunkPrefilter, SubstringPrefilter};
|
|
|
|
// Tests that too many cache resets cause the lazy DFA to quit.
|
|
//
|
|
// We only test this on 64-bit because the test is gingerly crafted based on
|
|
// implementation details of cache sizes. It's not a great test because of
|
|
// that, but it does check some interesting properties around how positions are
|
|
// reported when a search "gives up."
|
|
#[test]
|
|
#[cfg(target_pointer_width = "64")]
|
|
fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
|
|
// This is a carefully chosen regex. The idea is to pick one that requires
|
|
// some decent number of states (hence the bounded repetition). But we
|
|
// specifically choose to create a class with an ASCII letter and a
|
|
// non-ASCII letter so that we can check that no new states are created
|
|
// once the cache is full. Namely, if we fill up the cache on a haystack
|
|
// of 'a's, then in order to match one 'β', a new state will need to be
|
|
// created since a 'β' is encoded with multiple bytes. Since there's no
|
|
// room for this state, the search should quit at the very first position.
|
|
let pattern = r"[aβ]{100}";
|
|
let dfa = DFA::builder()
|
|
.configure(
|
|
// Configure it so that we have the minimum cache capacity
|
|
// possible. And that if any resets occur, the search quits.
|
|
DFA::config()
|
|
.skip_cache_capacity_check(true)
|
|
.cache_capacity(0)
|
|
.minimum_cache_clear_count(Some(0)),
|
|
)
|
|
.build(pattern)?;
|
|
let mut cache = dfa.create_cache();
|
|
|
|
let haystack = "a".repeat(101).into_bytes();
|
|
let err = MatchError::GaveUp { offset: 25 };
|
|
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
|
|
assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
|
|
assert_eq!(
|
|
dfa.find_overlapping_fwd(
|
|
&mut cache,
|
|
&haystack,
|
|
&mut OverlappingState::start()
|
|
),
|
|
Err(err.clone())
|
|
);
|
|
|
|
let haystack = "β".repeat(101).into_bytes();
|
|
let err = MatchError::GaveUp { offset: 0 };
|
|
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
|
|
// no need to test that other find routines quit, since we did that above
|
|
|
|
// OK, if we reset the cache, then we should be able to create more states
|
|
// and make more progress with searching for betas.
|
|
cache.reset(&dfa);
|
|
let err = MatchError::GaveUp { offset: 26 };
|
|
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
|
|
|
|
// ... switching back to ASCII still makes progress since it just needs to
|
|
// set transitions on existing states!
|
|
let haystack = "a".repeat(101).into_bytes();
|
|
let err = MatchError::GaveUp { offset: 13 };
|
|
assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Tests that quit bytes in the forward direction work correctly.
|
|
#[test]
|
|
fn quit_fwd() -> Result<(), Box<dyn Error>> {
|
|
let dfa = DFA::builder()
|
|
.configure(DFA::config().quit(b'x', true))
|
|
.build("[[:word:]]+$")?;
|
|
let mut cache = dfa.create_cache();
|
|
|
|
assert_eq!(
|
|
dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
|
|
Err(MatchError::Quit { byte: b'x', offset: 3 })
|
|
);
|
|
assert_eq!(
|
|
dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
|
|
Err(MatchError::Quit { byte: b'x', offset: 3 })
|
|
);
|
|
assert_eq!(
|
|
dfa.find_overlapping_fwd(
|
|
&mut cache,
|
|
b"abcxyz",
|
|
&mut OverlappingState::start()
|
|
),
|
|
Err(MatchError::Quit { byte: b'x', offset: 3 })
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Tests that quit bytes in the reverse direction work correctly.
|
|
#[test]
|
|
fn quit_rev() -> Result<(), Box<dyn Error>> {
|
|
let dfa = DFA::builder()
|
|
.configure(DFA::config().quit(b'x', true))
|
|
.thompson(thompson::Config::new().reverse(true))
|
|
.build("^[[:word:]]+")?;
|
|
let mut cache = dfa.create_cache();
|
|
|
|
assert_eq!(
|
|
dfa.find_earliest_rev(&mut cache, b"abcxyz"),
|
|
Err(MatchError::Quit { byte: b'x', offset: 3 })
|
|
);
|
|
assert_eq!(
|
|
dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
|
|
Err(MatchError::Quit { byte: b'x', offset: 3 })
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Tests that if we heuristically enable Unicode word boundaries but then
|
|
// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
|
|
// will panic.
|
|
#[test]
|
|
#[should_panic]
|
|
fn quit_panics() {
|
|
DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
|
|
}
|
|
|
|
// This tests an intesting case where even if the Unicode word boundary option
|
|
// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
|
|
// word boundaries to be enabled.
|
|
#[test]
|
|
fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
|
|
let mut config = DFA::config();
|
|
for b in 0x80..=0xFF {
|
|
config = config.quit(b, true);
|
|
}
|
|
let dfa = DFA::builder().configure(config).build(r"\b")?;
|
|
let mut cache = dfa.create_cache();
|
|
let expected = HalfMatch::must(0, 1);
|
|
assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
|
|
Ok(())
|
|
}
|
|
|
|
// Tests that we can provide a prefilter to a Regex, and the search reports
|
|
// correct results.
|
|
#[test]
|
|
fn prefilter_works() -> Result<(), Box<dyn Error>> {
|
|
let mut re = Regex::new(r"a[0-9]+").unwrap();
|
|
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
|
|
let mut cache = re.create_cache();
|
|
|
|
let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
|
|
let matches: Vec<(usize, usize)> = re
|
|
.find_leftmost_iter(&mut cache, text)
|
|
.map(|m| (m.start(), m.end()))
|
|
.collect();
|
|
assert_eq!(
|
|
matches,
|
|
vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
// This test confirms that a prefilter is active by using a prefilter that
|
|
// reports false negatives.
|
|
#[test]
|
|
fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
|
|
let text = b"za123";
|
|
let mut re = Regex::new(r"a[0-9]+").unwrap();
|
|
let mut cache = re.create_cache();
|
|
|
|
re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
|
|
assert_eq!(
|
|
re.find_leftmost(&mut cache, b"za123"),
|
|
Some(MultiMatch::must(0, 1, 5))
|
|
);
|
|
assert_eq!(
|
|
re.find_leftmost(&mut cache, b"a123"),
|
|
Some(MultiMatch::must(0, 0, 4))
|
|
);
|
|
re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
|
|
assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
|
|
// This checks that the prefilter is used when first starting the search,
|
|
// instead of waiting until at least one transition has occurred.
|
|
assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
|
|
Ok(())
|
|
}
|