commit d5c0e44a9dd774fd5f2c65a6ba3b228209f24575 Author: Wolfgang Bumiller Date: Fri Apr 24 17:48:01 2020 +0200 initial import Signed-off-by: Wolfgang Bumiller diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..91a938f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "pathpatterns" +version = "0.1.0" +authors = ["Wolfgang Bumiller "] +edition = "2018" + +[dependencies] +bitflags = "1.2" +libc = "0.2" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7fe0c3e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,98 @@ +//! Include/Exclude file pattern matching. +//! +//! This implements a glob `Pattern` similar to `git`'s matching done in include/exclude files, and +//! some helper methods to figure out whether a file should be included given its designated path +//! and a list of include/exclude patterns. +//! +//! Here's a rather long matching example: +//! +//! ``` +//! # use pathpatterns::*; +//! # fn test() -> Result<(), Box> { +//! let file_list: &'static [&'static [u8]] = &[ +//! b"/things", +//! b"/things/file1.dat", +//! b"/things/file2.dat", +//! b"/things/shop", +//! b"/things/shop/info.txt", +//! b"/things/shop/apples", +//! b"/things/shop/apples/gala.txt", +//! b"/things/shop/apples/golden-delicious.txt", +//! b"/things/shop/bananas", +//! b"/things/shop/bananas/straight.txt", +//! b"/things/shop/bananas/curved.txt", +//! b"/things/shop/bananas/curved.bak", +//! b"/things/shop/bananas/other.txt", +//! ]; +//! +//! let mut list = MatchList::new(vec![ +//! MatchEntry::include(Pattern::path("shop")?), +//! MatchEntry::exclude(Pattern::path("bananas")?), +//! MatchEntry::include(Pattern::path("bananas/curved.*")?), +//! ]); +//! +//! assert_eq!(list.matches("/things", None), None); +//! assert_eq!(list.matches("/things/shop", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas", None), Some(MatchType::Exclude)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.bak", None), Some(MatchType::Include)); +//! +//! // this will exclude the curved.bak file +//! list.push(MatchEntry::exclude(Pattern::path("curved.bak")?)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.bak", None), Some(MatchType::Exclude)); +//! list.pop(); +//! # assert_eq!(list.matches("/things/shop/bananas/curved.bak", None), Some(MatchType::Include)); +//! +//! // but this will not: +//! list.push(MatchEntry::new(Pattern::path("curved.bak")?, MatchType::Exclude, MatchFlag::ANCHORED)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.bak", None), Some(MatchType::Include)); +//! list.pop(); +//! +//! // let's check some patterns, anything starting with a 'c', 'f' or 's': +//! let mut list = MatchList::new(vec![ +//! MatchEntry::include(Pattern::path("[cfs]*")?), +//! ]); +//! assert_eq!(list.matches("/things", None), None); +//! assert_eq!(list.matches("/things/file1.dat", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/file2.dat", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/info.txt", None), None); +//! assert_eq!(list.matches("/things/shop/apples", None), None); +//! assert_eq!(list.matches("/things/shop/apples/gala.txt", None), None); +//! assert_eq!(list.matches("/things/shop/apples/golden-delicious.txt", None), None); +//! assert_eq!(list.matches("/things/shop/bananas", None), None); +//! assert_eq!(list.matches("/things/shop/bananas/straight.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/shop/bananas/curved.bak", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas/other.txt", None), None); +//! +//! // If we add `**` we end up including the entire `shop/` subtree: +//! list.push(MatchEntry::include(Pattern::path("[cfs]*/**")?)); +//! assert_eq!(list.matches("/things", None), None); +//! assert_eq!(list.matches("/things/file1.dat", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/file2.dat", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/info.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/apples", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/apples/gala.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/shop/apples/golden-delicious.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas/straight.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/things/shop/bananas/curved.txt", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/shop/bananas/curved.bak", None), Some(MatchType::Include)); +//! assert_eq!(list.matches("/shop/bananas/other.txt", None), Some(MatchType::Include)); +//! +//! // +//! # Ok(()) +//! # } +//! # test().unwrap() +//! ``` + +mod match_list; +mod pattern; + +#[doc(inline)] +pub use match_list::{MatchEntry, MatchFlag, MatchList, MatchListRef, MatchPattern, MatchType}; + +#[doc(inline)] +pub use pattern::{ParseError, Pattern, PatternFlag}; diff --git a/src/match_list.rs b/src/match_list.rs new file mode 100644 index 0000000..4c43d23 --- /dev/null +++ b/src/match_list.rs @@ -0,0 +1,293 @@ +//! Helpers for include/exclude lists. + +use bitflags::bitflags; + +#[rustfmt::skip] +bitflags! { + /// These flags influence what kind of paths should be matched. + pub struct MatchFlag: u16 { + /// Match only a complete entry. The pattern `bar` will not match `/foo/bar`. + const ANCHORED = 0x00_01; + + const MATCH_DIRECTORIES = 0x01_00; + const MATCH_REGULAR_FILES = 0x02_00; + const MATCH_SYMLINKS = 0x04_00; + const MATCH_SOCKETS = 0x08_00; + const MATCH_FIFOS = 0x10_00; + const MATCH_CHARDEVS = 0x20_00; + const MATCH_BLOCKDEVS = 0x40_00; + const MATCH_DEVICES = + MatchFlag::MATCH_CHARDEVS.bits() | MatchFlag::MATCH_BLOCKDEVS.bits(); + + /// This is the default. + const ANY_FILE_TYPE = + MatchFlag::MATCH_DIRECTORIES.bits() + | MatchFlag::MATCH_REGULAR_FILES.bits() + | MatchFlag::MATCH_SYMLINKS.bits() + | MatchFlag::MATCH_SOCKETS.bits() + | MatchFlag::MATCH_FIFOS.bits() + | MatchFlag::MATCH_CHARDEVS.bits() + | MatchFlag::MATCH_BLOCKDEVS.bits(); + } +} + +impl Default for MatchFlag { + fn default() -> Self { + Self::ANY_FILE_TYPE + } +} + +/// A pattern entry. For now this only contains glob patterns, but we may want to add regex +/// patterns or user defined callback functions later on as well. +/// +/// For regex we'd likely use the POSIX extended REs via `regexec(3)`, since we're targetting +/// command line interfaces and want something command line users are used to. +#[derive(Clone, Debug)] +pub enum MatchPattern { + /// A glob pattern. + Pattern(crate::Pattern), +} + +impl From for MatchPattern { + fn from(pattern: crate::Pattern) -> Self { + MatchPattern::Pattern(pattern) + } +} + +/// A pattern can be used as an include or an exclude pattern. In a list of `MatchEntry`s, later +/// patterns take precedence over earlier patterns and the order of includes vs excludes makes a +/// difference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchType { + Include, + Exclude, +} + +/// Convenience helpers +impl MatchType { + pub fn is_include(self) -> bool { + self == MatchType::Include + } + + pub fn is_exclude(self) -> bool { + self == MatchType::Exclude + } +} + +/// A single entry in a `MatchList`. +#[derive(Clone, Debug)] +pub struct MatchEntry { + pattern: MatchPattern, + ty: MatchType, + flags: MatchFlag, +} + +impl MatchEntry { + /// Create a new match entry. + pub fn new>(pattern: T, ty: MatchType, flags: MatchFlag) -> Self { + Self { + pattern: pattern.into(), + ty, + flags, + } + } + + /// Create a new include-type match entry with default flags. + pub fn include>(pattern: T) -> Self { + Self::new(pattern.into(), MatchType::Include, MatchFlag::default()) + } + + /// Create a new exclude-type match entry with default flags. + pub fn exclude>(pattern: T) -> Self { + Self::new(pattern.into(), MatchType::Exclude, MatchFlag::default()) + } + + #[inline] + pub fn match_type(&self) -> MatchType { + self.ty + } + + /// Test this entry's file type restrictions against a file mode retrieved from `stat()`. + pub fn matches_mode(&self, file_mode: u32) -> bool { + // bitflags' `.contains` means ALL bits must be set, if they are all set we don't + // need to check the mode... + if self.flags.contains(MatchFlag::ANY_FILE_TYPE) { + return true; + } + + let flag = match file_mode & libc::S_IFMT { + libc::S_IFDIR => MatchFlag::MATCH_DIRECTORIES, + libc::S_IFREG => MatchFlag::MATCH_REGULAR_FILES, + libc::S_IFLNK => MatchFlag::MATCH_SYMLINKS, + libc::S_IFSOCK => MatchFlag::MATCH_SOCKETS, + libc::S_IFIFO => MatchFlag::MATCH_FIFOS, + libc::S_IFCHR => MatchFlag::MATCH_CHARDEVS, + libc::S_IFBLK => MatchFlag::MATCH_BLOCKDEVS, + _unknown => return false, + }; + self.flags.intersects(flag) + } + + /// Test whether this entry's pattern matches any complete suffix of a path. + /// + /// For the path `/foo/bar/baz`, this tests whether `baz`, `bar/baz` or `foo/bar/baz` is + /// matched. + pub fn matches_path_suffix>(&self, path: T) -> bool { + self.matches_path_suffix_do(path.as_ref()) + } + + fn matches_path_suffix_do(&self, path: &[u8]) -> bool { + if self.flags.intersects(MatchFlag::ANCHORED) { + return self.matches_path_exact(path); + } + + if path.is_empty() { + return false; + } + + for start in (0..path.len()).rev() { + if path[start] == b'/' && self.matches_path_exact(&path[(start + 1)..]) { + return true; + } + } + + if path[0] != b'/' { + // we had "foo/bar", so we haven't yet tried to match the whole string: + self.matches_path_exact(path) + } else { + false + } + } + + /// Test whether this entry's pattern matches a path exactly. + pub fn matches_path_exact>(&self, path: T) -> bool { + self.matches_path_exact_do(path.as_ref()) + } + + fn matches_path_exact_do(&self, path: &[u8]) -> bool { + match &self.pattern { + MatchPattern::Pattern(pattern) => pattern.matches(path), + } + } + + /// Check whether the path contains a matching suffix and the file mode match the expected file modes. + /// This is a combination of using `.matches_mode()` and `.matches_path_suffix()`. + pub fn matches>(&self, path: T, file_mode: Option) -> bool { + self.matches_do(path.as_ref(), file_mode) + } + + fn matches_do(&self, path: &[u8], file_mode: Option) -> bool { + if let Some(mode) = file_mode { + if !self.matches_mode(mode) { + return false; + } + } + + self.matches_path_suffix(path) + } + + /// Check whether the path contains a matching suffix and the file mode match the expected file modes. + /// This is a combination of using `.matches_mode()` and `.matches_path_exact()`. + pub fn matches_exact>(&self, path: T, file_mode: Option) -> bool { + self.matches_exact_do(path.as_ref(), file_mode) + } + + fn matches_exact_do(&self, path: &[u8], file_mode: Option) -> bool { + if let Some(mode) = file_mode { + if !self.matches_mode(mode) { + return false; + } + } + + self.matches_path_exact(path) + } +} + +/// Convenience type for an ordered list of `MatchEntry`s. This is just a `Vec`. +#[derive(Clone, Debug, Default)] +pub struct MatchList { + list: Vec, +} + +impl MatchList { + pub fn new>>(list: T) -> Self { + Self { list: list.into() } + } + + pub fn push(&mut self, entry: MatchEntry) { + self.list.push(entry) + } + + pub fn pop(&mut self) -> Option { + self.list.pop() + } +} + +impl From> for MatchList { + fn from(list: Vec) -> Self { + Self { list } + } +} + +impl std::ops::Deref for MatchList { + type Target = MatchListRef; + + fn deref(&self) -> &Self::Target { + (&self.list[..]).into() + } +} + +/// Helper to provide the `matches` method on slices of `MatchEntry`s. +#[repr(transparent)] +pub struct MatchListRef([MatchEntry]); + +impl std::ops::Deref for MatchListRef { + type Target = [MatchEntry]; + + fn deref(&self) -> &Self::Target { + &self.0[..] + } +} + +impl<'a> From<&'a [MatchEntry]> for &'a MatchListRef { + fn from(entries: &'a [MatchEntry]) -> &'a MatchListRef { + unsafe { &*(entries as *const [MatchEntry] as *const MatchListRef) } + } +} + +impl MatchListRef { + /// Check whether this list contains anything matching a prefix of the specified path, and the + /// specified file mode. + pub fn matches>(&self, path: T, file_mode: Option) -> Option { + self.matches_do(path.as_ref(), file_mode) + } + + fn matches_do(&self, path: &[u8], file_mode: Option) -> Option { + for m in self.iter().rev() { + if m.matches(path, file_mode) { + return Some(m.match_type()); + } + } + + None + } + + /// Check whether this list contains anything exactly matching the path and mode. + pub fn matches_exact>( + &self, + path: T, + file_mode: Option, + ) -> Option { + self.matches_exact_do(path.as_ref(), file_mode) + } + + fn matches_exact_do(&self, path: &[u8], file_mode: Option) -> Option { + for m in self.iter().rev() { + if m.matches_exact(path, file_mode) { + return Some(m.match_type()); + } + } + + None + } +} diff --git a/src/pattern.rs b/src/pattern.rs new file mode 100644 index 0000000..8dbd4ee --- /dev/null +++ b/src/pattern.rs @@ -0,0 +1,704 @@ +//! This implements the pattern matching algorithm found in git's `wildmatch.c` + +use std::fmt; +use std::mem; +use std::ops::RangeInclusive; + +use bitflags::bitflags; + +/// A character class can be a list of characters `[abc]`, ranges of characters `[a-z]`, or named +/// classes `[[:digit:]]`, or a combination of them all. Additionally they can be negated with a +/// `^` at the beginning. +#[derive(Clone, Debug, Default)] +struct CharacterClass { + negated: bool, + named: Vec bool>, + listed: Vec, + ranges: Vec>, +} + +impl CharacterClass { + /// Check if a byte match this character class. + pub fn matches(&self, ch: u8) -> bool { + self.matches_do(ch) != self.negated + } + + fn matches_do(&self, ch: u8) -> bool { + self.listed.contains(&ch) + || self.ranges.iter().any(|range| range.contains(&ch)) + || self.named.iter().any(|func| func(ch)) + } +} + +/// One component of a pattern. +#[derive(Clone, Debug)] +enum Component { + /// A literal match. The `/a` and `.txt` in `/a*.txt`. + Literal(Vec), + + /// A question mark should match exactly one byte. If desired it may also match slashes, but + /// this property is part of the whole pattern, not the component. + QuestionMark, + + /// A "star" normally matches everything except when matching path names, where it does not + /// match slashes. + Star, + + /// A double star always matches everything, even slashes. + StarStar, + + // SlashStarStarSlash, // maybe? + /// A character class matches one byte out of a set of allowed or disallowed bytes. + Class(CharacterClass), +} + +impl Component { + /// Check if this is a literal component ending with a slash. + fn ends_with_slash(&self) -> bool { + match self { + Component::Literal(lit) => lit.last().copied() == Some(b'/'), + _ => false, + } + } + + /// Check if this is a literal component starting with a slash. + fn starts_with_slash(&self) -> bool { + match self { + Component::Literal(lit) => { + lit.first().copied() == Some(b'/') + || (lit.first().copied() == Some(b'\\') && lit.get(1).copied() == Some(b'/')) + } + _ => false, + } + } +} + +bitflags! { + /// Flags affecting how a pattern should match. + pub struct PatternFlag: u8 { + /// Ignore upper/lower case on the pattern. Note that this only affects ascii characters. + /// We do not normalize/casefold unicode here. If you need this, case-fold your input + /// strings and patterns first. + const IGNORE_CASE = 0x01; + + /// This pattern is used for paths, meaning that `*` and `?` do not match slashes. Only + /// explicit slashes and `**` can match slashes. + const PATH_NAME = 0x02; + } +} + +/// Error cases which may happen while parsing a pattern. +#[derive(Clone, Debug)] +pub enum ParseError { + NulByteError, + TrailingBackslash, + UnclosedCharacterClass(usize), + MalformedNamedCharacterClass(usize), +} + +impl std::error::Error for ParseError {} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + ParseError::NulByteError => write!(f, "null-byte in pattern"), + ParseError::TrailingBackslash => write!(f, "trailing backslash in pattern"), + ParseError::UnclosedCharacterClass(begin) => write!( + f, + "unclosed character class in pattern, starting at byte {}", + begin + ), + ParseError::MalformedNamedCharacterClass(begin) => write!( + f, + "malformed named character class in pattern, starting at byte {}", + begin + ), + } + } +} + +/// See the match function for the algorithm. +enum MatchResult { + Match, + NoMatch, + AbortAll, + AbortToStarStar, +} + +/// An `fnmatch`-like pattern working like in `git`, with support for `*` vs `**` distinction for +/// paths. +/// +/// Note that patterns are treated as ASCII strings, so unicode characters have no special effect. +/// You can use it for UTF-8 paths, but there's no direct support for `PatternFlag::IGNORE_CASE` on +/// unicode text. +#[derive(Clone, Debug)] +pub struct Pattern { + /// Original pattern the user provided. + pattern: std::ffi::CString, + + /// Matching components we parsed out of the string. + components: Vec, + + /// Flags used for the pattern. + /// + /// In the future we may want to optimize the `components` based on this, so `IGNORE_CASE` + /// happens at "compile time" already. + flags: PatternFlag, +} + +impl Pattern { + /// Get the original input pattern. + pub fn pattern(&self) -> &std::ffi::CStr { + &self.pattern + } + + /// Create a new pattern. + pub fn new>(pattern: T, flags: PatternFlag) -> Result { + Self::new_do(pattern.as_ref(), flags) + } + + /// Convenience shortcut to create a new pattern with `PatternFlag::PATH_NAME`. + pub fn path>(pattern: T) -> Result { + Self::new_do(pattern.as_ref(), PatternFlag::PATH_NAME) + } + + fn new_do(pattern: &[u8], flags: PatternFlag) -> Result { + let c_pattern = std::ffi::CString::new(pattern).map_err(|_| ParseError::NulByteError)?; + + let mut components = Vec::::new(); + let mut literal = Vec::::new(); + + fn push_literal( + literal: &mut Vec, + components: &mut Vec, + flags: PatternFlag, + ) { + if !literal.is_empty() { + if flags.intersects(PatternFlag::IGNORE_CASE) { + for b in &mut literal[..] { + *b = b.to_ascii_lowercase(); + } + } + components.push(Component::Literal(mem::take(literal))); + } + } + + let mut i = 0; + while i != pattern.len() { + match pattern[i] { + 0 => return Err(ParseError::NulByteError), + b'\\' => { + i += 1; + let mut ch = *pattern.get(i).ok_or(ParseError::TrailingBackslash)?; + if flags.intersects(PatternFlag::IGNORE_CASE) { + ch = ch.to_ascii_lowercase() + } + literal.push(ch); + } + b'?' => { + push_literal(&mut literal, &mut components, flags); + components.push(Component::QuestionMark); + } + b'*' => { + push_literal(&mut literal, &mut components, flags); + if pattern.get(i + 1).copied() == Some(b'*') { + let beg = i; + i += 1; + // swallow following stars as well: + while pattern.get(i + 1).copied() == Some(b'*') { + i += 1; + } + + // git doesn't allow `**` attached to anything other than slashes to match + // subdirectories, so only `**`, `.../**`, `.../**/...` and `**/...` are + // valid. + if (beg == 0 || pattern[beg - 1] == b'/') + && ((i + 1) == pattern.len() || pattern[i + 1] == b'/') + { + components.push(Component::StarStar) + } else { + components.push(Component::Star) + } + } else { + components.push(Component::Star) + } + } + b'[' => { + push_literal(&mut literal, &mut components, flags); + let (component, new_i) = Self::parse_char_class(pattern, i, flags)?; + i = new_i; + components.push(component); + } + ch => literal.push(if flags.intersects(PatternFlag::IGNORE_CASE) { + ch.to_ascii_lowercase() + } else { + ch + }), + } + + i += 1; + } + + push_literal(&mut literal, &mut components, flags); + Ok(Self { + pattern: c_pattern, + components, + flags, + }) + } + + fn parse_char_class( + pattern: &[u8], + begin_i: usize, + flags: PatternFlag, + ) -> Result<(Component, usize), ParseError> { + let mut i = begin_i + 1; + + let negated = if pattern.get(i).copied() == Some(b'^') { + i += 1; + true + } else { + false + }; + + if i == pattern.len() { + return Err(ParseError::UnclosedCharacterClass(begin_i)); + } + + let mut class = CharacterClass { + negated, + ..Default::default() + }; + let mut prev = None; + while i != pattern.len() { + let mut new_prev = None; + match pattern[i] { + 0 => return Err(ParseError::NulByteError), + b'[' if pattern[(i + 1)..].starts_with(b":alnum:]") => { + i += 8; + class.named.push(|b| b.is_ascii_alphanumeric()); + } + b'[' if pattern[(i + 1)..].starts_with(b":alpha:]") => { + i += 8; + class.named.push(|b| b.is_ascii_alphabetic()); + } + b'[' if pattern[(i + 1)..].starts_with(b":blank:]") => { + i += 8; + class.named.push(|b| b == b' ' || b == b'\t'); + } + b'[' if pattern[(i + 1)..].starts_with(b":cntrl:]") => { + i += 8; + class.named.push(|b| b.is_ascii_control()); + } + b'[' if pattern[(i + 1)..].starts_with(b":digit:]") => { + i += 8; + class.named.push(|b| b.is_ascii_digit()); + } + b'[' if pattern[(i + 1)..].starts_with(b":graph:]") => { + i += 8; + class.named.push(|b| b.is_ascii_graphic()); + } + b'[' if pattern[(i + 1)..].starts_with(b":lower:]") => { + i += 8; + class.named.push(|b| b.is_ascii_lowercase()); + } + b'[' if pattern[(i + 1)..].starts_with(b":print:]") => { + i += 8; + class.named.push(|b| b >= 0x20 && b <= 0x7f); + } + b'[' if pattern[(i + 1)..].starts_with(b":punct:]") => { + i += 8; + class.named.push(|b| b.is_ascii_punctuation()); + } + b'[' if pattern[(i + 1)..].starts_with(b":space:]") => { + i += 8; + class.named.push(|b| b.is_ascii_whitespace()); + } + b'[' if pattern[(i + 1)..].starts_with(b":upper:]") => { + i += 8; + if flags.intersects(PatternFlag::IGNORE_CASE) { + class.named.push(|b| b.is_ascii_lowercase()); + } else { + class.named.push(|b| b.is_ascii_uppercase()); + } + } + b'[' if pattern[(i + 1)..].starts_with(b":xdigit:]") => { + i += 9; + class.named.push(|b| b.is_ascii_hexdigit()); + } + b'[' if pattern.get(i + 1).copied() == Some(b':') => { + return Err(ParseError::MalformedNamedCharacterClass(begin_i)); + } + b']' => break, + b'\\' => { + i += 1; + let ch = *pattern.get(i).ok_or(ParseError::TrailingBackslash)?; + class.listed.push(ch); + new_prev = Some(ch); + } + b'-' => match prev { + None => { + new_prev = Some(b'-'); + class.listed.push(b'-'); + } + Some(beg) => { + // The previous character was also pushed to `class.listed`, so remove it: + class.listed.pop(); + + i += 1; + let mut end = *pattern + .get(i) + .ok_or(ParseError::UnclosedCharacterClass(begin_i))?; + if end == b'\\' { + i += 1; + end = *pattern + .get(i) + .ok_or(ParseError::UnclosedCharacterClass(begin_i))?; + } + + if flags.intersects(PatternFlag::IGNORE_CASE) { + end = end.to_ascii_lowercase(); + } + + if beg <= end { + class.ranges.push(beg..=end); + } else { + class.ranges.push(end..=beg); + } + } + }, + mut ch => { + if flags.intersects(PatternFlag::IGNORE_CASE) { + ch = ch.to_ascii_lowercase(); + } + new_prev = Some(ch); + class.listed.push(ch); + } + } + prev = new_prev; + i += 1; + } + + Ok((Component::Class(class), i)) + } + + /// Check whether this pattern matches a text. + pub fn matches>(&self, text: T) -> bool { + match self.do_matches(&self.components, 0, text.as_ref()) { + MatchResult::Match => true, + _ => false, + } + } + + // The algorithm is ported from git's wildmatch.c. + fn do_matches(&self, components: &[Component], mut ci: usize, mut text: &[u8]) -> MatchResult { + if self.flags.intersects(PatternFlag::PATH_NAME) { + // If we match a path then we want the pattern `"foo"` to match the path `"/foo"`. + } + + while ci != components.len() { + //eprintln!("Matching: {:?} at text: {:?}", components[ci], unsafe { + // std::str::from_utf8_unchecked(text) + //},); + match &components[ci] { + Component::Literal(literal) => { + if text.is_empty() { + // The '*' implementation is NON-greedy, so if the text is empty, we + // already tried all shorter possible matches, so anything other than a '*' + // match can `AbortAll` if the text is empty. + return MatchResult::AbortAll; + } + + if !starts_with(text, &literal, self.flags) { + return MatchResult::NoMatch; + } + + text = &text[literal.len()..]; + } + Component::QuestionMark => { + if text.is_empty() { + // See Literal case + return MatchResult::AbortAll; + } + + if text[0] == b'/' && self.flags.intersects(PatternFlag::PATH_NAME) { + return MatchResult::NoMatch; + } + + text = &text[1..]; + } + Component::Class(class) => { + if text.is_empty() { + // See Literal case + return MatchResult::AbortAll; + } + + let mut ch = text[0]; + if self.flags.intersects(PatternFlag::IGNORE_CASE) { + ch = ch.to_ascii_lowercase(); + } + if !class.matches(ch) { + return MatchResult::NoMatch; + } + + text = &text[1..]; + } + Component::Star if self.flags.intersects(PatternFlag::PATH_NAME) => { + // FIXME: Optimization: Instead of .contains, fast-skip to its index, like git + // does. + if (ci + 1) == components.len() && !text.contains(&b'/') { + return MatchResult::Match; + } + + loop { + if text.is_empty() { + // We still abort all here, but git has some optimizations we could + // do instead before reaching this. + return MatchResult::AbortAll; + } + + // FIXME: Optimization: Add the "try to advance faster" optimization from + // git here. + + match self.do_matches(components, ci + 1, text) { + MatchResult::NoMatch => { + if text[0] == b'/' { + return MatchResult::AbortToStarStar; + } + } + other => return other, + } + + text = &text[1..]; + } + } + Component::Star | Component::StarStar => { + if (ci + 1) == components.len() { + return MatchResult::Match; + } + + if let Component::StarStar = components[ci] { + if ci > 0 + && components[ci - 1].ends_with_slash() + && ((ci + 1) == components.len() + || components[ci + 1].starts_with_slash()) + { + // Assuming we matched `foo/` and are at `/` `**` `/`, see if we an let + // it match nothing, so that `foo/` `**` `/bar` can match `foo/bar`. + // + // Under the condition that the previous component ended with a slash + // (`components[ci - 1].ends_with_slash()`) we can safely move back by + // a byte in `text`. + let text = unsafe { + std::slice::from_raw_parts(text.as_ptr().offset(-1), text.len() + 1) + }; + #[allow(clippy::single_match)] + match self.do_matches(components, ci + 1, text) { + MatchResult::Match => return MatchResult::Match, + _ => (), // or just continue regularly + } + } + } + + loop { + if text.is_empty() { + // See Literal case + return MatchResult::AbortAll; + } + + match self.do_matches(components, ci + 1, text) { + MatchResult::NoMatch => (), + MatchResult::AbortToStarStar => (), // continue from here + other => return other, + } + + text = &text[1..]; + } + } + } + ci += 1; + } + + if text.is_empty() { + MatchResult::Match + } else { + MatchResult::NoMatch + } + } +} + +fn starts_with(text: &[u8], with: &[u8], flags: PatternFlag) -> bool { + if flags.intersects(PatternFlag::IGNORE_CASE) { + starts_with_caseless(text, with) + } else { + text.starts_with(with) + } +} + +fn starts_with_caseless(text: &[u8], with: &[u8]) -> bool { + if text.len() < with.len() { + return false; + } + + for i in 0..with.len() { + if text[i].to_ascii_lowercase() != with[i].to_ascii_lowercase() { + return false; + } + } + + true +} + +#[test] +fn test() { + let pattern = Pattern::new("/hey/*/you", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("/hey/asdf/you")); + assert!(!pattern.matches("/hey/asdf/more/you")); + assert!(!pattern.matches("/heyasdf/you")); + assert!(!pattern.matches("/heyasdfyou")); + assert!(!pattern.matches("/hey/asdfyou")); + assert!(!pattern.matches("/hey/you")); + assert!(pattern.matches("/hey//you")); + + let pattern = Pattern::new("/hey/*/you", PatternFlag::empty()).unwrap(); + assert!(pattern.matches("/hey/asdf/you")); + assert!(pattern.matches("/hey/asdf/more/you")); // different to PATH_NAME + assert!(!pattern.matches("/heyasdf/you")); + assert!(!pattern.matches("/heyasdfyou")); + assert!(!pattern.matches("/hey/asdfyou")); + assert!(!pattern.matches("/hey/you")); + assert!(pattern.matches("/hey//you")); + + let pattern = Pattern::new("/hey/**/you", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("/hey/asdf/you")); + assert!(pattern.matches("/hey/asdf/more/you")); + assert!(!pattern.matches("/heyasdf/you")); + assert!(!pattern.matches("/heyasdfyou")); + assert!(!pattern.matches("/hey/asdfyou")); + assert!(pattern.matches("/hey/you")); + assert!(pattern.matches("/hey//you")); + + let pattern = Pattern::new("/he[yx]/**/you", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("/hey/asdf/you")); + assert!(pattern.matches("/hey/asdf/more/you")); + assert!(!pattern.matches("/heyasdf/you")); + assert!(!pattern.matches("/heyasdfyou")); + assert!(!pattern.matches("/hey/asdfyou")); + assert!(pattern.matches("/hey/you")); + assert!(pattern.matches("/hey//you")); + + assert!(pattern.matches("/hex/asdf/you")); + assert!(pattern.matches("/hex/asdf/more/you")); + assert!(!pattern.matches("/hexasdf/you")); + assert!(!pattern.matches("/hexasdfyou")); + assert!(!pattern.matches("/hex/asdfyou")); + assert!(pattern.matches("/hex/you")); + assert!(pattern.matches("/hex//you")); + + assert!(!pattern.matches("/hez/asdf/you")); + assert!(!pattern.matches("/hez/asdf/more/you")); + assert!(!pattern.matches("/hezasdf/you")); + assert!(!pattern.matches("/hezasdfyou")); + assert!(!pattern.matches("/hez/asdfyou")); + assert!(!pattern.matches("/hez/you")); + assert!(!pattern.matches("/hez//you")); + + let pattern = Pattern::new("/he[^yx]/**/you", PatternFlag::PATH_NAME).unwrap(); + assert!(!pattern.matches("/hey/asdf/you")); + assert!(!pattern.matches("/hey/asdf/more/you")); + assert!(!pattern.matches("/heyasdf/you")); + assert!(!pattern.matches("/heyasdfyou")); + assert!(!pattern.matches("/hey/asdfyou")); + assert!(!pattern.matches("/hey/you")); + assert!(!pattern.matches("/hey//you")); + + assert!(!pattern.matches("/hex/asdf/you")); + assert!(!pattern.matches("/hex/asdf/more/you")); + assert!(!pattern.matches("/hexasdf/you")); + assert!(!pattern.matches("/hexasdfyou")); + assert!(!pattern.matches("/hex/asdfyou")); + assert!(!pattern.matches("/hex/you")); + assert!(!pattern.matches("/hex//you")); + + assert!(pattern.matches("/hez/asdf/you")); + assert!(pattern.matches("/hez/asdf/more/you")); + assert!(!pattern.matches("/hezasdf/you")); + assert!(!pattern.matches("/hezasdfyou")); + assert!(!pattern.matches("/hez/asdfyou")); + assert!(pattern.matches("/hez/you")); + assert!(pattern.matches("/hez//you")); + + let wrong = b"/hez/"; + for i in 0..wrong.len() { + assert!(!pattern.matches(&wrong[..i])); + } + + let pattern = Pattern::new("/tes[a-t]", PatternFlag::PATH_NAME).unwrap(); + assert!(!pattern.matches("/testoolong")); + assert!(!pattern.matches("/tes")); + assert!(!pattern.matches("/t")); + assert!(!pattern.matches("/")); + assert!(!pattern.matches("")); + assert!(pattern.matches("/tesa")); + assert!(pattern.matches("/test")); + assert!(!pattern.matches("/tesu")); + + let pattern_path = Pattern::new("/tes[a-t]/a?a", PatternFlag::PATH_NAME).unwrap(); + let pattern_nopath = Pattern::new("/tes[a-t]/a?a", PatternFlag::empty()).unwrap(); + assert!(!pattern_path.matches("/tesu")); + assert!(!pattern_nopath.matches("/tesu")); + assert!(!pattern_path.matches("/tesu/aaa")); + assert!(!pattern_nopath.matches("/tesu/aaa")); + assert!(!pattern_path.matches("/tesu/xax")); + assert!(!pattern_nopath.matches("/tesu/xax")); + assert!(!pattern_path.matches("/test/xax")); + assert!(!pattern_nopath.matches("/test/xax")); + assert!(!pattern_path.matches("/test/a")); + assert!(!pattern_nopath.matches("/test/a")); + assert!(!pattern_path.matches("/test/ab")); + assert!(!pattern_nopath.matches("/test/ab")); + assert!(pattern_path.matches("/test/aba")); + assert!(pattern_nopath.matches("/test/aba")); + assert!(pattern_path.matches("/test/aaa")); + assert!(pattern_nopath.matches("/test/aaa")); + assert!(pattern_path.matches("/test/aba")); + assert!(pattern_nopath.matches("/test/aba")); + // the difference is here: + assert!(!pattern_path.matches("/test/a/a")); + assert!(pattern_nopath.matches("/test/a/a")); + + let pattern = Pattern::new("a*b*c", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("axxbxxc")); + assert!(!pattern.matches("axxbxxcxx")); + assert!(pattern.matches("axxbxxbxxc")); + assert!(!pattern.matches("axxbxxbxxcxx")); + assert!(pattern.matches("axxbxxbxxcxxc")); + assert!(!pattern.matches("axxbxxbxxcxxcxx")); + + let pattern = Pattern::new("a*b*c*", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("axxbxxc")); + assert!(pattern.matches("axxbxxcxx")); + assert!(pattern.matches("axxbxxbxxc")); + assert!(pattern.matches("axxbxxbxxcxx")); + assert!(pattern.matches("axxbxxbxxcxxc")); + assert!(pattern.matches("axxbxxbxxcxxcxx")); + + let pattern = Pattern::new( + "aB[c-fX-Z][[:upper:]][[:lower:]][[:digit:]k]", + PatternFlag::PATH_NAME | PatternFlag::IGNORE_CASE, + ) + .unwrap(); + eprintln!("{:#?}", pattern); + assert!(pattern.matches("aBcUl3")); + assert!(pattern.matches("AbCuL9")); + assert!(!pattern.matches("aBgUl3")); + assert!(!pattern.matches("aBgUl3")); + assert!(!pattern.matches("aBcUlx")); + assert!(pattern.matches("abculk")); + assert!(pattern.matches("abxulk")); + assert!(!pattern.matches("abxul")); + + let pattern = Pattern::new("a/b**/c", PatternFlag::PATH_NAME).unwrap(); + assert!(pattern.matches("a/bxx/c")); + assert!(!pattern.matches("a/bxx/yy/c")); +}