mirror of
https://git.proxmox.com/git/rustc
synced 2025-08-18 14:24:05 +00:00
307 lines
10 KiB
Rust
307 lines
10 KiB
Rust
// Copyright 2014-2017 The html5ever Project Developers. See the
|
|
// COPYRIGHT file at the top-level directory of this distribution.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
|
|
//! The `BufferQueue` struct and helper types.
|
|
//!
|
|
//! This type is designed for the efficient parsing of string data, especially where many
|
|
//! significant characters are from the ascii range 0-63. This includes, for example, important
|
|
//! characters in xml/html parsing.
|
|
//!
|
|
//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
|
|
//! copy).
|
|
//!
|
|
//! [`BufferQueue`]: struct.BufferQueue.html
|
|
|
|
use std::collections::VecDeque;
|
|
|
|
use tendril::StrTendril;
|
|
|
|
pub use self::SetResult::{FromSet, NotFromSet};
|
|
use crate::util::smallcharset::SmallCharSet;
|
|
|
|
/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
|
|
/// string buffer of characters not from the set.
|
|
///
|
|
/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
|
|
/// [`SmallCharSet`]: ../struct.SmallCharSet.html
|
|
#[derive(PartialEq, Eq, Debug)]
|
|
pub enum SetResult {
|
|
/// A character from the `SmallCharSet`.
|
|
FromSet(char),
|
|
/// A string buffer containing no characters from the `SmallCharSet`.
|
|
NotFromSet(StrTendril),
|
|
}
|
|
|
|
/// A queue of owned string buffers, which supports incrementally consuming characters.
|
|
///
|
|
/// Internally it uses [`VecDeque`] and has the same complexity properties.
|
|
///
|
|
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
|
|
#[derive(Debug)]
|
|
pub struct BufferQueue {
|
|
/// Buffers to process.
|
|
buffers: VecDeque<StrTendril>,
|
|
}
|
|
|
|
impl Default for BufferQueue {
|
|
/// Create an empty BufferQueue.
|
|
#[inline]
|
|
fn default() -> Self {
|
|
Self {
|
|
buffers: VecDeque::with_capacity(16),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl BufferQueue {
|
|
/// Returns whether the queue is empty.
|
|
#[inline]
|
|
pub fn is_empty(&self) -> bool {
|
|
self.buffers.is_empty()
|
|
}
|
|
|
|
/// Get the buffer at the beginning of the queue.
|
|
#[inline]
|
|
pub fn pop_front(&mut self) -> Option<StrTendril> {
|
|
self.buffers.pop_front()
|
|
}
|
|
|
|
/// Add a buffer to the beginning of the queue.
|
|
///
|
|
/// If the buffer is empty, it will be skipped.
|
|
pub fn push_front(&mut self, buf: StrTendril) {
|
|
if buf.len32() == 0 {
|
|
return;
|
|
}
|
|
self.buffers.push_front(buf);
|
|
}
|
|
|
|
/// Add a buffer to the end of the queue.
|
|
///
|
|
/// If the buffer is empty, it will be skipped.
|
|
pub fn push_back(&mut self, buf: StrTendril) {
|
|
if buf.len32() == 0 {
|
|
return;
|
|
}
|
|
self.buffers.push_back(buf);
|
|
}
|
|
|
|
/// Look at the next available character without removing it, if the queue is not empty.
|
|
pub fn peek(&self) -> Option<char> {
|
|
debug_assert!(
|
|
!self.buffers.iter().any(|el| el.len32() == 0),
|
|
"invariant \"all buffers in the queue are non-empty\" failed"
|
|
);
|
|
self.buffers.front().map(|b| b.chars().next().unwrap())
|
|
}
|
|
|
|
/// Pops and returns either a single character from the given set, or
|
|
/// a buffer of characters none of which are in the set.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// # #[macro_use] extern crate markup5ever;
|
|
/// # #[macro_use] extern crate tendril;
|
|
/// # fn main() {
|
|
/// use markup5ever::buffer_queue::{BufferQueue, SetResult};
|
|
///
|
|
/// let mut queue = BufferQueue::default();
|
|
/// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
|
|
/// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
|
|
/// let tag = format_tendril!("some_tag");
|
|
/// let attr = format_tendril!("attr");
|
|
/// let attr_val = format_tendril!("text");
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
|
|
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
|
|
/// // ...
|
|
/// # }
|
|
/// ```
|
|
pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
|
|
let (result, now_empty) = match self.buffers.front_mut() {
|
|
None => (None, false),
|
|
Some(buf) => {
|
|
let n = set.nonmember_prefix_len(buf);
|
|
if n > 0 {
|
|
let out;
|
|
unsafe {
|
|
out = buf.unsafe_subtendril(0, n);
|
|
buf.unsafe_pop_front(n);
|
|
}
|
|
(Some(NotFromSet(out)), buf.is_empty())
|
|
} else {
|
|
let c = buf.pop_front_char().expect("empty buffer in queue");
|
|
(Some(FromSet(c)), buf.is_empty())
|
|
}
|
|
},
|
|
};
|
|
|
|
// Unborrow self for this part.
|
|
if now_empty {
|
|
self.buffers.pop_front();
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
/// Consume bytes matching the pattern, using a custom comparison function `eq`.
|
|
///
|
|
/// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
|
|
/// it wasn't possible to know (more data is needed).
|
|
///
|
|
/// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// # extern crate markup5ever;
|
|
/// # #[macro_use] extern crate tendril;
|
|
/// # fn main() {
|
|
/// use markup5ever::buffer_queue::BufferQueue;
|
|
///
|
|
/// let mut queue = BufferQueue::default();
|
|
/// queue.push_back(format_tendril!("testtext"));
|
|
/// let test_str = "test";
|
|
/// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
|
|
/// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
|
|
/// assert!(queue.is_empty());
|
|
/// # }
|
|
/// ```
|
|
pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> {
|
|
let mut buffers_exhausted = 0;
|
|
let mut consumed_from_last = 0;
|
|
|
|
self.buffers.front()?;
|
|
|
|
for pattern_byte in pat.bytes() {
|
|
if buffers_exhausted >= self.buffers.len() {
|
|
return None;
|
|
}
|
|
let buf = &self.buffers[buffers_exhausted];
|
|
|
|
if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
|
|
return Some(false);
|
|
}
|
|
|
|
consumed_from_last += 1;
|
|
if consumed_from_last >= buf.len() {
|
|
buffers_exhausted += 1;
|
|
consumed_from_last = 0;
|
|
}
|
|
}
|
|
|
|
// We have a match. Commit changes to the BufferQueue.
|
|
for _ in 0..buffers_exhausted {
|
|
self.buffers.pop_front();
|
|
}
|
|
|
|
match self.buffers.front_mut() {
|
|
None => assert_eq!(consumed_from_last, 0),
|
|
Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
|
|
}
|
|
|
|
Some(true)
|
|
}
|
|
}
|
|
|
|
impl Iterator for BufferQueue {
|
|
type Item = char;
|
|
|
|
/// Get the next character if one is available, removing it from the queue.
|
|
///
|
|
/// This function manages the buffers, removing them as they become empty.
|
|
fn next(&mut self) -> Option<char> {
|
|
let (result, now_empty) = match self.buffers.front_mut() {
|
|
None => (None, false),
|
|
Some(buf) => {
|
|
let c = buf.pop_front_char().expect("empty buffer in queue");
|
|
(Some(c), buf.is_empty())
|
|
},
|
|
};
|
|
|
|
if now_empty {
|
|
self.buffers.pop_front();
|
|
}
|
|
|
|
result
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[allow(non_snake_case)]
|
|
mod test {
|
|
use tendril::SliceExt;
|
|
|
|
use super::BufferQueue;
|
|
use super::SetResult::{FromSet, NotFromSet};
|
|
|
|
#[test]
|
|
fn smoke_test() {
|
|
let mut bq = BufferQueue::default();
|
|
assert_eq!(bq.peek(), None);
|
|
assert_eq!(bq.next(), None);
|
|
|
|
bq.push_back("abc".to_tendril());
|
|
assert_eq!(bq.peek(), Some('a'));
|
|
assert_eq!(bq.next(), Some('a'));
|
|
assert_eq!(bq.peek(), Some('b'));
|
|
assert_eq!(bq.peek(), Some('b'));
|
|
assert_eq!(bq.next(), Some('b'));
|
|
assert_eq!(bq.peek(), Some('c'));
|
|
assert_eq!(bq.next(), Some('c'));
|
|
assert_eq!(bq.peek(), None);
|
|
assert_eq!(bq.next(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn can_unconsume() {
|
|
let mut bq = BufferQueue::default();
|
|
bq.push_back("abc".to_tendril());
|
|
assert_eq!(bq.next(), Some('a'));
|
|
|
|
bq.push_front("xy".to_tendril());
|
|
assert_eq!(bq.next(), Some('x'));
|
|
assert_eq!(bq.next(), Some('y'));
|
|
assert_eq!(bq.next(), Some('b'));
|
|
assert_eq!(bq.next(), Some('c'));
|
|
assert_eq!(bq.next(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn can_pop_except_set() {
|
|
let mut bq = BufferQueue::default();
|
|
bq.push_back("abc&def".to_tendril());
|
|
let mut pop = || bq.pop_except_from(small_char_set!('&'));
|
|
assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
|
|
assert_eq!(pop(), Some(FromSet('&')));
|
|
assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
|
|
assert_eq!(pop(), None);
|
|
}
|
|
|
|
#[test]
|
|
fn can_eat() {
|
|
// This is not very comprehensive. We rely on the tokenizer
|
|
// integration tests for more thorough testing with many
|
|
// different input buffer splits.
|
|
let mut bq = BufferQueue::default();
|
|
bq.push_back("a".to_tendril());
|
|
bq.push_back("bc".to_tendril());
|
|
assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
|
|
assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
|
|
assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
|
|
assert_eq!(bq.next(), Some('c'));
|
|
assert_eq!(bq.next(), None);
|
|
}
|
|
}
|