rustc/vendor/syntect-5.2.0/examples/syntest.rs

//! An example of using syntect for testing syntax definitions.
//! Basically exactly the same as what Sublime Text can do,
//! but without needing ST installed
// To run tests only for a particular package, while showing the operations, you could use:
// cargo run --example syntest -- --debug testdata/Packages/Makefile/
// to specify that the syntax definitions should be parsed instead of loaded from the dump file,
// you can tell it where to parse them from - the following will execute only 1 syntax test after
// parsing the sublime-syntax files in the JavaScript folder:
// cargo run --example syntest testdata/Packages/JavaScript/syntax_test_json.json testdata/Packages/JavaScript/

use syntect::easy::ScopeRegionIterator;
use syntect::highlighting::ScopeSelectors;
use syntect::parsing::{ParseState, Scope, ScopeStack, SyntaxSet, SyntaxSetBuilder};

use std::cmp::{max, min};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::str::FromStr;
use std::time::Instant;

use getopts::Options;
use once_cell::sync::Lazy;
use regex::Regex;
use walkdir::{DirEntry, WalkDir};

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SyntaxTestHeaderError {
    MalformedHeader,
    SyntaxDefinitionNotFound,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SyntaxTestFileResult {
    FailedAssertions(usize, usize),
    Success(usize),
}

pub static SYNTAX_TEST_HEADER_PATTERN: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r#"(?xm)
        ^(?P<testtoken_start>\s*\S+)
        \s+SYNTAX\sTEST\s+
        "(?P<syntax_file>[^"]+)"
        \s*(?P<testtoken_end>\S+)?$
    "#,
    )
    .unwrap()
});
pub static SYNTAX_TEST_ASSERTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r#"(?xm)
    \s*(?:
        (?P<begin_of_token><-)|(?P<range>\^+)
    )(.*)$"#,
    )
    .unwrap()
});

#[derive(Clone, Copy)]
struct OutputOptions {
    time: bool,
    debug: bool,
    summary: bool,
}

#[derive(Debug)]
struct AssertionRange<'a> {
    begin_char: usize,
    end_char: usize,
    scope_selector_text: &'a str,
    is_pure_assertion_line: bool,
}

#[derive(Debug)]
struct ScopedText {
    scope: Vec<Scope>,
    char_start: usize,
    text_len: usize,
}

#[derive(Debug)]
struct RangeTestResult {
    column_begin: usize,
    column_end: usize,
    success: bool,
}

fn get_line_assertion_details<'a>(
    testtoken_start: &str,
    testtoken_end: Option<&str>,
    line: &'a str,
) -> Option<AssertionRange<'a>> {
    // if the test start token specified in the test file's header is on the line
    if let Some(index) = line.find(testtoken_start) {
        let (before_token_start, token_and_rest_of_line) = line.split_at(index);

        if let Some(captures) =
            SYNTAX_TEST_ASSERTION_PATTERN.captures(&token_and_rest_of_line[testtoken_start.len()..])
        {
            let mut sst = captures.get(3).unwrap().as_str(); // get the scope selector text
            let mut only_whitespace_after_token_end = true;

            if let Some(token) = testtoken_end {
                // if there is an end token defined in the test file header
                if let Some(end_token_pos) = sst.find(token) {
                    // and there is an end token in the line
                    let (ss, after_token_end) = sst.split_at(end_token_pos); // the scope selector text ends at the end token
                    sst = ss;
                    only_whitespace_after_token_end = after_token_end.trim_end().is_empty();
                }
            }
            return Some(AssertionRange {
                begin_char: index
                    + if captures.get(2).is_some() {
                        testtoken_start.len() + captures.get(2).unwrap().start()
                    } else {
                        0
                    },
                end_char: index
                    + if captures.get(2).is_some() {
                        testtoken_start.len() + captures.get(2).unwrap().end()
                    } else {
                        1
                    },
                scope_selector_text: sst,
                is_pure_assertion_line: before_token_start.trim_start().is_empty()
                    && only_whitespace_after_token_end, // if only whitespace surrounds the test tokens on the line, then it is a pure assertion line
            });
        }
    }
    None
}

fn process_assertions(
    assertion: &AssertionRange<'_>,
    test_against_line_scopes: &[ScopedText],
) -> Vec<RangeTestResult> {
    // format the scope selector to include a space at the beginning, because, currently, ScopeSelector expects excludes to begin with " -"
    // and they are sometimes in the syntax test as ^^^-comment, for example
    let selector =
        ScopeSelectors::from_str(&format!(" {}", &assertion.scope_selector_text)).unwrap();
    // find the scope at the specified start column, and start matching the selector through the rest of the tokens on the line from there until the end column is reached
    let mut results = Vec::new();
    for scoped_text in test_against_line_scopes
        .iter()
        .skip_while(|s| s.char_start + s.text_len <= assertion.begin_char)
        .take_while(|s| s.char_start < assertion.end_char)
    {
        let match_value = selector.does_match(scoped_text.scope.as_slice());
        let result = RangeTestResult {
            column_begin: max(scoped_text.char_start, assertion.begin_char),
            column_end: min(
                scoped_text.char_start + scoped_text.text_len,
                assertion.end_char,
            ),
            success: match_value.is_some(),
        };
        results.push(result);
    }
    // don't ignore assertions after the newline, they should be treated as though they are asserting against the newline
    let last = test_against_line_scopes.last().unwrap();
    if last.char_start + last.text_len < assertion.end_char {
        let match_value = selector.does_match(last.scope.as_slice());
        let result = RangeTestResult {
            column_begin: max(last.char_start + last.text_len, assertion.begin_char),
            column_end: assertion.end_char,
            success: match_value.is_some(),
        };
        results.push(result);
    }
    results
}

/// If `parse_test_lines` is `false` then lines that only contain assertions are not parsed
fn test_file(
    ss: &SyntaxSet,
    path: &Path,
    parse_test_lines: bool,
    out_opts: OutputOptions,
) -> Result<SyntaxTestFileResult, SyntaxTestHeaderError> {
    use syntect::util::debug_print_ops;
    let f = File::open(path).unwrap();
    let mut reader = BufReader::new(f);
    let mut line = String::new();

    // read the first line from the file - if we have reached EOF already, it's an invalid file
    if reader.read_line(&mut line).unwrap() == 0 {
        return Err(SyntaxTestHeaderError::MalformedHeader);
    }

    line = line.replace('\r', "");

    // parse the syntax test header in the first line of the file
    let header_line = line.clone();
    let search_result = SYNTAX_TEST_HEADER_PATTERN.captures(&header_line);
    let captures = search_result.ok_or(SyntaxTestHeaderError::MalformedHeader)?;

    let testtoken_start = captures.name("testtoken_start").unwrap().as_str();
    let testtoken_end = captures.name("testtoken_end").map(|c| c.as_str());
    let syntax_file = captures.name("syntax_file").unwrap().as_str();

    // find the relevant syntax definition to parse the file with - case is important!
    if !out_opts.summary {
        println!(
            "The test file references syntax definition file: {}",
            syntax_file
        );
    }
    let syntax = ss
        .find_syntax_by_path(syntax_file)
        .ok_or(SyntaxTestHeaderError::SyntaxDefinitionNotFound)?;

    // iterate over the lines of the file, testing them
    let mut state = ParseState::new(syntax);
    let mut stack = ScopeStack::new();

    let mut current_line_number = 1;
    let mut test_against_line_number = 1;
    let mut scopes_on_line_being_tested = Vec::new();
    let mut previous_non_assertion_line = line.to_string();

    let mut assertion_failures: usize = 0;
    let mut total_assertions: usize = 0;

    loop {
        // over lines of file, starting with the header line
        let mut line_only_has_assertion = false;
        let mut line_has_assertion = false;
        if let Some(assertion) = get_line_assertion_details(testtoken_start, testtoken_end, &line) {
            let result = process_assertions(&assertion, &scopes_on_line_being_tested);
            total_assertions += assertion.end_char - assertion.begin_char;
            for failure in result.iter().filter(|r| !r.success) {
                let length = failure.column_end - failure.column_begin;
                let text: String = previous_non_assertion_line
                    .chars()
                    .skip(failure.column_begin)
                    .take(length)
                    .collect();
                if !out_opts.summary {
                    println!(
                        "  Assertion selector {:?} \
                        from line {:?} failed against line {:?}, column range {:?}-{:?} \
                        (with text {:?}) \
                        has scope {:?}",
                        assertion.scope_selector_text.trim(),
                        current_line_number,
                        test_against_line_number,
                        failure.column_begin,
                        failure.column_end,
                        text,
                        scopes_on_line_being_tested
                            .iter()
                            .find(|s| s.char_start + s.text_len > failure.column_begin)
                            .unwrap_or_else(|| scopes_on_line_being_tested.last().unwrap())
                            .scope
                    );
                }
                assertion_failures += failure.column_end - failure.column_begin;
            }
            line_only_has_assertion = assertion.is_pure_assertion_line;
            line_has_assertion = true;
        }
        if !line_only_has_assertion || parse_test_lines {
            if !line_has_assertion {
                // ST seems to ignore lines that have assertions when calculating which line the assertion tests against
                scopes_on_line_being_tested.clear();
                test_against_line_number = current_line_number;
                previous_non_assertion_line = line.to_string();
            }
            if out_opts.debug && !line_only_has_assertion {
                println!(
                    "-- debugging line {} -- scope stack: {:?}",
                    current_line_number, stack
                );
            }
            let ops = state.parse_line(&line, ss).unwrap();
            if out_opts.debug && !line_only_has_assertion {
                if ops.is_empty() && !line.is_empty() {
                    println!("no operations for this line...");
                } else {
                    debug_print_ops(&line, &ops);
                }
            }
            let mut col: usize = 0;
            for (s, op) in ScopeRegionIterator::new(&ops, &line) {
                stack.apply(op).unwrap();
                if s.is_empty() {
                    // in this case we don't care about blank tokens
                    continue;
                }
                if !line_has_assertion {
                    // if the line has no assertions on it, remember the scopes on the line so we can test against them later
                    let len = s.chars().count();
                    scopes_on_line_being_tested.push(ScopedText {
                        char_start: col,
                        text_len: len,
                        scope: stack.as_slice().to_vec(),
                    });
                    // TODO: warn when there are duplicate adjacent (non-meta?) scopes, as it is almost always undesired
                    col += len;
                }
            }
        }

        line.clear();
        current_line_number += 1;
        if reader.read_line(&mut line).unwrap() == 0 {
            break;
        }
        line = line.replace('\r', "");
    }
    let res = if assertion_failures > 0 {
        Ok(SyntaxTestFileResult::FailedAssertions(
            assertion_failures,
            total_assertions,
        ))
    } else {
        Ok(SyntaxTestFileResult::Success(total_assertions))
    };

    if out_opts.summary {
        if let Ok(SyntaxTestFileResult::FailedAssertions(failures, _)) = res {
            // Don't print total assertion count so that diffs don't pick up new succeeding tests
            println!("FAILED {}: {}", path.display(), failures);
        }
    } else {
        println!("{:?}", res);
    }

    res
}

fn main() {
    let args: Vec<String> = std::env::args().collect();
    let mut opts = Options::new();
    opts.optflag("d", "debug", "Show parsing results for each test line");
    opts.optflag(
        "t",
        "time",
        "Time execution as a more broad-ranging benchmark",
    );
    opts.optflag("s", "summary", "Print only summary of test failures");

    let matches = match opts.parse(&args[1..]) {
        Ok(m) => m,
        Err(f) => {
            panic!("{}", f.to_string())
        }
    };

    let tests_path = if matches.free.is_empty() {
        "."
    } else {
        &args[1]
    };

    let syntaxes_path = if matches.free.len() < 2 { "" } else { &args[2] };

    // load the syntaxes from disk if told to
    // (as opposed to from the binary dumps)
    // this helps to ensure that a recompile isn't needed
    // when using this for syntax development
    let mut ss = if syntaxes_path.is_empty() {
        SyntaxSet::load_defaults_newlines() // note we load the version with newlines
    } else {
        SyntaxSet::new()
    };
    if !syntaxes_path.is_empty() {
        println!("loading syntax definitions from {}", syntaxes_path);
        let mut builder = SyntaxSetBuilder::new();
        builder.add_from_folder(syntaxes_path, true).unwrap(); // note that we load the version with newlines
        ss = builder.build();
    }

    let out_opts = OutputOptions {
        debug: matches.opt_present("debug"),
        time: matches.opt_present("time"),
        summary: matches.opt_present("summary"),
    };

    let exit_code = recursive_walk(&ss, tests_path, out_opts);
    println!("exiting with code {}", exit_code);
    std::process::exit(exit_code);
}

fn recursive_walk(ss: &SyntaxSet, path: &str, out_opts: OutputOptions) -> i32 {
    let mut exit_code: i32 = 0; // exit with code 0 by default, if all tests pass
    let walker = WalkDir::new(path).into_iter();

    // accumulate and sort for consistency of diffs across machines
    let mut files = Vec::new();
    for entry in walker.filter_entry(|e| e.file_type().is_dir() || is_a_syntax_test_file(e)) {
        let entry = entry.unwrap();
        if entry.file_type().is_file() {
            files.push(entry.path().to_owned());
        }
    }
    files.sort();

    for path in &files {
        if !out_opts.summary {
            println!("Testing file {}", path.display());
        }
        let start = Instant::now();
        let result = test_file(ss, path, true, out_opts);
        let elapsed = start.elapsed();
        if out_opts.time {
            let ms = (elapsed.as_secs() * 1_000) + elapsed.subsec_millis() as u64;
            println!("{} ms for file {}", ms, path.display());
        }
        if exit_code != 2 {
            // leave exit code 2 if there was an error
            if result.is_err() {
                // set exit code 2 if there was an error
                exit_code = 2;
            } else if let Ok(SyntaxTestFileResult::FailedAssertions(_, _)) = result {
                exit_code = 1; // otherwise, if there were failures, exit with code 1
            }
        }
    }

    exit_code
}

fn is_a_syntax_test_file(entry: &DirEntry) -> bool {
    entry
        .file_name()
        .to_str()
        .map(|s| s.starts_with("syntax_test_"))
        .unwrap_or(false)
}