From b05b9672e64b3430861e276c64a6c2c3eeabbda7 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Sun, 2 Nov 2025 15:01:56 -0500 Subject: [PATCH] perf(parse/tailwind): use compact trie for lexing base names instead of linear search --- .../src/lexer/base_name_store.rs | 206 ++++++++++++++++++ crates/biome_tailwind_parser/src/lexer/mod.rs | 35 +-- 2 files changed, 212 insertions(+), 29 deletions(-) create mode 100644 crates/biome_tailwind_parser/src/lexer/base_name_store.rs diff --git a/crates/biome_tailwind_parser/src/lexer/base_name_store.rs b/crates/biome_tailwind_parser/src/lexer/base_name_store.rs new file mode 100644 index 000000000000..d0950733a56c --- /dev/null +++ b/crates/biome_tailwind_parser/src/lexer/base_name_store.rs @@ -0,0 +1,206 @@ +use std::sync::LazyLock; + +use biome_tailwind_syntax::metadata::BASENAMES_WITH_DASHES; + +/// A global store of dashed basenames for efficient reuse across lexing. +/// +/// Ideally, this will be built by parsing a project's tailwind config, since it can add custom utilities and variants. +pub static BASENAME_STORE: LazyLock = + LazyLock::new(|| BaseNameStore::new(BASENAMES_WITH_DASHES)); + +/// A compact trie storing all dashed basenames to efficiently match the longest valid basename +/// at the beginning of a byte slice. +/// +/// Build it once from `BASENAMES_WITH_DASHES` and reuse it across lexing. +pub(crate) struct BaseNameStore { + nodes: Vec, +} + +#[derive(Default)] +struct Node { + terminal: bool, + // Children are stored as (byte, child_index) + children: Vec<(u8, usize)>, +} + +impl BaseNameStore { + /// Creates a store from a list of ASCII basenames. + pub(crate) fn new(list: &[&str]) -> Self { + let mut store = Self { + nodes: vec![Node::default()], // root + }; + + for name in list { + store.insert(name.as_bytes()); + } + + // Optional: sort children to enable binary search (currently linear search is fine). + for i in 0..store.nodes.len() { + let children = &mut store.nodes[i].children; + children.sort_unstable_by_key(|(b, _)| *b); + // Deduplicate in case of construction anomalies (shouldn't happen, but cheap insurance) + children.dedup_by_key(|(b, _)| *b); + } + + store + } + + fn insert(&mut self, word: &[u8]) { + let mut node_idx = 0usize; + + for &b in word { + node_idx = match self.find_child(node_idx, b) { + Some(next) => next, + None => { + let next = self.new_node(); + self.nodes[node_idx].children.push((b, next)); + next + } + }; + } + + self.nodes[node_idx].terminal = true; + } + + #[inline] + fn new_node(&mut self) -> usize { + let next_index = self.nodes.len(); + self.nodes.push(Node::default()); + next_index + } + + #[inline] + fn find_child(&self, node: usize, byte: u8) -> Option { + // Linear search is fine because children fan-out is small in practice. + self.nodes[node] + .children + .iter() + .find_map(|(b, idx)| (*b == byte).then_some(*idx)) + } + + /// Creates a matcher for the provided text slice, starting at offset 0. + #[inline] + pub(crate) fn matcher<'s, 't>(&'s self, text: &'t [u8]) -> BaseNameMatcher<'s, 't> { + BaseNameMatcher { store: self, text } + } +} + +/// A streaming helper that scans the provided text slice from the beginning +/// and returns the longest dashed basename prefix end if any. +/// +/// It stops scanning when: +/// - It encounters a byte that isn't a path in the trie, +/// - Or it hits a delimiter (`whitespace`, `!`, `:`). +/// +/// The returned `usize` is the number of bytes consumed for the matched basename. +/// The caller is responsible for verifying the boundary (e.g. next byte is `-`, `:`, whitespace, or end-of-input). +pub(crate) struct BaseNameMatcher<'s, 't> { + store: &'s BaseNameStore, + text: &'t [u8], +} + +impl<'s, 't> BaseNameMatcher<'s, 't> { + pub(crate) fn base_end(&self) -> usize { + let mut node_idx = 0usize; + let mut best_end: Option = None; + + // Scan for dashed basename using the trie until a delimiter or mismatch + let mut i = 0usize; + while i < self.text.len() { + let b = self.text[i]; + if is_delimiter(b) { + break; + } + + match self.store.find_child(node_idx, b) { + Some(next) => { + node_idx = next; + i += 1; + // SAFETY: This is safe because `next` will always be a valid index into `self.store.nodes`. + // If it isn't, the trie is malformed. + if self.store.nodes[node_idx].terminal { + best_end = Some(i); + } + } + None => { + break; + } + } + } + + // If we found a dashed match, accept it only if the next byte is a valid boundary or end-of-input + if let Some(end) = best_end + && self.text.get(end).is_none_or(|b| is_boundary_byte(*b)) + { + return end; + } + + // Fallback: naive basename ends at the first '-', whitespace, '!' or ':' + let mut j = 0usize; + while j < self.text.len() { + let b = self.text[j]; + if b == b'-' || is_delimiter(b) { + break; + } + j += 1; + } + j + } +} + +#[inline] +const fn is_delimiter(b: u8) -> bool { + // Delimiters that cannot be part of a basename (excluding '-' which may be inside dashed basenames): + // - whitespace + // - '!' important modifier + // - ':' variant separator + matches!(b, b' ' | b'\n' | b'\r' | b'\t' | b'!' | b':') +} + +#[inline] +const fn is_boundary_byte(b: u8) -> bool { + // Valid boundary after a dashed basename: + // - '-' indicates a value follows + // - ':' indicates a variant boundary + // - whitespace + b == b'-' || b == b':' || matches!(b, b' ' | b'\n' | b'\r' | b'\t') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matches_longest_prefix() { + let store = BaseNameStore::new(&["border-t", "border", "mask-radial-at"]); + assert_eq!( + store.matcher(b"border-t-red-300").base_end(), + "border-t".len() + ); + assert_eq!( + store.matcher(b"mask-radial-at-top").base_end(), + "mask-radial-at".len() + ); + // Does not match undashed names that aren't present + assert_eq!(store.matcher(b"bg-red-500").base_end(), "bg".len()); + } + + #[test] + fn respects_delimiters() { + let store = BaseNameStore::new(&["nth-last", "rounded-bl"]); + assert_eq!(store.matcher(b"nth-last:odd").base_end(), "nth-last".len()); + assert_eq!( + store.matcher(b"rounded-bl-lg").base_end(), + "rounded-bl".len() + ); + } + + #[test] + fn stops_on_non_edge() { + let store = BaseNameStore::new(&["border-t"]); + // No edge for 'f' after "border-t", so it shouldn't match "border-tf" + assert_eq!(store.matcher(b"border-tfoo").base_end(), "border".len()); + // If the very first char doesn't match, None + assert_eq!(store.matcher(b"bg-red-500").base_end(), "bg".len()); + } +} diff --git a/crates/biome_tailwind_parser/src/lexer/mod.rs b/crates/biome_tailwind_parser/src/lexer/mod.rs index a26416f0b2c4..18d18a3c3f63 100644 --- a/crates/biome_tailwind_parser/src/lexer/mod.rs +++ b/crates/biome_tailwind_parser/src/lexer/mod.rs @@ -1,12 +1,13 @@ +mod base_name_store; mod tests; +use crate::lexer::base_name_store::BASENAME_STORE; use crate::token_source::TailwindLexContext; use biome_parser::diagnostic::ParseDiagnostic; use biome_parser::lexer::{Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags}; use biome_rowan::{SyntaxKind, TextLen}; use biome_tailwind_syntax::T; use biome_tailwind_syntax::TailwindSyntaxKind::*; -use biome_tailwind_syntax::metadata::BASENAMES_WITH_DASHES; use biome_tailwind_syntax::{TailwindSyntaxKind, TextSize}; pub(crate) struct TailwindLexer<'src> { @@ -132,34 +133,10 @@ impl<'src> TailwindLexer<'src> { fn consume_base(&mut self) -> TailwindSyntaxKind { self.assert_current_char_boundary(); - // Find the longest matching base name - let source_from_position = &self.source[self.position..]; - let base_name = BASENAMES_WITH_DASHES - .iter() - .rfind(|&name| source_from_position.starts_with(name)); - - if let Some(base_name) = base_name { - // we need to make sure that either the base is followed by a `-` to signify a value is coming, or a whitespace/end of input to signify the end of the base - match self.byte_at(base_name.len()) { - Some(b'-') | None => { - self.advance(base_name.len()); - return TW_BASE; - } - Some(b) if b.is_ascii_whitespace() => { - self.advance(base_name.len()); - return TW_BASE; - } - _ => {} - } - } - - while let Some(byte) = self.current_byte() { - let char = self.current_char_unchecked(); - if char.is_whitespace() || byte == b'-' || byte == b'!' || byte == b':' { - break; - } - self.advance(char.len_utf8()); - } + let bytes = self.source.as_bytes(); + let slice = &bytes[self.position..]; + let end = BASENAME_STORE.matcher(slice).base_end(); + self.advance(end); TW_BASE }