From b05b9672e64b3430861e276c64a6c2c3eeabbda7 Mon Sep 17 00:00:00 2001
From: Carson McManus <carson.mcmanus1@gmail.com>
Date: Sun, 2 Nov 2025 15:01:56 -0500
Subject: [PATCH] perf(parse/tailwind): use compact trie for lexing base names
 instead of linear search

---
 .../src/lexer/base_name_store.rs              | 206 ++++++++++++++++++
 crates/biome_tailwind_parser/src/lexer/mod.rs |  35 +--
 2 files changed, 212 insertions(+), 29 deletions(-)
 create mode 100644 crates/biome_tailwind_parser/src/lexer/base_name_store.rs

diff --git a/crates/biome_tailwind_parser/src/lexer/base_name_store.rs b/crates/biome_tailwind_parser/src/lexer/base_name_store.rs
new file mode 100644
index 000000000000..d0950733a56c
--- /dev/null
+++ b/crates/biome_tailwind_parser/src/lexer/base_name_store.rs
@@ -0,0 +1,206 @@
+use std::sync::LazyLock;
+
+use biome_tailwind_syntax::metadata::BASENAMES_WITH_DASHES;
+
+/// A global store of dashed basenames for efficient reuse across lexing.
+///
+/// Ideally, this will be built by parsing a project's tailwind config, since it can add custom utilities and variants.
+pub static BASENAME_STORE: LazyLock<BaseNameStore> =
+    LazyLock::new(|| BaseNameStore::new(BASENAMES_WITH_DASHES));
+
+/// A compact trie storing all dashed basenames to efficiently match the longest valid basename
+/// at the beginning of a byte slice.
+///
+/// Build it once from `BASENAMES_WITH_DASHES` and reuse it across lexing.
+pub(crate) struct BaseNameStore {
+    nodes: Vec<Node>,
+}
+
+#[derive(Default)]
+struct Node {
+    terminal: bool,
+    // Children are stored as (byte, child_index)
+    children: Vec<(u8, usize)>,
+}
+
+impl BaseNameStore {
+    /// Creates a store from a list of ASCII basenames.
+    pub(crate) fn new(list: &[&str]) -> Self {
+        let mut store = Self {
+            nodes: vec![Node::default()], // root
+        };
+
+        for name in list {
+            store.insert(name.as_bytes());
+        }
+
+        // Optional: sort children to enable binary search (currently linear search is fine).
+        for i in 0..store.nodes.len() {
+            let children = &mut store.nodes[i].children;
+            children.sort_unstable_by_key(|(b, _)| *b);
+            // Deduplicate in case of construction anomalies (shouldn't happen, but cheap insurance)
+            children.dedup_by_key(|(b, _)| *b);
+        }
+
+        store
+    }
+
+    fn insert(&mut self, word: &[u8]) {
+        let mut node_idx = 0usize;
+
+        for &b in word {
+            node_idx = match self.find_child(node_idx, b) {
+                Some(next) => next,
+                None => {
+                    let next = self.new_node();
+                    self.nodes[node_idx].children.push((b, next));
+                    next
+                }
+            };
+        }
+
+        self.nodes[node_idx].terminal = true;
+    }
+
+    #[inline]
+    fn new_node(&mut self) -> usize {
+        let next_index = self.nodes.len();
+        self.nodes.push(Node::default());
+        next_index
+    }
+
+    #[inline]
+    fn find_child(&self, node: usize, byte: u8) -> Option<usize> {
+        // Linear search is fine because children fan-out is small in practice.
+        self.nodes[node]
+            .children
+            .iter()
+            .find_map(|(b, idx)| (*b == byte).then_some(*idx))
+    }
+
+    /// Creates a matcher for the provided text slice, starting at offset 0.
+    #[inline]
+    pub(crate) fn matcher<'s, 't>(&'s self, text: &'t [u8]) -> BaseNameMatcher<'s, 't> {
+        BaseNameMatcher { store: self, text }
+    }
+}
+
+/// A streaming helper that scans the provided text slice from the beginning
+/// and returns the longest dashed basename prefix end if any.
+///
+/// It stops scanning when:
+/// - It encounters a byte that isn't a path in the trie,
+/// - Or it hits a delimiter (`whitespace`, `!`, `:`).
+///
+/// The returned `usize` is the number of bytes consumed for the matched basename.
+/// The caller is responsible for verifying the boundary (e.g. next byte is `-`, `:`, whitespace, or end-of-input).
+pub(crate) struct BaseNameMatcher<'s, 't> {
+    store: &'s BaseNameStore,
+    text: &'t [u8],
+}
+
+impl<'s, 't> BaseNameMatcher<'s, 't> {
+    pub(crate) fn base_end(&self) -> usize {
+        let mut node_idx = 0usize;
+        let mut best_end: Option<usize> = None;
+
+        // Scan for dashed basename using the trie until a delimiter or mismatch
+        let mut i = 0usize;
+        while i < self.text.len() {
+            let b = self.text[i];
+            if is_delimiter(b) {
+                break;
+            }
+
+            match self.store.find_child(node_idx, b) {
+                Some(next) => {
+                    node_idx = next;
+                    i += 1;
+                    // SAFETY: This is safe because `next` will always be a valid index into `self.store.nodes`.
+                    // If it isn't, the trie is malformed.
+                    if self.store.nodes[node_idx].terminal {
+                        best_end = Some(i);
+                    }
+                }
+                None => {
+                    break;
+                }
+            }
+        }
+
+        // If we found a dashed match, accept it only if the next byte is a valid boundary or end-of-input
+        if let Some(end) = best_end
+            && self.text.get(end).is_none_or(|b| is_boundary_byte(*b))
+        {
+            return end;
+        }
+
+        // Fallback: naive basename ends at the first '-', whitespace, '!' or ':'
+        let mut j = 0usize;
+        while j < self.text.len() {
+            let b = self.text[j];
+            if b == b'-' || is_delimiter(b) {
+                break;
+            }
+            j += 1;
+        }
+        j
+    }
+}
+
+#[inline]
+const fn is_delimiter(b: u8) -> bool {
+    // Delimiters that cannot be part of a basename (excluding '-' which may be inside dashed basenames):
+    // - whitespace
+    // - '!' important modifier
+    // - ':' variant separator
+    matches!(b, b' ' | b'\n' | b'\r' | b'\t' | b'!' | b':')
+}
+
+#[inline]
+const fn is_boundary_byte(b: u8) -> bool {
+    // Valid boundary after a dashed basename:
+    // - '-' indicates a value follows
+    // - ':' indicates a variant boundary
+    // - whitespace
+    b == b'-' || b == b':' || matches!(b, b' ' | b'\n' | b'\r' | b'\t')
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn matches_longest_prefix() {
+        let store = BaseNameStore::new(&["border-t", "border", "mask-radial-at"]);
+        assert_eq!(
+            store.matcher(b"border-t-red-300").base_end(),
+            "border-t".len()
+        );
+        assert_eq!(
+            store.matcher(b"mask-radial-at-top").base_end(),
+            "mask-radial-at".len()
+        );
+        // Does not match undashed names that aren't present
+        assert_eq!(store.matcher(b"bg-red-500").base_end(), "bg".len());
+    }
+
+    #[test]
+    fn respects_delimiters() {
+        let store = BaseNameStore::new(&["nth-last", "rounded-bl"]);
+        assert_eq!(store.matcher(b"nth-last:odd").base_end(), "nth-last".len());
+        assert_eq!(
+            store.matcher(b"rounded-bl-lg").base_end(),
+            "rounded-bl".len()
+        );
+    }
+
+    #[test]
+    fn stops_on_non_edge() {
+        let store = BaseNameStore::new(&["border-t"]);
+        // No edge for 'f' after "border-t", so it shouldn't match "border-tf"
+        assert_eq!(store.matcher(b"border-tfoo").base_end(), "border".len());
+        // If the very first char doesn't match, None
+        assert_eq!(store.matcher(b"bg-red-500").base_end(), "bg".len());
+    }
+}
diff --git a/crates/biome_tailwind_parser/src/lexer/mod.rs b/crates/biome_tailwind_parser/src/lexer/mod.rs
index a26416f0b2c4..18d18a3c3f63 100644
--- a/crates/biome_tailwind_parser/src/lexer/mod.rs
+++ b/crates/biome_tailwind_parser/src/lexer/mod.rs
@@ -1,12 +1,13 @@
+mod base_name_store;
 mod tests;
 
+use crate::lexer::base_name_store::BASENAME_STORE;
 use crate::token_source::TailwindLexContext;
 use biome_parser::diagnostic::ParseDiagnostic;
 use biome_parser::lexer::{Lexer, LexerCheckpoint, LexerWithCheckpoint, ReLexer, TokenFlags};
 use biome_rowan::{SyntaxKind, TextLen};
 use biome_tailwind_syntax::T;
 use biome_tailwind_syntax::TailwindSyntaxKind::*;
-use biome_tailwind_syntax::metadata::BASENAMES_WITH_DASHES;
 use biome_tailwind_syntax::{TailwindSyntaxKind, TextSize};
 
 pub(crate) struct TailwindLexer<'src> {
@@ -132,34 +133,10 @@ impl<'src> TailwindLexer<'src> {
     fn consume_base(&mut self) -> TailwindSyntaxKind {
         self.assert_current_char_boundary();
 
-        // Find the longest matching base name
-        let source_from_position = &self.source[self.position..];
-        let base_name = BASENAMES_WITH_DASHES
-            .iter()
-            .rfind(|&name| source_from_position.starts_with(name));
-
-        if let Some(base_name) = base_name {
-            // we need to make sure that either the base is followed by a `-` to signify a value is coming, or a whitespace/end of input to signify the end of the base
-            match self.byte_at(base_name.len()) {
-                Some(b'-') | None => {
-                    self.advance(base_name.len());
-                    return TW_BASE;
-                }
-                Some(b) if b.is_ascii_whitespace() => {
-                    self.advance(base_name.len());
-                    return TW_BASE;
-                }
-                _ => {}
-            }
-        }
-
-        while let Some(byte) = self.current_byte() {
-            let char = self.current_char_unchecked();
-            if char.is_whitespace() || byte == b'-' || byte == b'!' || byte == b':' {
-                break;
-            }
-            self.advance(char.len_utf8());
-        }
+        let bytes = self.source.as_bytes();
+        let slice = &bytes[self.position..];
+        let end = BASENAME_STORE.matcher(slice).base_end();
+        self.advance(end);
 
         TW_BASE
     }