chore(bench): add index acceleration and query implementations harness

zolero · zolero · commit 6cf7dc018afb · 2026-05-10T21:54:50.000+02:00
diff --git a/crates/lora-database/benches/README.md b/crates/lora-database/benches/README.md
@@ -0,0 +1,26 @@
+# lora-database Benchmarks
+
+Benchmarks are split by intent:
+
+| Target | Purpose |
+| --- | --- |
+| `query_implementations` | Coverage-oriented query-language suite. Add representative benches here when a tested query implementation changes or lands. |
+| `scale` | Same query families across larger graph sizes. |
+| `realistic` | End-to-end domain-shaped workloads that combine several operators. |
+| `perf_smoke` | Short CI canary for large regressions. |
+| `wal` | Durability and recovery overhead. |
+| `concurrent` | Concurrent read/write workload behavior. |
+| `concurrency_guard` | Focused guardrail suite for snapshot, OCC, and WAL concurrency changes. |
+| `engine`, `advanced`, `temporal_spatial` | Older deep-dive suites kept for historical comparison and detailed performance docs. Prefer `query_implementations` for new query-feature coverage. |
+
+Run the coverage suite:
+
+```bash
+cargo bench -p lora-database --bench query_implementations
+```
+
+Run every registered database benchmark:
+
+```bash
+cargo bench -p lora-database --benches
+```
diff --git a/crates/lora-database/benches/concurrency_guard.rs b/crates/lora-database/benches/concurrency_guard.rs
@@ -116,6 +116,61 @@ fn bench_concurrency_guard(c: &mut Criterion) {
         });
     }
 
+    // Large materialized read paths. These catch whether intra-query read
+    // parallelism is actually paying off instead of just adding scheduling
+    // overhead to normal `execute()`.
+    {
+        let db = build_node_graph(Scale::LARGE);
+        group.bench_function("read_scan_50k", |b| {
+            b.iter(|| {
+                black_box(db.service.execute("MATCH (n) RETURN n.id", opts()).unwrap());
+            });
+        });
+
+        group.bench_function("read_label_project_50k", |b| {
+            b.iter(|| {
+                black_box(
+                    db.service
+                        .execute("MATCH (n:Node) RETURN n.id, n.name, n.value", opts())
+                        .unwrap(),
+                );
+            });
+        });
+
+        group.bench_function("read_scan_filter_project_50k", |b| {
+            b.iter(|| {
+                black_box(
+                    db.service
+                        .execute("MATCH (n:Node) WHERE n.value >= 0 RETURN n.id", opts())
+                        .unwrap(),
+                );
+            });
+        });
+
+        group.bench_function("read_scan_filter_half_project_50k", |b| {
+            b.iter(|| {
+                black_box(
+                    db.service
+                        .execute("MATCH (n:Node) WHERE n.value >= 50 RETURN n.id", opts())
+                        .unwrap(),
+                );
+            });
+        });
+
+        group.bench_function("read_map_projection_50k", |b| {
+            b.iter(|| {
+                black_box(
+                    db.service
+                        .execute(
+                            "MATCH (n:Node) RETURN n { .id, .name, .value } AS node",
+                            opts(),
+                        )
+                        .unwrap(),
+                );
+            });
+        });
+    }
+
     // Live read stream: pins an Arc snapshot and drops after one row.
     {
         let db = build_node_graph(Scale::SMALL);
diff --git a/crates/lora-database/benches/index_acceleration.rs b/crates/lora-database/benches/index_acceleration.rs
@@ -0,0 +1,189 @@
+//! Index-acceleration benchmarks.
+//!
+//! Pairs every read query against two seeded copies of the same graph —
+//! one with a property/text/point index, one without — so the runtime
+//! delta directly attributes to the cost-based rewrite picking up the
+//! index. Covers the four operators added in v0.8 plus their rel-side
+//! mirrors:
+//!
+//! * `NodeByPropertyRangeScan` ← `WHERE n.prop > X`
+//! * `NodeByTextScan`          ← `WHERE n.prop STARTS WITH …`
+//! * `RelByPropertyRangeScan`  ← `MATCH ()-[r:T]->() WHERE r.prop > X`
+//! * `RelByTextScan`           ← `MATCH ()-[r:T]->() WHERE r.prop STARTS WITH …`
+//!
+//! Run with:
+//!   `cargo bench -p lora-database --bench index_acceleration`
+//!
+//! Each scenario seeds once and is reused across iterations; only the
+//! query is measured. Set `LORA_BENCH_NODES` / `LORA_BENCH_RELS` in the
+//! environment to override the defaults (10k / 50k) for a quick local
+//! sweep, e.g. `LORA_BENCH_NODES=2000 LORA_BENCH_RELS=8000 cargo bench …`.
+
+mod fixtures;
+
+use std::env;
+use std::hint::black_box;
+use std::time::Duration;
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use fixtures::BenchDb;
+use lora_database::{ExecuteOptions, ResultFormat};
+
+const DEFAULT_NODES: usize = 10_000;
+const DEFAULT_RELS: usize = 50_000;
+const SEED_BATCH: usize = 2_000;
+
+fn opts() -> Option<ExecuteOptions> {
+    Some(ExecuteOptions {
+        format: ResultFormat::Rows,
+    })
+}
+
+fn env_usize(key: &str, default: usize) -> usize {
+    env::var(key)
+        .ok()
+        .and_then(|v| v.parse().ok())
+        .unwrap_or(default)
+}
+
+fn bench_config() -> Criterion {
+    // Rel-side scenarios at default scale need ~3 ms per iter; 4 s of
+    // measurement keeps the 30-sample target reachable without a
+    // warning. Override with `--measurement-time` for shorter runs.
+    Criterion::default()
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_millis(4_000))
+        .sample_size(30)
+}
+
+/// Seed `n` `:Person` nodes and `m` `:KNOWS` relationships connecting
+/// random pairs. Properties are chosen so:
+///
+/// * `n.age` spans 0..100 → range queries hit ~half the corpus when
+///   unindexed, a bounded slice when indexed.
+/// * `n.name` is `'p_<i>'` → STARTS WITH 'p_5' matches ~1/10 of the
+///   corpus, exercising the trigram path under load.
+/// * `r.since` spans 1990..2030 → range queries split the rel set.
+/// * `r.note` is `'note_<i>'` → STARTS WITH 'note_5' matches ~1/10.
+///
+/// The src/dst pairings are deterministic (seeded by index), so two
+/// builds of the same `(nodes, rels)` produce identical edges — the
+/// indexed and non-indexed databases see exactly the same data.
+fn seed_graph(db: &BenchDb, nodes: usize, rels: usize) {
+    let mut i = 0;
+    while i < nodes {
+        let end = (i + SEED_BATCH).min(nodes);
+        db.run(&format!(
+            "UNWIND range({i}, {}) AS i \
+             CREATE (:Person {{id: i, age: i % 100, name: 'p_' + toString(i)}})",
+            end - 1
+        ));
+        i = end;
+    }
+
+    let mut j = 0;
+    while j < rels {
+        let end = (j + SEED_BATCH).min(rels);
+        db.run(&format!(
+            "UNWIND range({j}, {}) AS i \
+             MATCH (a:Person {{id: i % {nodes}}}), (b:Person {{id: (i * 7 + 3) % {nodes}}}) \
+             CREATE (a)-[:KNOWS {{since: 1990 + i % 41, note: 'note_' + toString(i % 100), idx: i}}]->(b)",
+            end - 1
+        ));
+        j = end;
+    }
+}
+
+/// Build two databases with identical data but different index
+/// catalogs: `(without_index, with_index)`. Index DDL runs *after*
+/// the seed so the index is built once over the existing corpus
+/// rather than incrementally per CREATE.
+fn build_pair<F: Fn(&BenchDb)>(nodes: usize, rels: usize, install_index: F) -> (BenchDb, BenchDb) {
+    let plain = BenchDb::with_capacity_hint(nodes, rels);
+    seed_graph(&plain, nodes, rels);
+
+    let indexed = BenchDb::with_capacity_hint(nodes, rels);
+    seed_graph(&indexed, nodes, rels);
+    install_index(&indexed);
+
+    (plain, indexed)
+}
+
+fn run(db: &BenchDb, query: &str) {
+    black_box(db.service.execute(query, opts()).unwrap());
+}
+
+fn bench_node_range(c: &mut Criterion) {
+    let nodes = env_usize("LORA_BENCH_NODES", DEFAULT_NODES);
+    let rels = env_usize("LORA_BENCH_RELS", DEFAULT_RELS);
+
+    let (plain, indexed) = build_pair(nodes, rels, |db| {
+        db.run("CREATE INDEX person_age FOR (n:Person) ON (n.age)");
+    });
+
+    let query = "MATCH (n:Person) WHERE n.age > 95 RETURN n.id";
+
+    let mut group = c.benchmark_group("index_acceleration/node_range");
+    group.bench_function("without_index", |b| b.iter(|| run(&plain, query)));
+    group.bench_function("with_index", |b| b.iter(|| run(&indexed, query)));
+    group.finish();
+}
+
+fn bench_node_text(c: &mut Criterion) {
+    let nodes = env_usize("LORA_BENCH_NODES", DEFAULT_NODES);
+    let rels = env_usize("LORA_BENCH_RELS", DEFAULT_RELS);
+
+    let (plain, indexed) = build_pair(nodes, rels, |db| {
+        db.run("CREATE TEXT INDEX person_name FOR (n:Person) ON (n.name)");
+    });
+
+    let query = "MATCH (n:Person) WHERE n.name STARTS WITH 'p_99' RETURN n.id";
+
+    let mut group = c.benchmark_group("index_acceleration/node_text");
+    group.bench_function("without_index", |b| b.iter(|| run(&plain, query)));
+    group.bench_function("with_index", |b| b.iter(|| run(&indexed, query)));
+    group.finish();
+}
+
+fn bench_rel_range(c: &mut Criterion) {
+    let nodes = env_usize("LORA_BENCH_NODES", DEFAULT_NODES);
+    let rels = env_usize("LORA_BENCH_RELS", DEFAULT_RELS);
+
+    let (plain, indexed) = build_pair(nodes, rels, |db| {
+        db.run("CREATE INDEX knows_since FOR ()-[r:KNOWS]-() ON (r.since)");
+    });
+
+    let query = "MATCH ()-[r:KNOWS]->() WHERE r.since > 2025 RETURN r.idx";
+
+    let mut group = c.benchmark_group("index_acceleration/rel_range");
+    group.bench_function("without_index", |b| b.iter(|| run(&plain, query)));
+    group.bench_function("with_index", |b| b.iter(|| run(&indexed, query)));
+    group.finish();
+}
+
+fn bench_rel_text(c: &mut Criterion) {
+    let nodes = env_usize("LORA_BENCH_NODES", DEFAULT_NODES);
+    let rels = env_usize("LORA_BENCH_RELS", DEFAULT_RELS);
+
+    let (plain, indexed) = build_pair(nodes, rels, |db| {
+        db.run("CREATE TEXT INDEX knows_note FOR ()-[r:KNOWS]-() ON (r.note)");
+    });
+
+    let query = "MATCH ()-[r:KNOWS]->() WHERE r.note STARTS WITH 'note_9' RETURN r.idx";
+
+    let mut group = c.benchmark_group("index_acceleration/rel_text");
+    group.bench_function("without_index", |b| b.iter(|| run(&plain, query)));
+    group.bench_function("with_index", |b| b.iter(|| run(&indexed, query)));
+    group.finish();
+}
+
+criterion_group! {
+    name = index_acceleration;
+    config = bench_config();
+    targets =
+        bench_node_range,
+        bench_node_text,
+        bench_rel_range,
+        bench_rel_text,
+}
+criterion_main!(index_acceleration);
diff --git a/crates/lora-database/benches/perf_smoke.rs b/crates/lora-database/benches/perf_smoke.rs
@@ -3,8 +3,8 @@
 //! This is a deliberately tiny Criterion suite used as a CI "canary": it is
 //! meant to detect obvious, large performance regressions (≥3× slower) in
 //! core engine paths. It is **not** a source of truth for performance
-//! numbers — see `engine`, `scale`,
-//! `advanced`, and `temporal_spatial` for that.
+//! numbers — see `query_implementations`, `scale`, `realistic`, and the
+//! WAL/concurrency suites for that.
 //!
 //! Run locally with:
 //!   `cargo bench -p lora-database --bench perf_smoke`
diff --git a/crates/lora-database/benches/query_implementations.rs b/crates/lora-database/benches/query_implementations.rs