tensorzero · shuyangli · Dec 12, 2025 · Dec 11, 2025
diff --git a/gateway/src/routes/internal.rs b/gateway/src/routes/internal.rs
@@ -71,6 +71,10 @@ pub fn build_internal_non_otel_enabled_routes() -> Router<AppStateData> {
             "/internal/evaluations/run-stats",
             get(endpoints::internal::evaluations::get_evaluation_run_stats_handler),
         )
+        .route(
+                        "/internal/evaluations/runs",
+            get(endpoints::internal::evaluations::list_evaluation_runs_handler),
+        )
         .route(
             "/internal/models/usage",
             get(endpoints::internal::models::get_model_usage_handler),

diff --git a/internal/tensorzero-node/lib/bindings/EvaluationRunInfo.ts b/internal/tensorzero-node/lib/bindings/EvaluationRunInfo.ts
@@ -0,0 +1,13 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+
+/**
+ * Information about a single evaluation run.
+ */
+export type EvaluationRunInfo = {
+  evaluation_run_id: string;
+  evaluation_name: string;
+  dataset_name: string;
+  function_name: string;
+  variant_name: string;
+  last_inference_timestamp: string;
+};
diff --git a/internal/tensorzero-node/lib/bindings/ListEvaluationRunsResponse.ts b/internal/tensorzero-node/lib/bindings/ListEvaluationRunsResponse.ts
@@ -0,0 +1,7 @@
+// This file was generated by [ts-rs](https://github.com/Aleph-Alpha/ts-rs). Do not edit this file manually.
+import type { EvaluationRunInfo } from "./EvaluationRunInfo";
+
+/**
+ * Response containing a list of evaluation runs.
+ */
+export type ListEvaluationRunsResponse = { runs: Array<EvaluationRunInfo> };
diff --git a/internal/tensorzero-node/lib/bindings/index.ts b/internal/tensorzero-node/lib/bindings/index.ts
@@ -76,6 +76,7 @@ export * from "./EvaluationRunCompleteEvent";
 export * from "./EvaluationRunErrorEvent";
 export * from "./EvaluationRunEvent";
 export * from "./EvaluationRunFatalErrorEvent";
+export * from "./EvaluationRunInfo";
 export * from "./EvaluationRunStartEvent";
 export * from "./EvaluationRunStatsResponse";
 export * from "./EvaluationRunSuccessEvent";
@@ -154,6 +155,7 @@ export * from "./LaunchOptimizationParams";
 export * from "./LaunchOptimizationWorkflowParams";
 export * from "./ListDatapointsRequest";
 export * from "./ListDatasetsResponse";
+export * from "./ListEvaluationRunsResponse";
 export * from "./ListInferenceMetadataResponse";
 export * from "./ListInferencesRequest";
 export * from "./MetricConfig";

diff --git a/tensorzero-core/src/db/clickhouse/evaluation_queries.rs b/tensorzero-core/src/db/clickhouse/evaluation_queries.rs
@@ -5,8 +5,9 @@ use std::collections::HashMap;
 use async_trait::async_trait;
 
 use super::ClickHouseConnectionInfo;
-use super::select_queries::parse_count;
+use super::select_queries::{parse_count, parse_json_rows};
 use crate::db::evaluation_queries::EvaluationQueries;
+use crate::db::evaluation_queries::EvaluationRunInfoRow;
 use crate::error::Error;
 
 #[async_trait]
@@ -21,4 +22,223 @@ impl EvaluationQueries for ClickHouseConnectionInfo {
         let response = self.run_query_synchronous(query, &HashMap::new()).await?;
         parse_count(&response.response)
     }
+
+    async fn list_evaluation_runs(
+        &self,
+        limit: u32,
+        offset: u32,
+    ) -> Result<Vec<EvaluationRunInfoRow>, Error> {
+        let query = r"
+            SELECT
+                evaluation_run_id,
+                any(evaluation_name) AS evaluation_name,
+                any(inference_function_name) AS function_name,
+                any(variant_name) AS variant_name,
+                any(dataset_name) AS dataset_name,
+                formatDateTime(UUIDv7ToDateTime(uint_to_uuid(max(max_inference_id))), '%Y-%m-%dT%H:%i:%SZ') AS last_inference_timestamp
+            FROM (
+                SELECT
+                    maxIf(value, key = 'tensorzero::evaluation_run_id') AS evaluation_run_id,
+                    maxIf(value, key = 'tensorzero::evaluation_name') AS evaluation_name,
+                    maxIf(value, key = 'tensorzero::dataset_name') AS dataset_name,
+                    any(function_name) AS inference_function_name,
+                    any(variant_name) AS variant_name,
+                    max(toUInt128(inference_id)) AS max_inference_id
+                FROM TagInference FINAL
+                WHERE key IN ('tensorzero::evaluation_run_id', 'tensorzero::evaluation_name', 'tensorzero::dataset_name')
+                GROUP BY inference_id
+            )
+            WHERE NOT startsWith(inference_function_name, 'tensorzero::')
+            GROUP BY evaluation_run_id
+            ORDER BY toUInt128(toUUID(evaluation_run_id)) DESC
+            LIMIT {limit:UInt32}
+            OFFSET {offset:UInt32}
+            FORMAT JSONEachRow
+        "
+        .to_string();
+
+        let limit_str = limit.to_string();
+        let offset_str = offset.to_string();
+        let mut params = HashMap::new();
+        params.insert("limit", limit_str.as_str());
+        params.insert("offset", offset_str.as_str());
+
+        let response = self.run_query_synchronous(query, &params).await?;
+
+        parse_json_rows(response.response.as_str())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use crate::db::{
+        clickhouse::{
+            ClickHouseConnectionInfo, ClickHouseResponse, ClickHouseResponseMetadata,
+            clickhouse_client::MockClickHouseClient,
+            query_builder::test_util::assert_query_contains,
+        },
+        evaluation_queries::EvaluationQueries,
+    };
+
+    #[tokio::test]
+    async fn test_count_total_evaluation_runs() {
+        let mut mock_clickhouse_client = MockClickHouseClient::new();
+
+        mock_clickhouse_client
+            .expect_run_query_synchronous()
+            .withf(|query, params| {
+                assert_query_contains(
+                    query,
+                    "SELECT toUInt32(uniqExact(value)) as count
+                     FROM TagInference
+                     WHERE key = 'tensorzero::evaluation_run_id'
+                     FORMAT JSONEachRow",
+                );
+                assert_eq!(params.len(), 0, "Should have no parameters");
+                true
+            })
+            .returning(|_, _| {
+                Ok(ClickHouseResponse {
+                    response: r#"{"count":42}"#.to_string(),
+                    metadata: ClickHouseResponseMetadata {
+                        read_rows: 1,
+                        written_rows: 0,
+                    },
+                })
+            });
+
+        let conn = ClickHouseConnectionInfo::new_mock(Arc::new(mock_clickhouse_client));
+
+        let result = conn.count_total_evaluation_runs().await.unwrap();
+
+        assert_eq!(result, 42, "Should return count of 42");
+    }
+
+    #[tokio::test]
+    async fn test_list_evaluation_runs_with_defaults() {
+        let mut mock_clickhouse_client = MockClickHouseClient::new();
+
+        mock_clickhouse_client
+            .expect_run_query_synchronous()
+            .withf(|query, params| {
+                // Verify the query contains the expected structure
+                assert_query_contains(query, "SELECT");
+                assert_query_contains(query, "evaluation_run_id");
+                assert_query_contains(query, "FROM TagInference FINAL");
+                assert_query_contains(query, "LIMIT {limit:UInt32}");
+                assert_query_contains(query, "OFFSET {offset:UInt32}");
+
+                // Verify parameters
+                assert_eq!(params.get("limit"), Some(&"100"));
+                assert_eq!(params.get("offset"), Some(&"0"));
+                true
+            })
+            .returning(|_, _| {
+                Ok(ClickHouseResponse {
+                    response: r#"{"evaluation_run_id":"0196ee9c-d808-74f3-8000-02ec7409b95d","evaluation_name":"test_eval","function_name":"test_func","variant_name":"test_variant","dataset_name":"test_dataset","last_inference_timestamp":"2025-05-20T16:52:58Z"}"#.to_string(),
+                    metadata: ClickHouseResponseMetadata {
+                        read_rows: 1,
+                        written_rows: 0,
+                    },
+                })
+            });
+
+        let conn = ClickHouseConnectionInfo::new_mock(Arc::new(mock_clickhouse_client));
+
+        let result = conn.list_evaluation_runs(100, 0).await.unwrap();
+
+        assert_eq!(result.len(), 1, "Should return one evaluation run");
+        assert_eq!(result[0].evaluation_name, "test_eval");
+        assert_eq!(result[0].function_name, "test_func");
+        assert_eq!(result[0].variant_name, "test_variant");
+        assert_eq!(result[0].dataset_name, "test_dataset");
+    }
+
+    #[tokio::test]
+    async fn test_list_evaluation_runs_with_custom_pagination() {
+        let mut mock_clickhouse_client = MockClickHouseClient::new();
+
+        mock_clickhouse_client
+            .expect_run_query_synchronous()
+            .withf(|_query, params| {
+                // Verify custom pagination parameters
+                assert_eq!(params.get("limit"), Some(&"50"));
+                assert_eq!(params.get("offset"), Some(&"100"));
+                true
+            })
+            .returning(|_, _| {
+                Ok(ClickHouseResponse {
+                    response: String::new(),
+                    metadata: ClickHouseResponseMetadata {
+                        read_rows: 0,
+                        written_rows: 0,
+                    },
+                })
+            });
+
+        let conn = ClickHouseConnectionInfo::new_mock(Arc::new(mock_clickhouse_client));
+
+        let result = conn.list_evaluation_runs(50, 100).await.unwrap();
+
+        assert_eq!(result.len(), 0, "Should return empty results");
+    }
+
+    #[tokio::test]
+    async fn test_list_evaluation_runs_multiple_results() {
+        let mut mock_clickhouse_client = MockClickHouseClient::new();
+
+        mock_clickhouse_client
+            .expect_run_query_synchronous()
+            .returning(|_, _| {
+                Ok(ClickHouseResponse {
+                    response: r#"{"evaluation_run_id":"0196ee9c-d808-74f3-8000-02ec7409b95d","evaluation_name":"eval1","function_name":"func1","variant_name":"variant1","dataset_name":"dataset1","last_inference_timestamp":"2025-05-20T16:52:58Z"}
+{"evaluation_run_id":"0196ee9c-d808-74f3-8000-02ec7409b95e","evaluation_name":"eval2","function_name":"func2","variant_name":"variant2","dataset_name":"dataset2","last_inference_timestamp":"2025-05-20T17:52:58Z"}
+{"evaluation_run_id":"0196ee9c-d808-74f3-8000-02ec7409b95f","evaluation_name":"eval3","function_name":"func3","variant_name":"variant3","dataset_name":"dataset3","last_inference_timestamp":"2025-05-20T18:52:58Z"}"#.to_string(),
+                    metadata: ClickHouseResponseMetadata {
+                        read_rows: 3,
+                        written_rows: 0,
+                    },
+                })
+            });
+
+        let conn = ClickHouseConnectionInfo::new_mock(Arc::new(mock_clickhouse_client));
+
+        let result = conn.list_evaluation_runs(100, 0).await.unwrap();
+
+        assert_eq!(result.len(), 3, "Should return three evaluation runs");
+        assert_eq!(result[0].evaluation_name, "eval1");
+        assert_eq!(result[1].evaluation_name, "eval2");
+        assert_eq!(result[2].evaluation_name, "eval3");
+    }
+
+    #[tokio::test]
+    async fn test_list_evaluation_runs_filters_out_tensorzero_functions() {
+        let mut mock_clickhouse_client = MockClickHouseClient::new();
+
+        mock_clickhouse_client
+            .expect_run_query_synchronous()
+            .withf(|query, _params| {
+                // Verify the query filters out tensorzero:: functions
+                assert_query_contains(
+                    query,
+                    "NOT startsWith(inference_function_name, 'tensorzero::')",
+                );
+                true
+            })
+            .returning(|_, _| {
+                Ok(ClickHouseResponse {
+                    response: String::new(),
+                    metadata: ClickHouseResponseMetadata {
+                        read_rows: 0,
+                        written_rows: 0,
+                    },
+                })
+            });
+
+        let conn = ClickHouseConnectionInfo::new_mock(Arc::new(mock_clickhouse_client));
+
+        let _result = conn.list_evaluation_runs(100, 0).await.unwrap();
+    }
 }
diff --git a/tensorzero-core/src/db/evaluation_queries.rs b/tensorzero-core/src/db/evaluation_queries.rs
@@ -2,11 +2,36 @@
 
 use async_trait::async_trait;
 
+use chrono::{DateTime, Utc};
+#[cfg(test)]
+use mockall::automock;
+use serde::Deserialize;
+use uuid::Uuid;
+
 use crate::error::Error;
 
+/// Database struct for deserializing evaluation run info from ClickHouse.
+#[derive(Debug, Deserialize)]
+pub struct EvaluationRunInfoRow {
+    pub evaluation_run_id: Uuid,
+    pub evaluation_name: String,
+    pub function_name: String,
+    pub variant_name: String,
+    pub dataset_name: String,
+    pub last_inference_timestamp: DateTime<Utc>,
+}
+
 /// Trait for evaluation-related queries.
 #[async_trait]
+#[cfg_attr(test, automock)]
 pub trait EvaluationQueries {
     /// Counts the total number of unique evaluation runs across all functions.
     async fn count_total_evaluation_runs(&self) -> Result<u64, Error>;
+
+    /// Lists evaluation runs with pagination.
+    async fn list_evaluation_runs(
+        &self,
+        limit: u32,
+        offset: u32,
+    ) -> Result<Vec<EvaluationRunInfoRow>, Error>;
 }