diff --git a/crates/adapters/src/controller.rs b/crates/adapters/src/controller.rs index b0c2cf6335..76daada843 100644 --- a/crates/adapters/src/controller.rs +++ b/crates/adapters/src/controller.rs @@ -580,10 +580,6 @@ impl Controller { }) } - pub(crate) fn last_checkpoint(&self) -> LastCheckpoint { - self.inner.last_checkpoint() - } - pub(crate) fn last_checkpoint_sync(&self) -> LastCheckpoint { self.inner.last_checkpoint_sync() } diff --git a/crates/adapters/src/server.rs b/crates/adapters/src/server.rs index 7976824195..f4c93d32be 100644 --- a/crates/adapters/src/server.rs +++ b/crates/adapters/src/server.rs @@ -86,8 +86,8 @@ use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use serde::{Deserialize, Serialize}; use serde_json::json; use std::cell::RefCell; -use std::collections::HashMap; use std::collections::hash_map::Entry; +use std::collections::{HashMap, VecDeque}; use std::convert::Infallible; use std::ffi::OsStr; use std::hash::{BuildHasherDefault, DefaultHasher}; @@ -1195,7 +1195,7 @@ where .service(lir) .service(checkpoint) .service(checkpoint_status) - .service(checkpoint_list) + .service(checkpoints) .service(checkpoint_sync) .service(sync_checkpoint_status) .service(suspend) @@ -1828,11 +1828,20 @@ fn samply_profile_response(last_profile: &SamplyProfile) -> HttpResponse { } } +fn get_checkpoints(state: &ServerState) -> Result, PipelineError> { + Ok(match &state.storage { + Some(backend) => { + Checkpointer::read_checkpoints(&**backend).map_err(ControllerError::dbsp_error)? + } + None => Default::default(), + }) +} + #[post("/checkpoint/sync")] async fn checkpoint_sync(state: WebData) -> Result { let controller = state.controller()?; - let Some(last_checkpoint) = controller.last_checkpoint().id else { + let Some(last_checkpoint) = get_checkpoints(&state)?.back().map(|c| c.uuid) else { return Ok(HttpResponse::BadRequest().json(ErrorResponse { message: "no checkpoints found; make a POST request to `/checkpoint` to make a new checkpoint".to_string(), error_code: "400".into(), @@ -1874,15 +1883,9 @@ async fn checkpoint_status(state: WebData) -> impl Responder { HttpResponse::Ok().json(state.checkpoint_state.lock().unwrap().status.clone()) } -#[get("/checkpoint_list")] -async fn checkpoint_list(state: WebData) -> Result { - let checkpoints = match &state.storage { - Some(backend) => { - Checkpointer::read_checkpoints(&**backend).map_err(ControllerError::dbsp_error)? - } - None => Default::default(), - }; - Ok(HttpResponse::Ok().json(checkpoints)) +#[get("/checkpoints")] +async fn checkpoints(state: WebData) -> Result { + Ok(HttpResponse::Ok().json(get_checkpoints(&state)?)) } #[get("/checkpoint/sync_status")] diff --git a/crates/feldera-types/src/checkpoint.rs b/crates/feldera-types/src/checkpoint.rs index 8f99c432eb..f67ab37053 100644 --- a/crates/feldera-types/src/checkpoint.rs +++ b/crates/feldera-types/src/checkpoint.rs @@ -75,7 +75,7 @@ pub struct CheckpointSyncFailure { /// Holds meta-data about a checkpoint that was taken for persistent storage /// and recovery of a circuit's state. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] pub struct CheckpointMetadata { /// A unique identifier for the given checkpoint. /// diff --git a/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs b/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs index f34805f083..24e81bcc3c 100644 --- a/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs +++ b/crates/pipeline-manager/src/api/endpoints/pipeline_interaction.rs @@ -1157,6 +1157,60 @@ pub(crate) async fn get_checkpoint_sync_status( .await } +/// Get the checkpoints for a pipeline +/// +/// Retrieve the current checkpoints made by a pipeline. +#[utoipa::path( + context_path = "/v0", + security(("JSON web token (JWT) or API key" = [])), + params( + ("pipeline_name" = String, Path, description = "Unique pipeline name"), + ), + responses( + (status = OK + , description = "Checkpoints retrieved successfully" + , content_type = "application/json" + , body = CheckpointMetadata), + (status = NOT_FOUND + , description = "Pipeline with that name does not exist" + , body = ErrorResponse + , example = json!(examples::error_unknown_pipeline_name())), + (status = SERVICE_UNAVAILABLE + , body = ErrorResponse + , examples( + ("Pipeline is not deployed" = (value = json!(examples::error_pipeline_interaction_not_deployed()))), + ("Pipeline is currently unavailable" = (value = json!(examples::error_pipeline_interaction_currently_unavailable()))), + ("Disconnected during response" = (value = json!(examples::error_pipeline_interaction_disconnected()))), + ("Response timeout" = (value = json!(examples::error_pipeline_interaction_timeout()))) + ) + ), + (status = INTERNAL_SERVER_ERROR, body = ErrorResponse), + ), + tag = "Pipeline Lifecycle" +)] +#[get("/pipelines/{pipeline_name}/checkpoints")] +pub(crate) async fn get_checkpoints( + state: WebData, + client: WebData, + tenant_id: ReqData, + path: web::Path, + request: HttpRequest, +) -> Result { + let pipeline_name = path.into_inner(); + state + .runner + .forward_http_request_to_pipeline_by_name( + client.as_ref(), + *tenant_id, + &pipeline_name, + Method::GET, + "checkpoints", + request.query_string(), + None, + ) + .await +} + /// Start a Samply profile /// /// Profile the pipeline using the Samply profiler for the next `duration_secs` seconds. diff --git a/crates/pipeline-manager/src/api/main.rs b/crates/pipeline-manager/src/api/main.rs index 63b5e1e8fe..2fec3d9f43 100644 --- a/crates/pipeline-manager/src/api/main.rs +++ b/crates/pipeline-manager/src/api/main.rs @@ -215,6 +215,7 @@ It contains the following fields: endpoints::pipeline_interaction::get_checkpoint_status, endpoints::pipeline_interaction::sync_checkpoint, endpoints::pipeline_interaction::get_checkpoint_sync_status, + endpoints::pipeline_interaction::get_checkpoints, endpoints::pipeline_interaction::post_pipeline_pause, endpoints::pipeline_interaction::post_pipeline_resume, endpoints::pipeline_interaction::post_pipeline_activate, @@ -423,6 +424,7 @@ It contains the following fields: feldera_types::checkpoint::CheckpointStatus, feldera_types::checkpoint::CheckpointResponse, feldera_types::checkpoint::CheckpointFailure, + feldera_types::checkpoint::CheckpointMetadata, feldera_types::transaction::StartTransactionResponse, feldera_types::time_series::TimeSeries, feldera_types::time_series::SampleStatistics, @@ -546,6 +548,7 @@ fn api_scope() -> Scope { .service(endpoints::pipeline_interaction::sync_checkpoint) .service(endpoints::pipeline_interaction::get_checkpoint_status) .service(endpoints::pipeline_interaction::get_checkpoint_sync_status) + .service(endpoints::pipeline_interaction::get_checkpoints) .service(endpoints::pipeline_interaction::post_pipeline_pause) .service(endpoints::pipeline_interaction::post_pipeline_resume) .service(endpoints::pipeline_interaction::post_pipeline_activate) diff --git a/openapi.json b/openapi.json index 3b239c15a4..db64fffedb 100644 --- a/openapi.json +++ b/openapi.json @@ -2509,6 +2509,127 @@ ] } }, + "/v0/pipelines/{pipeline_name}/checkpoints": { + "get": { + "tags": [ + "Pipeline Lifecycle" + ], + "summary": "Get the checkpoints for a pipeline", + "description": "Retrieve the current checkpoints made by a pipeline.", + "operationId": "get_checkpoints", + "parameters": [ + { + "name": "pipeline_name", + "in": "path", + "description": "Unique pipeline name", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Checkpoints retrieved successfully", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CheckpointMetadata" + } + } + } + }, + "404": { + "description": "Pipeline with that name does not exist", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "message": "Unknown pipeline name 'non-existent-pipeline'", + "error_code": "UnknownPipelineName", + "details": { + "pipeline_name": "non-existent-pipeline" + } + } + } + } + }, + "500": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "503": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "examples": { + "Disconnected during response": { + "value": { + "message": "Error sending HTTP request to pipeline: the pipeline disconnected while it was processing this HTTP request. This could be because the pipeline either (a) encountered a fatal error or panic, (b) was stopped, or (c) experienced network issues -- retrying might help in the last case. Alternatively, check the pipeline logs. Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "the pipeline disconnected while it was processing this HTTP request. This could be because the pipeline either (a) encountered a fatal error or panic, (b) was stopped, or (c) experienced network issues -- retrying might help in the last case. Alternatively, check the pipeline logs." + } + } + }, + "Pipeline is currently unavailable": { + "value": { + "message": "Error sending HTTP request to pipeline: deployment status is currently 'unavailable' -- wait for it to become 'running' or 'paused' again Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "deployment status is currently 'unavailable' -- wait for it to become 'running' or 'paused' again" + } + } + }, + "Pipeline is not deployed": { + "value": { + "message": "Unable to interact with pipeline because the deployment status (stopped) indicates it is not (yet) fully provisioned pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionNotDeployed", + "details": { + "pipeline_name": "my_pipeline", + "status": "Stopped", + "desired_status": "Provisioned" + } + } + }, + "Response timeout": { + "value": { + "message": "Error sending HTTP request to pipeline: timeout (10s) was reached: this means the pipeline took too long to respond -- this can simply be because the request was too difficult to process in time, or other reasons (e.g., deadlock): the pipeline logs might contain additional information (original send request error: Timeout while waiting for response) Failed request: /pause pipeline-id=N/A pipeline-name=\"my_pipeline\"", + "error_code": "PipelineInteractionUnreachable", + "details": { + "pipeline_name": "my_pipeline", + "request": "/pause", + "error": "timeout (10s) was reached: this means the pipeline took too long to respond -- this can simply be because the request was too difficult to process in time, or other reasons (e.g., deadlock): the pipeline logs might contain additional information (original send request error: Timeout while waiting for response)" + } + } + } + } + } + } + } + }, + "security": [ + { + "JSON web token (JWT) or API key": [] + } + ] + } + }, "/v0/pipelines/{pipeline_name}/circuit_json_profile": { "get": { "tags": [ @@ -6509,6 +6630,53 @@ } } }, + "CheckpointMetadata": { + "type": "object", + "description": "Holds meta-data about a checkpoint that was taken for persistent storage\nand recovery of a circuit's state.", + "required": [ + "uuid", + "fingerprint" + ], + "properties": { + "fingerprint": { + "type": "integer", + "format": "int64", + "description": "Fingerprint of the circuit at the time of the checkpoint.", + "minimum": 0 + }, + "identifier": { + "type": "string", + "description": "An optional name for the checkpoint.", + "nullable": true + }, + "processed_records": { + "type": "integer", + "format": "int64", + "description": "Total number of records processed.", + "nullable": true, + "minimum": 0 + }, + "size": { + "type": "integer", + "format": "int64", + "description": "Total size of the checkpoint files in bytes.", + "nullable": true, + "minimum": 0 + }, + "steps": { + "type": "integer", + "format": "int64", + "description": "Total number of steps made.", + "nullable": true, + "minimum": 0 + }, + "uuid": { + "type": "string", + "format": "uuid", + "description": "A unique identifier for the given checkpoint.\n\nThis is used to identify the checkpoint in the file-system hierarchy." + } + } + }, "CheckpointResponse": { "type": "object", "description": "Response to a checkpoint request.", diff --git a/python/feldera/pipeline.py b/python/feldera/pipeline.py index ad767d3b69..49dc154757 100644 --- a/python/feldera/pipeline.py +++ b/python/feldera/pipeline.py @@ -34,6 +34,7 @@ from feldera.rest.sql_view import SQLView from feldera.runtime_config import RuntimeConfig from feldera.stats import PipelineStatistics +from feldera.types import CheckpointMetadata class Pipeline: @@ -1493,3 +1494,13 @@ def wait_for_token(self, token: str): """ self.client.wait_for_token(self.name, token) + + def checkpoints(self) -> List[CheckpointMetadata]: + """ + Returns the list of checkpoints for this pipeline. + """ + + return [ + CheckpointMetadata.from_dict(chk) + for chk in self.client.get_checkpoints(self.name) + ] diff --git a/python/feldera/rest/feldera_client.py b/python/feldera/rest/feldera_client.py index ffcc545c1a..8363807743 100644 --- a/python/feldera/rest/feldera_client.py +++ b/python/feldera/rest/feldera_client.py @@ -1328,3 +1328,6 @@ def get_cluster_event(self, event_id: str, selector: str = "status") -> dict: def rebalance_pipeline(self, pipeline_name: str): self.http.post(path=f"/pipelines/{pipeline_name}/rebalance") + + def get_checkpoints(self, pipeline_name: str): + return self.http.get(path=f"/pipelines/{pipeline_name}/checkpoints") diff --git a/python/feldera/types.py b/python/feldera/types.py new file mode 100644 index 0000000000..a83a8bb905 --- /dev/null +++ b/python/feldera/types.py @@ -0,0 +1,39 @@ +class CheckpointMetadata: + def __init__( + self, + uuid: str, + size: int, + steps: int, + processed_records: int, + fingerprint: int, + identifier: str | None = None, + ): + self.uuid = uuid + self.size = size + self.steps = steps + self.processed_records = processed_records + self.fingerprint = fingerprint + self.identifier = identifier + + @classmethod + def from_dict(self, chk_dict: dict): + return CheckpointMetadata( + uuid=chk_dict["uuid"], + size=chk_dict["size"], + steps=chk_dict["steps"], + processed_records=chk_dict["processed_records"], + fingerprint=chk_dict["fingerprint"], + identifier=chk_dict.get("identifier"), + ) + + def to_dict(self) -> dict: + chk_dict = { + "uuid": self.uuid, + "size": self.size, + "steps": self.steps, + "processed_records": self.processed_records, + "fingerprint": self.fingerprint, + } + if self.identifier is not None: + chk_dict["identifier"] = self.identifier + return chk_dict diff --git a/python/tests/platform/test_checkpoint_sync.py b/python/tests/platform/test_checkpoint_sync.py index 567d40ad7a..d58233493c 100644 --- a/python/tests/platform/test_checkpoint_sync.py +++ b/python/tests/platform/test_checkpoint_sync.py @@ -7,9 +7,9 @@ from feldera.enums import FaultToleranceModel, PipelineStatus from feldera.runtime_config import RuntimeConfig, Storage +from feldera.testutils import FELDERA_TEST_NUM_HOSTS, FELDERA_TEST_NUM_WORKERS from tests import enterprise_only from tests.shared_test_pipeline import SharedTestPipeline -from feldera.testutils import FELDERA_TEST_NUM_WORKERS, FELDERA_TEST_NUM_HOSTS DEFAULT_ENDPOINT = os.environ.get( "DEFAULT_MINIO_ENDPOINT", "http://minio.extra.svc.cluster.local:9000" @@ -122,14 +122,31 @@ def test_checkpoint_sync( f"adhoc query returned {len(got_before)} but {processed} records were processed: {got_before}" ) + chk_timeout = time.monotonic() + 30 + chk_uuid = None + if not automated_checkpoint: self.pipeline.checkpoint(wait=True) else: - time.sleep(ft_interval) + # wait for at least one automated checkpoint to be created with current data + while True: + chks = self.pipeline.checkpoints() + chk = next((x for x in chks if x.processed_records == processed), None) + + if chk is not None: + chk_uuid = chk.uuid + break + + if time.monotonic() > chk_timeout: + raise TimeoutError( + "timed out waiting for automated checkpoint to be created" + ) + time.sleep(0.5) + + print("Checkpoint UUID:", chk_uuid, file=sys.stderr) if automated_sync_interval is not None: - time.sleep(automated_sync_interval + 1) - timeout = time.monotonic() + 15 + timeout = time.monotonic() + 30 success = None while time.monotonic() < timeout and success is None: try: