diff --git a/Cargo.lock b/Cargo.lock index e08fe31f..e9b2d1c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1337,6 +1337,7 @@ dependencies = [ "schemars 0.8.22", "serde", "serde_json", + "serde_path_to_error", "serde_with", "sqlx", "time", diff --git a/Cargo.toml b/Cargo.toml index 6638c324..f7a99e85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -144,3 +144,4 @@ azure_storage_blobs = { version = "0.21.0", default-features = false, features = "enable_reqwest_rustls", "hmac_rust", ] } +serde_path_to_error = "0.1.17" diff --git a/examples/amazon_s3_embedding/pyproject.toml b/examples/amazon_s3_embedding/pyproject.toml index 6245bac0..3c21f5b9 100644 --- a/examples/amazon_s3_embedding/pyproject.toml +++ b/examples/amazon_s3_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Amazon S3 files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/azure_blob_embedding/pyproject.toml b/examples/azure_blob_embedding/pyproject.toml index de2d7f9d..b46c8a54 100644 --- a/examples/azure_blob_embedding/pyproject.toml +++ b/examples/azure_blob_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Azure Blob Storage files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/code_embedding/pyproject.toml b/examples/code_embedding/pyproject.toml index 9629ef42..4b3868d6 100644 --- a/examples/code_embedding/pyproject.toml +++ b/examples/code_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on source code." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/custom_output_files/pyproject.toml b/examples/custom_output_files/pyproject.toml index 907647df..d4fb06af 100644 --- a/examples/custom_output_files/pyproject.toml +++ b/examples/custom_output_files/pyproject.toml @@ -3,7 +3,7 @@ name = "custom-output-files" version = "0.1.0" description = "Simple example for cocoindex: convert markdown files to HTML files and save them to a local directory." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.4", "markdown-it-py[linkify,plugins]"] +dependencies = ["cocoindex>=0.2.8", "markdown-it-py[linkify,plugins]"] [tool.setuptools] packages = [] diff --git a/examples/docs_to_knowledge_graph/pyproject.toml b/examples/docs_to_knowledge_graph/pyproject.toml index 6669bac9..40c85a98 100644 --- a/examples/docs_to_knowledge_graph/pyproject.toml +++ b/examples/docs_to_knowledge_graph/pyproject.toml @@ -3,7 +3,7 @@ name = "manuals-to-kg" version = "0.1.0" description = "Simple example for cocoindex: extract triples from files and build knowledge graph." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.4"] +dependencies = ["cocoindex>=0.2.8"] [tool.setuptools] packages = [] diff --git a/examples/face_recognition/pyproject.toml b/examples/face_recognition/pyproject.toml index c3700a07..fd338f67 100644 --- a/examples/face_recognition/pyproject.toml +++ b/examples/face_recognition/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Build index for papers with both metadata and content embeddings" requires-python = ">=3.11" dependencies = [ - "cocoindex>=0.2.4", + "cocoindex>=0.2.8", "face-recognition>=1.3.0", "pillow>=10.0.0", "numpy>=1.26.0", diff --git a/examples/fastapi_server_docker/requirements.txt b/examples/fastapi_server_docker/requirements.txt index 4b9adc86..07ae58c5 100644 --- a/examples/fastapi_server_docker/requirements.txt +++ b/examples/fastapi_server_docker/requirements.txt @@ -1,4 +1,4 @@ -cocoindex[embeddings]>=0.2.4 +cocoindex[embeddings]>=0.2.8 python-dotenv>=1.0.1 fastapi==0.115.12 fastapi-cli==0.0.7 diff --git a/examples/gdrive_text_embedding/pyproject.toml b/examples/gdrive_text_embedding/pyproject.toml index e372e905..66a5d827 100644 --- a/examples/gdrive_text_embedding/pyproject.toml +++ b/examples/gdrive_text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Google Drive files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/image_search/pyproject.toml b/examples/image_search/pyproject.toml index 8ff11e39..f248a49d 100644 --- a/examples/image_search/pyproject.toml +++ b/examples/image_search/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Image search examples for cocoindex: CLIP and ColPali-based embedding." requires-python = ">=3.11" dependencies = [ - "cocoindex[colpali]>=0.2.4", + "cocoindex[colpali]>=0.2.8", "python-dotenv>=1.0.1", "fastapi>=0.100.0", "torch>=2.0.0", diff --git a/examples/live_updates/pyproject.toml b/examples/live_updates/pyproject.toml index c7cb6cdc..f9061599 100644 --- a/examples/live_updates/pyproject.toml +++ b/examples/live_updates/pyproject.toml @@ -3,7 +3,7 @@ name = "live-updates-example" version = "0.1.0" description = "Simple example for cocoindex: perform live updates based on local markdown files." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.4", "python-dotenv>=1.1.0"] +dependencies = ["cocoindex>=0.2.8", "python-dotenv>=1.1.0"] [tools.setuptools] packages = [] diff --git a/examples/manuals_llm_extraction/pyproject.toml b/examples/manuals_llm_extraction/pyproject.toml index 0012a4a3..c3fb97d6 100644 --- a/examples/manuals_llm_extraction/pyproject.toml +++ b/examples/manuals_llm_extraction/pyproject.toml @@ -3,7 +3,7 @@ name = "manuals-llm-extraction" version = "0.1.0" description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.4", "marker-pdf>=1.8.5"] +dependencies = ["cocoindex>=0.2.8", "marker-pdf>=1.8.5"] [tool.setuptools] packages = [] diff --git a/examples/multi_format_indexing/pyproject.toml b/examples/multi_format_indexing/pyproject.toml index a8912d93..fe92da67 100644 --- a/examples/multi_format_indexing/pyproject.toml +++ b/examples/multi_format_indexing/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local PDF files." requires-python = ">=3.11" dependencies = [ - "cocoindex[colpali]>=0.2.4", + "cocoindex[colpali]>=0.2.8", "python-dotenv>=1.0.1", "pdf2image>=1.17.0", "qdrant-client>=1.15.0", diff --git a/examples/paper_metadata/pyproject.toml b/examples/paper_metadata/pyproject.toml index 060c3cc2..47ce78f5 100644 --- a/examples/paper_metadata/pyproject.toml +++ b/examples/paper_metadata/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Build index for papers with both metadata and content embeddings" requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "pypdf>=5.7.0", "marker-pdf>=1.8.5", ] diff --git a/examples/patient_intake_extraction/pyproject.toml b/examples/patient_intake_extraction/pyproject.toml index 3e405276..2aae43bc 100644 --- a/examples/patient_intake_extraction/pyproject.toml +++ b/examples/patient_intake_extraction/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Extract structured information from patient intake forms using LLM." requires-python = ">=3.10" dependencies = [ - "cocoindex>=0.2.4", + "cocoindex>=0.2.8", "python-dotenv>=1.0.1", "markitdown>=0.1.2", "openai>=1.68.2", diff --git a/examples/pdf_embedding/pyproject.toml b/examples/pdf_embedding/pyproject.toml index cd278b2a..7ec0e546 100644 --- a/examples/pdf_embedding/pyproject.toml +++ b/examples/pdf_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local PDF files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "marker-pdf>=1.8.5", "psycopg[binary,pool]", diff --git a/examples/postgres_source/pyproject.toml b/examples/postgres_source/pyproject.toml index 47783e66..9ddd070e 100644 --- a/examples/postgres_source/pyproject.toml +++ b/examples/postgres_source/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Demonstrate how to use Postgres tables as the source for CocoIndex." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/product_recommendation/pyproject.toml b/examples/product_recommendation/pyproject.toml index 9faca26b..7b7e8ed7 100644 --- a/examples/product_recommendation/pyproject.toml +++ b/examples/product_recommendation/pyproject.toml @@ -3,7 +3,7 @@ name = "cocoindex-ecommerce-taxonomy" version = "0.1.0" description = "Simple example for CocoIndex: extract taxonomy from e-commerce products and build knowledge graph." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.4", "jinja2>=3.1.6"] +dependencies = ["cocoindex>=0.2.8", "jinja2>=3.1.6"] [tool.setuptools] packages = [] diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml index 2b00fe1a..3ad848bd 100644 --- a/examples/text_embedding/pyproject.toml +++ b/examples/text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/text_embedding_qdrant/pyproject.toml b/examples/text_embedding_qdrant/pyproject.toml index 3ae7d660..43878927 100644 --- a/examples/text_embedding_qdrant/pyproject.toml +++ b/examples/text_embedding_qdrant/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.2.4", + "cocoindex[embeddings]>=0.2.8", "python-dotenv>=1.0.1", "qdrant-client>=1.6.0", ] diff --git a/python/cocoindex/setting.py b/python/cocoindex/setting.py index 95e8c682..8b9817e3 100644 --- a/python/cocoindex/setting.py +++ b/python/cocoindex/setting.py @@ -94,8 +94,7 @@ def from_env(cls) -> Self: database_url = os.getenv("COCOINDEX_DATABASE_URL") if database_url is not None: - db_kwargs: dict[str, Any] = dict() - _load_field(db_kwargs, "url", "COCOINDEX_DATABASE_URL", required=True) + db_kwargs: dict[str, Any] = {"url": database_url} _load_field(db_kwargs, "user", "COCOINDEX_DATABASE_USER") _load_field(db_kwargs, "password", "COCOINDEX_DATABASE_PASSWORD") _load_field( diff --git a/src/base/value.rs b/src/base/value.rs index 2fad98c5..0015a06d 100644 --- a/src/base/value.rs +++ b/src/base/value.rs @@ -1,13 +1,12 @@ +use crate::prelude::*; + use super::schema::*; use crate::base::duration::parse_duration; -use crate::{api_bail, api_error}; -use anyhow::Result; use base64::prelude::*; use bytes::Bytes; use chrono::Offset; use log::warn; use serde::{ - Deserialize, Serialize, de::{SeqAccess, Visitor}, ser::{SerializeMap, SerializeSeq, SerializeTuple}, }; @@ -1014,7 +1013,8 @@ where Ok(Self { fields: fields .map(|(s, v)| { - let value = Value::::from_json(v, &s.value_type.typ)?; + let value = Value::::from_json(v, &s.value_type.typ) + .with_context(|| format!("while deserializing field `{}`", s.name))?; if value.is_null() && !s.value_type.nullable { api_bail!("expected non-null value for `{}`", s.name); } @@ -1033,9 +1033,10 @@ where fields: fields_schema .map(|field| { let value = match values.get_mut(&field.name) { - Some(v) => { - Value::::from_json(std::mem::take(v), &field.value_type.typ)? - } + Some(v) => Value::::from_json(std::mem::take(v), &field.value_type.typ) + .with_context(|| { + format!("while deserializing field `{}`", field.name) + })?, None => Value::::default(), }; if value.is_null() && !field.value_type.nullable { @@ -1137,7 +1138,7 @@ impl BasicValue { v.as_f64() .ok_or_else(|| anyhow::anyhow!("invalid fp64 value {v}"))?, ), - (v, BasicValueType::Range) => BasicValue::Range(serde_json::from_value(v)?), + (v, BasicValueType::Range) => BasicValue::Range(utils::deser::from_json_value(v)?), (serde_json::Value::String(v), BasicValueType::Uuid) => BasicValue::Uuid(v.parse()?), (serde_json::Value::String(v), BasicValueType::Date) => BasicValue::Date(v.parse()?), (serde_json::Value::String(v), BasicValueType::Time) => BasicValue::Time(v.parse()?), @@ -1170,7 +1171,11 @@ impl BasicValue { ) => { let vec = v .into_iter() - .map(|v| BasicValue::from_json(v, element_type)) + .enumerate() + .map(|(i, v)| { + BasicValue::from_json(v, element_type) + .with_context(|| format!("while deserializing Vector element #{i}")) + }) .collect::>>()?; BasicValue::Vector(Arc::from(vec)) } @@ -1267,7 +1272,11 @@ where TableKind::UTable => { let rows = v .into_iter() - .map(|v| Ok(FieldValues::from_json(v, &s.row.fields)?.into())) + .map(|v| { + Ok(FieldValues::from_json(v, &s.row.fields) + .with_context(|| format!("while deserializing UTable row"))? + .into()) + }) .collect::>>()?; Value::LTable(rows) } @@ -1289,10 +1298,13 @@ where let mut field_vals_iter = v.into_iter(); let keys: Box<[KeyPart]> = (0..num_key_parts) .map(|_| { + let field_schema = fields_iter.next().unwrap(); Self::from_json( field_vals_iter.next().unwrap(), - &fields_iter.next().unwrap().value_type.typ, - )? + &field_schema.value_type.typ, + ).with_context(|| { + format!("while deserializing key part `{}`", field_schema.name) + })? .into_key() }) .collect::>()?; @@ -1328,7 +1340,14 @@ where TableKind::LTable => { let rows = v .into_iter() - .map(|v| Ok(FieldValues::from_json(v, &s.row.fields)?.into())) + .enumerate() + .map(|(i, v)| { + Ok(FieldValues::from_json(v, &s.row.fields) + .with_context(|| { + format!("while deserializing LTable row #{i}") + })? + .into()) + }) .collect::>>()?; Value::LTable(rows) } diff --git a/src/builder/analyzed_flow.rs b/src/builder/analyzed_flow.rs index 9dab7d2e..64a5522c 100644 --- a/src/builder/analyzed_flow.rs +++ b/src/builder/analyzed_flow.rs @@ -20,7 +20,9 @@ impl AnalyzedFlow { flow_instance_ctx: Arc, ) -> Result { let (data_schema, setup_state, execution_plan_fut) = - analyzer::analyze_flow(&flow_instance, flow_instance_ctx.clone()).await?; + analyzer::analyze_flow(&flow_instance, flow_instance_ctx.clone()) + .await + .with_context(|| format!("analyzing flow `{}`", flow_instance.name))?; let execution_plan = async move { shared_ok(Arc::new( execution_plan_fut.await.map_err(SharedError::new)?, diff --git a/src/builder/analyzer.rs b/src/builder/analyzer.rs index 9e36b014..64a59055 100644 --- a/src/builder/analyzer.rs +++ b/src/builder/analyzer.rs @@ -674,7 +674,9 @@ impl AnalyzerContext { let global_concurrency_controller = self.lib_ctx.global_concurrency_controller.clone(); let result_fut = async move { trace!("Start building executor for source op `{op_name}`"); - let executor = executor.await?; + let executor = executor + .await + .with_context(|| format!("Preparing for source op: {op_name}"))?; trace!("Finished building executor for source op `{op_name}`"); Ok(AnalyzedImportOp { executor, @@ -700,10 +702,7 @@ impl AnalyzerContext { ReactiveOpSpec::Transform(op) => { let input_field_schemas = analyze_input_fields(&op.inputs, op_scope).with_context(|| { - format!( - "Failed to analyze inputs for transform op: {}", - reactive_op.name - ) + format!("Preparing inputs for transform op: {}", reactive_op.name) })?; let spec = serde_json::Value::Object(op.op.spec.clone()); @@ -725,7 +724,7 @@ impl AnalyzerContext { async move { trace!("Start building executor for transform op `{op_name}`"); let executor = executor.await.with_context(|| { - format!("Failed to build executor for transform op: {op_name}") + format!("Preparing for transform op: {op_name}") })?; let enable_cache = executor.enable_cache(); let behavior_version = executor.behavior_version(); @@ -784,7 +783,7 @@ impl AnalyzerContext { local_field_ref, op_scope: analyzed_op_scope_fut .await - .with_context(|| format!("Analyzing foreach op: {op_name}"))?, + .with_context(|| format!("Preparing for foreach op: {op_name}"))?, name: op_name, concurrency_controller: concur_control::ConcurrencyController::new( &concur_control_options, @@ -920,7 +919,7 @@ impl AnalyzerContext { let export_context = data_coll_output .export_context .await - .with_context(|| format!("Analyzing export op: {op_name}"))?; + .with_context(|| format!("Preparing for export op: {op_name}"))?; trace!("Finished building executor for export op `{op_name}`"); Ok(AnalyzedExportOp { name: op_name, @@ -1006,7 +1005,8 @@ pub async fn analyze_flow( import_ops_futs.push( analyzer_ctx .analyze_import_op(&root_op_scope, import_op.clone()) - .await?, + .await + .with_context(|| format!("Preparing for import op: {}", import_op.name))?, ); } let op_scope_fut = analyzer_ctx @@ -1059,7 +1059,8 @@ pub async fn analyze_flow( &mut targets_analyzed_ss, &mut declarations_analyzed_ss, ) - .await?, + .await + .with_context(|| format!("Analyzing export ops for target `{target_kind}`"))?, ); analyzed_target_op_groups.push(analyzed_target_op_group); } diff --git a/src/execution/source_indexer.rs b/src/execution/source_indexer.rs index b1a403c7..bea3e737 100644 --- a/src/execution/source_indexer.rs +++ b/src/execution/source_indexer.rs @@ -382,7 +382,8 @@ impl SourceIndexingContext { error!( "{:?}", e.context(format!( - "Error in processing row from source `{source}` with key: {key}", + "Error in processing row from flow `{flow}` source `{source}` with key: {key}", + flow = self.flow.flow_instance.name, source = self.flow.flow_instance.import_ops[self.source_idx].name, key = row_input.key, )) diff --git a/src/lib_context.rs b/src/lib_context.rs index 4899c01d..3465102a 100644 --- a/src/lib_context.rs +++ b/src/lib_context.rs @@ -250,7 +250,7 @@ impl LibContext { pub fn require_persistence_ctx(&self) -> Result<&PersistenceContext> { self.persistence_ctx .as_ref() - .ok_or_else(|| anyhow!("Database is required for this operation. Please set COCOINDEX_DATABASE_URL environment variable and call cocoindex.init() with database settings.")) + .ok_or_else(|| anyhow!("Database is required for this operation. Please set COCOINDEX_DATABASE_URL environment variable OR call `cocoindex.init()` with database settings.")) } pub fn require_builtin_db_pool(&self) -> Result<&PgPool> { diff --git a/src/llm/anthropic.rs b/src/llm/anthropic.rs index 1ce3bc4f..bfb3c311 100644 --- a/src/llm/anthropic.rs +++ b/src/llm/anthropic.rs @@ -138,7 +138,7 @@ impl LlmGenerationClient for Client { match &mut resp_json["content"][0]["text"] { serde_json::Value::String(s) => { // Try strict JSON parsing first - match serde_json::from_str::(s) { + match utils::deser::from_json_str::(s) { Ok(_) => std::mem::take(s), Err(e) => { // Try permissive json5 parsing as fallback diff --git a/src/llm/gemini.rs b/src/llm/gemini.rs index 9586a249..d8cec607 100644 --- a/src/llm/gemini.rs +++ b/src/llm/gemini.rs @@ -276,7 +276,7 @@ impl LlmGenerationClient for VertexAiClient { generation_config = Some( GenerationConfig::new() .set_response_mime_type("application/json".to_string()) - .set_response_schema(serde_json::from_value::(schema_json)?), + .set_response_schema(utils::deser::from_json_value::(schema_json)?), ); } @@ -358,7 +358,7 @@ impl LlmEmbeddingClient for VertexAiClient { .next() .and_then(|mut e| e.get_mut("embeddings").map(|v| v.take())) .ok_or_else(|| anyhow::anyhow!("No embeddings in response"))?; - let embedding: ContentEmbedding = serde_json::from_value(embeddings)?; + let embedding: ContentEmbedding = utils::deser::from_json_value(embeddings)?; Ok(super::LlmEmbeddingResponse { embedding: embedding.values, }) diff --git a/src/ops/factory_bases.rs b/src/ops/factory_bases.rs index dd8bbc23..d85bcf6d 100644 --- a/src/ops/factory_bases.rs +++ b/src/ops/factory_bases.rs @@ -245,7 +245,8 @@ impl SourceFactory for T { EnrichedValueType, BoxFuture<'static, Result>>, )> { - let spec: T::Spec = serde_json::from_value(spec)?; + let spec: T::Spec = utils::deser::from_json_value(spec) + .with_context(|| format!("Failed in parsing spec for source `{source_name}`"))?; let output_schema = self.get_output_schema(&spec, &context).await?; let source_name = source_name.to_string(); let executor = async move { self.build_executor(&source_name, spec, context).await }; @@ -323,7 +324,8 @@ impl SimpleFunctionFactory for T { EnrichedValueType, BoxFuture<'static, Result>>, )> { - let spec: T::Spec = serde_json::from_value(spec)?; + let spec: T::Spec = utils::deser::from_json_value(spec) + .with_context(|| format!("Failed in parsing spec for function `{}`", self.name()))?; let mut nonnull_args_idx = vec![]; let mut may_nullify_output = false; let mut args_resolver = OpArgsResolver::new( @@ -397,7 +399,7 @@ pub trait TargetFactoryBase: TargetFactory + Send + Sync + 'static { /// Deserialize the setup key from a JSON value. /// You can override this method to provide a custom deserialization logic, e.g. to perform backward compatible deserialization. fn deserialize_setup_key(key: serde_json::Value) -> Result { - Ok(serde_json::from_value(key)?) + Ok(utils::deser::from_json_value(key)?) } /// Will not be called if it's setup by user. @@ -466,8 +468,10 @@ impl TargetFactory for T { .into_iter() .map(|d| { anyhow::Ok(TypedExportDataCollectionSpec { + spec: utils::deser::from_json_value(d.spec).with_context(|| { + format!("Failed in parsing spec for target `{}`", d.name) + })?, name: d.name, - spec: serde_json::from_value(d.spec)?, key_fields_schema: d.key_fields_schema, value_fields_schema: d.value_fields_schema, index_options: d.index_options, @@ -476,7 +480,7 @@ impl TargetFactory for T { .collect::>>()?, declarations .into_iter() - .map(|d| anyhow::Ok(serde_json::from_value(d)?)) + .map(|d| anyhow::Ok(utils::deser::from_json_value(d)?)) .collect::>>()?, context, ) @@ -511,7 +515,7 @@ impl TargetFactory for T { ) -> Result> { let key: T::SetupKey = Self::deserialize_setup_key(key.clone())?; let desired_state: Option = desired_state - .map(|v| serde_json::from_value(v.clone())) + .map(|v| utils::deser::from_json_value(v.clone())) .transpose()?; let existing_states = from_json_combined_state(existing_states)?; let setup_change = TargetFactoryBase::diff_setup_states( @@ -542,8 +546,8 @@ impl TargetFactory for T { ) -> Result { let result = TargetFactoryBase::check_state_compatibility( self, - &serde_json::from_value(desired_state.clone())?, - &serde_json::from_value(existing_state.clone())?, + &utils::deser::from_json_value(desired_state.clone())?, + &utils::deser::from_json_value(existing_state.clone())?, )?; Ok(result) } @@ -596,7 +600,7 @@ impl TargetFactory for T { .into_iter() .map(|item| -> anyhow::Result<_> { Ok(TypedResourceSetupChangeItem { - key: serde_json::from_value(item.key.clone())?, + key: utils::deser::from_json_value(item.key.clone())?, setup_change: (item.setup_change as &dyn Any) .downcast_ref::() .ok_or_else(invariance_violation)?, @@ -614,7 +618,7 @@ fn from_json_combined_state( Ok(setup::CombinedState { current: existing_states .current - .map(|v| serde_json::from_value(v)) + .map(|v| utils::deser::from_json_value(v)) .transpose()?, staging: existing_states .staging @@ -622,7 +626,7 @@ fn from_json_combined_state( .map(|v| { anyhow::Ok(match v { setup::StateChange::Upsert(v) => { - setup::StateChange::Upsert(serde_json::from_value(v)?) + setup::StateChange::Upsert(utils::deser::from_json_value(v)?) } setup::StateChange::Delete => setup::StateChange::Delete, }) diff --git a/src/ops/functions/extract_by_llm.rs b/src/ops/functions/extract_by_llm.rs index 627cb93c..4dfe9d4d 100644 --- a/src/ops/functions/extract_by_llm.rs +++ b/src/ops/functions/extract_by_llm.rs @@ -113,7 +113,7 @@ impl SimpleFunctionExecutor for Executor { }), }; let res = self.client.generate(req).await?; - let json_value: serde_json::Value = serde_json::from_str(res.text.as_str())?; + let json_value: serde_json::Value = utils::deser::from_json_str(res.text.as_str())?; let value = self.value_extractor.extract_value(json_value)?; Ok(value) } diff --git a/src/ops/functions/parse_json.rs b/src/ops/functions/parse_json.rs index 84900710..d1052439 100644 --- a/src/ops/functions/parse_json.rs +++ b/src/ops/functions/parse_json.rs @@ -29,7 +29,7 @@ fn add_language( } fn parse_json(text: &str) -> Result { - Ok(serde_json::from_str(text)?) + Ok(utils::deser::from_json_str(text)?) } static PARSE_FN_BY_LANG: LazyLock, Arc>> = diff --git a/src/ops/sources/amazon_s3.rs b/src/ops/sources/amazon_s3.rs index d07d5185..e81ed772 100644 --- a/src/ops/sources/amazon_s3.rs +++ b/src/ops/sources/amazon_s3.rs @@ -241,7 +241,7 @@ impl Executor { let mut change_messages = vec![]; for message in messages.into_iter() { if let Some(body) = message.body { - let notification: S3EventNotification = serde_json::from_str(&body)?; + let notification: S3EventNotification = utils::deser::from_json_str(&body)?; let mut changes = vec![]; for record in notification.records { let s3 = if let Some(s3) = record.s3 { diff --git a/src/ops/sources/postgres.rs b/src/ops/sources/postgres.rs index 303cbe9d..a78e0aa7 100644 --- a/src/ops/sources/postgres.rs +++ b/src/ops/sources/postgres.rs @@ -600,7 +600,7 @@ impl PostgresSourceExecutor { } fn parse_notification_payload(&self, notification: &PgNotification) -> Result { - let mut payload: serde_json::Value = serde_json::from_str(notification.payload())?; + let mut payload: serde_json::Value = utils::deser::from_json_str(notification.payload())?; let payload = payload .as_object_mut() .ok_or_else(|| anyhow::anyhow!("'fields' field is not an object"))?; diff --git a/src/ops/targets/qdrant.rs b/src/ops/targets/qdrant.rs index f58e0320..cb54e343 100644 --- a/src/ops/targets/qdrant.rs +++ b/src/ops/targets/qdrant.rs @@ -481,7 +481,7 @@ impl TargetFactoryBase for Factory { connection: None, } } - _ => serde_json::from_value(key)?, + _ => utils::deser::from_json_value(key)?, }) } diff --git a/src/service/flows.rs b/src/service/flows.rs index 04cd2cfa..0e35e805 100644 --- a/src/service/flows.rs +++ b/src/service/flows.rs @@ -164,7 +164,7 @@ impl<'a> SourceRowKeyContextHolder<'a> { let key = value::KeyValue::decode_from_strs(source_row_key.key, key_schema)?; let key_aux_info = source_row_key .key_aux - .map(|s| serde_json::from_str(&s)) + .map(|s| utils::deser::from_json_str(&s)) .transpose()? .unwrap_or_default(); Ok(Self { diff --git a/src/setup/auth_registry.rs b/src/setup/auth_registry.rs index 0698d2fb..e9424da2 100644 --- a/src/setup/auth_registry.rs +++ b/src/setup/auth_registry.rs @@ -35,7 +35,7 @@ impl AuthRegistry { pub fn get(&self, entry_ref: &spec::AuthEntryReference) -> Result { let entries = self.entries.read().unwrap(); match entries.get(&entry_ref.key) { - Some(value) => Ok(serde_json::from_value(value.clone())?), + Some(value) => Ok(utils::deser::from_json_value(value.clone())?), None => api_bail!( "Auth entry `{key}` not found.\n\ Hint: If you're not referencing `{key}` in your flow, it will likely be caused by a previously persisted target using it. \ diff --git a/src/setup/driver.rs b/src/setup/driver.rs index 6e4f520f..38dc22e4 100644 --- a/src/setup/driver.rs +++ b/src/setup/driver.rs @@ -63,11 +63,11 @@ fn from_metadata_record( staging_changes: sqlx::types::Json>>, legacy_state_key: Option, ) -> Result> { - let current: Option = state.map(serde_json::from_value).transpose()?; + let current: Option = state.map(utils::deser::from_json_value).transpose()?; let staging: Vec> = (staging_changes.0.into_iter()) .map(|sc| -> Result<_> { Ok(match sc { - StateChange::Upsert(v) => StateChange::Upsert(serde_json::from_value(v)?), + StateChange::Upsert(v) => StateChange::Upsert(utils::deser::from_json_value(v)?), StateChange::Delete => StateChange::Delete, }) }) diff --git a/src/utils/deser.rs b/src/utils/deser.rs new file mode 100644 index 00000000..0ad3696d --- /dev/null +++ b/src/utils/deser.rs @@ -0,0 +1,25 @@ +use anyhow::{Result, anyhow}; +use serde::de::DeserializeOwned; + +fn map_serde_path_err( + err: serde_path_to_error::Error, +) -> anyhow::Error { + let ty = std::any::type_name::().replace("::", "."); + let path = err.path(); + let full_path = if path.iter().next().is_none() { + format!("<{ty}>") + } else { + format!("<{ty}>.{path}") + }; + let inner = err.into_inner(); + anyhow!("while deserializing `{full_path}`: {inner}") +} + +pub fn from_json_value(value: serde_json::Value) -> Result { + serde_path_to_error::deserialize::<_, T>(value).map_err(map_serde_path_err::) +} + +pub fn from_json_str(s: &str) -> Result { + let mut de = serde_json::Deserializer::from_str(s); + serde_path_to_error::deserialize::<_, T>(&mut de).map_err(map_serde_path_err::) +} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 41f8de9e..9aa1655f 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,5 +1,6 @@ pub mod concur_control; pub mod db; +pub mod deser; pub mod fingerprint; pub mod immutable; pub mod retryable;