From 40a61c150adee6beb9961302fece81c33639082e Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 16 Feb 2025 02:31:00 +0800 Subject: [PATCH 01/22] add to_timestamp_nanos (#1020) --- python/datafusion/functions.py | 1 + python/tests/test_functions.py | 4 ++++ src/functions.rs | 2 ++ 3 files changed, 7 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 7c2fa9a8f..5c260aade 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -252,6 +252,7 @@ "to_hex", "to_timestamp", "to_timestamp_micros", + "to_timestamp_nanos", "to_timestamp_millis", "to_timestamp_seconds", "to_unixtime", diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 796b1f76e..b1a739b49 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,6 +871,7 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), ) result = df.collect() assert len(result) == 1 @@ -909,6 +910,9 @@ def test_temporal_functions(df): [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) + assert result.column(11) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): diff --git a/src/functions.rs b/src/functions.rs index 46c748cf8..6a8abb18d 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -553,6 +553,7 @@ expr_fn!( expr_fn!(now); expr_fn_vec!(to_timestamp); expr_fn_vec!(to_timestamp_millis); +expr_fn_vec!(to_timestamp_nanos); expr_fn_vec!(to_timestamp_micros); expr_fn_vec!(to_timestamp_seconds); expr_fn_vec!(to_unixtime); @@ -977,6 +978,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(to_hex))?; m.add_wrapped(wrap_pyfunction!(to_timestamp))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_millis))?; + m.add_wrapped(wrap_pyfunction!(to_timestamp_nanos))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_micros))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_seconds))?; m.add_wrapped(wrap_pyfunction!(to_unixtime))?; From 3584bec8900bcfb33bcae4b85a3c47a46b82c72e Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 19 Feb 2025 20:50:31 -0500 Subject: [PATCH 02/22] [infra] Fail Clippy on rust build warnings (#1029) * pyo3 update required changes to deprecated interfaces * Substrait feature clippy updates * PyTuple was called twice * add -D warnings option --------- Co-authored-by: Tim Saucer --- .github/workflows/test.yaml | 2 +- .pre-commit-config.yaml | 2 +- src/config.rs | 10 +++--- src/context.rs | 12 +++---- src/dataframe.rs | 17 +++++---- src/dataset.rs | 2 +- src/dataset_exec.rs | 8 ++--- src/errors.rs | 4 +++ src/expr.rs | 61 ++++++++++++++++---------------- src/expr/aggregate.rs | 6 ++-- src/expr/analyze.rs | 6 ++-- src/expr/create_memory_table.rs | 6 ++-- src/expr/create_view.rs | 6 ++-- src/expr/distinct.rs | 6 ++-- src/expr/drop_table.rs | 6 ++-- src/expr/empty_relation.rs | 6 ++-- src/expr/explain.rs | 6 ++-- src/expr/extension.rs | 6 ++-- src/expr/filter.rs | 6 ++-- src/expr/join.rs | 6 ++-- src/expr/limit.rs | 6 ++-- src/expr/literal.rs | 6 ++-- src/expr/logical_node.rs | 4 +-- src/expr/projection.rs | 6 ++-- src/expr/repartition.rs | 6 ++-- src/expr/sort.rs | 6 ++-- src/expr/subquery.rs | 6 ++-- src/expr/subquery_alias.rs | 6 ++-- src/expr/table_scan.rs | 6 ++-- src/expr/union.rs | 6 ++-- src/expr/unnest.rs | 6 ++-- src/expr/window.rs | 6 ++-- src/lib.rs | 10 +++--- src/physical_plan.rs | 2 +- src/pyarrow_filter_expression.rs | 36 ++++++++++--------- src/pyarrow_util.rs | 4 +-- src/sql/logical.rs | 4 +-- src/substrait.rs | 4 +-- src/udaf.rs | 5 +-- src/udf.rs | 5 +-- src/udwf.rs | 44 +++++++++++------------ 41 files changed, 188 insertions(+), 180 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c93d4c06f..c1d9ac838 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -71,7 +71,7 @@ jobs: - name: Run Clippy if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: cargo clippy --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure + run: cargo clippy --all-targets --all-features -- -D clippy::all -D warnings -A clippy::redundant_closure - name: Install dependencies and build uses: astral-sh/setup-uv@v5 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e20fedf5c..b548ff18f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - id: rust-clippy name: Rust clippy description: Run cargo clippy on files included in the commit. clippy should be installed before-hand. - entry: cargo clippy --all-targets --all-features -- -Dclippy::all -Aclippy::redundant_closure + entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure pass_filenames: false types: [file, rust] language: system diff --git a/src/config.rs b/src/config.rs index cc725b9a3..667d5c590 100644 --- a/src/config.rs +++ b/src/config.rs @@ -47,14 +47,14 @@ impl PyConfig { } /// Get a configuration option - pub fn get(&mut self, key: &str, py: Python) -> PyResult { + pub fn get<'py>(&mut self, key: &str, py: Python<'py>) -> PyResult> { let options = self.config.to_owned(); for entry in options.entries() { if entry.key == key { - return Ok(entry.value.into_py(py)); + return Ok(entry.value.into_pyobject(py)?); } } - Ok(None::.into_py(py)) + Ok(None::.into_pyobject(py)?) } /// Set a configuration option @@ -66,10 +66,10 @@ impl PyConfig { /// Get all configuration options pub fn get_all(&mut self, py: Python) -> PyResult { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); let options = self.config.to_owned(); for entry in options.entries() { - dict.set_item(entry.key, entry.value.clone().into_py(py))?; + dict.set_item(entry.key, entry.value.clone().into_pyobject(py)?)?; } Ok(dict.into()) } diff --git a/src/context.rs b/src/context.rs index ebe7db230..0f962638e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -458,8 +458,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pylist", args)?; // Convert Arrow Table to datafusion DataFrame @@ -478,8 +478,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pydict", args)?; // Convert Arrow Table to datafusion DataFrame @@ -533,8 +533,8 @@ impl PySessionContext { let py = data.py(); // Instantiate pyarrow Table object & convert to Arrow Table - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[data]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[data])?; let table = table_class.call_method1("from_pandas", args)?; // Convert Arrow Table to datafusion DataFrame diff --git a/src/dataframe.rs b/src/dataframe.rs index 13d7ae838..ed9578a71 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -545,12 +545,12 @@ impl PyDataFrame { /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table fn to_arrow_table(&self, py: Python<'_>) -> PyResult { - let batches = self.collect(py)?.to_object(py); - let schema: PyObject = self.schema().into_pyobject(py)?.to_object(py); + let batches = self.collect(py)?.into_pyobject(py)?; + let schema = self.schema().into_pyobject(py)?; // Instantiate pyarrow Table object and use its from_batches method - let table_class = py.import_bound("pyarrow")?.getattr("Table")?; - let args = PyTuple::new_bound(py, &[batches, schema]); + let table_class = py.import("pyarrow")?.getattr("Table")?; + let args = PyTuple::new(py, &[batches, schema])?; let table: PyObject = table_class.call_method1("from_batches", args)?.into(); Ok(table) } @@ -585,8 +585,7 @@ impl PyDataFrame { let ffi_stream = FFI_ArrowArrayStream::new(reader); let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); - PyCapsule::new_bound(py, ffi_stream, Some(stream_capsule_name)) - .map_err(PyDataFusionError::from) + PyCapsule::new(py, ffi_stream, Some(stream_capsule_name)).map_err(PyDataFusionError::from) } fn execute_stream(&self, py: Python) -> PyDataFusionResult { @@ -649,8 +648,8 @@ impl PyDataFrame { /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame fn to_polars(&self, py: Python<'_>) -> PyResult { let table = self.to_arrow_table(py)?; - let dataframe = py.import_bound("polars")?.getattr("DataFrame")?; - let args = PyTuple::new_bound(py, &[table]); + let dataframe = py.import("polars")?.getattr("DataFrame")?; + let args = PyTuple::new(py, &[table])?; let result: PyObject = dataframe.call1(args)?.into(); Ok(result) } @@ -673,7 +672,7 @@ fn print_dataframe(py: Python, df: DataFrame) -> PyDataFusionResult<()> { // Import the Python 'builtins' module to access the print function // Note that println! does not print to the Python debug console and is not visible in notebooks for instance - let print = py.import_bound("builtins")?.getattr("print")?; + let print = py.import("builtins")?.getattr("print")?; print.call1((result,))?; Ok(()) } diff --git a/src/dataset.rs b/src/dataset.rs index a8fa21ec5..0baf4da2a 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -48,7 +48,7 @@ impl Dataset { // Creates a Python PyArrow.Dataset pub fn new(dataset: &Bound<'_, PyAny>, py: Python) -> PyResult { // Ensure that we were passed an instance of pyarrow.dataset.Dataset - let ds = PyModule::import_bound(py, "pyarrow.dataset")?; + let ds = PyModule::import(py, "pyarrow.dataset")?; let ds_attr = ds.getattr("Dataset")?; let ds_type = ds_attr.downcast::()?; if dataset.is_instance(ds_type)? { diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs index ace42115b..445e4fe74 100644 --- a/src/dataset_exec.rs +++ b/src/dataset_exec.rs @@ -104,7 +104,7 @@ impl DatasetExec { }) .transpose()?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs.set_item("columns", columns.clone())?; kwargs.set_item( @@ -121,7 +121,7 @@ impl DatasetExec { .0, ); - let builtins = Python::import_bound(py, "builtins")?; + let builtins = Python::import(py, "builtins")?; let pylist = builtins.getattr("list")?; // Get the fragments or partitions of the dataset @@ -198,7 +198,7 @@ impl ExecutionPlan for DatasetExec { let dataset_schema = dataset .getattr("schema") .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; - let kwargs = PyDict::new_bound(py); + let kwargs = PyDict::new(py); kwargs .set_item("columns", self.columns.clone()) .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; @@ -223,7 +223,7 @@ impl ExecutionPlan for DatasetExec { let record_batches: Bound<'_, PyIterator> = scanner .call_method0("to_batches") .map_err(|err| InnerDataFusionError::External(Box::new(err)))? - .iter() + .try_iter() .map_err(|err| InnerDataFusionError::External(Box::new(err)))?; let record_batches = PyArrowBatchesAdapter { diff --git a/src/errors.rs b/src/errors.rs index b02b754a2..f1d5aeb23 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -91,3 +91,7 @@ pub fn py_datafusion_err(e: impl Debug) -> PyErr { pub fn py_unsupported_variant_err(e: impl Debug) -> PyErr { PyErr::new::(format!("{e:?}")) } + +pub fn to_datafusion_err(e: impl Debug) -> InnerDataFusionError { + InnerDataFusionError::Execution(format!("{e:?}")) +} diff --git a/src/expr.rs b/src/expr.rs index 1e9983d42..e750be6a4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -19,6 +19,7 @@ use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, }; +use pyo3::IntoPyObjectExt; use pyo3::{basic::CompareOp, prelude::*}; use std::convert::{From, Into}; use std::sync::Arc; @@ -126,35 +127,35 @@ pub fn py_expr_list(expr: &[Expr]) -> PyResult> { #[pymethods] impl PyExpr { /// Return the specific expression - fn to_variant(&self, py: Python) -> PyResult { + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { Python::with_gil(|_| { match &self.expr { - Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_py(py)), - Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_py(py)), + Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_bound_py_any(py)?), + Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_bound_py_any(py)?), Expr::ScalarVariable(data_type, variables) => { - Ok(PyScalarVariable::new(data_type, variables).into_py(py)) + Ok(PyScalarVariable::new(data_type, variables).into_bound_py_any(py)?) } - Expr::Like(value) => Ok(PyLike::from(value.clone()).into_py(py)), - Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_py(py)), - Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_py(py)), - Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_py(py)), - Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_py(py)), - Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_py(py)), - Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_py(py)), - Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_py(py)), - Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_py(py)), - Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_py(py)), - Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_py(py)), - Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_py(py)), - Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_py(py)), + Expr::Like(value) => Ok(PyLike::from(value.clone()).into_bound_py_any(py)?), + Expr::Literal(value) => Ok(PyLiteral::from(value.clone()).into_bound_py_any(py)?), + Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_bound_py_any(py)?), + Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNull(expr) => Ok(PyIsNull::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsTrue(expr) => Ok(PyIsTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsFalse(expr) => Ok(PyIsFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_bound_py_any(py)?), Expr::AggregateFunction(expr) => { - Ok(PyAggregateFunction::from(expr.clone()).into_py(py)) + Ok(PyAggregateFunction::from(expr.clone()).into_bound_py_any(py)?) } - Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_py(py)), - Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_py(py)), - Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_py(py)), - Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_py(py)), - Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_py(py)), + Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_bound_py_any(py)?), + Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_bound_py_any(py)?), + Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_bound_py_any(py)?), + Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), + Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), Expr::ScalarFunction(value) => Err(py_unsupported_variant_err(format!( "Converting Expr::ScalarFunction to a Python object is not implemented: {:?}", value @@ -163,29 +164,29 @@ impl PyExpr { "Converting Expr::WindowFunction to a Python object is not implemented: {:?}", value ))), - Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_py(py)), - Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_py(py)), + Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), + Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), Expr::InSubquery(value) => { - Ok(in_subquery::PyInSubquery::from(value.clone()).into_py(py)) + Ok(in_subquery::PyInSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::ScalarSubquery(value) => { - Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_py(py)) + Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options ))), Expr::GroupingSet(value) => { - Ok(grouping_set::PyGroupingSet::from(value.clone()).into_py(py)) + Ok(grouping_set::PyGroupingSet::from(value.clone()).into_bound_py_any(py)?) } Expr::Placeholder(value) => { - Ok(placeholder::PyPlaceholder::from(value.clone()).into_py(py)) + Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {:?} - {:?}", data_type, column ))), - Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_py(py)), + Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), } }) } diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 389bfb332..8fc9da5b0 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -19,7 +19,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::expr::{AggregateFunction, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -151,7 +151,7 @@ impl LogicalNode for PyAggregate { vec![PyLogicalPlan::from((*self.aggregate.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/analyze.rs b/src/expr/analyze.rs index 084513971..62f93cd26 100644 --- a/src/expr/analyze.rs +++ b/src/expr/analyze.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Analyze; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -78,7 +78,7 @@ impl LogicalNode for PyAnalyze { vec![PyLogicalPlan::from((*self.analyze.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_memory_table.rs b/src/expr/create_memory_table.rs index 01ebb66b0..8872b2d47 100644 --- a/src/expr/create_memory_table.rs +++ b/src/expr/create_memory_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::CreateMemoryTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -91,7 +91,7 @@ impl LogicalNode for PyCreateMemoryTable { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/create_view.rs b/src/expr/create_view.rs index d119f5c21..87bb76876 100644 --- a/src/expr/create_view.rs +++ b/src/expr/create_view.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{CreateView, DdlStatement, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -88,8 +88,8 @@ impl LogicalNode for PyCreateView { vec![PyLogicalPlan::from((*self.create.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/distinct.rs b/src/expr/distinct.rs index 061ab4824..b62b776f8 100644 --- a/src/expr/distinct.rs +++ b/src/expr/distinct.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Distinct; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -89,7 +89,7 @@ impl LogicalNode for PyDistinct { } } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/drop_table.rs b/src/expr/drop_table.rs index 330156abe..96983c1cf 100644 --- a/src/expr/drop_table.rs +++ b/src/expr/drop_table.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::logical_plan::DropTable; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -83,7 +83,7 @@ impl LogicalNode for PyDropTable { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/empty_relation.rs b/src/expr/empty_relation.rs index ce7163466..a1534ac15 100644 --- a/src/expr/empty_relation.rs +++ b/src/expr/empty_relation.rs @@ -17,7 +17,7 @@ use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; use datafusion::logical_expr::EmptyRelation; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use super::logical_node::LogicalNode; @@ -79,7 +79,7 @@ impl LogicalNode for PyEmptyRelation { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/explain.rs b/src/expr/explain.rs index 8e7fb8843..fc02fe2b5 100644 --- a/src/expr/explain.rs +++ b/src/expr/explain.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Explain, LogicalPlan}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -104,7 +104,7 @@ impl LogicalNode for PyExplain { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/extension.rs b/src/expr/extension.rs index a29802b0b..1e3fbb199 100644 --- a/src/expr/extension.rs +++ b/src/expr/extension.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::Extension; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -46,7 +46,7 @@ impl LogicalNode for PyExtension { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/filter.rs b/src/expr/filter.rs index a6d8aa7ee..9bdb667cd 100644 --- a/src/expr/filter.rs +++ b/src/expr/filter.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Filter; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -81,7 +81,7 @@ impl LogicalNode for PyFilter { vec![PyLogicalPlan::from((*self.filter.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/join.rs b/src/expr/join.rs index 66e677f8a..76ec532e7 100644 --- a/src/expr/join.rs +++ b/src/expr/join.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::{Join, JoinConstraint, JoinType}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -193,7 +193,7 @@ impl LogicalNode for PyJoin { ] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/limit.rs b/src/expr/limit.rs index 84ad7d68b..c2a33ff89 100644 --- a/src/expr/limit.rs +++ b/src/expr/limit.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Limit; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -90,7 +90,7 @@ impl LogicalNode for PyLimit { vec![PyLogicalPlan::from((*self.limit.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/literal.rs b/src/expr/literal.rs index 2cb2079f1..a660ac914 100644 --- a/src/expr/literal.rs +++ b/src/expr/literal.rs @@ -17,7 +17,7 @@ use crate::errors::PyDataFusionError; use datafusion::common::ScalarValue; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; #[pyclass(name = "Literal", module = "datafusion.expr", subclass)] #[derive(Clone)] @@ -144,8 +144,8 @@ impl PyLiteral { } #[allow(clippy::wrong_self_convention)] - fn into_type(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn into_type<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } fn __repr__(&self) -> PyResult { diff --git a/src/expr/logical_node.rs b/src/expr/logical_node.rs index 757e4f94b..5aff70059 100644 --- a/src/expr/logical_node.rs +++ b/src/expr/logical_node.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use pyo3::{PyObject, PyResult, Python}; +use pyo3::{Bound, PyAny, PyResult, Python}; use crate::sql::logical::PyLogicalPlan; @@ -25,5 +25,5 @@ pub trait LogicalNode { /// The input plan to the current logical node instance. fn inputs(&self) -> Vec; - fn to_variant(&self, py: Python) -> PyResult; + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult>; } diff --git a/src/expr/projection.rs b/src/expr/projection.rs index 36534fdb2..dc7e5e3c1 100644 --- a/src/expr/projection.rs +++ b/src/expr/projection.rs @@ -17,7 +17,7 @@ use datafusion::logical_expr::logical_plan::Projection; use datafusion::logical_expr::Expr; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -113,7 +113,7 @@ impl LogicalNode for PyProjection { vec![PyLogicalPlan::from((*self.projection.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/repartition.rs b/src/expr/repartition.rs index 4e680e181..3e782d6af 100644 --- a/src/expr/repartition.rs +++ b/src/expr/repartition.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{logical_plan::Repartition, Expr, Partitioning}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; @@ -121,7 +121,7 @@ impl LogicalNode for PyRepartition { vec![PyLogicalPlan::from((*self.repartition.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/sort.rs b/src/expr/sort.rs index a1803ccaf..ed4947591 100644 --- a/src/expr/sort.rs +++ b/src/expr/sort.rs @@ -17,7 +17,7 @@ use datafusion::common::DataFusionError; use datafusion::logical_expr::logical_plan::Sort; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -96,7 +96,7 @@ impl LogicalNode for PySort { vec![PyLogicalPlan::from((*self.sort.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery.rs b/src/expr/subquery.rs index dac8d0a2b..5ebfe6927 100644 --- a/src/expr/subquery.rs +++ b/src/expr/subquery.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Subquery; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::sql::logical::PyLogicalPlan; @@ -75,7 +75,7 @@ impl LogicalNode for PySubquery { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/subquery_alias.rs b/src/expr/subquery_alias.rs index a83cff96d..267a4d485 100644 --- a/src/expr/subquery_alias.rs +++ b/src/expr/subquery_alias.rs @@ -18,7 +18,7 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::SubqueryAlias; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; @@ -85,7 +85,7 @@ impl LogicalNode for PySubqueryAlias { vec![PyLogicalPlan::from((*self.subquery_alias.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs index f61be7fe4..6a0d53f0f 100644 --- a/src/expr/table_scan.rs +++ b/src/expr/table_scan.rs @@ -17,7 +17,7 @@ use datafusion::common::TableReference; use datafusion::logical_expr::logical_plan::TableScan; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::expr::logical_node::LogicalNode; @@ -146,7 +146,7 @@ impl LogicalNode for PyTableScan { vec![] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/union.rs b/src/expr/union.rs index 62488d9a1..5a08ccc13 100644 --- a/src/expr/union.rs +++ b/src/expr/union.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Union; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -83,7 +83,7 @@ impl LogicalNode for PyUnion { .collect() } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/unnest.rs b/src/expr/unnest.rs index adc705035..8e70e0990 100644 --- a/src/expr/unnest.rs +++ b/src/expr/unnest.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::logical_expr::logical_plan::Unnest; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::df_schema::PyDFSchema; @@ -79,7 +79,7 @@ impl LogicalNode for PyUnnest { vec![PyLogicalPlan::from((*self.unnest_.input).clone())] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/expr/window.rs b/src/expr/window.rs index 4dc6cb9c9..13deaec25 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -18,7 +18,7 @@ use datafusion::common::{DataFusionError, ScalarValue}; use datafusion::logical_expr::expr::WindowFunction; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; use crate::common::data_type::PyScalarValue; @@ -289,7 +289,7 @@ impl LogicalNode for PyWindowExpr { vec![self.window.input.as_ref().clone().into()] } - fn to_variant(&self, py: Python) -> PyResult { - Ok(self.clone().into_py(py)) + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) } } diff --git a/src/lib.rs b/src/lib.rs index 317c3a49a..ce93ff0c3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,21 +94,21 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; // Register `common` as a submodule. Matching `datafusion-common` https://docs.rs/datafusion-common/latest/datafusion_common/ - let common = PyModule::new_bound(py, "common")?; + let common = PyModule::new(py, "common")?; common::init_module(&common)?; m.add_submodule(&common)?; // Register `expr` as a submodule. Matching `datafusion-expr` https://docs.rs/datafusion-expr/latest/datafusion_expr/ - let expr = PyModule::new_bound(py, "expr")?; + let expr = PyModule::new(py, "expr")?; expr::init_module(&expr)?; m.add_submodule(&expr)?; // Register the functions as a submodule - let funcs = PyModule::new_bound(py, "functions")?; + let funcs = PyModule::new(py, "functions")?; functions::init_module(&funcs)?; m.add_submodule(&funcs)?; - let store = PyModule::new_bound(py, "object_store")?; + let store = PyModule::new(py, "object_store")?; store::init_module(&store)?; m.add_submodule(&store)?; @@ -121,7 +121,7 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { #[cfg(feature = "substrait")] fn setup_substrait_module(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { - let substrait = PyModule::new_bound(py, "substrait")?; + let substrait = PyModule::new(py, "substrait")?; substrait::init_module(&substrait)?; m.add_submodule(&substrait)?; Ok(()) diff --git a/src/physical_plan.rs b/src/physical_plan.rs index 295908dc7..f0be45c6a 100644 --- a/src/physical_plan.rs +++ b/src/physical_plan.rs @@ -66,7 +66,7 @@ impl PyExecutionPlan { )?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index 314eebf4f..4b4c86597 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -16,7 +16,7 @@ // under the License. /// Converts a Datafusion logical plan expression (Expr) into a PyArrow compute expression -use pyo3::prelude::*; +use pyo3::{prelude::*, IntoPyObjectExt}; use std::convert::TryFrom; use std::result::Result; @@ -53,24 +53,28 @@ fn operator_to_py<'py>( Ok(py_op) } -fn extract_scalar_list(exprs: &[Expr], py: Python) -> PyDataFusionResult> { +fn extract_scalar_list<'py>( + exprs: &[Expr], + py: Python<'py>, +) -> PyDataFusionResult>> { let ret = exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? Expr::Literal(v) => match v { - ScalarValue::Boolean(Some(b)) => Ok(b.into_py(py)), - ScalarValue::Int8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Int64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt8(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt16(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt32(Some(i)) => Ok(i.into_py(py)), - ScalarValue::UInt64(Some(i)) => Ok(i.into_py(py)), - ScalarValue::Float32(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Float64(Some(f)) => Ok(f.into_py(py)), - ScalarValue::Utf8(Some(s)) => Ok(s.into_py(py)), + // The unwraps here are for infallible conversions + ScalarValue::Boolean(Some(b)) => Ok(b.into_bound_py_any(py)?), + ScalarValue::Int8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Int64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt8(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt16(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt32(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::UInt64(Some(i)) => Ok(i.into_bound_py_any(py)?), + ScalarValue::Float32(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Float64(Some(f)) => Ok(f.into_bound_py_any(py)?), + ScalarValue::Utf8(Some(s)) => Ok(s.into_bound_py_any(py)?), _ => Err(PyDataFusionError::Common(format!( "PyArrow can't handle ScalarValue: {v:?}" ))), @@ -98,8 +102,8 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { // https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow-dataset-expression fn try_from(expr: &Expr) -> Result { Python::with_gil(|py| { - let pc = Python::import_bound(py, "pyarrow.compute")?; - let op_module = Python::import_bound(py, "operator")?; + let pc = Python::import(py, "pyarrow.compute")?; + let op_module = Python::import(py, "operator")?; let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), Expr::Literal(scalar) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), diff --git a/src/pyarrow_util.rs b/src/pyarrow_util.rs index 2b31467f8..cab708458 100644 --- a/src/pyarrow_util.rs +++ b/src/pyarrow_util.rs @@ -33,8 +33,8 @@ impl FromPyArrow for PyScalarValue { let val = value.call_method0("as_py")?; // construct pyarrow array from the python value and pyarrow type - let factory = py.import_bound("pyarrow")?.getattr("array")?; - let args = PyList::new_bound(py, [val]); + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [val])?; let array = factory.call1((args, typ))?; // convert the pyarrow array to rust array using C data interface diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 1be33b75f..96561c434 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -64,7 +64,7 @@ impl PyLogicalPlan { #[pymethods] impl PyLogicalPlan { /// Return the specific logical operator - pub fn to_variant(&self, py: Python) -> PyResult { + pub fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { match self.plan.as_ref() { LogicalPlan::Aggregate(plan) => PyAggregate::from(plan.clone()).to_variant(py), LogicalPlan::Analyze(plan) => PyAnalyze::from(plan.clone()).to_variant(py), @@ -132,7 +132,7 @@ impl PyLogicalPlan { datafusion_proto::protobuf::LogicalPlanNode::try_from_logical_plan(&self.plan, &codec)?; let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) + Ok(PyBytes::new(py, &bytes)) } #[staticmethod] diff --git a/src/substrait.rs b/src/substrait.rs index 8dcf3e8a7..1fefc0bbd 100644 --- a/src/substrait.rs +++ b/src/substrait.rs @@ -40,7 +40,7 @@ impl PyPlan { self.plan .encode(&mut proto_bytes) .map_err(PyDataFusionError::EncodeError)?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } } @@ -95,7 +95,7 @@ impl PySubstraitSerializer { py: Python, ) -> PyDataFusionResult { let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))?; - Ok(PyBytes::new_bound(py, &proto_bytes).unbind().into()) + Ok(PyBytes::new(py, &proto_bytes).into()) } #[staticmethod] diff --git a/src/udaf.rs b/src/udaf.rs index 5f21533e0..34a9cd51d 100644 --- a/src/udaf.rs +++ b/src/udaf.rs @@ -29,6 +29,7 @@ use datafusion::logical_expr::{ }; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -73,7 +74,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum @@ -119,7 +120,7 @@ impl Accumulator for RustAccumulator { .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) .collect::>(); - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function self.accum diff --git a/src/udf.rs b/src/udf.rs index 4570e77a6..574c9d7b5 100644 --- a/src/udf.rs +++ b/src/udf.rs @@ -28,6 +28,7 @@ use datafusion::logical_expr::function::ScalarFunctionImplementation; use datafusion::logical_expr::ScalarUDF; use datafusion::logical_expr::{create_udf, ColumnarValue}; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; @@ -46,11 +47,11 @@ fn pyarrow_function_to_rust( .map_err(|e| DataFusionError::Execution(format!("{e:?}"))) }) .collect::, _>>()?; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; // 2. call function let value = func - .call_bound(py, py_args, None) + .call(py, py_args, None) .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; // 3. cast to arrow::array::Array diff --git a/src/udwf.rs b/src/udwf.rs index 04a4a1640..defd9c522 100644 --- a/src/udwf.rs +++ b/src/udwf.rs @@ -27,6 +27,7 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use crate::common::data_type::PyScalarValue; +use crate::errors::to_datafusion_err; use crate::expr::PyExpr; use crate::utils::parse_volatility; use datafusion::arrow::datatypes::DataType; @@ -56,8 +57,8 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn get_range(&self, idx: usize, n_rows: usize) -> Result> { Python::with_gil(|py| { - let py_args = vec![idx.to_object(py), n_rows.to_object(py)]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = vec![idx.into_pyobject(py)?, n_rows.into_pyobject(py)?]; + let py_args = PyTuple::new(py, py_args)?; self.evaluator .bind(py) @@ -93,17 +94,14 @@ impl PartitionEvaluator for RustPartitionEvaluator { fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { println!("evaluate all called with number of values {}", values.len()); Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let py_num_rows = num_rows.to_object(py).into_bound(py); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), &py_num_rows]), - ); + )?; + let py_num_rows = num_rows.into_pyobject(py)?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), &py_num_rows])?; self.evaluator .bind(py) @@ -112,32 +110,28 @@ impl PartitionEvaluator for RustPartitionEvaluator { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate(&mut self, values: &[ArrayRef], range: &Range) -> Result { Python::with_gil(|py| { - let py_values = PyList::new_bound( + let py_values = PyList::new( py, values .iter() .map(|arg| arg.into_data().to_pyarrow(py).unwrap()), - ); - let range_tuple = - PyTuple::new_bound(py, vec![range.start.to_object(py), range.end.to_object(py)]); - let py_args = PyTuple::new_bound( - py, - PyTuple::new_bound(py, vec![py_values.as_any(), range_tuple.as_any()]), - ); + )?; + let range_tuple = PyTuple::new(py, vec![range.start, range.end])?; + let py_args = PyTuple::new(py, vec![py_values.as_any(), range_tuple.as_any()])?; self.evaluator .bind(py) .call_method1("evaluate", py_args) .and_then(|v| v.extract::()) .map(|v| v.0) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) + .map_err(to_datafusion_err) } fn evaluate_all_with_rank( @@ -148,23 +142,27 @@ impl PartitionEvaluator for RustPartitionEvaluator { Python::with_gil(|py| { let ranks = ranks_in_partition .iter() - .map(|r| PyTuple::new_bound(py, vec![r.start, r.end])); + .map(|r| PyTuple::new(py, vec![r.start, r.end])) + .collect::>>()?; // 1. cast args to Pyarrow array - let py_args = vec![num_rows.to_object(py), PyList::new_bound(py, ranks).into()]; + let py_args = vec![ + num_rows.into_pyobject(py)?.into_any(), + PyList::new(py, ranks)?.into_any(), + ]; - let py_args = PyTuple::new_bound(py, py_args); + let py_args = PyTuple::new(py, py_args)?; // 2. call function self.evaluator .bind(py) .call_method1("evaluate_all_with_rank", py_args) - .map_err(|e| DataFusionError::Execution(format!("{e}"))) .map(|v| { let array_data = ArrayData::from_pyarrow_bound(&v).unwrap(); make_array(array_data) }) }) + .map_err(to_datafusion_err) } fn supports_bounded_execution(&self) -> bool { From e6f6e66c1d180246ad933f8bcc0d40faa8426dfa Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 21 Feb 2025 16:03:36 -0500 Subject: [PATCH 03/22] Add user documentation for the FFI approach (#1031) * Initial commit for FFI user documentation * Update readme to point to the online documentation. Fix a small typo. * Small text adjustments for clarity and formatting --- README.md | 11 +- docs/source/contributor-guide/ffi.rst | 212 ++++++++++++++++++++++++++ docs/source/index.rst | 1 + 3 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 docs/source/contributor-guide/ffi.rst diff --git a/README.md b/README.md index 5aaf7f5f3..9c56b62dd 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,8 @@ DataFusion's Python bindings can be used as a foundation for building new data s planning, and logical plan optimizations, and then transpiles the logical plan to Dask operations for execution. - [DataFusion Ballista](https://github.com/apache/datafusion-ballista) is a distributed SQL query engine that extends DataFusion's Python bindings for distributed use cases. - -It is also possible to use these Python bindings directly for DataFrame and SQL operations, but you may find that -[Polars](http://pola.rs/) and [DuckDB](http://www.duckdb.org/) are more suitable for this use case, since they have -more of an end-user focus and are more actively maintained than these Python bindings. +- [DataFusion Ray](https://github.com/apache/datafusion-ray) is another distributed query engine that uses + DataFusion's Python bindings. ## Features @@ -114,6 +112,11 @@ Printing the context will show the current configuration settings. print(ctx) ``` +## Extensions + +For information about how to extend DataFusion Python, please see the extensions page of the +[online documentation](https://datafusion.apache.org/python/). + ## More Examples See [examples](examples/README.md) for more information. diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst new file mode 100644 index 000000000..c1f9806b3 --- /dev/null +++ b/docs/source/contributor-guide/ffi.rst @@ -0,0 +1,212 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Python Extensions +================= + +The DataFusion in Python project is designed to allow users to extend its functionality in a few core +areas. Ideally many users would like to package their extensions as a Python package and easily +integrate that package with this project. This page serves to describe some of the challenges we face +when doing these integrations and the approach our project uses. + +The Primary Issue +----------------- + +Suppose you wish to use DataFusion and you have a custom data source that can produce tables that +can then be queried against, similar to how you can register a :ref:`CSV ` or +:ref:`Parquet ` file. In DataFusion terminology, you likely want to implement a +:ref:`Custom Table Provider `. In an effort to make your data source +as performant as possible and to utilize the features of DataFusion, you may decide to write +your source in Rust and then expose it through `PyO3 `_ as a Python library. + +At first glance, it may appear the best way to do this is to add the ``datafusion-python`` +crate as a dependency, provide a ``PyTable``, and then to register it with the +``SessionContext``. Unfortunately, this will not work. + +When you produce your code as a Python library and it needs to interact with the DataFusion +library, at the lowest level they communicate through an Application Binary Interface (ABI). +The acronym sounds similar to API (Application Programming Interface), but it is distinctly +different. + +The ABI sets the standard for how these libraries can share data and functions between each +other. One of the key differences between Rust and other programming languages is that Rust +does not have a stable ABI. What this means in practice is that if you compile a Rust library +with one version of the ``rustc`` compiler and I compile another library to interface with it +but I use a different version of the compiler, there is no guarantee the interface will be +the same. + +In practice, this means that a Python library built with ``datafusion-python`` as a Rust +dependency will generally **not** be compatible with the DataFusion Python package, even +if they reference the same version of ``datafusion-python``. If you attempt to do this, it may +work on your local computer if you have built both packages with the same optimizations. +This can sometimes lead to a false expectation that the code will work, but it frequently +breaks the moment you try to use your package against the released packages. + +You can find more information about the Rust ABI in their +`online documentation `_. + +The FFI Approach +---------------- + +Rust supports interacting with other programming languages through it's Foreign Function +Interface (FFI). The advantage of using the FFI is that it enables you to write data structures +and functions that have a stable ABI. The allows you to use Rust code with C, Python, and +other languages. In fact, the `PyO3 `_ library uses the FFI to share data +and functions between Python and Rust. + +The approach we are taking in the DataFusion in Python project is to incrementally expose +more portions of the DataFusion project via FFI interfaces. This allows users to write Rust +code that does **not** require the ``datafusion-python`` crate as a dependency, expose their +code in Python via PyO3, and have it interact with the DataFusion Python package. + +Early adopters of this approach include `delta-rs `_ +who has adapted their Table Provider for use in ```datafusion-python``` with only a few lines +of code. Also, the DataFusion Python project uses the existing definitions from +`Apache Arrow CStream Interface `_ +to support importing **and** exporting tables. Any Python package that supports reading +the Arrow C Stream interface can work with DataFusion Python out of the box! You can read +more about working with Arrow sources in the :ref:`Data Sources ` +page. + +To learn more about the Foreign Function Interface in Rust, the +`Rustonomicon `_ is a good resource. + +Inspiration from Arrow +---------------------- + +DataFusion is built upon `Apache Arrow `_. The canonical Python +Arrow implementation, `pyarrow `_ provides +an excellent way to share Arrow data between Python projects without performing any copy +operations on the data. They do this by using a well defined set of interfaces. You can +find the details about their stream interface +`here `_. The +`Rust Arrow Implementation `_ also supports these +``C`` style definitions via the Foreign Function Interface. + +In addition to using these interfaces to transfer Arrow data between libraries, ``pyarrow`` +goes one step further to make sharing the interfaces easier in Python. They do this +by exposing PyCapsules that contain the expected functionality. + +You can learn more about PyCapsules from the official +`Python online documentation `_. PyCapsules +have excellent support in PyO3 already. The +`PyO3 online documentation `_ is a good source +for more details on using PyCapsules in Rust. + +Two lessons we leverage from the Arrow project in DataFusion Python are: + +- We reuse the existing Arrow FFI functionality wherever possible. +- We expose PyCapsules that contain a FFI stable struct. + +Implementation Details +---------------------- + +The bulk of the code necessary to perform our FFI operations is in the upstream +`DataFusion `_ core repository. You can review the code and +documentation in the `datafusion-ffi`_ crate. + +Our FFI implementation is narrowly focused at sharing data and functions with Rust backed +libraries. This allows us to use the `abi_stable crate `_. +This is an excellent crate that allows for easy conversion between Rust native types +and FFI-safe alternatives. For example, if you needed to pass a ``Vec`` via FFI, +you can simply convert it to a ``RVec`` in an intuitive manner. It also supports +features like ``RResult`` and ``ROption`` that do not have an obvious translation to a +C equivalent. + +The `datafusion-ffi`_ crate has been designed to make it easy to convert from DataFusion +traits into their FFI counterparts. For example, if you have defined a custom +`TableProvider `_ +and you want to create a sharable FFI counterpart, you could write: + +.. code-block:: rust + + let my_provider = MyTableProvider::default(); + let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None); + +If you were interfacing with a library that provided the above ``FFI_TableProvider`` and +you needed to turn it back into an ``TableProvider``, you can turn it into a +``ForeignTableProvider`` with implements the ``TableProvider`` trait. + +.. code-block:: rust + + let foreign_provider: ForeignTableProvider = ffi_provider.into(); + +If you review the code in `datafusion-ffi`_ you will find that each of the traits we share +across the boundary has two portions, one with a ``FFI_`` prefix and one with a ``Foreign`` +prefix. This is used to distinguish which side of the FFI boundary that struct is +designed to be used on. The structures with the ``FFI_`` prefix are to be used on the +**provider** of the structure. In the example we're showing, this means the code that has +written the underlying ``TableProvider`` implementation to access your custom data source. +The structures with the ``Foreign`` prefix are to be used by the receiver. In this case, +it is the ``datafusion-python`` library. + +In order to share these FFI structures, we need to wrap them in some kind of Python object +that can be used to interface from one package to another. As described in the above +section on our inspiration from Arrow, we use ``PyCapsule``. We can create a ``PyCapsule`` +for our provider thusly: + +.. code-block:: rust + + let name = CString::new("datafusion_table_provider")?; + let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?; + +On the receiving side, turn this pycapsule object into the ``FFI_TableProvider``, which +can then be turned into a ``ForeignTableProvider`` the associated code is: + +.. code-block:: rust + + let capsule = capsule.downcast::()?; + let provider = unsafe { capsule.reference::() }; + +By convention the ``datafusion-python`` library expects a Python object that has a +``TableProvider`` PyCapsule to have this capsule accessible by calling a function named +``__datafusion_table_provider__``. You can see a complete working example of how to +share a ``TableProvider`` from one python library to DataFusion Python in the +`repository examples folder `_. + +This section has been written using ``TableProvider`` as an example. It is the first +extension that has been written using this approach and the most thoroughly implemented. +As we continue to expose more of the DataFusion features, we intend to follow this same +design pattern. + +Alternative Approach +-------------------- + +Suppose you needed to expose some other features of DataFusion and you could not wait +for the upstream repository to implement the FFI approach we describe. In this case +you decide to create your dependency on the ``datafusion-python`` crate instead. + +As we discussed, this is not guaranteed to work across different compiler versions and +optimization levels. If you wish to go down this route, there are two approaches we +have identified you can use. + +#. Re-export all of ``datafusion-python`` yourself with your extensions built in. +#. Carefully synchonize your software releases with the ``datafusion-python`` CI build + system so that your libraries use the exact same compiler, features, and + optimization level. + +We currently do not recommend either of these approaches as they are difficult to +maintain over a long period. Additionally, they require a tight version coupling +between libraries. + +Status of Work +-------------- + +At the time of this writing, the FFI features are under active development. To see +the latest status, we recommend reviewing the code in the `datafusion-ffi`_ crate. + +.. _datafusion-ffi: https://crates.io/crates/datafusion-ffi diff --git a/docs/source/index.rst b/docs/source/index.rst index 34eb23b28..558b2d572 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,6 +85,7 @@ Example :caption: CONTRIBUTOR GUIDE contributor-guide/introduction + contributor-guide/ffi .. _toc.api: .. toctree:: From 3f3983cc86ffe267cff97480241e8a588ac38fa3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 23 Feb 2025 08:00:52 -0500 Subject: [PATCH 04/22] build(deps): bump arrow from 54.1.0 to 54.2.0 (#1035) Bumps [arrow](https://github.com/apache/arrow-rs) from 54.1.0 to 54.2.0. - [Release notes](https://github.com/apache/arrow-rs/releases) - [Changelog](https://github.com/apache/arrow-rs/blob/main/CHANGELOG-old.md) - [Commits](https://github.com/apache/arrow-rs/compare/54.1.0...54.2.0) --- updated-dependencies: - dependency-name: arrow dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 56 +++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1b1ed50a..d23ed6169 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" dependencies = [ "arrow-array", "arrow-buffer", @@ -326,9 +326,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +352,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" dependencies = [ "ahash", "arrow-array", @@ -375,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" dependencies = [ "arrow-array", "arrow-buffer", From 69ebf70bd821d0ae516d2f61d96058e2252a7a1f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 24 Feb 2025 21:30:52 +0100 Subject: [PATCH 05/22] Chore: Release datafusion-python 45 (#1024) * Bump version number to prepare for release * Add changelog 45.0.0 * Add deprecated marker from either typing or typing_extensions based on the python version * Limit pyarrow version per issue # 1023 * Bumping the version number to support new release candidate * There was no guarantee that the record batches would be returned in a single partition, so update the unit test to check all partitions. * Revert "Limit pyarrow version per issue # 1023" This reverts commit b48d5872661017ec21ea71f7dbb9569f2f0bf797. * Correct import for python 3.13 and above * Bump minor version due to pypi requirement * Update cargo lock --- Cargo.lock | 113 +++++++++++++-------------------- Cargo.toml | 2 +- dev/changelog/45.0.0.md | 42 ++++++++++++ python/datafusion/context.py | 5 +- python/datafusion/dataframe.py | 5 +- python/datafusion/expr.py | 6 +- python/datafusion/substrait.py | 5 +- python/tests/test_dataframe.py | 21 ++++-- 8 files changed, 118 insertions(+), 81 deletions(-) create mode 100644 dev/changelog/45.0.0.md diff --git a/Cargo.lock b/Cargo.lock index d23ed6169..5c7f2bf3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -606,19 +606,18 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bafdbf26611df8c14810e268ddceda071c297570a5fb360ceddf617fe417ef58" +checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.12+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" dependencies = [ "cc", "libc", @@ -627,9 +626,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.12" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755717a7de9ec452bf7f3f1a3099085deabd7f2962b861dae91ecd7a365903d2" +checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9" dependencies = [ "jobserver", "libc", @@ -684,21 +683,20 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.53" +version = "0.1.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" dependencies = [ "cc", ] [[package]] name = "comfy-table" -version = "7.1.3" +version = "7.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] @@ -837,9 +835,9 @@ dependencies = [ [[package]] name = "csv-core" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" dependencies = [ "memchr", ] @@ -878,7 +876,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.5.0", + "bzip2 0.5.1", "chrono", "datafusion-catalog", "datafusion-common", @@ -1240,7 +1238,7 @@ dependencies = [ "itertools 0.14.0", "log", "paste", - "petgraph 0.7.1", + "petgraph", ] [[package]] @@ -1341,7 +1339,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" dependencies = [ "arrow", "async-trait", @@ -1436,9 +1434,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" @@ -1456,12 +1454,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fixedbitset" version = "0.5.7" @@ -2269,9 +2261,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" +checksum = "b3b1c9bd4fe1f0f8b387f6eb9eb3b4a1aa26185e5750efb9140301703f62cd1b" dependencies = [ "adler2", ] @@ -2548,23 +2540,13 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset 0.4.2", - "indexmap", -] - [[package]] name = "petgraph" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "indexmap", ] @@ -2660,9 +2642,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", "prost-derive", @@ -2670,16 +2652,16 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", - "petgraph 0.6.5", + "petgraph", "prettyplease", "prost", "prost-types", @@ -2690,12 +2672,12 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -2703,9 +2685,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost", ] @@ -2721,9 +2703,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" dependencies = [ "cc", ] @@ -2860,9 +2842,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", @@ -3042,15 +3024,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24" dependencies = [ "cc", "cfg-if", "getrandom 0.2.15", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -3097,9 +3078,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.22" +version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb9263ab4eb695e42321db096e3b8fbd715a59b154d5c88d82db2175b681ba7" +checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ "once_cell", "ring", @@ -3377,9 +3358,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "snafu" @@ -3418,12 +3399,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sqlparser" version = "0.53.0" @@ -3453,9 +3428,9 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" dependencies = [ "cc", "cfg-if", diff --git a/Cargo.toml b/Cargo.toml index d18e0e8f0..5358b1836 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "44.0.0" +version = "45.2.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/45.0.0.md b/dev/changelog/45.0.0.md new file mode 100644 index 000000000..93659b171 --- /dev/null +++ b/dev/changelog/45.0.0.md @@ -0,0 +1,42 @@ + + +# Apache DataFusion Python 45.0.0 Changelog + +This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information. + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) + +**Other:** + +- Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 1 Kevin Liu + 1 Tim Saucer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 864ef1c8b..21955b6d1 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -21,7 +21,10 @@ from typing import TYPE_CHECKING, Any, Protocol -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.catalog import Catalog, Table from datafusion.dataframe import DataFrame diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 7413a5fa3..23b5d630b 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -33,7 +33,10 @@ overload, ) -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import ExecutionPlan, LogicalPlan from datafusion.record_batch import RecordBatchStream diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 68ddd7c9a..e3d7158eb 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -25,7 +25,11 @@ from typing import TYPE_CHECKING, Any, Optional, Type import pyarrow as pa -from typing_extensions import deprecated + +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.common import DataTypeMap, NullTreatment, RexType diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 402184d3f..06302fe38 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -26,7 +26,10 @@ import pathlib from typing import TYPE_CHECKING -from typing_extensions import deprecated +try: + from warnings import deprecated # Python 3.13+ +except ImportError: + from typing_extensions import deprecated # Python 3.12 from datafusion.plan import LogicalPlan diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 5bc3fb094..c636e896a 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -755,13 +755,20 @@ def test_execution_plan(aggregate_df): assert "CsvExec:" in indent ctx = SessionContext() - stream = ctx.execute(plan, 0) - # get the one and only batch - batch = stream.next() - assert batch is not None - # there should be no more batches - with pytest.raises(StopIteration): - stream.next() + rows_returned = 0 + for idx in range(0, plan.partition_count): + stream = ctx.execute(plan, idx) + try: + batch = stream.next() + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + except StopIteration: + # This is one of the partitions with no values + pass + with pytest.raises(StopIteration): + stream.next() + + assert rows_returned == 5 def test_repartition(df): From a80a788f69cf46ef002b3c537837548cc103748c Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 8 Mar 2025 21:22:36 +0800 Subject: [PATCH 06/22] Enable Dataframe to be converted into views which can be used in register_table (#1016) * add test_view * feat: add into_view method to register DataFrame as a view * add pytableprovider * feat: add as_table method to PyTableProvider and update into_view to return PyTable * refactor: simplify as_table method and update documentation for into_view * test: improve test_register_filtered_dataframe by removing redundant comments and assertions * test: enhance test_register_filtered_dataframe with additional assertions for DataFrame results * ruff formatted * cleanup: remove unused imports from test_view.py * docs: add example for registering a DataFrame as a view in README.md * docs: update docstring for into_view method to clarify usage as ViewTable * chore: add license header to test_view.py * ruff correction * refactor: rename into_view method to _into_view * ruff lint * refactor: simplify into_view method and update Rust binding convention * docs: add views section to user guide with example on registering views * feat: add register_view method to SessionContext for DataFrame registration * docs: update README and user guide to reflect register_view method for DataFrame registration * docs: remove some documentation from PyDataFrame --- README.md | 40 +++++++++++++ .../user-guide/common-operations/index.rst | 1 + .../user-guide/common-operations/views.rst | 58 +++++++++++++++++++ python/datafusion/context.py | 12 ++++ python/datafusion/dataframe.py | 4 ++ python/tests/test_view.py | 49 ++++++++++++++++ src/dataframe.rs | 39 +++++++++++++ 7 files changed, 203 insertions(+) create mode 100644 docs/source/user-guide/common-operations/views.rst create mode 100644 python/tests/test_view.py diff --git a/README.md b/README.md index 9c56b62dd..4f80dbe18 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,46 @@ This produces the following chart: ![Chart](examples/chart.png) +## Registering a DataFrame as a View + +You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context. + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` + ## Configuration It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context. diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst index d7c708c21..7abd1f138 100644 --- a/docs/source/user-guide/common-operations/index.rst +++ b/docs/source/user-guide/common-operations/index.rst @@ -23,6 +23,7 @@ The contents of this section are designed to guide a new user through how to use .. toctree:: :maxdepth: 2 + views basic-info select-and-filter expressions diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst new file mode 100644 index 000000000..df11e3abe --- /dev/null +++ b/docs/source/user-guide/common-operations/views.rst @@ -0,0 +1,58 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +====================== +Registering Views +====================== + +You can use the context's ``register_view`` method to register a DataFrame as a view + +.. code-block:: python + + from datafusion import SessionContext, col, literal + + # Create a DataFusion context + ctx = SessionContext() + + # Create sample data + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + # Create a DataFrame from the dictionary + df = ctx.from_pydict(data, "my_table") + + # Filter the DataFrame (for example, keep rows where a > 2) + df_filtered = df.filter(col("a") > literal(2)) + + # Register the dataframe as a view with the context + ctx.register_view("view1", df_filtered) + + # Now run a SQL query against the registered view + df_view = ctx.sql("SELECT * FROM view1") + + # Collect the results + results = df_view.collect() + + # Convert results to a list of dictionaries for display + result_dicts = [batch.to_pydict() for batch in results] + + print(result_dicts) + +This will output: + +.. code-block:: python + + [{'a': [3, 4, 5], 'b': [30, 40, 50]}] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 21955b6d1..befc4dce6 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -707,6 +707,18 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_polars(data, name)) + # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + # is the discussion on how we arrived at adding register_view + def register_view(self, name: str, df: DataFrame): + """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. + + Args: + name (str): The name to register the view under. + df (DataFrame): The DataFrame to be converted into a view and registered. + """ + view = df.into_view() + self.ctx.register_table(name, view) + def register_table(self, name: str, table: Table) -> None: """Register a :py:class: `~datafusion.catalog.Table` as a table. diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 23b5d630b..85a179ec9 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -124,6 +124,10 @@ def __init__(self, df: DataFrameInternal) -> None: """ self.df = df + def into_view(self) -> pa.Table: + """Convert DataFrame as a ViewTable which can be used in register_table.""" + return self.df.into_view() + def __getitem__(self, key: str | List[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. diff --git a/python/tests/test_view.py b/python/tests/test_view.py new file mode 100644 index 000000000..1d92cc0d4 --- /dev/null +++ b/python/tests/test_view.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from datafusion import SessionContext, col, literal + + +def test_register_filtered_dataframe(): + ctx = SessionContext() + + data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + + df = ctx.from_pydict(data, "my_table") + + df_filtered = df.filter(col("a") > literal(2)) + + ctx.register_view("view1", df_filtered) + + df_view = ctx.sql("SELECT * FROM view1") + + filtered_results = df_view.collect() + + result_dicts = [batch.to_pydict() for batch in filtered_results] + + expected_results = [{"a": [3, 4, 5], "b": [30, 40, 50]}] + + assert result_dicts == expected_results + + df_results = df.collect() + + df_result_dicts = [batch.to_pydict() for batch in df_results] + + expected_df_results = [{"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}] + + assert df_result_dicts == expected_df_results diff --git a/src/dataframe.rs b/src/dataframe.rs index ed9578a71..243e2e14f 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -30,6 +30,7 @@ use datafusion::arrow::util::pretty; use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; +use datafusion::datasource::TableProvider; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; @@ -39,6 +40,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods}; use tokio::task::JoinHandle; +use crate::catalog::PyTable; use crate::errors::{py_datafusion_err, PyDataFusionError}; use crate::expr::sort_expr::to_sort_expressions; use crate::physical_plan::PyExecutionPlan; @@ -50,6 +52,25 @@ use crate::{ expr::{sort_expr::PySortExpr, PyExpr}, }; +// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 +// - we have not decided on the table_provider approach yet +// this is an interim implementation +#[pyclass(name = "TableProvider", module = "datafusion")] +pub struct PyTableProvider { + provider: Arc, +} + +impl PyTableProvider { + pub fn new(provider: Arc) -> Self { + Self { provider } + } + + pub fn as_table(&self) -> PyTable { + let table_provider: Arc = self.provider.clone(); + PyTable::new(table_provider) + } +} + /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. @@ -156,6 +177,24 @@ impl PyDataFrame { PyArrowType(self.df.schema().into()) } + /// Convert this DataFrame into a Table that can be used in register_table + /// By convention, into_... methods consume self and return the new object. + /// Disabling the clippy lint, so we can use &self + /// because we're working with Python bindings + /// where objects are shared + /// https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 + /// - we have not decided on the table_provider approach yet + #[allow(clippy::wrong_self_convention)] + fn into_view(&self) -> PyDataFusionResult { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don’t invalidate this PyDataFrame. + let table_provider = self.df.as_ref().clone().into_view(); + let table_provider = PyTableProvider::new(table_provider); + + Ok(table_provider.as_table()) + } + #[pyo3(signature = (*args))] fn select_columns(&self, args: Vec) -> PyDataFusionResult { let args = args.iter().map(|s| s.as_ref()).collect::>(); From 9027b4d79fdd7a41dd9c1f25c2ecebc1fabf50f2 Mon Sep 17 00:00:00 2001 From: Chen Chongchen Date: Sat, 8 Mar 2025 21:24:02 +0800 Subject: [PATCH 07/22] fix: type checking (#993) * fix: type checking * update license * format * format * update catalog * revert type annotation * format * format * update --- python/datafusion/catalog.py | 5 +++-- python/datafusion/context.py | 19 ++++++++++++------ python/datafusion/dataframe.py | 3 ++- python/datafusion/expr.py | 8 ++++---- python/datafusion/functions.py | 10 +++++++--- python/datafusion/input/location.py | 10 +++++----- python/datafusion/udf.py | 7 ++++--- python/tests/test_functions.py | 30 +++++++++++++++++++++++++++++ 8 files changed, 68 insertions(+), 24 deletions(-) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 703037665..0560f4704 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -66,11 +66,12 @@ def __init__(self, table: df_internal.Table) -> None: """This constructor is not typically called by the end user.""" self.table = table + @property def schema(self) -> pyarrow.Schema: """Returns the schema associated with this table.""" - return self.table.schema() + return self.table.schema @property def kind(self) -> str: """Returns the kind of table.""" - return self.table.kind() + return self.table.kind diff --git a/python/datafusion/context.py b/python/datafusion/context.py index befc4dce6..282b2a477 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -728,7 +728,7 @@ def register_table(self, name: str, table: Table) -> None: name: Name of the resultant table. table: DataFusion table to add to the session context. """ - self.ctx.register_table(name, table) + self.ctx.register_table(name, table.table) def deregister_table(self, name: str) -> None: """Remove a table from the session.""" @@ -767,7 +767,7 @@ def register_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -798,7 +798,9 @@ def register_parquet( file_extension, skip_metadata, schema, - file_sort_order, + [sort_list_to_raw_sort_list(exprs) for exprs in file_sort_order] + if file_sort_order is not None + else None, ) def register_csv( @@ -934,7 +936,7 @@ def register_udwf(self, udwf: WindowUDF) -> None: def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" - return self.ctx.catalog(name) + return Catalog(self.ctx.catalog(name)) @deprecated( "Use the catalog provider interface ``SessionContext.Catalog`` to " @@ -1054,7 +1056,7 @@ def read_parquet( file_extension: str = ".parquet", skip_metadata: bool = True, schema: pyarrow.Schema | None = None, - file_sort_order: list[list[Expr]] | None = None, + file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1078,6 +1080,11 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] + file_sort_order = ( + [sort_list_to_raw_sort_list(f) for f in file_sort_order] + if file_sort_order is not None + else None + ) return DataFrame( self.ctx.read_parquet( str(path), @@ -1121,7 +1128,7 @@ def read_table(self, table: Table) -> DataFrame: :py:class:`~datafusion.catalog.ListingTable`, create a :py:class:`~datafusion.dataframe.DataFrame`. """ - return DataFrame(self.ctx.read_table(table)) + return DataFrame(self.ctx.read_table(table.table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: """Execute the ``plan`` and return the results.""" diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 85a179ec9..de5d8376e 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,6 +52,7 @@ from enum import Enum from datafusion._internal import DataFrame as DataFrameInternal +from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -277,7 +278,7 @@ def with_columns( def _simplify_expression( *exprs: Expr | Iterable[Expr], **named_exprs: Expr - ) -> list[Expr]: + ) -> list[expr_internal.Expr]: expr_list = [] for expr in exprs: if isinstance(expr, Expr): diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index e3d7158eb..3639abec6 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e.expr, True, True).raw_sort + return SortExpr(e, True, True).raw_sort def sort_list_to_raw_sort_list( @@ -231,7 +231,7 @@ def variant_name(self) -> str: def __richcmp__(self, other: Expr, op: int) -> Expr: """Comparison operator.""" - return Expr(self.expr.__richcmp__(other, op)) + return Expr(self.expr.__richcmp__(other.expr, op)) def __repr__(self) -> str: """Generate a string representation of this expression.""" @@ -417,7 +417,7 @@ def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: ascending: If true, sort in ascending order. nulls_first: Return null values first. """ - return SortExpr(self.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(self, ascending=ascending, nulls_first=nulls_first) def is_null(self) -> Expr: """Returns ``True`` if this expression is null.""" @@ -789,7 +789,7 @@ class SortExpr: def __init__(self, expr: Expr, ascending: bool, nulls_first: bool) -> None: """This constructor should not be called by the end user.""" - self.raw_sort = expr_internal.SortExpr(expr, ascending, nulls_first) + self.raw_sort = expr_internal.SortExpr(expr.expr, ascending, nulls_first) def expr(self) -> Expr: """Return the raw expr backing the SortExpr.""" diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 5c260aade..b449c4868 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -366,7 +366,7 @@ def concat_ws(separator: str, *args: Expr) -> Expr: def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: """Creates a new sort expression.""" - return SortExpr(expr.expr, ascending=ascending, nulls_first=nulls_first) + return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) def alias(expr: Expr, name: str) -> Expr: @@ -942,6 +942,7 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_millis(arg.expr, *formatters)) @@ -950,6 +951,7 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_micros(arg.expr, *formatters)) @@ -958,6 +960,7 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) @@ -966,6 +969,7 @@ def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: See :py:func:`to_timestamp` for a description on how to use formatters. """ + formatters = [f.expr for f in formatters] return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) @@ -1078,9 +1082,9 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr: return Expr(f.range(start.expr, stop.expr, step.expr)) -def uuid(arg: Expr) -> Expr: +def uuid() -> Expr: """Returns uuid v4 as a string value.""" - return Expr(f.uuid(arg.expr)) + return Expr(f.uuid()) def struct(*args: Expr) -> Expr: diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index a8252b53c..517cd1578 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -37,12 +37,12 @@ def is_correct_input(self, input_item: Any, table_name: str, **kwargs): def build_table( self, - input_file: str, + input_item: str, table_name: str, **kwargs, ) -> SqlTable: """Create a table from the input source.""" - _, extension = os.path.splitext(input_file) + _, extension = os.path.splitext(input_item) format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] @@ -50,7 +50,7 @@ def build_table( import pyarrow.parquet as pq # Read the Parquet metadata - metadata = pq.read_metadata(input_file) + metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable for col in metadata.schema: @@ -69,7 +69,7 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_file, "r") as file: + with open(input_item, "r") as file: reader = csv.reader(file) header_row = next(reader) print(header_row) @@ -84,6 +84,6 @@ def build_table( ) # Input could possibly be multiple files. Create a list if so - input_files = glob.glob(input_file) + input_files = glob.glob(input_item) return SqlTable(table_name, columns, num_rows, input_files) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index c97f453d0..0bba3d723 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -85,7 +85,7 @@ class ScalarUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[..., _R], input_types: pyarrow.DataType | list[pyarrow.DataType], return_type: _R, @@ -182,7 +182,7 @@ class AggregateUDF: def __init__( self, - name: Optional[str], + name: str, accumulator: Callable[[], Accumulator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, @@ -277,6 +277,7 @@ def sum_bias_10() -> Summarize: ) if name is None: name = accum.__call__().__class__.__qualname__.lower() + assert name is not None if isinstance(input_types, pyarrow.DataType): input_types = [input_types] return AggregateUDF( @@ -462,7 +463,7 @@ class WindowUDF: def __init__( self, - name: Optional[str], + name: str, func: Callable[[], WindowEvaluator], input_types: list[pyarrow.DataType], return_type: pyarrow.DataType, diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index b1a739b49..fca05bb8f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -871,7 +871,22 @@ def test_temporal_functions(df): f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), f.extract(literal("day"), column("d")), + f.to_timestamp( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_seconds( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_millis( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), + f.to_timestamp_micros( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), f.to_timestamp_nanos(literal("2023-09-07 05:06:14.523952")), + f.to_timestamp_nanos( + literal("2023-09-07 05:06:14.523952000"), literal("%Y-%m-%d %H:%M:%S.%f") + ), ) result = df.collect() assert len(result) == 1 @@ -913,6 +928,21 @@ def test_temporal_functions(df): assert result.column(11) == pa.array( [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") ) + assert result.column(12) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + ) + assert result.column(13) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + ) + assert result.column(14) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + ) + assert result.column(15) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) + assert result.column(16) == pa.array( + [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + ) def test_arrow_cast(df): From acd70409f73f299a144e7ff4115c6e6035c3ffb5 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:37:10 +0100 Subject: [PATCH 08/22] feat: reads using global ctx (#982) * feat: reads using global ctx * Add text to io methods to describe the context they are using --------- Co-authored-by: Tim Saucer --- python/datafusion/__init__.py | 5 + python/datafusion/io.py | 199 ++++++++++++++++++++++++++ python/tests/test_io.py | 95 ++++++++++++ python/tests/test_wrapper_coverage.py | 2 + src/context.rs | 12 +- src/utils.rs | 8 ++ 6 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 python/datafusion/io.py create mode 100644 python/tests/test_io.py diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 85aefcce7..f11ce54a6 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -45,6 +45,7 @@ Expr, WindowFrame, ) +from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF @@ -81,6 +82,10 @@ "functions", "object_store", "substrait", + "read_parquet", + "read_avro", + "read_csv", + "read_json", ] diff --git a/python/datafusion/io.py b/python/datafusion/io.py new file mode 100644 index 000000000..7f3b77efa --- /dev/null +++ b/python/datafusion/io.py @@ -0,0 +1,199 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""IO read functions using global context.""" + +import pathlib + +import pyarrow + +from datafusion.dataframe import DataFrame +from datafusion.expr import Expr + +from ._internal import SessionContext as SessionContextInternal + + +def read_parquet( + path: str | pathlib.Path, + table_partition_cols: list[tuple[str, str]] | None = None, + parquet_pruning: bool = True, + file_extension: str = ".parquet", + skip_metadata: bool = True, + schema: pyarrow.Schema | None = None, + file_sort_order: list[list[Expr]] | None = None, +) -> DataFrame: + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Parquet file. + table_partition_cols: Partition columns. + parquet_pruning: Whether the parquet reader should use the predicate + to prune row groups. + file_extension: File extension; only files with this extension are + selected for data input. + skip_metadata: Whether the parquet reader should skip any metadata + that may be in the file schema. This can help avoid schema + conflicts due to metadata. + schema: An optional schema representing the parquet files. If None, + the parquet reader will try to infer it based on data in the + file. + file_sort_order: Sort order for the file. + + Returns: + DataFrame representation of the read Parquet files + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, + ) + ) + + +def read_json( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + schema_infer_max_records: int = 1000, + file_extension: str = ".json", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a line-delimited JSON data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the JSON file. + schema: The data source schema. + schema_infer_max_records: Maximum number of rows to read from JSON + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read JSON files. + """ + if table_partition_cols is None: + table_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_csv( + path: str | pathlib.Path | list[str] | list[pathlib.Path], + schema: pyarrow.Schema | None = None, + has_header: bool = True, + delimiter: str = ",", + schema_infer_max_records: int = 1000, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, str]] | None = None, + file_compression_type: str | None = None, +) -> DataFrame: + """Read a CSV data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the CSV file + schema: An optional schema representing the CSV files. If None, the + CSV reader will try to infer it based on data in file. + has_header: Whether the CSV file have a header. If schema inference + is run on a file with no headers, default column names are + created. + delimiter: An optional column delimiter. + schema_infer_max_records: Maximum number of rows to read from CSV + files for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns. + file_compression_type: File compression type. + + Returns: + DataFrame representation of the read CSV files + """ + if table_partition_cols is None: + table_partition_cols = [] + + path = [str(p) for p in path] if isinstance(path, list) else str(path) + + return DataFrame( + SessionContextInternal._global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + ) + ) + + +def read_avro( + path: str | pathlib.Path, + schema: pyarrow.Schema | None = None, + file_partition_cols: list[tuple[str, str]] | None = None, + file_extension: str = ".avro", +) -> DataFrame: + """Create a :py:class:`DataFrame` for reading Avro data source. + + This function will use the global context. Any functions or tables registered + with another context may not be accessible when used with a DataFrame created + using this function. + + Args: + path: Path to the Avro file. + schema: The data source schema. + file_partition_cols: Partition columns. + file_extension: File extension to select. + + Returns: + DataFrame representation of the read Avro file + """ + if file_partition_cols is None: + file_partition_cols = [] + return DataFrame( + SessionContextInternal._global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension + ) + ) diff --git a/python/tests/test_io.py b/python/tests/test_io.py new file mode 100644 index 000000000..21ad188ee --- /dev/null +++ b/python/tests/test_io.py @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import os +import pathlib + +import pyarrow as pa +from datafusion import column +from datafusion.io import read_avro, read_csv, read_json, read_parquet + + +def test_read_json_global_ctx(ctx): + path = os.path.dirname(os.path.abspath(__file__)) + + # Default + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + # Schema + schema = pa.schema( + [ + pa.field("A", pa.string(), nullable=True), + ] + ) + df = read_json(test_data_path, schema=schema) + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].schema == schema + + # File extension + test_data_path = os.path.join(path, "data_test_context", "data.json") + df = read_json(test_data_path, file_extension=".json") + result = df.collect() + + assert result[0].column(0) == pa.array(["a", "b", "c"]) + assert result[0].column(1) == pa.array([1, 2, 3]) + + +def test_read_parquet_global(): + parquet_df = read_parquet(path="parquet/data/alltypes_plain.parquet") + parquet_df.show() + assert parquet_df is not None + + path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + parquet_df = read_parquet(path=path) + assert parquet_df is not None + + +def test_read_csv(): + csv_df = read_csv(path="testing/data/csv/aggregate_test_100.csv") + csv_df.select(column("c1")).show() + + +def test_read_csv_list(): + csv_df = read_csv(path=["testing/data/csv/aggregate_test_100.csv"]) + expected = csv_df.count() * 2 + + double_csv_df = read_csv( + path=[ + "testing/data/csv/aggregate_test_100.csv", + "testing/data/csv/aggregate_test_100.csv", + ] + ) + actual = double_csv_df.count() + + double_csv_df.select(column("c1")).show() + assert actual == expected + + +def test_read_avro(): + avro_df = read_avro(path="testing/data/avro/alltypes_plain.avro") + avro_df.show() + assert avro_df is not None + + path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + avro_df = read_avro(path=path) + assert avro_df is not None diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index 86f2d57f2..ac064ba95 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -34,6 +34,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: return for attr in dir(internal_obj): + if attr in ["_global_ctx"]: + continue assert attr in dir(wrapped_obj) internal_attr = getattr(internal_obj, attr) diff --git a/src/context.rs b/src/context.rs index 0f962638e..9ba87eb8a 100644 --- a/src/context.rs +++ b/src/context.rs @@ -44,7 +44,7 @@ use crate::store::StorageContexts; use crate::udaf::PyAggregateUDF; use crate::udf::PyScalarUDF; use crate::udwf::PyWindowUDF; -use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future}; +use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; @@ -69,7 +69,7 @@ use datafusion::prelude::{ AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, }; use datafusion_ffi::table_provider::{FFI_TableProvider, ForeignTableProvider}; -use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple}; +use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; use tokio::task::JoinHandle; /// Configuration options for a SessionContext @@ -306,6 +306,14 @@ impl PySessionContext { }) } + #[classmethod] + #[pyo3(signature = ())] + fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + Ok(Self { + ctx: get_global_ctx().clone(), + }) + } + /// Register an object store with the given name #[pyo3(signature = (scheme, store, host=None))] pub fn register_object_store( diff --git a/src/utils.rs b/src/utils.rs index ed224b364..999aad755 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -17,6 +17,7 @@ use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::TokioRuntime; +use datafusion::execution::context::SessionContext; use datafusion::logical_expr::Volatility; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -37,6 +38,13 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { RUNTIME.get_or_init(|| TokioRuntime(tokio::runtime::Runtime::new().unwrap())) } +/// Utility to get the Global Datafussion CTX +#[inline] +pub(crate) fn get_global_ctx() -> &'static SessionContext { + static CTX: OnceLock = OnceLock::new(); + CTX.get_or_init(|| SessionContext::new()) +} + /// Utility to collect rust futures with GIL released pub fn wait_for_future(py: Python, f: F) -> F::Output where From 973d7ec4a8196a78bc4fb32db4f24e523997ba4c Mon Sep 17 00:00:00 2001 From: Crystal Zhou <45134936+CrystalZhou0529@users.noreply.github.com> Date: Sat, 8 Mar 2025 16:23:54 -0500 Subject: [PATCH 09/22] feat: Implementation of udf and udaf decorator (#1040) * Implementation of udf and udaf decorator * Rename decorators back to udf and udaf, update documentations * Minor typo fixes * Fixing linting errors * ruff formatting --------- Co-authored-by: Tim Saucer --- python/datafusion/udf.py | 257 +++++++++++++++++++++++++++----------- python/tests/test_udaf.py | 42 +++++++ python/tests/test_udf.py | 42 ++++++- 3 files changed, 265 insertions(+), 76 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 0bba3d723..af7bcf2ed 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -19,6 +19,7 @@ from __future__ import annotations +import functools from abc import ABCMeta, abstractmethod from enum import Enum from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar @@ -110,43 +111,102 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - @staticmethod - def udf( - func: Callable[..., _R], - input_types: list[pyarrow.DataType], - return_type: _R, - volatility: Volatility | str, - name: Optional[str] = None, - ) -> ScalarUDF: - """Create a new User-Defined Function. + class udf: + """Create a new User-Defined Function (UDF). + + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udf(input_types, return_type, volatility, + name)`. In this case, do **not** pass `func` explicitly. Args: - func: A callable python function. - input_types: The data types of the arguments to ``func``. This list - must be of the same length as the number of arguments. - return_type: The data type of the return value from the python - function. - volatility: See ``Volatility`` for allowed values. - name: A descriptive name for the function. + func (Callable, optional): **Only needed when calling as a function.** + Skip this argument when using `udf` as a decorator. + input_types (list[pyarrow.DataType]): The data types of the arguments + to `func`. This list must be of the same length as the number of + arguments. + return_type (_R): The data type of the return value from the function. + volatility (Volatility | str): See `Volatility` for allowed values. + name (Optional[str]): A descriptive name for the function. Returns: - A user-defined aggregate function, which can be used in either data - aggregation or window function calls. + A user-defined function that can be used in SQL expressions, + data aggregation, or window function calls. + + Example: + **Using `udf` as a function:** + ``` + def double_func(x): + return x * 2 + double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + "volatile", "double_it") + ``` + + **Using `udf` as a decorator:** + ``` + @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + def double_udf(x): + return x * 2 + ``` """ - if not callable(func): - raise TypeError("`func` argument must be callable") - if name is None: - if hasattr(func, "__qualname__"): - name = func.__qualname__.lower() + + def __new__(cls, *args, **kwargs): + """Create a new UDF. + + Trigger UDF function or decorator depending on if the first args is callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) else: - name = func.__class__.__name__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + func: Callable[..., _R], + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: + if not callable(func): + raise TypeError("`func` argument must be callable") + if name is None: + if hasattr(func, "__qualname__"): + name = func.__qualname__.lower() + else: + name = func.__class__.__name__.lower() + return ScalarUDF( + name=name, + func=func, + input_types=input_types, + return_type=return_type, + volatility=volatility, + ) + + @staticmethod + def _decorator( + input_types: list[pyarrow.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(func): + udf_caller = ScalarUDF.udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args, **kwargs): + return udf_caller(*args, **kwargs) + + return wrapper + + return decorator class Accumulator(metaclass=ABCMeta): @@ -212,25 +272,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - @staticmethod - def udaf( - accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], - volatility: Volatility | str, - name: Optional[str] = None, - ) -> AggregateUDF: - """Create a new User-Defined Aggregate Function. + class udaf: + """Create a new User-Defined Aggregate Function (UDAF). - If your :py:class:`Accumulator` can be instantiated with no arguments, you - can simply pass it's type as ``accum``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`Accumulator` will be constructed for every instance in - which this UDAF is used. The following examples are all valid. + This class allows you to define an **aggregate function** that can be used in + data aggregation or window function calls. - .. code-block:: python + Usage: + - **As a function**: Call `udaf(accum, input_types, return_type, state_type, + volatility, name)`. + - **As a decorator**: Use `@udaf(input_types, return_type, state_type, + volatility, name)`. + When using `udaf` as a decorator, **do not pass `accum` explicitly**. + **Function example:** + + If your `:py:class:Accumulator` can be instantiated with no arguments, you + can simply pass it's type as `accum`. If you need to pass additional + arguments to it's constructor, you can define a lambda or a factory method. + During runtime the `:py:class:Accumulator` will be constructed for every + instance in which this UDAF is used. The following examples are all valid. + ``` import pyarrow as pa import pyarrow.compute as pc @@ -253,12 +315,24 @@ def evaluate(self) -> pa.Scalar: def sum_bias_10() -> Summarize: return Summarize(10.0) - udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], "immutable") - udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), [pa.float64()], "immutable") + udaf1 = udaf(Summarize, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf2 = udaf(sum_bias_10, pa.float64(), pa.float64(), [pa.float64()], + "immutable") + udaf3 = udaf(lambda: Summarize(20.0), pa.float64(), pa.float64(), + [pa.float64()], "immutable") + ``` + + **Decorator example:** + ``` + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def udf4() -> Summarize: + return Summarize(10.0) + ``` Args: - accum: The accumulator python function. + accum: The accumulator python function. **Only needed when calling as a + function. Skip this argument when using `udaf` as a decorator.** input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. @@ -268,26 +342,69 @@ def sum_bias_10() -> Summarize: Returns: A user-defined aggregate function, which can be used in either data aggregation or window function calls. - """ # noqa W505 - if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" + """ + + def __new__(cls, *args, **kwargs): + """Create a new UDAF. + + Trigger UDAF function or decorator depending on if the first args is + callable + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return cls._function(*args, **kwargs) + else: + # Case 2: Used as a decorator with parameters + return cls._decorator(*args, **kwargs) + + @staticmethod + def _function( + accum: Callable[[], Accumulator], + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: + if not callable(accum): + raise TypeError("`func` must be callable.") + if not isinstance(accum.__call__(), Accumulator): + raise TypeError( + "Accumulator must implement the abstract base class Accumulator" + ) + if name is None: + name = accum.__call__().__class__.__qualname__.lower() + if isinstance(input_types, pyarrow.DataType): + input_types = [input_types] + return AggregateUDF( + name=name, + accumulator=accum, + input_types=input_types, + return_type=return_type, + state_type=state_type, + volatility=volatility, ) - if name is None: - name = accum.__call__().__class__.__qualname__.lower() - assert name is not None - if isinstance(input_types, pyarrow.DataType): - input_types = [input_types] - return AggregateUDF( - name=name, - accumulator=accum, - input_types=input_types, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) + + @staticmethod + def _decorator( + input_types: pyarrow.DataType | list[pyarrow.DataType], + return_type: pyarrow.DataType, + state_type: list[pyarrow.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ): + def decorator(accum: Callable[[], Accumulator]): + udaf_caller = AggregateUDF.udaf( + accum, input_types, return_type, state_type, volatility, name + ) + + @functools.wraps(accum) + def wrapper(*args, **kwargs): + return udaf_caller(*args, **kwargs) + + return wrapper + + return decorator class WindowEvaluator(metaclass=ABCMeta): diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 0005a3da8..e69c77d3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -117,6 +117,26 @@ def test_udaf_aggregate(df): assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate(df): + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize() + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) + + def test_udaf_aggregate_with_arguments(df): bias = 10.0 @@ -143,6 +163,28 @@ def test_udaf_aggregate_with_arguments(df): assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) +def test_udaf_decorator_aggregate_with_arguments(df): + bias = 10.0 + + @udaf(pa.float64(), pa.float64(), [pa.float64()], "immutable") + def summarize(): + return Summarize(bias) + + df1 = df.aggregate([], [summarize(column("a"))]) + + # execute and collect the first (and only) batch + result = df1.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + df2 = df.aggregate([], [summarize(column("a"))]) + + # Run a second time to ensure the state is properly reset + result = df2.collect()[0] + + assert result.column(0) == pa.array([bias + 1.0 + 2.0 + 3.0]) + + def test_group_by(df): summarize = udaf( Summarize, diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py index 3a5dce6d6..a6c047552 100644 --- a/python/tests/test_udf.py +++ b/python/tests/test_udf.py @@ -24,7 +24,7 @@ def df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + [pa.array([1, 2, 3]), pa.array([4, 4, None])], names=["a", "b"], ) return ctx.create_dataframe([[batch]], name="test_table") @@ -39,10 +39,20 @@ def test_udf(df): volatility="immutable", ) - df = df.select(is_null(column("a"))) + df = df.select(is_null(column("b"))) result = df.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) + + +def test_udf_decorator(df): + @udf([pa.int64()], pa.bool_(), "immutable") + def is_null(x: pa.Array) -> pa.Array: + return x.is_null() + + df = df.select(is_null(column("b"))) + result = df.collect()[0].column(0) + assert result == pa.array([False, False, True]) def test_register_udf(ctx, df) -> None: @@ -56,10 +66,10 @@ def test_register_udf(ctx, df) -> None: ctx.register_udf(is_null) - df_result = ctx.sql("select is_null(a) from test_table") + df_result = ctx.sql("select is_null(b) from test_table") result = df_result.collect()[0].column(0) - assert result == pa.array([False, False, False]) + assert result == pa.array([False, False, True]) class OverThresholdUDF: @@ -70,7 +80,7 @@ def __call__(self, values: pa.Array) -> pa.Array: return pa.array(v.as_py() >= self.threshold for v in values) -def test_udf_with_parameters(df) -> None: +def test_udf_with_parameters_function(df) -> None: udf_no_param = udf( OverThresholdUDF(), pa.int64(), @@ -94,3 +104,23 @@ def test_udf_with_parameters(df) -> None: result = df2.collect()[0].column(0) assert result == pa.array([False, True, True]) + + +def test_udf_with_parameters_decorator(df) -> None: + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_no_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF()(values) + + df1 = df.select(udf_no_param(column("a"))) + result = df1.collect()[0].column(0) + + assert result == pa.array([True, True, True]) + + @udf([pa.int64()], pa.bool_(), "immutable") + def udf_with_param(values: pa.Array) -> pa.Array: + return OverThresholdUDF(2)(values) + + df2 = df.select(udf_with_param(column("a"))) + result = df2.collect()[0].column(0) + + assert result == pa.array([False, True, True]) From d72f5605b3d523585d04857505793920f96242ba Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 06:56:12 -0400 Subject: [PATCH 10/22] Enable FA ruff lint (#1052) --- examples/python-udwf.py | 2 ++ pyproject.toml | 2 +- python/datafusion/io.py | 2 ++ python/tests/test_udaf.py | 2 ++ python/tests/test_udwf.py | 2 ++ 5 files changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 32f8fadaa..7d39dc1b8 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import datafusion import pyarrow as pa from datafusion import col, lit, udwf diff --git a/pyproject.toml b/pyproject.toml index f416e02a5..d16a18aa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "D", "W", "I"] +select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 7f3b77efa..3b6264948 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -17,6 +17,8 @@ """IO read functions using global context.""" +from __future__ import annotations + import pathlib import pyarrow diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index e69c77d3c..97cf81f3c 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + from typing import List import pyarrow as pa diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 0ffa04179..2fea34aa3 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import pyarrow as pa import pytest from datafusion import SessionContext, column, lit, udwf From 0002372ccdb780e011631c797ec9613174cf0a94 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 10 Mar 2025 14:22:42 -0400 Subject: [PATCH 11/22] Enable take comments to assign issues to users (#1058) --- .github/workflows/take.yml | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/take.yml diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml new file mode 100644 index 000000000..86dc190ad --- /dev/null +++ b/.github/workflows/take.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Assign the issue via a `take` comment +on: + issue_comment: + types: created + +permissions: + issues: write + +jobs: + issue_assign: + runs-on: ubuntu-latest + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) + if [ "$CODE" -eq "204" ] + then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + else + echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + fi \ No newline at end of file From 9d634de6df2f8b76bd303ab1f5972f01deb2210d Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 10 Mar 2025 14:24:40 -0400 Subject: [PATCH 12/22] Update python min version to 3.9 (#1043) * 3.8 -> 3.9 * upgrade pyo3 abi3-py38 -> abi3-py39 --- Cargo.toml | 2 +- .../source/contributor-guide/introduction.rst | 2 +- examples/ffi-table-provider/Cargo.lock | 75 +- examples/ffi-table-provider/Cargo.toml | 2 +- examples/ffi-table-provider/pyproject.toml | 2 +- pyproject.toml | 3 +- uv.lock | 707 ++---------------- 7 files changed, 121 insertions(+), 672 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5358b1836..50967a219 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ substrait = ["dep:datafusion-substrait"] [dependencies] tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} arrow = { version = "54", features = ["pyarrow"] } datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst index 25f2c21a4..2fba64111 100644 --- a/docs/source/contributor-guide/introduction.rst +++ b/docs/source/contributor-guide/introduction.rst @@ -118,7 +118,7 @@ be ignored by ``git``. .. code-block:: implementation=CPython - version=3.8 + version=3.9 shared=true abi3=true lib_name=python3.12 diff --git a/examples/ffi-table-provider/Cargo.lock b/examples/ffi-table-provider/Cargo.lock index 32af85180..8d0edd515 100644 --- a/examples/ffi-table-provider/Cargo.lock +++ b/examples/ffi-table-provider/Cargo.lock @@ -766,7 +766,8 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "arrow", "arrow-array", @@ -816,7 +817,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ "arrow", "async-trait", @@ -836,7 +838,8 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "arrow", @@ -862,7 +865,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -871,12 +875,14 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -894,7 +900,8 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -914,7 +921,8 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", @@ -925,7 +933,8 @@ dependencies = [ [[package]] name = "datafusion-ffi" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" dependencies = [ "abi_stable", "arrow", @@ -945,7 +954,8 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -974,7 +984,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", @@ -996,7 +1007,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1008,7 +1020,8 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" dependencies = [ "arrow", "arrow-array", @@ -1031,7 +1044,8 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1046,7 +1060,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1062,7 +1077,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1071,7 +1087,8 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ "datafusion-expr", "quote", @@ -1081,7 +1098,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1099,7 +1117,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", @@ -1123,7 +1142,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", @@ -1137,7 +1157,8 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", "arrow-schema", @@ -1158,7 +1179,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", @@ -1189,7 +1211,8 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" dependencies = [ "arrow", "chrono", @@ -1204,7 +1227,8 @@ dependencies = [ [[package]] name = "datafusion-proto-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" dependencies = [ "arrow", "datafusion-common", @@ -1214,7 +1238,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=1a29bd3#1a29bd3b62f1759c557aca9eed937ac38f5a5602" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", "arrow-array", diff --git a/examples/ffi-table-provider/Cargo.toml b/examples/ffi-table-provider/Cargo.toml index 0e558fdd0..f4e4fda79 100644 --- a/examples/ffi-table-provider/Cargo.toml +++ b/examples/ffi-table-provider/Cargo.toml @@ -23,7 +23,7 @@ edition = "2021" [dependencies] datafusion = { version = "45.0.0" } datafusion-ffi = { version = "45.0.0" } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } arrow = { version = "54" } arrow-array = { version = "54" } arrow-schema = { version = "54" } diff --git a/examples/ffi-table-provider/pyproject.toml b/examples/ffi-table-provider/pyproject.toml index 116efae9c..9cd25b423 100644 --- a/examples/ffi-table-provider/pyproject.toml +++ b/examples/ffi-table-provider/pyproject.toml @@ -21,7 +21,7 @@ build-backend = "maturin" [project] name = "ffi_table_provider" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", diff --git a/pyproject.toml b/pyproject.toml index d16a18aa6..1c2733677 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ name = "datafusion" description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["datafusion", "dataframe", "rust", "query-engine"] classifiers = [ "Development Status :: 2 - Pre-Alpha", @@ -35,7 +35,6 @@ classifiers = [ "Operating System :: Microsoft :: Windows", "Operating System :: POSIX :: Linux", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/uv.lock b/uv.lock index 587ddc8b7..619b92856 100644 --- a/uv.lock +++ b/uv.lock @@ -1,23 +1,10 @@ version = 1 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12'", "python_full_version == '3.11.*'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", -] - -[[package]] -name = "alabaster" -version = "0.7.13" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/94/71/a8ee96d1fd95ca04a0d2e2d9c4081dac4c2d2b12f7ddb899c8cb9bfd1532/alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2", size = 11454 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/88/c7083fc61120ab661c5d0b82cb77079fc1429d3f913a456c1c82cf4658f7/alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3", size = 13857 }, + "python_full_version < '3.10'", ] [[package]] @@ -25,7 +12,7 @@ name = "alabaster" version = "0.7.16" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/c9/3e/13dd8e5ed9094e734ac430b5d0eb4f2bb001708a8b7856cbf8e084e001ba/alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65", size = 23776 } wheels = [ @@ -46,42 +33,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929 }, ] -[[package]] -name = "appnope" -version = "0.1.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321 }, -] - -[[package]] -name = "astroid" -version = "3.2.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/53/1067e1113ecaf58312357f2cd93063674924119d80d173adc3f6f2387aa2/astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a", size = 397576 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/96/b32bbbb46170a1c8b8b1f28c794202e25cfe743565e9d3469b8eb1e0cc05/astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25", size = 276348 }, -] - [[package]] name = "astroid" version = "3.3.8" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/80/c5/5c83c48bbf547f3dd8b587529db7cf5a265a3368b33e85e76af8ff6061d3/astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b", size = 398196 } wheels = [ @@ -101,23 +58,11 @@ wheels = [ name = "babel" version = "2.16.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pytz", marker = "python_full_version < '3.9'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/2a/74/f1bc80f23eeba13393b7222b11d95ca3af2c1e28edca18af487137eefed9/babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316", size = 9348104 } wheels = [ { url = "https://files.pythonhosted.org/packages/ed/20/bc79bc575ba2e2a7f70e8a1155618bb1301eaa5132a8271373a6903f73f8/babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b", size = 9587599 }, ] -[[package]] -name = "backcall" -version = "0.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a2/40/764a663805d84deee23043e1426a9175567db89c8b3287b5c2ad9f71aa93/backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", size = 18041 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4c/1c/ff6546b6c12603d8dd1070aa3c3d273ad4c07f5771689a7b69a550e8c951/backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255", size = 11157 }, -] - [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -194,14 +139,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469 }, { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475 }, { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009 }, - { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457 }, - { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932 }, - { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585 }, - { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268 }, - { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592 }, - { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512 }, - { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576 }, - { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229 }, { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220 }, { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605 }, { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910 }, @@ -274,19 +211,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732 }, { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391 }, { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702 }, - { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653 }, - { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701 }, - { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495 }, - { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946 }, - { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737 }, - { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471 }, - { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801 }, - { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312 }, - { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347 }, - { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888 }, - { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169 }, - { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094 }, - { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139 }, { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867 }, { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385 }, { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367 }, @@ -351,11 +275,9 @@ wheels = [ [[package]] name = "datafusion" -version = "44.0.0" source = { editable = "." } dependencies = [ - { name = "pyarrow", version = "17.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pyarrow", version = "18.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pyarrow" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] @@ -369,20 +291,16 @@ dev = [ { name = "toml" }, ] docs = [ - { name = "ipython", version = "8.12.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "ipython", version = "8.18.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "ipython", version = "8.31.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "jinja2" }, { name = "myst-parser", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "myst-parser", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "pandas", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "pandas" }, { name = "pickleshare" }, { name = "pydata-sphinx-theme" }, - { name = "setuptools", version = "75.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "setuptools", version = "75.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "setuptools" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "sphinx-autoapi" }, ] @@ -435,28 +353,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, ] -[[package]] -name = "docutils" -version = "0.20.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/1f/53/a5da4f2c5739cf66290fac1431ee52aff6851c7c8ffd8264f13affd7bcdd/docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b", size = 2058365 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/87/f238c0670b94533ac0353a4e2a1a771a0cc73277b88bff23d3ae35a256c1/docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6", size = 572666 }, -] - [[package]] name = "docutils" version = "0.21.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444 } wheels = [ { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408 }, @@ -503,8 +403,7 @@ name = "importlib-metadata" version = "8.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", version = "3.20.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "zipp", version = "3.21.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "zipp", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } wheels = [ @@ -520,52 +419,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, ] -[[package]] -name = "ipython" -version = "8.12.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "appnope", marker = "python_full_version < '3.9' and sys_platform == 'darwin'" }, - { name = "backcall", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version < '3.9'" }, - { name = "jedi", marker = "python_full_version < '3.9'" }, - { name = "matplotlib-inline", marker = "python_full_version < '3.9'" }, - { name = "pexpect", marker = "python_full_version < '3.9' and sys_platform != 'win32'" }, - { name = "pickleshare", marker = "python_full_version < '3.9'" }, - { name = "prompt-toolkit", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "stack-data", marker = "python_full_version < '3.9'" }, - { name = "traitlets", marker = "python_full_version < '3.9'" }, - { name = "typing-extensions", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/6a/44ef299b1762f5a73841e87fae8a73a8cc8aee538d6dc8c77a5afe1fd2ce/ipython-8.12.3.tar.gz", hash = "sha256:3910c4b54543c2ad73d06579aa771041b7d5707b033bd488669b4cf544e3b363", size = 5470171 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/97/8fe103906cd81bc42d3b0175b5534a9f67dccae47d6451131cf8d0d70bb2/ipython-8.12.3-py3-none-any.whl", hash = "sha256:b0340d46a933d27c657b211a329d0be23793c36595acf9e6ef4164bc01a1804c", size = 798307 }, -] - [[package]] name = "ipython" version = "8.18.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version == '3.9.*'" }, - { name = "exceptiongroup", marker = "python_full_version == '3.9.*'" }, - { name = "jedi", marker = "python_full_version == '3.9.*'" }, - { name = "matplotlib-inline", marker = "python_full_version == '3.9.*'" }, - { name = "pexpect", marker = "python_full_version == '3.9.*' and sys_platform != 'win32'" }, - { name = "prompt-toolkit", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "stack-data", marker = "python_full_version == '3.9.*'" }, - { name = "traitlets", marker = "python_full_version == '3.9.*'" }, - { name = "typing-extensions", marker = "python_full_version == '3.9.*'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "decorator", marker = "python_full_version < '3.10'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10'" }, + { name = "jedi", marker = "python_full_version < '3.10'" }, + { name = "matplotlib-inline", marker = "python_full_version < '3.10'" }, + { name = "pexpect", marker = "python_full_version < '3.10' and sys_platform != 'win32'" }, + { name = "prompt-toolkit", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "stack-data", marker = "python_full_version < '3.10'" }, + { name = "traitlets", marker = "python_full_version < '3.10'" }, + { name = "typing-extensions", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b1/b9/3ba6c45a6df813c09a48bac313c22ff83efa26cbb55011218d925a46e2ad/ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27", size = 5486330 } wheels = [ @@ -616,8 +488,7 @@ name = "jinja2" version = "3.1.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe", version = "2.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "markupsafe", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "markupsafe" }, ] sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674 } wheels = [ @@ -636,77 +507,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, ] -[[package]] -name = "markupsafe" -version = "2.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/87/5b/aae44c6655f3801e81aa3eef09dbbf012431987ba564d7231722f68df02d/MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b", size = 19384 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e4/54/ad5eb37bf9d51800010a74e4665425831a9db4e7c4e0fde4352e391e808e/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc", size = 18206 }, - { url = "https://files.pythonhosted.org/packages/6a/4a/a4d49415e600bacae038c67f9fecc1d5433b9d3c71a4de6f33537b89654c/MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5", size = 14079 }, - { url = "https://files.pythonhosted.org/packages/0a/7b/85681ae3c33c385b10ac0f8dd025c30af83c78cec1c37a6aa3b55e67f5ec/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46", size = 26620 }, - { url = "https://files.pythonhosted.org/packages/7c/52/2b1b570f6b8b803cef5ac28fdf78c0da318916c7d2fe9402a84d591b394c/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", size = 25818 }, - { url = "https://files.pythonhosted.org/packages/29/fe/a36ba8c7ca55621620b2d7c585313efd10729e63ef81e4e61f52330da781/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900", size = 25493 }, - { url = "https://files.pythonhosted.org/packages/60/ae/9c60231cdfda003434e8bd27282b1f4e197ad5a710c14bee8bea8a9ca4f0/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff", size = 30630 }, - { url = "https://files.pythonhosted.org/packages/65/dc/1510be4d179869f5dafe071aecb3f1f41b45d37c02329dfba01ff59e5ac5/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad", size = 29745 }, - { url = "https://files.pythonhosted.org/packages/30/39/8d845dd7d0b0613d86e0ef89549bfb5f61ed781f59af45fc96496e897f3a/MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd", size = 30021 }, - { url = "https://files.pythonhosted.org/packages/c7/5c/356a6f62e4f3c5fbf2602b4771376af22a3b16efa74eb8716fb4e328e01e/MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4", size = 16659 }, - { url = "https://files.pythonhosted.org/packages/69/48/acbf292615c65f0604a0c6fc402ce6d8c991276e16c80c46a8f758fbd30c/MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5", size = 17213 }, - { url = "https://files.pythonhosted.org/packages/11/e7/291e55127bb2ae67c64d66cef01432b5933859dfb7d6949daa721b89d0b3/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f", size = 18219 }, - { url = "https://files.pythonhosted.org/packages/6b/cb/aed7a284c00dfa7c0682d14df85ad4955a350a21d2e3b06d8240497359bf/MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2", size = 14098 }, - { url = "https://files.pythonhosted.org/packages/1c/cf/35fe557e53709e93feb65575c93927942087e9b97213eabc3fe9d5b25a55/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced", size = 29014 }, - { url = "https://files.pythonhosted.org/packages/97/18/c30da5e7a0e7f4603abfc6780574131221d9148f323752c2755d48abad30/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5", size = 28220 }, - { url = "https://files.pythonhosted.org/packages/0c/40/2e73e7d532d030b1e41180807a80d564eda53babaf04d65e15c1cf897e40/MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c", size = 27756 }, - { url = "https://files.pythonhosted.org/packages/18/46/5dca760547e8c59c5311b332f70605d24c99d1303dd9a6e1fc3ed0d73561/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f", size = 33988 }, - { url = "https://files.pythonhosted.org/packages/6d/c5/27febe918ac36397919cd4a67d5579cbbfa8da027fa1238af6285bb368ea/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a", size = 32718 }, - { url = "https://files.pythonhosted.org/packages/f8/81/56e567126a2c2bc2684d6391332e357589a96a76cb9f8e5052d85cb0ead8/MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f", size = 33317 }, - { url = "https://files.pythonhosted.org/packages/00/0b/23f4b2470accb53285c613a3ab9ec19dc944eaf53592cb6d9e2af8aa24cc/MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906", size = 16670 }, - { url = "https://files.pythonhosted.org/packages/b7/a2/c78a06a9ec6d04b3445a949615c4c7ed86a0b2eb68e44e7541b9d57067cc/MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617", size = 17224 }, - { url = "https://files.pythonhosted.org/packages/53/bd/583bf3e4c8d6a321938c13f49d44024dbe5ed63e0a7ba127e454a66da974/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1", size = 18215 }, - { url = "https://files.pythonhosted.org/packages/48/d6/e7cd795fc710292c3af3a06d80868ce4b02bfbbf370b7cee11d282815a2a/MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4", size = 14069 }, - { url = "https://files.pythonhosted.org/packages/51/b5/5d8ec796e2a08fc814a2c7d2584b55f889a55cf17dd1a90f2beb70744e5c/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee", size = 29452 }, - { url = "https://files.pythonhosted.org/packages/0a/0d/2454f072fae3b5a137c119abf15465d1771319dfe9e4acbb31722a0fff91/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5", size = 28462 }, - { url = "https://files.pythonhosted.org/packages/2d/75/fd6cb2e68780f72d47e6671840ca517bda5ef663d30ada7616b0462ad1e3/MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b", size = 27869 }, - { url = "https://files.pythonhosted.org/packages/b0/81/147c477391c2750e8fc7705829f7351cf1cd3be64406edcf900dc633feb2/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a", size = 33906 }, - { url = "https://files.pythonhosted.org/packages/8b/ff/9a52b71839d7a256b563e85d11050e307121000dcebc97df120176b3ad93/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f", size = 32296 }, - { url = "https://files.pythonhosted.org/packages/88/07/2dc76aa51b481eb96a4c3198894f38b480490e834479611a4053fbf08623/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", size = 33038 }, - { url = "https://files.pythonhosted.org/packages/96/0c/620c1fb3661858c0e37eb3cbffd8c6f732a67cd97296f725789679801b31/MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", size = 16572 }, - { url = "https://files.pythonhosted.org/packages/3f/14/c3554d512d5f9100a95e737502f4a2323a1959f6d0d01e0d0997b35f7b10/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", size = 17127 }, - { url = "https://files.pythonhosted.org/packages/f8/ff/2c942a82c35a49df5de3a630ce0a8456ac2969691b230e530ac12314364c/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", size = 18192 }, - { url = "https://files.pythonhosted.org/packages/4f/14/6f294b9c4f969d0c801a4615e221c1e084722ea6114ab2114189c5b8cbe0/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", size = 14072 }, - { url = "https://files.pythonhosted.org/packages/81/d4/fd74714ed30a1dedd0b82427c02fa4deec64f173831ec716da11c51a50aa/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", size = 26928 }, - { url = "https://files.pythonhosted.org/packages/c7/bd/50319665ce81bb10e90d1cf76f9e1aa269ea6f7fa30ab4521f14d122a3df/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", size = 26106 }, - { url = "https://files.pythonhosted.org/packages/4c/6f/f2b0f675635b05f6afd5ea03c094557bdb8622fa8e673387444fe8d8e787/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68", size = 25781 }, - { url = "https://files.pythonhosted.org/packages/51/e0/393467cf899b34a9d3678e78961c2c8cdf49fb902a959ba54ece01273fb1/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", size = 30518 }, - { url = "https://files.pythonhosted.org/packages/f6/02/5437e2ad33047290dafced9df741d9efc3e716b75583bbd73a9984f1b6f7/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", size = 29669 }, - { url = "https://files.pythonhosted.org/packages/0e/7d/968284145ffd9d726183ed6237c77938c021abacde4e073020f920e060b2/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", size = 29933 }, - { url = "https://files.pythonhosted.org/packages/bf/f3/ecb00fc8ab02b7beae8699f34db9357ae49d9f21d4d3de6f305f34fa949e/MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", size = 16656 }, - { url = "https://files.pythonhosted.org/packages/92/21/357205f03514a49b293e214ac39de01fadd0970a6e05e4bf1ddd0ffd0881/MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", size = 17206 }, - { url = "https://files.pythonhosted.org/packages/0f/31/780bb297db036ba7b7bbede5e1d7f1e14d704ad4beb3ce53fb495d22bc62/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", size = 18193 }, - { url = "https://files.pythonhosted.org/packages/6c/77/d77701bbef72892affe060cdacb7a2ed7fd68dae3b477a8642f15ad3b132/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", size = 14073 }, - { url = "https://files.pythonhosted.org/packages/d9/a7/1e558b4f78454c8a3a0199292d96159eb4d091f983bc35ef258314fe7269/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", size = 26486 }, - { url = "https://files.pythonhosted.org/packages/5f/5a/360da85076688755ea0cceb92472923086993e86b5613bbae9fbc14136b0/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3", size = 25685 }, - { url = "https://files.pythonhosted.org/packages/6a/18/ae5a258e3401f9b8312f92b028c54d7026a97ec3ab20bfaddbdfa7d8cce8/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465", size = 25338 }, - { url = "https://files.pythonhosted.org/packages/0b/cc/48206bd61c5b9d0129f4d75243b156929b04c94c09041321456fd06a876d/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e", size = 30439 }, - { url = "https://files.pythonhosted.org/packages/d1/06/a41c112ab9ffdeeb5f77bc3e331fdadf97fa65e52e44ba31880f4e7f983c/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea", size = 29531 }, - { url = "https://files.pythonhosted.org/packages/02/8c/ab9a463301a50dab04d5472e998acbd4080597abc048166ded5c7aa768c8/MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6", size = 29823 }, - { url = "https://files.pythonhosted.org/packages/bc/29/9bc18da763496b055d8e98ce476c8e718dcfd78157e17f555ce6dd7d0895/MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf", size = 16658 }, - { url = "https://files.pythonhosted.org/packages/f6/f8/4da07de16f10551ca1f640c92b5f316f9394088b183c6a57183df6de5ae4/MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5", size = 17211 }, -] - [[package]] name = "markupsafe" version = "3.0.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } wheels = [ { url = "https://files.pythonhosted.org/packages/04/90/d08277ce111dd22f77149fd1a5d4653eeb3b3eaacbdfcbae5afb2600eebd/MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8", size = 14357 }, @@ -832,18 +636,15 @@ name = "myst-parser" version = "3.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, { name = "jinja2", marker = "python_full_version < '3.10'" }, { name = "markdown-it-py", marker = "python_full_version < '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version < '3.10'" }, { name = "pyyaml", marker = "python_full_version < '3.10'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/64/e2f13dac02f599980798c01156393b781aec983b52a6e4057ee58f07c43a/myst_parser-3.0.1.tar.gz", hash = "sha256:88f0cb406cb363b077d176b51c476f62d60604d68a8dcdf4832e080441301a87", size = 92392 } wheels = [ @@ -860,7 +661,7 @@ resolution-markers = [ "python_full_version == '3.10.*'", ] dependencies = [ - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, { name = "mdit-py-plugins", marker = "python_full_version >= '3.10'" }, @@ -872,50 +673,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/b4/b036f8fdb667587bb37df29dc6644681dd78b7a2a6321a34684b79412b28/myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d", size = 84563 }, ] -[[package]] -name = "numpy" -version = "1.24.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140 }, - { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297 }, - { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611 }, - { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357 }, - { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222 }, - { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514 }, - { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508 }, - { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033 }, - { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951 }, - { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923 }, - { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446 }, - { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466 }, - { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722 }, - { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102 }, - { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616 }, - { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263 }, - { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660 }, - { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112 }, - { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549 }, - { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950 }, - { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228 }, - { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170 }, - { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918 }, - { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441 }, - { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590 }, - { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744 }, - { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290 }, -] - [[package]] name = "numpy" version = "2.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015 } wheels = [ @@ -1041,63 +804,16 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] -[[package]] -name = "pandas" -version = "2.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "python-dateutil", marker = "python_full_version < '3.9'" }, - { name = "pytz", marker = "python_full_version < '3.9'" }, - { name = "tzdata", marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/a7/824332581e258b5aa4f3763ecb2a797e5f9a54269044ba2e50ac19936b32/pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c", size = 5284455 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3c/b2/0d4a5729ce1ce11630c4fc5d5522a33b967b3ca146c210f58efde7c40e99/pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8", size = 11760908 }, - { url = "https://files.pythonhosted.org/packages/4a/f6/f620ca62365d83e663a255a41b08d2fc2eaf304e0b8b21bb6d62a7390fe3/pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f", size = 10823486 }, - { url = "https://files.pythonhosted.org/packages/c2/59/cb4234bc9b968c57e81861b306b10cd8170272c57b098b724d3de5eda124/pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183", size = 11571897 }, - { url = "https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0", size = 12306421 }, - { url = "https://files.pythonhosted.org/packages/94/71/3a0c25433c54bb29b48e3155b959ac78f4c4f2f06f94d8318aac612cb80f/pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210", size = 9540792 }, - { url = "https://files.pythonhosted.org/packages/ed/30/b97456e7063edac0e5a405128065f0cd2033adfe3716fb2256c186bd41d0/pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e", size = 10664333 }, - { url = "https://files.pythonhosted.org/packages/b3/92/a5e5133421b49e901a12e02a6a7ef3a0130e10d13db8cb657fdd0cba3b90/pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8", size = 11645672 }, - { url = "https://files.pythonhosted.org/packages/8f/bb/aea1fbeed5b474cb8634364718abe9030d7cc7a30bf51f40bd494bbc89a2/pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26", size = 10693229 }, - { url = "https://files.pythonhosted.org/packages/d6/90/e7d387f1a416b14e59290baa7a454a90d719baebbf77433ff1bdcc727800/pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d", size = 11581591 }, - { url = "https://files.pythonhosted.org/packages/d0/28/88b81881c056376254618fad622a5e94b5126db8c61157ea1910cd1c040a/pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df", size = 12219370 }, - { url = "https://files.pythonhosted.org/packages/e4/a5/212b9039e25bf8ebb97e417a96660e3dc925dacd3f8653d531b8f7fd9be4/pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd", size = 9482935 }, - { url = "https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b", size = 10607692 }, - { url = "https://files.pythonhosted.org/packages/78/a8/07dd10f90ca915ed914853cd57f79bfc22e1ef4384ab56cb4336d2fc1f2a/pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061", size = 11653303 }, - { url = "https://files.pythonhosted.org/packages/53/c3/f8e87361f7fdf42012def602bfa2a593423c729f5cb7c97aed7f51be66ac/pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5", size = 10710932 }, - { url = "https://files.pythonhosted.org/packages/a7/87/828d50c81ce0f434163bf70b925a0eec6076808e0bca312a79322b141f66/pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089", size = 11684018 }, - { url = "https://files.pythonhosted.org/packages/f8/7f/5b047effafbdd34e52c9e2d7e44f729a0655efafb22198c45cf692cdc157/pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0", size = 12353723 }, - { url = "https://files.pythonhosted.org/packages/ea/ae/26a2eda7fa581347d69e51f93892493b2074ef3352ac71033c9f32c52389/pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02", size = 9646403 }, - { url = "https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78", size = 10777638 }, - { url = "https://files.pythonhosted.org/packages/f8/c7/cfef920b7b457dff6928e824896cb82367650ea127d048ee0b820026db4f/pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b", size = 11834160 }, - { url = "https://files.pythonhosted.org/packages/6c/1c/689c9d99bc4e5d366a5fd871f0bcdee98a6581e240f96b78d2d08f103774/pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e", size = 10862752 }, - { url = "https://files.pythonhosted.org/packages/cc/b8/4d082f41c27c95bf90485d1447b647cc7e5680fea75e315669dc6e4cb398/pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b", size = 11715852 }, - { url = "https://files.pythonhosted.org/packages/9e/0d/91a9fd2c202f2b1d97a38ab591890f86480ecbb596cbc56d035f6f23fdcc/pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641", size = 12398496 }, - { url = "https://files.pythonhosted.org/packages/26/7d/d8aa0a2c4f3f5f8ea59fb946c8eafe8f508090ca73e2b08a9af853c1103e/pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682", size = 9630766 }, - { url = "https://files.pythonhosted.org/packages/9a/f2/0ad053856debbe90c83de1b4f05915f85fd2146f20faf9daa3b320d36df3/pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc", size = 10755902 }, -] - [[package]] name = "pandas" version = "2.2.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "python-dateutil", marker = "python_full_version >= '3.9'" }, - { name = "pytz", marker = "python_full_version >= '3.9'" }, - { name = "tzdata", marker = "python_full_version >= '3.9'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } wheels = [ @@ -1213,65 +929,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, ] -[[package]] -name = "pyarrow" -version = "17.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/27/4e/ea6d43f324169f8aec0e57569443a38bab4b398d09769ca64f7b4d467de3/pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28", size = 1112479 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/5d/78d4b040bc5ff2fc6c3d03e80fca396b742f6c125b8af06bcf7427f931bc/pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07", size = 28994846 }, - { url = "https://files.pythonhosted.org/packages/3b/73/8ed168db7642e91180330e4ea9f3ff8bab404678f00d32d7df0871a4933b/pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655", size = 27165908 }, - { url = "https://files.pythonhosted.org/packages/81/36/e78c24be99242063f6d0590ef68c857ea07bdea470242c361e9a15bd57a4/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545", size = 39264209 }, - { url = "https://files.pythonhosted.org/packages/18/4c/3db637d7578f683b0a8fb8999b436bdbedd6e3517bd4f90c70853cf3ad20/pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2", size = 39862883 }, - { url = "https://files.pythonhosted.org/packages/81/3c/0580626896c842614a523e66b351181ed5bb14e5dfc263cd68cea2c46d90/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8", size = 38723009 }, - { url = "https://files.pythonhosted.org/packages/ee/fb/c1b47f0ada36d856a352da261a44d7344d8f22e2f7db3945f8c3b81be5dd/pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047", size = 39855626 }, - { url = "https://files.pythonhosted.org/packages/19/09/b0a02908180a25d57312ab5919069c39fddf30602568980419f4b02393f6/pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087", size = 25147242 }, - { url = "https://files.pythonhosted.org/packages/f9/46/ce89f87c2936f5bb9d879473b9663ce7a4b1f4359acc2f0eb39865eaa1af/pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977", size = 29028748 }, - { url = "https://files.pythonhosted.org/packages/8d/8e/ce2e9b2146de422f6638333c01903140e9ada244a2a477918a368306c64c/pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3", size = 27190965 }, - { url = "https://files.pythonhosted.org/packages/3b/c8/5675719570eb1acd809481c6d64e2136ffb340bc387f4ca62dce79516cea/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15", size = 39269081 }, - { url = "https://files.pythonhosted.org/packages/5e/78/3931194f16ab681ebb87ad252e7b8d2c8b23dad49706cadc865dff4a1dd3/pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597", size = 39864921 }, - { url = "https://files.pythonhosted.org/packages/d8/81/69b6606093363f55a2a574c018901c40952d4e902e670656d18213c71ad7/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420", size = 38740798 }, - { url = "https://files.pythonhosted.org/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4", size = 39871877 }, - { url = "https://files.pythonhosted.org/packages/30/d1/63a7c248432c71c7d3ee803e706590a0b81ce1a8d2b2ae49677774b813bb/pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03", size = 25151089 }, - { url = "https://files.pythonhosted.org/packages/d4/62/ce6ac1275a432b4a27c55fe96c58147f111d8ba1ad800a112d31859fae2f/pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22", size = 29019418 }, - { url = "https://files.pythonhosted.org/packages/8e/0a/dbd0c134e7a0c30bea439675cc120012337202e5fac7163ba839aa3691d2/pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053", size = 27152197 }, - { url = "https://files.pythonhosted.org/packages/cb/05/3f4a16498349db79090767620d6dc23c1ec0c658a668d61d76b87706c65d/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a", size = 39263026 }, - { url = "https://files.pythonhosted.org/packages/c2/0c/ea2107236740be8fa0e0d4a293a095c9f43546a2465bb7df34eee9126b09/pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc", size = 39880798 }, - { url = "https://files.pythonhosted.org/packages/f6/b0/b9164a8bc495083c10c281cc65064553ec87b7537d6f742a89d5953a2a3e/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a", size = 38715172 }, - { url = "https://files.pythonhosted.org/packages/f1/c4/9625418a1413005e486c006e56675334929fad864347c5ae7c1b2e7fe639/pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b", size = 39874508 }, - { url = "https://files.pythonhosted.org/packages/ae/49/baafe2a964f663413be3bd1cf5c45ed98c5e42e804e2328e18f4570027c1/pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7", size = 25099235 }, - { url = "https://files.pythonhosted.org/packages/8d/bd/8f52c1d7b430260f80a349cffa2df351750a737b5336313d56dcadeb9ae1/pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204", size = 28999345 }, - { url = "https://files.pythonhosted.org/packages/64/d9/51e35550f2f18b8815a2ab25948f735434db32000c0e91eba3a32634782a/pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8", size = 27168441 }, - { url = "https://files.pythonhosted.org/packages/18/d8/7161d87d07ea51be70c49f615004c1446d5723622a18b2681f7e4b71bf6e/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155", size = 39363163 }, - { url = "https://files.pythonhosted.org/packages/3f/08/bc497130789833de09e345e3ce4647e3ce86517c4f70f2144f0367ca378b/pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145", size = 39965253 }, - { url = "https://files.pythonhosted.org/packages/d3/2e/493dd7db889402b4c7871ca7dfdd20f2c5deedbff802d3eb8576359930f9/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c", size = 38805378 }, - { url = "https://files.pythonhosted.org/packages/e6/c1/4c6bcdf7a820034aa91a8b4d25fef38809be79b42ca7aaa16d4680b0bbac/pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c", size = 39958364 }, - { url = "https://files.pythonhosted.org/packages/d1/db/42ac644453cfdfc60fe002b46d647fe7a6dfad753ef7b28e99b4c936ad5d/pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca", size = 25229211 }, - { url = "https://files.pythonhosted.org/packages/43/e0/a898096d35be240aa61fb2d54db58b86d664b10e1e51256f9300f47565e8/pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb", size = 29007881 }, - { url = "https://files.pythonhosted.org/packages/59/22/f7d14907ed0697b5dd488d393129f2738629fa5bcba863e00931b7975946/pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df", size = 27178117 }, - { url = "https://files.pythonhosted.org/packages/bf/ee/661211feac0ed48467b1d5c57298c91403809ec3ab78b1d175e1d6ad03cf/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687", size = 39273896 }, - { url = "https://files.pythonhosted.org/packages/af/61/bcd9b58e38ead6ad42b9ed00da33a3f862bc1d445e3d3164799c25550ac2/pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b", size = 39875438 }, - { url = "https://files.pythonhosted.org/packages/75/63/29d1bfcc57af73cde3fc3baccab2f37548de512dbe0ab294b033cd203516/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5", size = 38735092 }, - { url = "https://files.pythonhosted.org/packages/39/f4/90258b4de753df7cc61cefb0312f8abcf226672e96cc64996e66afce817a/pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda", size = 39867610 }, - { url = "https://files.pythonhosted.org/packages/e7/f6/b75d4816c32f1618ed31a005ee635dd1d91d8164495d94f2ea092f594661/pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204", size = 25148611 }, -] - [[package]] name = "pyarrow" version = "18.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/7f/7b/640785a9062bb00314caa8a387abce547d2a420cf09bd6c715fe659ccffb/pyarrow-18.1.0.tar.gz", hash = "sha256:9386d3ca9c145b5539a1cfc75df07757dff870168c959b473a0bccbc3abc8c73", size = 1118671 } wheels = [ { url = "https://files.pythonhosted.org/packages/1a/bb/8d4a1573f66e0684f190dd2b55fd0b97a7214de8882d58a3867e777bf640/pyarrow-18.1.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e21488d5cfd3d8b500b3238a6c4b075efabc18f0f6d80b29239737ebd69caa6c", size = 29531620 }, @@ -1332,10 +993,8 @@ version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "beautifulsoup4" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "docutils" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fc/d6/3921de802cf1ee771f0e76c9068b52498aeb8eeec6b830ff931c81c7ecf3/pydata_sphinx_theme-0.8.0.tar.gz", hash = "sha256:9f72015d9c572ea92e3007ab221a8325767c426783b6b9941813e65fa988dc90", size = 1123746 } @@ -1349,13 +1008,11 @@ version = "2.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "deprecated" }, - { name = "pyjwt", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version < '3.9'" }, - { name = "pyjwt", version = "2.10.1", source = { registry = "https://pypi.org/simple" }, extra = ["crypto"], marker = "python_full_version >= '3.9'" }, + { name = "pyjwt", extra = ["crypto"] }, { name = "pynacl" }, { name = "requests" }, { name = "typing-extensions" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/16/ce/aa91d30040d9552c274e7ea8bd10a977600d508d579a4bb262b95eccf961/pygithub-2.5.0.tar.gz", hash = "sha256:e1613ac508a9be710920d26eb18b1905ebd9926aa49398e88151c1b526aad3cf", size = 3552804 } wheels = [ @@ -1371,33 +1028,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] -[[package]] -name = "pyjwt" -version = "2.9.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/fb/68/ce067f09fca4abeca8771fe667d89cc347d1e99da3e093112ac329c6020e/pyjwt-2.9.0.tar.gz", hash = "sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c", size = 78825 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/84/0fdf9b18ba31d69877bd39c9cd6052b47f3761e9910c15de788e519f079f/PyJWT-2.9.0-py3-none-any.whl", hash = "sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850", size = 22344 }, -] - -[package.optional-dependencies] -crypto = [ - { name = "cryptography", marker = "python_full_version < '3.9'" }, -] - [[package]] name = "pyjwt" version = "2.10.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785 } wheels = [ { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997 }, @@ -1405,7 +1039,7 @@ wheels = [ [package.optional-dependencies] crypto = [ - { name = "cryptography", marker = "python_full_version >= '3.9'" }, + { name = "cryptography" }, ] [[package]] @@ -1508,13 +1142,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597 }, { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527 }, { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446 }, - { url = "https://files.pythonhosted.org/packages/74/d9/323a59d506f12f498c2097488d80d16f4cf965cee1791eab58b56b19f47a/PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", size = 183218 }, - { url = "https://files.pythonhosted.org/packages/74/cc/20c34d00f04d785f2028737e2e2a8254e1425102e730fee1d6396f832577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", size = 728067 }, - { url = "https://files.pythonhosted.org/packages/20/52/551c69ca1501d21c0de51ddafa8c23a0191ef296ff098e98358f69080577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", size = 757812 }, - { url = "https://files.pythonhosted.org/packages/fd/7f/2c3697bba5d4aa5cc2afe81826d73dfae5f049458e44732c7a0938baa673/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", size = 746531 }, - { url = "https://files.pythonhosted.org/packages/8c/ab/6226d3df99900e580091bb44258fde77a8433511a86883bd4681ea19a858/PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", size = 800820 }, - { url = "https://files.pythonhosted.org/packages/a0/99/a9eb0f3e710c06c5d922026f6736e920d431812ace24aae38228d0d64b04/PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", size = 145514 }, - { url = "https://files.pythonhosted.org/packages/75/8a/ee831ad5fafa4431099aa4e078d4c8efd43cd5e48fbc774641d233b683a9/PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", size = 162702 }, { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777 }, { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318 }, { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891 }, @@ -1534,8 +1161,7 @@ dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, { name = "idna" }, - { name = "urllib3", version = "2.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "urllib3", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218 } wheels = [ @@ -1567,28 +1193,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/94/0498cdb7316ed67a1928300dd87d659c933479f44dec51b4f62bfd1f8028/ruff-0.9.1-py3-none-win_arm64.whl", hash = "sha256:1cd76c7f9c679e6e8f2af8f778367dca82b95009bc7b1a85a47f1521ae524fa7", size = 9145708 }, ] -[[package]] -name = "setuptools" -version = "75.3.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/22/a438e0caa4576f8c383fa4d35f1cc01655a46c75be358960d815bfbb12bd/setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686", size = 1351577 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/12/282ee9bce8b58130cb762fbc9beabd531549952cac11fc56add11dcb7ea0/setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd", size = 1251070 }, -] - [[package]] name = "setuptools" version = "75.8.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222 } wheels = [ { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782 }, @@ -1621,63 +1229,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186 }, ] -[[package]] -name = "sphinx" -version = "7.1.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "alabaster", version = "0.7.13", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "babel", marker = "python_full_version < '3.9'" }, - { name = "colorama", marker = "python_full_version < '3.9' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.20.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "imagesize", marker = "python_full_version < '3.9'" }, - { name = "importlib-metadata", marker = "python_full_version < '3.9'" }, - { name = "jinja2", marker = "python_full_version < '3.9'" }, - { name = "packaging", marker = "python_full_version < '3.9'" }, - { name = "pygments", marker = "python_full_version < '3.9'" }, - { name = "requests", marker = "python_full_version < '3.9'" }, - { name = "snowballstemmer", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-applehelp", version = "1.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-devhelp", version = "1.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-qthelp", version = "1.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinxcontrib-serializinghtml", version = "1.1.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/01/688bdf9282241dca09fe6e3a1110eda399fa9b10d0672db609e37c2e7a39/sphinx-7.1.2.tar.gz", hash = "sha256:780f4d32f1d7d1126576e0e5ecc19dc32ab76cd24e950228dcf7b1f6d3d9e22f", size = 6828258 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/48/17/325cf6a257d84751a48ae90752b3d8fe0be8f9535b6253add61c49d0d9bc/sphinx-7.1.2-py3-none-any.whl", hash = "sha256:d170a81825b2fcacb6dfd5a0d7f578a053e45d3f2b153fecc948c37344eb4cbe", size = 3169543 }, -] - [[package]] name = "sphinx" version = "7.4.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "babel", marker = "python_full_version == '3.9.*'" }, - { name = "colorama", marker = "python_full_version == '3.9.*' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "imagesize", marker = "python_full_version == '3.9.*'" }, - { name = "importlib-metadata", marker = "python_full_version == '3.9.*'" }, - { name = "jinja2", marker = "python_full_version == '3.9.*'" }, - { name = "packaging", marker = "python_full_version == '3.9.*'" }, - { name = "pygments", marker = "python_full_version == '3.9.*'" }, - { name = "requests", marker = "python_full_version == '3.9.*'" }, - { name = "snowballstemmer", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "tomli", marker = "python_full_version == '3.9.*'" }, + { name = "alabaster", version = "0.7.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "babel", marker = "python_full_version < '3.10'" }, + { name = "colorama", marker = "python_full_version < '3.10' and sys_platform == 'win32'" }, + { name = "docutils", marker = "python_full_version < '3.10'" }, + { name = "imagesize", marker = "python_full_version < '3.10'" }, + { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, + { name = "jinja2", marker = "python_full_version < '3.10'" }, + { name = "packaging", marker = "python_full_version < '3.10'" }, + { name = "pygments", marker = "python_full_version < '3.10'" }, + { name = "requests", marker = "python_full_version < '3.10'" }, + { name = "snowballstemmer", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.10'" }, + { name = "tomli", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/be/50e50cb4f2eff47df05673d361095cafd95521d2a22521b920c67a372dcb/sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe", size = 8067911 } wheels = [ @@ -1697,19 +1274,19 @@ dependencies = [ { name = "alabaster", version = "1.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "babel", marker = "python_full_version >= '3.10'" }, { name = "colorama", marker = "python_full_version >= '3.10' and sys_platform == 'win32'" }, - { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "docutils", marker = "python_full_version >= '3.10'" }, { name = "imagesize", marker = "python_full_version >= '3.10'" }, { name = "jinja2", marker = "python_full_version >= '3.10'" }, { name = "packaging", marker = "python_full_version >= '3.10'" }, { name = "pygments", marker = "python_full_version >= '3.10'" }, { name = "requests", marker = "python_full_version >= '3.10'" }, { name = "snowballstemmer", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-applehelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-devhelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-htmlhelp", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.10'" }, { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-qthelp", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "sphinxcontrib-serializinghtml", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.10'" }, + { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.10'" }, { name = "tomli", marker = "python_full_version == '3.10.*'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611 } @@ -1722,97 +1299,40 @@ name = "sphinx-autoapi" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "astroid", version = "3.2.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "astroid", version = "3.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "astroid" }, { name = "jinja2" }, { name = "pyyaml" }, - { name = "sphinx", version = "7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "sphinx", version = "7.4.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "stdlib-list", version = "0.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, - { name = "stdlib-list", version = "0.11.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, + { name = "stdlib-list", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4a/eb/cc243583bb1d518ca3b10998c203d919a8ed90affd4831f2b61ad09043d2/sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c", size = 29292 } wheels = [ { url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095 }, ] -[[package]] -name = "sphinxcontrib-applehelp" -version = "1.0.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/32/df/45e827f4d7e7fcc84e853bcef1d836effd762d63ccb86f43ede4e98b478c/sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e", size = 24766 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/c1/5e2cafbd03105ce50d8500f9b4e8a6e8d02e22d0475b574c3b3e9451a15f/sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228", size = 120601 }, -] - [[package]] name = "sphinxcontrib-applehelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053 } wheels = [ { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300 }, ] -[[package]] -name = "sphinxcontrib-devhelp" -version = "1.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/98/33/dc28393f16385f722c893cb55539c641c9aaec8d1bc1c15b69ce0ac2dbb3/sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4", size = 17398 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/09/5de5ed43a521387f18bdf5f5af31d099605c992fd25372b2b9b825ce48ee/sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", size = 84690 }, -] - [[package]] name = "sphinxcontrib-devhelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967 } wheels = [ { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530 }, ] -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.0.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/47/64cff68ea3aa450c373301e5bebfbb9fce0a3e70aca245fcadd4af06cd75/sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff", size = 27967 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6e/ee/a1f5e39046cbb5f8bc8fba87d1ddf1c6643fbc9194e58d26e606de4b9074/sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903", size = 99833 }, -] - [[package]] name = "sphinxcontrib-htmlhelp" version = "2.1.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617 } wheels = [ { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705 }, @@ -1827,55 +1347,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071 }, ] -[[package]] -name = "sphinxcontrib-qthelp" -version = "1.0.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b1/8e/c4846e59f38a5f2b4a0e3b27af38f2fcf904d4bfd82095bf92de0b114ebd/sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", size = 21658 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/14/05f9206cf4e9cfca1afb5fd224c7cd434dcc3a433d6d9e4e0264d29c6cdb/sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6", size = 90609 }, -] - [[package]] name = "sphinxcontrib-qthelp" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165 } wheels = [ { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743 }, ] -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "1.1.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b5/72/835d6fadb9e5d02304cf39b18f93d227cd93abd3c41ebf58e6853eeb1455/sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952", size = 21019 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/77/5464ec50dd0f1c1037e3c93249b040c8fc8078fdda97530eeb02424b6eea/sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", size = 94021 }, -] - [[package]] name = "sphinxcontrib-serializinghtml" version = "2.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080 } wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072 }, @@ -1895,25 +1379,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, ] -[[package]] -name = "stdlib-list" -version = "0.10.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/39/bb/1cdbc326a5ab0026602e0489cbf02357e78140253c4b57cd866d380eb355/stdlib_list-0.10.0.tar.gz", hash = "sha256:6519c50d645513ed287657bfe856d527f277331540691ddeaf77b25459964a14", size = 59447 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/13/d9/9085375f0d23a4896b307bf14dcc61b49ec8cc67cb33e06cf95bf3af3966/stdlib_list-0.10.0-py3-none-any.whl", hash = "sha256:b3a911bc441d03e0332dd1a9e7d0870ba3bb0a542a74d7524f54fb431256e214", size = 79814 }, -] - [[package]] name = "stdlib-list" version = "0.11.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/5d/04/6b37a71e92ddca16b190b7df62494ac4779d58ced4787f73584eb32c8f03/stdlib_list-0.11.0.tar.gz", hash = "sha256:b74a7b643a77a12637e907f3f62f0ab9f67300bce4014f6b2d3c8b4c8fd63c66", size = 60335 } wheels = [ { url = "https://files.pythonhosted.org/packages/16/fe/e07300c027a868d32d8ed7a425503401e91a03ff90e7ca525c115c634ffb/stdlib_list-0.11.0-py3-none-any.whl", hash = "sha256:8bf8decfffaaf273d4cfeb5bd852b910a00dec1037dcf163576803622bccf597", size = 83617 }, @@ -1994,28 +1463,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a6/ab/7e5f53c3b9d14972843a647d8d7a853969a58aecc7559cb3267302c94774/tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd", size = 346586 }, ] -[[package]] -name = "urllib3" -version = "2.2.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/ed/63/22ba4ebfe7430b76388e7cd448d5478814d3032121827c12a2cc287e2260/urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9", size = 300677 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ce/d9/5f4c13cecde62396b0d3fe530a50ccea91e7dfc1ccf0e09c228841bb5ba8/urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac", size = 126338 }, -] - [[package]] name = "urllib3" version = "2.3.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, @@ -2091,17 +1542,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/b1/0bb11e29aa5139d90b770ebbfa167267b1fc548d2302c30c8f7572851738/wrapt-1.17.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c82b8785d98cdd9fed4cac84d765d234ed3251bd6afe34cb7ac523cb93e8b4f", size = 106377 }, { url = "https://files.pythonhosted.org/packages/6a/e1/0122853035b40b3f333bbb25f1939fc1045e21dd518f7f0922b60c156f7c/wrapt-1.17.2-cp313-cp313t-win32.whl", hash = "sha256:13e6afb7fe71fe7485a4550a8844cc9ffbe263c0f1a1eea569bc7091d4898555", size = 37986 }, { url = "https://files.pythonhosted.org/packages/09/5e/1655cf481e079c1f22d0cabdd4e51733679932718dc23bf2db175f329b76/wrapt-1.17.2-cp313-cp313t-win_amd64.whl", hash = "sha256:eaf675418ed6b3b31c7a989fd007fa7c3be66ce14e5c3b27336383604c9da85c", size = 40750 }, - { url = "https://files.pythonhosted.org/packages/0c/66/95b9e90e6e1274999b183c9c3f984996d870e933ca9560115bd1cd1d6f77/wrapt-1.17.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5c803c401ea1c1c18de70a06a6f79fcc9c5acfc79133e9869e730ad7f8ad8ef9", size = 53234 }, - { url = "https://files.pythonhosted.org/packages/a4/b6/6eced5e2db5924bf6d9223d2bb96b62e00395aae77058e6a9e11bf16b3bd/wrapt-1.17.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f917c1180fdb8623c2b75a99192f4025e412597c50b2ac870f156de8fb101119", size = 38462 }, - { url = "https://files.pythonhosted.org/packages/5d/a4/c8472fe2568978b5532df84273c53ddf713f689d408a4335717ab89547e0/wrapt-1.17.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ecc840861360ba9d176d413a5489b9a0aff6d6303d7e733e2c4623cfa26904a6", size = 38730 }, - { url = "https://files.pythonhosted.org/packages/3c/70/1d259c6b1ad164eb23ff70e3e452dd1950f96e6473f72b7207891d0fd1f0/wrapt-1.17.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb87745b2e6dc56361bfde481d5a378dc314b252a98d7dd19a651a3fa58f24a9", size = 86225 }, - { url = "https://files.pythonhosted.org/packages/a9/68/6b83367e1afb8de91cbea4ef8e85b58acdf62f034f05d78c7b82afaa23d8/wrapt-1.17.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58455b79ec2661c3600e65c0a716955adc2410f7383755d537584b0de41b1d8a", size = 78055 }, - { url = "https://files.pythonhosted.org/packages/0d/21/09573d2443916705c57fdab85d508f592c0a58d57becc53e15755d67fba2/wrapt-1.17.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4e42a40a5e164cbfdb7b386c966a588b1047558a990981ace551ed7e12ca9c2", size = 85592 }, - { url = "https://files.pythonhosted.org/packages/45/ce/700e17a852dd5dec894e241c72973ea82363486bcc1fb05d47b4fbd1d683/wrapt-1.17.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:91bd7d1773e64019f9288b7a5101f3ae50d3d8e6b1de7edee9c2ccc1d32f0c0a", size = 83906 }, - { url = "https://files.pythonhosted.org/packages/37/14/bd210faf0a66faeb8529d42b6b45a25d6aa6ce25ddfc19168e4161aed227/wrapt-1.17.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:bb90fb8bda722a1b9d48ac1e6c38f923ea757b3baf8ebd0c82e09c5c1a0e7a04", size = 76763 }, - { url = "https://files.pythonhosted.org/packages/34/0c/85af70d291f44659c422416f0272046109e785bf6db8c081cfeeae5715c5/wrapt-1.17.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:08e7ce672e35efa54c5024936e559469436f8b8096253404faeb54d2a878416f", size = 83573 }, - { url = "https://files.pythonhosted.org/packages/f8/1e/b215068e824878f69ea945804fa26c176f7c2735a3ad5367d78930bd076a/wrapt-1.17.2-cp38-cp38-win32.whl", hash = "sha256:410a92fefd2e0e10d26210e1dfb4a876ddaf8439ef60d6434f21ef8d87efc5b7", size = 36408 }, - { url = "https://files.pythonhosted.org/packages/52/27/3dd9ad5f1097b33c95d05929e409cc86d7c765cb5437b86694dc8f8e9af0/wrapt-1.17.2-cp38-cp38-win_amd64.whl", hash = "sha256:95c658736ec15602da0ed73f312d410117723914a5c91a14ee4cdd72f1d790b3", size = 38737 }, { url = "https://files.pythonhosted.org/packages/8a/f4/6ed2b8f6f1c832933283974839b88ec7c983fd12905e01e97889dadf7559/wrapt-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99039fa9e6306880572915728d7f6c24a86ec57b0a83f6b2491e1d8ab0235b9a", size = 53308 }, { url = "https://files.pythonhosted.org/packages/a2/a9/712a53f8f4f4545768ac532619f6e56d5d0364a87b2212531685e89aeef8/wrapt-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2696993ee1eebd20b8e4ee4356483c4cb696066ddc24bd70bcbb80fa56ff9061", size = 38489 }, { url = "https://files.pythonhosted.org/packages/fa/9b/e172c8f28a489a2888df18f953e2f6cb8d33b1a2e78c9dfc52d8bf6a5ead/wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:612dff5db80beef9e649c6d803a8d50c409082f1fedc9dbcdfde2983b2025b82", size = 38776 }, @@ -2116,25 +1556,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2d/82/f56956041adef78f849db6b289b282e72b55ab8045a75abad81898c28d19/wrapt-1.17.2-py3-none-any.whl", hash = "sha256:b18f2d1533a71f069c7f82d524a52599053d4c7166e9dd374ae2136b7f40f7c8", size = 23594 }, ] -[[package]] -name = "zipp" -version = "3.20.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/54/bf/5c0000c44ebc80123ecbdddba1f5dcd94a5ada602a9c225d84b5aaa55e86/zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29", size = 24199 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/62/8b/5ba542fa83c90e09eac972fc9baca7a88e7e7ca4b221a89251954019308b/zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350", size = 9200 }, -] - [[package]] name = "zipp" version = "3.21.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version == '3.9.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3f/50/bad581df71744867e9468ebd0bcd6505de3b275e06f202c2cb016e3ff56f/zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4", size = 24545 } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/1a/7e4798e9339adc931158c9d69ecc34f5e6791489d469f5e50ec15e35f458/zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931", size = 9630 }, From b194a8772e58ccefc697e11671113127a8038716 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 12 Mar 2025 14:25:32 -0400 Subject: [PATCH 13/22] feat/improve ruff test coverage (#1055) * Run python tests on all currently supported python versions * Update ruff checks to select all * Ruff auto fix * Applying ruff suggestions * noqa rules updates per ruff checks * Working through more ruff suggestions * Working through more ruff suggestions * update timestamps on tests * More ruff updates * More ruff updates * Instead of importing udf static functions as variables, import * More ruff formatting suggestions * more ruff formatting suggestions * More ruff formatting * More ruff formatting * Cut off lint errors for this PR * Working through more ruff checks and disabling a bunch for now * Address CI difference from local ruff * UDWF isn't a proper abstract base class right now since users can opt in to all methods * Update pre-commit to match the version of ruff used in CI * To enable testing in python 3.9 we need numpy. Also going to the current minimal supported version * Update min requried version of python to 3.9 in pyproject.toml. The other changes will come in #1043 that is soon to be merged. * Suppress UP035 * ruff format --- .github/workflows/test.yaml | 2 + .pre-commit-config.yaml | 2 +- benchmarks/tpch/tpch.py | 14 +- dev/release/check-rat-report.py | 2 +- dev/release/generate-changelog.py | 10 +- docs/source/conf.py | 2 +- examples/python-udwf.py | 2 +- examples/tpch/_tests.py | 15 +- pyproject.toml | 76 +++++- python/datafusion/__init__.py | 50 ++-- python/datafusion/common.py | 14 +- python/datafusion/context.py | 4 +- python/datafusion/dataframe.py | 15 +- python/datafusion/expr.py | 94 +++---- python/datafusion/functions.py | 46 ++-- python/datafusion/input/__init__.py | 2 +- python/datafusion/input/base.py | 6 +- python/datafusion/input/location.py | 40 +-- python/datafusion/io.py | 20 +- python/datafusion/object_store.py | 2 +- python/datafusion/plan.py | 8 +- python/datafusion/record_batch.py | 8 +- python/datafusion/substrait.py | 21 +- python/datafusion/udf.py | 236 +++++++++-------- python/tests/generic.py | 19 +- python/tests/test_aggregation.py | 16 +- python/tests/test_catalog.py | 9 +- python/tests/test_context.py | 53 ++-- python/tests/test_dataframe.py | 38 ++- python/tests/test_expr.py | 11 +- python/tests/test_functions.py | 358 ++++++++++++++------------ python/tests/test_imports.py | 7 +- python/tests/test_input.py | 12 +- python/tests/test_io.py | 13 +- python/tests/test_sql.py | 35 +-- python/tests/test_store.py | 13 +- python/tests/test_substrait.py | 2 +- python/tests/test_udaf.py | 10 +- python/tests/test_udwf.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 40 files changed, 697 insertions(+), 599 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c1d9ac838..da3582766 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -33,9 +33,11 @@ jobs: fail-fast: false matrix: python-version: + - "3.9" - "3.10" - "3.11" - "3.12" + - "3.13" toolchain: - "stable" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b548ff18f..abcfcf321 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.3.0 + rev: v0.9.10 hooks: # Run the linter. - id: ruff diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index fb86b12b6..bfb9ac398 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -59,13 +59,13 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("setup,{}".format(round(time_millis, 1))) - results.write("setup,{}\n".format(round(time_millis, 1))) + print(f"setup,{round(time_millis, 1)}") + results.write(f"setup,{round(time_millis, 1)}\n") results.flush() # run queries for query in range(1, 23): - with open("{}/q{}.sql".format(query_path, query)) as f: + with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") queries = [] @@ -83,14 +83,14 @@ def bench(data_path, query_path): end = time.time() time_millis = (end - start) * 1000 total_time_millis += time_millis - print("q{},{}".format(query, round(time_millis, 1))) - results.write("q{},{}\n".format(query, round(time_millis, 1))) + print(f"q{query},{round(time_millis, 1)}") + results.write(f"q{query},{round(time_millis, 1)}\n") results.flush() except Exception as e: print("query", query, "failed", e) - print("total,{}".format(round(total_time_millis, 1))) - results.write("total,{}\n".format(round(total_time_millis, 1))) + print(f"total,{round(total_time_millis, 1)}") + results.write(f"total,{round(total_time_millis, 1)}\n") if __name__ == "__main__": diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index d3dd7c5dd..0c9f4c326 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -29,7 +29,7 @@ exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename, "r")] +globs = [line.strip() for line in open(exclude_globs_filename)] tree = ET.parse(xml_filename) root = tree.getroot() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index 2564eea86..e30e2def2 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -26,15 +26,11 @@ def print_pulls(repo_name, title, pulls): if len(pulls) > 0: - print("**{}:**".format(title)) + print(f"**{title}:**") print() for pull, commit in pulls: - url = "https://github.com/{}/pull/{}".format(repo_name, pull.number) - print( - "- {} [#{}]({}) ({})".format( - pull.title, pull.number, url, commit.author.login - ) - ) + url = f"https://github.com/{repo_name}/pull/{pull.number}" + print(f"- {pull.title} [#{pull.number}]({url}) ({commit.author.login})") print() diff --git a/docs/source/conf.py b/docs/source/conf.py index 2e5a41339..c82a189e0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): +def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), diff --git a/examples/python-udwf.py b/examples/python-udwf.py index 7d39dc1b8..98d118bf2 100644 --- a/examples/python-udwf.py +++ b/examples/python-udwf.py @@ -59,7 +59,7 @@ def __init__(self, alpha: float) -> None: def supports_bounded_execution(self) -> bool: return True - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 # Override the default range of current row since uses_window_frame is False # So for the purpose of this test we just smooth from the previous row to # current. diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index c4d872085..2be4dfabd 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -27,28 +27,25 @@ def df_selection(col_name, col_type): if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type): return F.round(col(col_name), lit(2)).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def load_schema(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return col_name, pa.string() - elif isinstance(col_type, pa.Decimal128Type): + if isinstance(col_type, pa.Decimal128Type): return col_name, pa.float64() - else: - return col_name, col_type + return col_name, col_type def expected_selection(col_name, col_type): if col_type == pa.int64() or col_type == pa.int32(): return F.trim(col(col_name)).cast(col_type).alias(col_name) - elif col_type == pa.string() or col_type == pa.string_view(): + if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) - else: - return col(col_name) + return col(col_name) def selections_and_schema(original_schema): diff --git a/pyproject.toml b/pyproject.toml index 1c2733677..060e3b80a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,57 @@ features = ["substrait"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "FA", "D", "W", "I"] +select = ["ALL" ] +ignore = [ + "A001", # Allow using words like min as variable names + "A002", # Allow using words like filter as variable names + "ANN401", # Allow Any for wrapper classes + "COM812", # Recommended to ignore these rules when using with ruff-format + "FIX002", # Allow TODO lines - consider removing at some point + "FBT001", # Allow boolean positional args + "FBT002", # Allow boolean positional args + "ISC001", # Recommended to ignore these rules when using with ruff-format + "SLF001", # Allow accessing private members + "TD002", + "TD003", # Allow TODO lines + "UP007", # Disallowing Union is pedantic + # TODO: Enable all of the following, but this PR is getting too large already + "PT001", + "ANN204", + "B008", + "EM101", + "PLR0913", + "PLR1714", + "ANN201", + "C400", + "TRY003", + "B904", + "UP006", + "RUF012", + "FBT003", + "C416", + "SIM102", + "PGH003", + "PLR2004", + "PERF401", + "PD901", + "EM102", + "ERA001", + "SIM108", + "ICN001", + "ANN001", + "ANN202", + "PTH", + "N812", + "INP001", + "DTZ007", + "PLW2901", + "RET503", + "RUF015", + "A005", + "TC001", + "UP035", +] [tool.ruff.lint.pydocstyle] convention = "google" @@ -75,16 +125,30 @@ max-doc-length = 88 # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] -"python/tests/*" = ["D"] -"examples/*" = ["D", "W505"] -"dev/*" = ["D"] -"benchmarks/*" = ["D", "F"] +"python/tests/*" = [ + "ANN", + "ARG", + "BLE001", + "D", + "S101", + "SLF", + "PD", + "PLR2004", + "PT011", + "RUF015", + "S608", + "PLR0913", + "PT004", +] +"examples/*" = ["D", "W505", "E501", "T201", "S101"] +"dev/*" = ["D", "E", "T", "S", "PLR", "C", "SIM", "UP", "EXE", "N817"] +"benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"] "docs/*" = ["D"] [dependency-groups] dev = [ "maturin>=1.8.1", - "numpy>1.24.4 ; python_full_version >= '3.10'", + "numpy>1.25.0", "pytest>=7.4.4", "ruff>=0.9.1", "toml>=0.10.2", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index f11ce54a6..286e5dc31 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -48,44 +48,47 @@ from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream -from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF +from .udf import Accumulator, AggregateUDF, ScalarUDF, WindowUDF, udaf, udf, udwf __version__ = importlib_metadata.version(__name__) __all__ = [ "Accumulator", + "AggregateUDF", + "Catalog", "Config", - "DataFrame", - "SessionContext", - "SessionConfig", - "SQLOptions", - "RuntimeEnvBuilder", - "Expr", - "ScalarUDF", - "WindowFrame", - "column", - "col", - "literal", - "lit", "DFSchema", - "Catalog", + "DataFrame", "Database", - "Table", - "AggregateUDF", - "WindowUDF", - "LogicalPlan", "ExecutionPlan", + "Expr", + "LogicalPlan", "RecordBatch", "RecordBatchStream", + "RuntimeEnvBuilder", + "SQLOptions", + "ScalarUDF", + "SessionConfig", + "SessionContext", + "Table", + "WindowFrame", + "WindowUDF", + "col", + "column", "common", "expr", "functions", + "lit", + "literal", "object_store", - "substrait", - "read_parquet", "read_avro", "read_csv", "read_json", + "read_parquet", + "substrait", + "udaf", + "udf", + "udwf", ] @@ -120,10 +123,3 @@ def str_lit(value): def lit(value): """Create a literal expression.""" return Expr.literal(value) - - -udf = ScalarUDF.udf - -udaf = AggregateUDF.udaf - -udwf = WindowUDF.udwf diff --git a/python/datafusion/common.py b/python/datafusion/common.py index a2298c634..e762a993b 100644 --- a/python/datafusion/common.py +++ b/python/datafusion/common.py @@ -20,7 +20,7 @@ from ._internal import common as common_internal -# TODO these should all have proper wrapper classes +# TODO: these should all have proper wrapper classes DFSchema = common_internal.DFSchema DataType = common_internal.DataType @@ -38,15 +38,15 @@ "DFSchema", "DataType", "DataTypeMap", - "RexType", - "PythonType", - "SqlType", "NullTreatment", - "SqlTable", + "PythonType", + "RexType", + "SqlFunction", "SqlSchema", - "SqlView", "SqlStatistics", - "SqlFunction", + "SqlTable", + "SqlType", + "SqlView", ] diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 282b2a477..0ab1a908a 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -393,8 +393,6 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: class RuntimeConfig(RuntimeEnvBuilder): """See `RuntimeEnvBuilder`.""" - pass - class SQLOptions: """Options to be used when performing SQL queries.""" @@ -498,7 +496,7 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) - def enable_url_table(self) -> "SessionContext": + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. Returns: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index de5d8376e..d1c71c2bb 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -29,6 +29,7 @@ List, Literal, Optional, + Type, Union, overload, ) @@ -49,10 +50,11 @@ import polars as pl import pyarrow as pa + from datafusion._internal import DataFrame as DataFrameInternal + from datafusion._internal import expr as expr_internal + from enum import Enum -from datafusion._internal import DataFrame as DataFrameInternal -from datafusion._internal import expr as expr_internal from datafusion.expr import Expr, SortExpr, sort_or_default @@ -73,7 +75,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls, value: str) -> "Compression": + def from_str(cls: Type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -88,8 +90,9 @@ def from_str(cls, value: str) -> "Compression": try: return cls(value.lower()) except ValueError: + valid_values = str([item.value for item in Compression]) raise ValueError( - f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" + f"{value} is not a valid Compression. Valid values are: {valid_values}" ) def get_default_level(self) -> Optional[int]: @@ -104,9 +107,9 @@ def get_default_level(self) -> Optional[int]: # https://github.com/apache/datafusion-python/pull/981#discussion_r1904789223 if self == Compression.GZIP: return 6 - elif self == Compression.BROTLI: + if self == Compression.BROTLI: return 1 - elif self == Compression.ZSTD: + if self == Compression.ZSTD: return 4 return None diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 3639abec6..702f75aed 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -101,63 +101,63 @@ WindowExpr = expr_internal.WindowExpr __all__ = [ - "Expr", - "Column", - "Literal", - "BinaryExpr", - "Literal", + "Aggregate", "AggregateFunction", - "Not", - "IsNotNull", - "IsNull", - "IsTrue", - "IsFalse", - "IsUnknown", - "IsNotTrue", - "IsNotFalse", - "IsNotUnknown", - "Negative", - "Like", - "ILike", - "SimilarTo", - "ScalarVariable", "Alias", - "InList", - "Exists", - "Subquery", - "InSubquery", - "ScalarSubquery", - "Placeholder", - "GroupingSet", + "Analyze", + "Between", + "BinaryExpr", "Case", "CaseBuilder", "Cast", - "TryCast", - "Between", + "Column", + "CreateMemoryTable", + "CreateView", + "Distinct", + "DropTable", + "EmptyRelation", + "Exists", "Explain", + "Expr", + "Extension", + "Filter", + "GroupingSet", + "ILike", + "InList", + "InSubquery", + "IsFalse", + "IsNotFalse", + "IsNotNull", + "IsNotTrue", + "IsNotUnknown", + "IsNull", + "IsTrue", + "IsUnknown", + "Join", + "JoinConstraint", + "JoinType", + "Like", "Limit", - "Aggregate", + "Literal", + "Literal", + "Negative", + "Not", + "Partitioning", + "Placeholder", + "Projection", + "Repartition", + "ScalarSubquery", + "ScalarVariable", + "SimilarTo", "Sort", "SortExpr", - "Analyze", - "EmptyRelation", - "Join", - "JoinType", - "JoinConstraint", + "Subquery", + "SubqueryAlias", + "TableScan", + "TryCast", "Union", "Unnest", "UnnestExpr", - "Extension", - "Filter", - "Projection", - "TableScan", - "CreateMemoryTable", - "CreateView", - "Distinct", - "SubqueryAlias", - "DropTable", - "Partitioning", - "Repartition", "Window", "WindowExpr", "WindowFrame", @@ -311,7 +311,7 @@ def __getitem__(self, key: str | int) -> Expr: ) return Expr(self.expr.__getitem__(key)) - def __eq__(self, rhs: Any) -> Expr: + def __eq__(self, rhs: object) -> Expr: """Equal to. Accepts either an expression or any valid PyArrow scalar literal value. @@ -320,7 +320,7 @@ def __eq__(self, rhs: Any) -> Expr: rhs = Expr.literal(rhs) return Expr(self.expr.__eq__(rhs.expr)) - def __ne__(self, rhs: Any) -> Expr: + def __ne__(self, rhs: object) -> Expr: """Not equal to. Accepts either an expression or any valid PyArrow scalar literal value. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index b449c4868..0cc7434cf 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,13 +18,12 @@ from __future__ import annotations -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import pyarrow as pa from datafusion._internal import functions as f from datafusion.common import NullTreatment -from datafusion.context import SessionContext from datafusion.expr import ( CaseBuilder, Expr, @@ -34,6 +33,9 @@ sort_list_to_raw_sort_list, ) +if TYPE_CHECKING: + from datafusion.context import SessionContext + __all__ = [ "abs", "acos", @@ -81,8 +83,8 @@ "array_sort", "array_to_string", "array_union", - "arrow_typeof", "arrow_cast", + "arrow_typeof", "ascii", "asin", "asinh", @@ -97,6 +99,7 @@ "bool_and", "bool_or", "btrim", + "cardinality", "case", "cbrt", "ceil", @@ -116,6 +119,7 @@ "covar", "covar_pop", "covar_samp", + "cume_dist", "current_date", "current_time", "date_bin", @@ -125,17 +129,17 @@ "datetrunc", "decode", "degrees", + "dense_rank", "digest", "empty", "encode", "ends_with", - "extract", "exp", + "extract", "factorial", "find_in_set", "first_value", "flatten", - "cardinality", "floor", "from_unixtime", "gcd", @@ -143,8 +147,10 @@ "initcap", "isnan", "iszero", + "lag", "last_value", "lcm", + "lead", "left", "length", "levenshtein", @@ -166,10 +172,10 @@ "list_prepend", "list_push_back", "list_push_front", - "list_repeat", "list_remove", "list_remove_all", "list_remove_n", + "list_repeat", "list_replace", "list_replace_all", "list_replace_n", @@ -180,14 +186,14 @@ "list_union", "ln", "log", - "log10", "log2", + "log10", "lower", "lpad", "ltrim", "make_array", - "make_list", "make_date", + "make_list", "max", "md5", "mean", @@ -195,19 +201,22 @@ "min", "named_struct", "nanvl", - "nvl", "now", "nth_value", + "ntile", "nullif", + "nvl", "octet_length", "order_by", "overlay", + "percent_rank", "pi", "pow", "power", "radians", "random", "range", + "rank", "regexp_like", "regexp_match", "regexp_replace", @@ -225,6 +234,7 @@ "reverse", "right", "round", + "row_number", "rpad", "rtrim", "sha224", @@ -252,8 +262,8 @@ "to_hex", "to_timestamp", "to_timestamp_micros", - "to_timestamp_nanos", "to_timestamp_millis", + "to_timestamp_nanos", "to_timestamp_seconds", "to_unixtime", "translate", @@ -268,14 +278,6 @@ "when", # Window Functions "window", - "lead", - "lag", - "row_number", - "rank", - "dense_rank", - "percent_rank", - "cume_dist", - "ntile", ] @@ -292,14 +294,14 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: return Expr(f.nullif(expr1.expr, expr2.expr)) -def encode(input: Expr, encoding: Expr) -> Expr: +def encode(expr: Expr, encoding: Expr) -> Expr: """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.encode(input.expr, encoding.expr)) + return Expr(f.encode(expr.expr, encoding.expr)) -def decode(input: Expr, encoding: Expr) -> Expr: +def decode(expr: Expr, encoding: Expr) -> Expr: """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" - return Expr(f.decode(input.expr, encoding.expr)) + return Expr(f.decode(expr.expr, encoding.expr)) def array_to_string(expr: Expr, delimiter: Expr) -> Expr: diff --git a/python/datafusion/input/__init__.py b/python/datafusion/input/__init__.py index f85ce21f0..f0c1f42b4 100644 --- a/python/datafusion/input/__init__.py +++ b/python/datafusion/input/__init__.py @@ -23,5 +23,5 @@ from .location import LocationInputPlugin __all__ = [ - LocationInputPlugin, + "LocationInputPlugin", ] diff --git a/python/datafusion/input/base.py b/python/datafusion/input/base.py index 4eba19784..f67dde2a1 100644 --- a/python/datafusion/input/base.py +++ b/python/datafusion/input/base.py @@ -38,11 +38,9 @@ class BaseInputSource(ABC): """ @abstractmethod - def is_correct_input(self, input_item: Any, table_name: str, **kwargs) -> bool: + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: """Returns `True` if the input is valid.""" - pass @abstractmethod - def build_table(self, input_item: Any, table_name: str, **kwarg) -> SqlTable: + def build_table(self, input_item: Any, table_name: str, **kwarg: Any) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - pass diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 517cd1578..08d98d115 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -18,7 +18,7 @@ """The default input source for DataFusion.""" import glob -import os +from pathlib import Path from typing import Any from datafusion.common import DataTypeMap, SqlTable @@ -31,7 +31,7 @@ class LocationInputPlugin(BaseInputSource): This can be read in from a file (on disk, remote etc.). """ - def is_correct_input(self, input_item: Any, table_name: str, **kwargs): + def is_correct_input(self, input_item: Any, table_name: str, **kwargs: Any) -> bool: # noqa: ARG002 """Returns `True` if the input is valid.""" return isinstance(input_item, str) @@ -39,27 +39,28 @@ def build_table( self, input_item: str, table_name: str, - **kwargs, - ) -> SqlTable: + **kwargs: Any, # noqa: ARG002 + ) -> SqlTable: # type: ignore[invalid-type-form] """Create a table from the input source.""" - _, extension = os.path.splitext(input_item) - format = extension.lstrip(".").lower() + extension = Path(input_item).suffix + file_format = extension.lstrip(".").lower() num_rows = 0 # Total number of rows in the file. Used for statistics columns = [] - if format == "parquet": + if file_format == "parquet": import pyarrow.parquet as pq # Read the Parquet metadata metadata = pq.read_metadata(input_item) num_rows = metadata.num_rows # Iterate through the schema and build the SqlTable - for col in metadata.schema: - columns.append( - ( - col.name, - DataTypeMap.from_parquet_type_str(col.physical_type), - ) + columns = [ + ( + col.name, + DataTypeMap.from_parquet_type_str(col.physical_type), ) + for col in metadata.schema + ] + elif format == "csv": import csv @@ -69,19 +70,18 @@ def build_table( # to get that information. However, this should only be occurring # at table creation time and therefore shouldn't # slow down query performance. - with open(input_item, "r") as file: + with Path(input_item).open() as file: reader = csv.reader(file) - header_row = next(reader) - print(header_row) + _header_row = next(reader) for _ in reader: num_rows += 1 # TODO: Need to actually consume this row into reasonable columns - raise RuntimeError("TODO: Currently unable to support CSV input files.") + msg = "TODO: Currently unable to support CSV input files." + raise RuntimeError(msg) else: - raise RuntimeError( - f"Input of format: `{format}` is currently not supported.\ + msg = f"Input of format: `{format}` is currently not supported.\ Only Parquet and CSV." - ) + raise RuntimeError(msg) # Input could possibly be multiple files. Create a list if so input_files = glob.glob(input_item) diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3b6264948..3e39703e3 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -19,15 +19,19 @@ from __future__ import annotations -import pathlib - -import pyarrow +from typing import TYPE_CHECKING from datafusion.dataframe import DataFrame -from datafusion.expr import Expr from ._internal import SessionContext as SessionContextInternal +if TYPE_CHECKING: + import pathlib + + import pyarrow as pa + + from datafusion.expr import Expr + def read_parquet( path: str | pathlib.Path, @@ -35,7 +39,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -79,7 +83,7 @@ def read_parquet( def read_json( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -120,7 +124,7 @@ def read_json( def read_csv( path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -173,7 +177,7 @@ def read_csv( def read_avro( path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/object_store.py b/python/datafusion/object_store.py index 7cc17506f..6298526f5 100644 --- a/python/datafusion/object_store.py +++ b/python/datafusion/object_store.py @@ -24,4 +24,4 @@ MicrosoftAzure = object_store.MicrosoftAzure Http = object_store.Http -__all__ = ["AmazonS3", "GoogleCloud", "LocalFileSystem", "MicrosoftAzure", "Http"] +__all__ = ["AmazonS3", "GoogleCloud", "Http", "LocalFileSystem", "MicrosoftAzure"] diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 133fc446d..0b7bebcb3 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -19,7 +19,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING, Any import datafusion._internal as df_internal @@ -27,8 +27,8 @@ from datafusion.context import SessionContext __all__ = [ - "LogicalPlan", "ExecutionPlan", + "LogicalPlan", ] @@ -54,7 +54,7 @@ def to_variant(self) -> Any: """Convert the logical plan into its specific variant.""" return self._raw_plan.to_variant() - def inputs(self) -> List[LogicalPlan]: + def inputs(self) -> list[LogicalPlan]: """Returns the list of inputs to the logical plan.""" return [LogicalPlan(p) for p in self._raw_plan.inputs()] @@ -106,7 +106,7 @@ def __init__(self, plan: df_internal.ExecutionPlan) -> None: """This constructor should not be called by the end user.""" self._raw_plan = plan - def children(self) -> List[ExecutionPlan]: + def children(self) -> list[ExecutionPlan]: """Get a list of children `ExecutionPlan` that act as inputs to this plan. The returned list will be empty for leaf nodes such as scans, will contain a diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 772cd9089..556eaa786 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -26,14 +26,14 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - import pyarrow + import pyarrow as pa import typing_extensions import datafusion._internal as df_internal class RecordBatch: - """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" + """This class is essentially a wrapper for :py:class:`pa.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. @@ -42,8 +42,8 @@ def __init__(self, record_batch: df_internal.RecordBatch) -> None: """ self.record_batch = record_batch - def to_pyarrow(self) -> pyarrow.RecordBatch: - """Convert to :py:class:`pyarrow.RecordBatch`.""" + def to_pyarrow(self) -> pa.RecordBatch: + """Convert to :py:class:`pa.RecordBatch`.""" return self.record_batch.to_pyarrow() diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 06302fe38..f10adfb0c 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -23,7 +23,6 @@ from __future__ import annotations -import pathlib from typing import TYPE_CHECKING try: @@ -36,11 +35,13 @@ from ._internal import substrait as substrait_internal if TYPE_CHECKING: + import pathlib + from datafusion.context import SessionContext __all__ = [ - "Plan", "Consumer", + "Plan", "Producer", "Serde", ] @@ -68,11 +69,9 @@ def encode(self) -> bytes: @deprecated("Use `Plan` instead.") -class plan(Plan): +class plan(Plan): # noqa: N801 """See `Plan`.""" - pass - class Serde: """Provides the ``Substrait`` serialization and deserialization.""" @@ -140,11 +139,9 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: @deprecated("Use `Serde` instead.") -class serde(Serde): +class serde(Serde): # noqa: N801 """See `Serde` instead.""" - pass - class Producer: """Generates substrait plans from a logical plan.""" @@ -168,11 +165,9 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: @deprecated("Use `Producer` instead.") -class producer(Producer): +class producer(Producer): # noqa: N801 """Use `Producer` instead.""" - pass - class Consumer: """Generates a logical plan from a substrait plan.""" @@ -194,7 +189,5 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: @deprecated("Use `Consumer` instead.") -class consumer(Consumer): +class consumer(Consumer): # noqa: N801 """Use `Consumer` instead.""" - - pass diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index af7bcf2ed..603b7063d 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -22,15 +22,15 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, overload -import pyarrow +import pyarrow as pa import datafusion._internal as df_internal from datafusion.expr import Expr if TYPE_CHECKING: - _R = TypeVar("_R", bound=pyarrow.DataType) + _R = TypeVar("_R", bound=pa.DataType) class Volatility(Enum): @@ -72,7 +72,7 @@ class Volatility(Enum): for each output row, resulting in a unique random value for each row. """ - def __str__(self): + def __str__(self) -> str: """Returns the string equivalent.""" return self.name.lower() @@ -88,7 +88,7 @@ def __init__( self, name: str, func: Callable[..., _R], - input_types: pyarrow.DataType | list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], return_type: _R, volatility: Volatility | str, ) -> None: @@ -96,7 +96,7 @@ def __init__( See helper method :py:func:`udf` for argument details. """ - if isinstance(input_types, pyarrow.DataType): + if isinstance(input_types, pa.DataType): input_types = [input_types] self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) @@ -111,7 +111,27 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udf.__call__(*args_raw)) - class udf: + @overload + @staticmethod + def udf( + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., ScalarUDF]: ... + + @overload + @staticmethod + def udf( + func: Callable[..., _R], + input_types: list[pa.DataType], + return_type: _R, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> ScalarUDF: ... + + @staticmethod + def udf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Function (UDF). This class can be used both as a **function** and as a **decorator**. @@ -125,7 +145,7 @@ class udf: Args: func (Callable, optional): **Only needed when calling as a function.** Skip this argument when using `udf` as a decorator. - input_types (list[pyarrow.DataType]): The data types of the arguments + input_types (list[pa.DataType]): The data types of the arguments to `func`. This list must be of the same length as the number of arguments. return_type (_R): The data type of the return value from the function. @@ -141,40 +161,28 @@ class udf: ``` def double_func(x): return x * 2 - double_udf = udf(double_func, [pyarrow.int32()], pyarrow.int32(), + double_udf = udf(double_func, [pa.int32()], pa.int32(), "volatile", "double_it") ``` **Using `udf` as a decorator:** ``` - @udf([pyarrow.int32()], pyarrow.int32(), "volatile", "double_it") + @udf([pa.int32()], pa.int32(), "volatile", "double_it") def double_udf(x): return x * 2 ``` """ - def __new__(cls, *args, **kwargs): - """Create a new UDF. - - Trigger UDF function or decorator depending on if the first args is callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( func: Callable[..., _R], - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, ) -> ScalarUDF: if not callable(func): - raise TypeError("`func` argument must be callable") + msg = "`func` argument must be callable" + raise TypeError(msg) if name is None: if hasattr(func, "__qualname__"): name = func.__qualname__.lower() @@ -188,49 +196,50 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: list[pyarrow.DataType], + input_types: list[pa.DataType], return_type: _R, volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(func): + ) -> Callable: + def decorator(func: Callable): udf_caller = ScalarUDF.udf( func, input_types, return_type, volatility, name ) @functools.wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any): return udf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + class Accumulator(metaclass=ABCMeta): """Defines how an :py:class:`AggregateUDF` accumulates values.""" @abstractmethod - def state(self) -> List[pyarrow.Scalar]: + def state(self) -> list[pa.Scalar]: """Return the current state.""" - pass @abstractmethod - def update(self, *values: pyarrow.Array) -> None: + def update(self, *values: pa.Array) -> None: """Evaluate an array of values and update state.""" - pass @abstractmethod - def merge(self, states: List[pyarrow.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: """Merge a set of states.""" - pass @abstractmethod - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: """Return the resultant value.""" - pass class AggregateUDF: @@ -244,9 +253,9 @@ def __init__( self, name: str, accumulator: Callable[[], Accumulator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, ) -> None: """Instantiate a user-defined aggregate function (UDAF). @@ -272,7 +281,29 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udaf.__call__(*args_raw)) - class udaf: + @overload + @staticmethod + def udaf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., AggregateUDF]: ... + + @overload + @staticmethod + def udaf( + accum: Callable[[], Accumulator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], + volatility: Volatility | str, + name: Optional[str] = None, + ) -> AggregateUDF: ... + + @staticmethod + def udaf(*args: Any, **kwargs: Any): # noqa: D417 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an **aggregate function** that can be used in @@ -300,13 +331,13 @@ class Summarize(Accumulator): def __init__(self, bias: float = 0.0): self._sum = pa.scalar(bias) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) def evaluate(self) -> pa.Scalar: @@ -344,37 +375,23 @@ def udf4() -> Summarize: aggregation or window function calls. """ - def __new__(cls, *args, **kwargs): - """Create a new UDAF. - - Trigger UDAF function or decorator depending on if the first args is - callable - """ - if args and callable(args[0]): - # Case 1: Used as a function, require the first parameter to be callable - return cls._function(*args, **kwargs) - else: - # Case 2: Used as a decorator with parameters - return cls._decorator(*args, **kwargs) - - @staticmethod def _function( accum: Callable[[], Accumulator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, ) -> AggregateUDF: if not callable(accum): - raise TypeError("`func` must be callable.") - if not isinstance(accum.__call__(), Accumulator): - raise TypeError( - "Accumulator must implement the abstract base class Accumulator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(accum(), Accumulator): + msg = "Accumulator must implement the abstract base class Accumulator" + raise TypeError(msg) if name is None: - name = accum.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = accum().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return AggregateUDF( name=name, @@ -385,29 +402,34 @@ def _function( volatility=volatility, ) - @staticmethod def _decorator( - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, - state_type: list[pyarrow.DataType], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + state_type: list[pa.DataType], volatility: Volatility | str, name: Optional[str] = None, - ): - def decorator(accum: Callable[[], Accumulator]): + ) -> Callable[..., Callable[..., Expr]]: + def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: udaf_caller = AggregateUDF.udaf( accum, input_types, return_type, state_type, volatility, name ) @functools.wraps(accum) - def wrapper(*args, **kwargs): + def wrapper(*args: Any, **kwargs: Any) -> Expr: return udaf_caller(*args, **kwargs) return wrapper return decorator + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return _function(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return _decorator(*args, **kwargs) + -class WindowEvaluator(metaclass=ABCMeta): +class WindowEvaluator: """Evaluator class for user-defined window functions (UDWF). It is up to the user to decide which evaluate function is appropriate. @@ -423,7 +445,7 @@ class WindowEvaluator(metaclass=ABCMeta): +------------------------+--------------------------------+------------------+---------------------------+ | True | True/False | True/False | ``evaluate`` | +------------------------+--------------------------------+------------------+---------------------------+ - """ # noqa: W505 + """ # noqa: W505, E501 def memoize(self) -> None: """Perform a memoize operation to improve performance. @@ -436,9 +458,8 @@ def memoize(self) -> None: `memoize` is called after each input batch is processed, and such functions can save whatever they need """ - pass - def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: + def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 """Return the range for the window fuction. If `uses_window_frame` flag is `false`. This method is used to @@ -460,14 +481,17 @@ def is_causal(self) -> bool: """Get whether evaluator needs future data for its result.""" return False - def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Array: + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: """Evaluate a window function on an entire input partition. This function is called once per input *partition* for window functions that *do not use* values from the window frame, such as - :py:func:`~datafusion.functions.row_number`, :py:func:`~datafusion.functions.rank`, - :py:func:`~datafusion.functions.dense_rank`, :py:func:`~datafusion.functions.percent_rank`, - :py:func:`~datafusion.functions.cume_dist`, :py:func:`~datafusion.functions.lead`, + :py:func:`~datafusion.functions.row_number`, + :py:func:`~datafusion.functions.rank`, + :py:func:`~datafusion.functions.dense_rank`, + :py:func:`~datafusion.functions.percent_rank`, + :py:func:`~datafusion.functions.cume_dist`, + :py:func:`~datafusion.functions.lead`, and :py:func:`~datafusion.functions.lag`. It produces the result of all rows in a single pass. It @@ -499,12 +523,11 @@ def evaluate_all(self, values: list[pyarrow.Array], num_rows: int) -> pyarrow.Ar .. code-block:: text avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING) - """ # noqa: W505 - pass + """ # noqa: W505, E501 def evaluate( - self, values: list[pyarrow.Array], eval_range: tuple[int, int] - ) -> pyarrow.Scalar: + self, values: list[pa.Array], eval_range: tuple[int, int] + ) -> pa.Scalar: """Evaluate window function on a range of rows in an input partition. This is the simplest and most general function to implement @@ -519,11 +542,10 @@ def evaluate( and evaluation results of ORDER BY expressions. If function has a single argument, `values[1..]` will contain ORDER BY expression results. """ - pass def evaluate_all_with_rank( self, num_rows: int, ranks_in_partition: list[tuple[int, int]] - ) -> pyarrow.Array: + ) -> pa.Array: """Called for window functions that only need the rank of a row. Evaluate the partition evaluator against the partition using @@ -552,7 +574,6 @@ def evaluate_all_with_rank( The user must implement this method if ``include_rank`` returns True. """ - pass def supports_bounded_execution(self) -> bool: """Can the window function be incrementally computed using bounded memory?""" @@ -567,10 +588,6 @@ def include_rank(self) -> bool: return False -if TYPE_CHECKING: - _W = TypeVar("_W", bound=WindowEvaluator) - - class WindowUDF: """Class for performing window user-defined functions (UDF). @@ -582,8 +599,8 @@ def __init__( self, name: str, func: Callable[[], WindowEvaluator], - input_types: list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, ) -> None: """Instantiate a user-defined window function (UDWF). @@ -607,8 +624,8 @@ def __call__(self, *args: Expr) -> Expr: @staticmethod def udwf( func: Callable[[], WindowEvaluator], - input_types: pyarrow.DataType | list[pyarrow.DataType], - return_type: pyarrow.DataType, + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, ) -> WindowUDF: @@ -648,16 +665,16 @@ def bias_10() -> BiasedNumbers: Returns: A user-defined window function. - """ # noqa W505 + """ # noqa: W505, E501 if not callable(func): - raise TypeError("`func` must be callable.") - if not isinstance(func.__call__(), WindowEvaluator): - raise TypeError( - "`func` must implement the abstract base class WindowEvaluator" - ) + msg = "`func` must be callable." + raise TypeError(msg) + if not isinstance(func(), WindowEvaluator): + msg = "`func` must implement the abstract base class WindowEvaluator" + raise TypeError(msg) if name is None: - name = func.__call__().__class__.__qualname__.lower() - if isinstance(input_types, pyarrow.DataType): + name = func().__class__.__qualname__.lower() + if isinstance(input_types, pa.DataType): input_types = [input_types] return WindowUDF( name=name, @@ -666,3 +683,10 @@ def bias_10() -> BiasedNumbers: return_type=return_type, volatility=volatility, ) + + +# Convenience exports so we can import instead of treating as +# variables at the package root +udf = ScalarUDF.udf +udaf = AggregateUDF.udaf +udwf = WindowUDF.udwf diff --git a/python/tests/generic.py b/python/tests/generic.py index 0177e2df0..1b98fdf9e 100644 --- a/python/tests/generic.py +++ b/python/tests/generic.py @@ -16,6 +16,7 @@ # under the License. import datetime +from datetime import timezone import numpy as np import pyarrow as pa @@ -26,29 +27,29 @@ def data(): - np.random.seed(1) + rng = np.random.default_rng(1) data = np.concatenate( [ - np.random.normal(0, 0.01, size=50), - np.random.normal(50, 0.01, size=50), + rng.normal(0, 0.01, size=50), + rng.normal(50, 0.01, size=50), ] ) return pa.array(data) def data_with_nans(): - np.random.seed(0) - data = np.random.normal(0, 0.01, size=50) - mask = np.random.randint(0, 2, size=50) + rng = np.random.default_rng(0) + data = rng.normal(0, 0.01, size=50) + mask = rng.normal(0, 2, size=50) data[mask == 0] = np.nan return data def data_datetime(f): data = [ - datetime.datetime.now(), - datetime.datetime.now() - datetime.timedelta(days=1), - datetime.datetime.now() + datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc), + datetime.datetime.now(tz=timezone.utc) - datetime.timedelta(days=1), + datetime.datetime.now(tz=timezone.utc) + datetime.timedelta(days=1), ] return pa.array(data, type=pa.timestamp(f), mask=np.array([False, True, False])) diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 5ef46131b..61b1c7d80 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -66,7 +66,7 @@ def df_aggregate_100(): @pytest.mark.parametrize( - "agg_expr, calc_expected", + ("agg_expr", "calc_expected"), [ (f.avg(column("a")), lambda a, b, c, d: np.array(np.average(a))), ( @@ -114,7 +114,7 @@ def test_aggregation_stats(df, agg_expr, calc_expected): @pytest.mark.parametrize( - "agg_expr, expected, array_sort", + ("agg_expr", "expected", "array_sort"), [ (f.approx_distinct(column("b")), pa.array([2], type=pa.uint64()), False), ( @@ -182,12 +182,11 @@ def test_aggregation(df, agg_expr, expected, array_sort): agg_df.show() result = agg_df.collect()[0] - print(result) assert result.column(0) == expected @pytest.mark.parametrize( - "name,expr,expected", + ("name", "expr", "expected"), [ ( "approx_percentile_cont", @@ -299,7 +298,9 @@ def test_aggregate_100(df_aggregate_100, name, expr, expected): ] -@pytest.mark.parametrize("name,expr,result", data_test_bitwise_and_boolean_functions) +@pytest.mark.parametrize( + ("name", "expr", "result"), data_test_bitwise_and_boolean_functions +) def test_bit_and_bool_fns(df, name, expr, result): df = df.aggregate([], [expr.alias(name)]) @@ -311,7 +312,7 @@ def test_bit_and_bool_fns(df, name, expr, result): @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("first_value", f.first_value(column("a")), [0, 4]), ( @@ -361,7 +362,6 @@ def test_bit_and_bool_fns(df, name, expr, result): ), [8, 9], ), - ("first_value", f.first_value(column("a")), [0, 4]), ( "nth_value_ordered", f.nth_value(column("a"), 2, order_by=[column("a").sort(ascending=False)]), @@ -401,7 +401,7 @@ def test_first_last_value(df_partitioned, name, expr, result) -> None: @pytest.mark.parametrize( - "name,expr,result", + ("name", "expr", "result"), [ ("string_agg", f.string_agg(column("a"), ","), "one,two,three,two"), ("string_agg", f.string_agg(column("b"), ""), "03124"), diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 214f6b165..23b328458 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -19,6 +19,9 @@ import pytest +# Note we take in `database` as a variable even though we don't use +# it because that will cause the fixture to set up the context with +# the tables we need. def test_basic(ctx, database): with pytest.raises(KeyError): ctx.catalog("non-existent") @@ -26,10 +29,10 @@ def test_basic(ctx, database): default = ctx.catalog() assert default.names() == ["public"] - for database in [default.database("public"), default.database()]: - assert database.names() == {"csv1", "csv", "csv2"} + for db in [default.database("public"), default.database()]: + assert db.names() == {"csv1", "csv", "csv2"} - table = database.table("csv") + table = db.table("csv") assert table.kind == "physical" assert table.schema == pa.schema( [ diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 91046e6b8..7a0a7aa08 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -16,7 +16,6 @@ # under the License. import datetime as dt import gzip -import os import pathlib import pyarrow as pa @@ -45,7 +44,7 @@ def test_create_context_runtime_config_only(): SessionContext(runtime=RuntimeEnvBuilder()) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_runtime_configs(tmp_path, path_to_str): path1 = tmp_path / "dir1" path2 = tmp_path / "dir2" @@ -62,7 +61,7 @@ def test_runtime_configs(tmp_path, path_to_str): assert db is not None -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_temporary_files(tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -79,14 +78,14 @@ def test_create_context_with_all_valid_args(): runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(1) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) ) ctx = SessionContext(config, runtime) @@ -167,7 +166,7 @@ def test_from_arrow_table(ctx): def record_batch_generator(num_batches: int): schema = pa.schema([("a", pa.int64()), ("b", pa.int64())]) - for i in range(num_batches): + for _i in range(num_batches): yield pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], schema=schema ) @@ -492,10 +491,10 @@ def test_table_not_found(ctx): def test_read_json(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = pathlib.Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path) result = df.collect() @@ -515,7 +514,7 @@ def test_read_json(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = path / "data_test_context" / "data.json" df = ctx.read_json(test_data_path, file_extension=".json") result = df.collect() @@ -524,15 +523,17 @@ def test_read_json(ctx): def test_read_json_compressed(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = pathlib.Path(__file__).parent.resolve() + test_data_path = path / "data_test_context" / "data.json" # File compression type gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) df = ctx.read_json(gzip_path, file_extension=".gz", file_compression_type="gz") result = df.collect() @@ -563,14 +564,16 @@ def test_read_csv_list(ctx): def test_read_csv_compressed(ctx, tmp_path): - test_data_path = "testing/data/csv/aggregate_test_100.csv" + test_data_path = pathlib.Path("testing/data/csv/aggregate_test_100.csv") # File compression type gzip_path = tmp_path / "aggregate_test_100.csv.gz" - with open(test_data_path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with ( + pathlib.Path.open(test_data_path, "rb") as csv_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(csv_file) csv_df = ctx.read_csv(gzip_path, file_extension=".gz", file_compression_type="gz") csv_df.select(column("c1")).show() @@ -603,7 +606,7 @@ def test_create_sql_options(): def test_sql_with_options_no_ddl(ctx): sql = "CREATE TABLE IF NOT EXISTS valuetable AS VALUES(1,'HELLO'),(12,'DATAFUSION')" ctx.sql(sql) - options = SQLOptions().with_allow_ddl(False) + options = SQLOptions().with_allow_ddl(allow=False) with pytest.raises(Exception, match="DDL"): ctx.sql_with_options(sql, options=options) @@ -618,7 +621,7 @@ def test_sql_with_options_no_dml(ctx): ctx.register_dataset(table_name, dataset) sql = f'INSERT INTO "{table_name}" VALUES (1, 2), (2, 3);' ctx.sql(sql) - options = SQLOptions().with_allow_dml(False) + options = SQLOptions().with_allow_dml(allow=False) with pytest.raises(Exception, match="DML"): ctx.sql_with_options(sql, options=options) @@ -626,6 +629,6 @@ def test_sql_with_options_no_dml(ctx): def test_sql_with_options_no_statements(ctx): sql = "SET time zone = 1;" ctx.sql(sql) - options = SQLOptions().with_allow_statements(False) + options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c636e896a..d084f12dd 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -339,7 +339,7 @@ def test_join(): # Verify we don't make a breaking change to pre-43.0.0 # where users would pass join_keys as a positional argument - df2 = df.join(df1, (["a"], ["a"]), how="inner") # type: ignore + df2 = df.join(df1, (["a"], ["a"]), how="inner") df2.show() df2 = df2.sort(column("l.a")) table = pa.Table.from_batches(df2.collect()) @@ -375,17 +375,17 @@ def test_join_invalid_params(): with pytest.raises( ValueError, match=r"`left_on` or `right_on` should not provided with `on`" ): - df2 = df.join(df1, on="a", how="inner", right_on="test") # type: ignore + df2 = df.join(df1, on="a", how="inner", right_on="test") with pytest.raises( ValueError, match=r"`left_on` and `right_on` should both be provided." ): - df2 = df.join(df1, left_on="a", how="inner") # type: ignore + df2 = df.join(df1, left_on="a", how="inner") with pytest.raises( ValueError, match=r"either `on` or `left_on` and `right_on` should be provided." ): - df2 = df.join(df1, how="inner") # type: ignore + df2 = df.join(df1, how="inner") def test_join_on(): @@ -567,7 +567,7 @@ def test_distinct(): ] -@pytest.mark.parametrize("name,expr,result", data_test_window_functions) +@pytest.mark.parametrize(("name", "expr", "result"), data_test_window_functions) def test_window_functions(partitioned_df, name, expr, result): df = partitioned_df.select( column("a"), column("b"), column("c"), f.alias(expr, name) @@ -731,7 +731,7 @@ def test_execution_plan(aggregate_df): plan = aggregate_df.execution_plan() expected = ( - "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" # noqa: E501 + "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n" ) assert expected == plan.display() @@ -756,7 +756,7 @@ def test_execution_plan(aggregate_df): ctx = SessionContext() rows_returned = 0 - for idx in range(0, plan.partition_count): + for idx in range(plan.partition_count): stream = ctx.execute(plan, idx) try: batch = stream.next() @@ -885,7 +885,7 @@ def test_union_distinct(ctx): ) df_c = ctx.create_dataframe([[batch]]).sort(column("a")) - df_a_u_b = df_a.union(df_b, True).sort(column("a")) + df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a")) assert df_c.collect() == df_a_u_b.collect() assert df_c.collect() == df_a_u_b.collect() @@ -954,8 +954,6 @@ def test_to_arrow_table(df): def test_execute_stream(df): stream = df.execute_stream() - for s in stream: - print(type(s)) assert all(batch is not None for batch in stream) assert not list(stream) # after one iteration the generator must be exhausted @@ -969,7 +967,7 @@ def test_execute_stream_to_arrow_table(df, schema): (batch.to_pyarrow() for batch in stream), schema=df.schema() ) else: - pyarrow_table = pa.Table.from_batches((batch.to_pyarrow() for batch in stream)) + pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream) assert isinstance(pyarrow_table, pa.Table) assert pyarrow_table.shape == (3, 3) @@ -1033,7 +1031,7 @@ def test_describe(df): } -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_csv(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1046,7 +1044,7 @@ def test_write_csv(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_json(ctx, df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1059,7 +1057,7 @@ def test_write_json(ctx, df, tmp_path, path_to_str): assert result == expected -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_write_parquet(df, tmp_path, path_to_str): path = str(tmp_path) if path_to_str else tmp_path @@ -1071,7 +1069,7 @@ def test_write_parquet(df, tmp_path, path_to_str): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 6), ("brotli", 7), ("zstd", 15)], ) def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @@ -1082,7 +1080,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): ) # test that the actual compression scheme is the one written - for root, dirs, files in os.walk(path): + for _root, _dirs, files in os.walk(path): for file in files: if file.endswith(".parquet"): metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict() @@ -1097,7 +1095,7 @@ def test_write_compressed_parquet(df, tmp_path, compression, compression_level): @pytest.mark.parametrize( - "compression, compression_level", + ("compression", "compression_level"), [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)], ) def test_write_compressed_parquet_wrong_compression_level( @@ -1152,7 +1150,7 @@ def test_dataframe_export(df) -> None: table = pa.table(df, schema=desired_schema) assert table.num_columns == 1 assert table.num_rows == 3 - for i in range(0, 3): + for i in range(3): assert table[0][i].as_py() is None # Expect an error when we cannot convert schema @@ -1186,8 +1184,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: result = df.to_pydict() assert result["a"] == [1, 2, 3] - assert result["string_col"] == ["string data" for _i in range(0, 3)] - assert result["new_col"] == [3 for _i in range(0, 3)] + assert result["string_col"] == ["string data" for _i in range(3)] + assert result["new_col"] == [3 for _i in range(3)] def test_dataframe_repr_html(df) -> None: diff --git a/python/tests/test_expr.py b/python/tests/test_expr.py index 354c7e180..926e69845 100644 --- a/python/tests/test_expr.py +++ b/python/tests/test_expr.py @@ -85,18 +85,14 @@ def test_limit(test_ctx): plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 0 + assert "Skip: None" in str(plan) df = test_ctx.sql("select c1 from test LIMIT 10 OFFSET 5") plan = df.logical_plan() plan = plan.to_variant() assert isinstance(plan, Limit) - # TODO: Upstream now has expressions for skip and fetch - # REF: https://github.com/apache/datafusion/pull/12836 - # assert plan.skip() == 5 + assert "Skip: Some(Literal(Int64(5)))" in str(plan) def test_aggregate_query(test_ctx): @@ -165,6 +161,7 @@ def traverse_logical_plan(plan): res = traverse_logical_plan(input_plan) if res is not None: return res + return None ctx = SessionContext() data = {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]} @@ -176,7 +173,7 @@ def traverse_logical_plan(plan): assert variant.expr().to_variant().qualified_name() == "table1.name" assert ( str(variant.list()) - == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' + == '[Expr(Utf8("dfa")), Expr(Utf8("ad")), Expr(Utf8("dfre")), Expr(Utf8("vsa"))]' # noqa: E501 ) assert not variant.negated() diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index fca05bb8f..ed88a16e3 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import math -from datetime import datetime +from datetime import datetime, timezone import numpy as np import pyarrow as pa @@ -25,6 +25,8 @@ np.seterr(invalid="ignore") +DEFAULT_TZ = timezone.utc + @pytest.fixture def df(): @@ -37,9 +39,9 @@ def df(): pa.array(["hello ", " world ", " !"], type=pa.string_view()), pa.array( [ - datetime(2022, 12, 31), - datetime(2027, 6, 26), - datetime(2020, 7, 2), + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), ] ), pa.array([False, True, True]), @@ -221,12 +223,12 @@ def py_indexof(arr, v): def py_arr_remove(arr, v, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: new_arr.remove(v) found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -234,13 +236,13 @@ def py_arr_remove(arr, v, n=None): def py_arr_replace(arr, from_, to, n=None): new_arr = arr[:] found = 0 - while found != n: - try: + try: + while found != n: idx = new_arr.index(from_) new_arr[idx] = to found += 1 - except ValueError: - break + except ValueError: + pass return new_arr @@ -268,266 +270,266 @@ def py_flatten(arr): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( lambda col: f.array_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_append(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_back(col, literal(99.0)), lambda data: [np.append(arr, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_cat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.list_concat(col, col), lambda data: [np.concatenate([arr, arr]) for arr in data], - ], - [ + ), + ( lambda col: f.array_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_distinct(col), lambda data: [list(set(r)) for r in data], - ], - [ + ), + ( lambda col: f.list_dims(col), lambda data: [[len(r)] for r in data], - ], - [ + ), + ( lambda col: f.array_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.empty(col), lambda data: [len(r) == 0 for r in data], - ], - [ + ), + ( lambda col: f.array_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_element(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.list_extract(col, literal(1)), lambda data: [r[0] for r in data], - ], - [ + ), + ( lambda col: f.array_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.list_length(col), lambda data: [len(r) for r in data], - ], - [ + ), + ( lambda col: f.array_has(col, literal(1.0)), lambda data: [1.0 in r for r in data], - ], - [ + ), + ( lambda col: f.array_has_all( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.all([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_has_any( col, f.make_array(*[literal(v) for v in [1.0, 3.0, 5.0]]) ), lambda data: [np.any([v in r for v in [1.0, 3.0, 5.0]]) for r in data], - ], - [ + ), + ( lambda col: f.array_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_position(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.list_indexof(col, literal(1.0)), lambda data: [py_indexof(r, 1.0) for r in data], - ], - [ + ), + ( lambda col: f.array_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.list_positions(col, literal(1.0)), lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data], - ], - [ + ), + ( lambda col: f.array_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.list_ndims(col), lambda data: [np.array(r).ndim for r in data], - ], - [ + ), + ( lambda col: f.array_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_prepend(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_push_front(literal(99.0), col), lambda data: [np.insert(arr, 0, 99.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_back(col), lambda data: [arr[:-1] for arr in data], - ], - [ + ), + ( lambda col: f.array_pop_front(col), lambda data: [arr[1:] for arr in data], - ], - [ + ), + ( lambda col: f.array_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_n(col, literal(3.0), literal(2)), lambda data: [py_arr_remove(arr, 3.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_remove_all(col, literal(3.0)), lambda data: [py_arr_remove(arr, 3.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.list_repeat(col, literal(2)), lambda data: [[arr] * 2 for arr in data], - ], - [ + ), + ( lambda col: f.array_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_n(col, literal(3.0), literal(4.0), literal(1)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 1) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_n(col, literal(3.0), literal(4.0), literal(2)), lambda data: [py_arr_replace(arr, 3.0, 4.0, 2) for arr in data], - ], - [ + ), + ( lambda col: f.array_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_replace_all(col, literal(3.0), literal(4.0)), lambda data: [py_arr_replace(arr, 3.0, 4.0) for arr in data], - ], - [ + ), + ( lambda col: f.array_sort(col, descending=True, null_first=True), lambda data: [np.sort(arr)[::-1] for arr in data], - ], - [ + ), + ( lambda col: f.list_sort(col, descending=False, null_first=False), lambda data: [np.sort(arr) for arr in data], - ], - [ + ), + ( lambda col: f.array_slice(col, literal(2), literal(4)), lambda data: [arr[1:4] for arr in data], - ], + ), pytest.param( lambda col: f.list_slice(col, literal(-1), literal(2)), lambda data: [arr[-1:2] for arr in data], ), - [ + ( lambda col: f.array_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_intersect(col, literal([3.0, 4.0])), lambda data: [np.intersect1d(arr, [3.0, 4.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_union(col, literal([12.0, 999.0])), lambda data: [np.union1d(arr, [12.0, 999.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.list_except(col, literal([3.0])), lambda data: [np.setdiff1d(arr, [3.0]) for arr in data], - ], - [ + ), + ( lambda col: f.array_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.list_resize(col, literal(10), literal(0.0)), lambda data: [py_arr_resize(arr, 10, 0.0) for arr in data], - ], - [ + ), + ( lambda col: f.range(literal(1), literal(5), literal(2)), lambda data: [np.arange(1, 5, 2)], - ], + ), ], ) def test_array_functions(stmt, py_expr): @@ -611,22 +613,22 @@ def test_make_array_functions(make_func): @pytest.mark.parametrize( ("stmt", "py_expr"), [ - [ + ( f.array_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.array_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_to_string(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], - [ + ), + ( f.list_join(column("arr"), literal(",")), lambda data: [",".join([str(int(v)) for v in r]) for r in data], - ], + ), ], ) def test_array_function_obj_tests(stmt, py_expr): @@ -640,7 +642,7 @@ def test_array_function_obj_tests(stmt, py_expr): @pytest.mark.parametrize( - "function, expected_result", + ("function", "expected_result"), [ ( f.ascii(column("a")), @@ -894,54 +896,72 @@ def test_temporal_functions(df): assert result.column(0) == pa.array([12, 6, 7], type=pa.int32()) assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.int32()) assert result.column(2) == pa.array( - [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 1, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 1, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(3) == pa.array( - [datetime(2022, 12, 31), datetime(2027, 6, 26), datetime(2020, 7, 2)], - type=pa.timestamp("us"), + [ + datetime(2022, 12, 31, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 26, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 2, tzinfo=DEFAULT_TZ), + ], + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(4) == pa.array( [ - datetime(2022, 12, 30, 23, 47, 30), - datetime(2027, 6, 25, 23, 47, 30), - datetime(2020, 7, 1, 23, 47, 30), + datetime(2022, 12, 30, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2027, 6, 25, 23, 47, 30, tzinfo=DEFAULT_TZ), + datetime(2020, 7, 1, 23, 47, 30, tzinfo=DEFAULT_TZ), ], - type=pa.timestamp("ns"), + type=pa.timestamp("ns", tz=DEFAULT_TZ), ) assert result.column(5) == pa.array( - [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s") + [datetime(2023, 1, 10, 20, 52, 54, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(6) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(7) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, type=pa.timestamp("s") ) assert result.column(8) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(9) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(10) == pa.array([31, 26, 2], type=pa.int32()) assert result.column(11) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(12) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") + [datetime(2023, 9, 7, 5, 6, 14, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("s"), ) assert result.column(13) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") + [datetime(2023, 9, 7, 5, 6, 14, 523000, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ms"), ) assert result.column(14) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("us"), ) assert result.column(15) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) assert result.column(16) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") + [datetime(2023, 9, 7, 5, 6, 14, 523952, tzinfo=DEFAULT_TZ)] * 3, + type=pa.timestamp("ns"), ) @@ -1057,7 +1077,7 @@ def test_regr_funcs_sql_2(): @pytest.mark.parametrize( - "func, expected", + ("func", "expected"), [ pytest.param(f.regr_slope(column("c2"), column("c1")), [4.6], id="regr_slope"), pytest.param( @@ -1160,7 +1180,7 @@ def test_binary_string_functions(df): @pytest.mark.parametrize( - "python_datatype, name, expected", + ("python_datatype", "name", "expected"), [ pytest.param(bool, "e", pa.bool_(), id="bool"), pytest.param(int, "b", pa.int64(), id="int"), @@ -1179,7 +1199,7 @@ def test_cast(df, python_datatype, name: str, expected): @pytest.mark.parametrize( - "negated, low, high, expected", + ("negated", "low", "high", "expected"), [ pytest.param(False, 3, 5, {"filtered": [4, 5]}), pytest.param(False, 4, 5, {"filtered": [4, 5]}), diff --git a/python/tests/test_imports.py b/python/tests/test_imports.py index 0c155cbde..9ef7ed89a 100644 --- a/python/tests/test_imports.py +++ b/python/tests/test_imports.py @@ -169,14 +169,15 @@ def test_class_module_is_datafusion(): def test_import_from_functions_submodule(): - from datafusion.functions import abs, sin # noqa + from datafusion.functions import abs as df_abs + from datafusion.functions import sin - assert functions.abs is abs + assert functions.abs is df_abs assert functions.sin is sin msg = "cannot import name 'foobar' from 'datafusion.functions'" with pytest.raises(ImportError, match=msg): - from datafusion.functions import foobar # noqa + from datafusion.functions import foobar # noqa: F401 def test_classes_are_inheritable(): diff --git a/python/tests/test_input.py b/python/tests/test_input.py index 806471357..4663f6148 100644 --- a/python/tests/test_input.py +++ b/python/tests/test_input.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +import pathlib from datafusion.input.location import LocationInputPlugin @@ -23,10 +23,10 @@ def test_location_input(): location_input = LocationInputPlugin() - cwd = os.getcwd() - input_file = cwd + "/testing/data/parquet/generated_simple_numerics/blogs.parquet" + cwd = pathlib.Path.cwd() + input_file = cwd / "testing/data/parquet/generated_simple_numerics/blogs.parquet" table_name = "blog" - tbl = location_input.build_table(input_file, table_name) - assert "blog" == tbl.name - assert 3 == len(tbl.columns) + tbl = location_input.build_table(str(input_file), table_name) + assert tbl.name == "blog" + assert len(tbl.columns) == 3 assert "blogs.parquet" in tbl.filepaths[0] diff --git a/python/tests/test_io.py b/python/tests/test_io.py index 21ad188ee..7ca509689 100644 --- a/python/tests/test_io.py +++ b/python/tests/test_io.py @@ -14,8 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import os -import pathlib +from pathlib import Path import pyarrow as pa from datafusion import column @@ -23,10 +22,10 @@ def test_read_json_global_ctx(ctx): - path = os.path.dirname(os.path.abspath(__file__)) + path = Path(__file__).parent.resolve() # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path) result = df.collect() @@ -46,7 +45,7 @@ def test_read_json_global_ctx(ctx): assert result[0].schema == schema # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") + test_data_path = Path(path) / "data_test_context" / "data.json" df = read_json(test_data_path, file_extension=".json") result = df.collect() @@ -59,7 +58,7 @@ def test_read_parquet_global(): parquet_df.show() assert parquet_df is not None - path = pathlib.Path.cwd() / "parquet/data/alltypes_plain.parquet" + path = Path.cwd() / "parquet/data/alltypes_plain.parquet" parquet_df = read_parquet(path=path) assert parquet_df is not None @@ -90,6 +89,6 @@ def test_read_avro(): avro_df.show() assert avro_df is not None - path = pathlib.Path.cwd() / "testing/data/avro/alltypes_plain.avro" + path = Path.cwd() / "testing/data/avro/alltypes_plain.avro" avro_df = read_avro(path=path) assert avro_df is not None diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py index 862f745bf..b6348e3a0 100644 --- a/python/tests/test_sql.py +++ b/python/tests/test_sql.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. import gzip -import os +from pathlib import Path import numpy as np import pyarrow as pa @@ -47,9 +47,8 @@ def test_register_csv(ctx, tmp_path): ) write_csv(table, path) - with open(path, "rb") as csv_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(csv_file) + with Path.open(path, "rb") as csv_file, gzip.open(gzip_path, "wb") as gzipped_file: + gzipped_file.writelines(csv_file) ctx.register_csv("csv", path) ctx.register_csv("csv1", str(path)) @@ -158,7 +157,7 @@ def test_register_parquet(ctx, tmp_path): assert result.to_pydict() == {"cnt": [100]} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): dir_root = tmp_path / "dataset_parquet_partitioned" dir_root.mkdir(exist_ok=False) @@ -194,7 +193,7 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str): assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_dataset(ctx, tmp_path, path_to_str): path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) path = str(path) if path_to_str else path @@ -209,13 +208,15 @@ def test_register_dataset(ctx, tmp_path, path_to_str): def test_register_json(ctx, tmp_path): - path = os.path.dirname(os.path.abspath(__file__)) - test_data_path = os.path.join(path, "data_test_context", "data.json") + path = Path(__file__).parent.resolve() + test_data_path = Path(path) / "data_test_context" / "data.json" gzip_path = tmp_path / "data.json.gz" - with open(test_data_path, "rb") as json_file: - with gzip.open(gzip_path, "wb") as gzipped_file: - gzipped_file.writelines(json_file) + with ( + Path.open(test_data_path, "rb") as json_file, + gzip.open(gzip_path, "wb") as gzipped_file, + ): + gzipped_file.writelines(json_file) ctx.register_json("json", test_data_path) ctx.register_json("json1", str(test_data_path)) @@ -470,16 +471,18 @@ def test_simple_select(ctx, tmp_path, arr): # In DF 43.0.0 we now default to having BinaryView and StringView # so the array that is saved to the parquet is slightly different # than the array read. Convert to values for comparison. - if isinstance(result, pa.BinaryViewArray) or isinstance(result, pa.StringViewArray): + if isinstance(result, (pa.BinaryViewArray, pa.StringViewArray)): arr = arr.tolist() result = result.tolist() np.testing.assert_equal(result, arr) -@pytest.mark.parametrize("file_sort_order", (None, [[col("int").sort(True, True)]])) -@pytest.mark.parametrize("pass_schema", (True, False)) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize( + "file_sort_order", [None, [[col("int").sort(ascending=True, nulls_first=True)]]] +) +@pytest.mark.parametrize("pass_schema", [True, False]) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_register_listing_table( ctx, tmp_path, pass_schema, file_sort_order, path_to_str ): @@ -528,7 +531,7 @@ def test_register_listing_table( assert dict(zip(rd["grp"], rd["count"])) == {"a": 5, "b": 2} result = ctx.sql( - "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" + "SELECT grp, COUNT(*) AS count FROM my_table WHERE date_id=20201005 GROUP BY grp" # noqa: E501 ).collect() result = pa.Table.from_batches(result) diff --git a/python/tests/test_store.py b/python/tests/test_store.py index 53ffc3acf..ac9af98f3 100644 --- a/python/tests/test_store.py +++ b/python/tests/test_store.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import os +from pathlib import Path import pytest from datafusion import SessionContext @@ -23,17 +23,16 @@ @pytest.fixture def ctx(): - ctx = SessionContext() - return ctx + return SessionContext() def test_read_parquet(ctx): ctx.register_parquet( "test", - f"file://{os.getcwd()}/parquet/data/alltypes_plain.parquet", - [], - True, - ".parquet", + f"file://{Path.cwd()}/parquet/data/alltypes_plain.parquet", + table_partition_cols=[], + parquet_pruning=True, + file_extension=".parquet", ) df = ctx.sql("SELECT * FROM test") assert isinstance(df.collect(), list) diff --git a/python/tests/test_substrait.py b/python/tests/test_substrait.py index feada7cde..f367a447d 100644 --- a/python/tests/test_substrait.py +++ b/python/tests/test_substrait.py @@ -50,7 +50,7 @@ def test_substrait_serialization(ctx): substrait_plan = ss.Producer.to_substrait_plan(df.logical_plan(), ctx) -@pytest.mark.parametrize("path_to_str", (True, False)) +@pytest.mark.parametrize("path_to_str", [True, False]) def test_substrait_file_serialization(ctx, tmp_path, path_to_str): batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], diff --git a/python/tests/test_udaf.py b/python/tests/test_udaf.py index 97cf81f3c..453ff6f4f 100644 --- a/python/tests/test_udaf.py +++ b/python/tests/test_udaf.py @@ -17,8 +17,6 @@ from __future__ import annotations -from typing import List - import pyarrow as pa import pyarrow.compute as pc import pytest @@ -31,7 +29,7 @@ class Summarize(Accumulator): def __init__(self, initial_value: float = 0.0): self._sum = pa.scalar(initial_value) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] def update(self, values: pa.Array) -> None: @@ -39,7 +37,7 @@ def update(self, values: pa.Array) -> None: # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - def merge(self, states: List[pa.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: # Not nice since pyarrow scalars can't be summed yet. # This breaks on `None` self._sum = pa.scalar(self._sum.as_py() + pc.sum(states[0]).as_py()) @@ -56,7 +54,7 @@ class MissingMethods(Accumulator): def __init__(self): self._sum = pa.scalar(0) - def state(self) -> List[pa.Scalar]: + def state(self) -> list[pa.Scalar]: return [self._sum] @@ -86,7 +84,7 @@ def test_errors(df): "evaluate, merge, update)" ) with pytest.raises(Exception, match=msg): - accum = udaf( # noqa F841 + accum = udaf( # noqa: F841 MissingMethods, pa.int64(), pa.int64(), diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 2fea34aa3..3d6dcf9d8 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -298,7 +298,7 @@ def test_udwf_errors(df): ] -@pytest.mark.parametrize("name,expr,expected", data_test_udwf_functions) +@pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) def test_udwf_functions(df, name, expr, expected): df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index ac064ba95..d7f6f6e35 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -19,6 +19,7 @@ import datafusion.functions import datafusion.object_store import datafusion.substrait +import pytest # EnumType introduced in 3.11. 3.10 and prior it was called EnumMeta. try: @@ -41,10 +42,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: internal_attr = getattr(internal_obj, attr) wrapped_attr = getattr(wrapped_obj, attr) - if internal_attr is not None: - if wrapped_attr is None: - print("Missing attribute: ", attr) - assert False + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {attr}") if attr in ["__self__", "__class__"]: continue From 3dcf7c7e5c0af0eb3c5e3bdf9c6e33fd4541b070 Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Thu, 13 Mar 2025 04:09:03 -0600 Subject: [PATCH 14/22] feat/making global context accessible for users (#1060) * Rename _global_ctx to global_ctx * Add global context to python wrapper code * Update context.py * singleton for global context * formatting * remove udf from import * remove _global_instance * formatting * formatting * unnecessary test * fix test_io.py * ran ruff * ran ruff format --- python/datafusion/context.py | 12 +++++++ python/datafusion/io.py | 63 ++++++++++++++++-------------------- python/tests/test_context.py | 18 +++++++++++ src/context.rs | 2 +- 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 0ab1a908a..58ad9a943 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -496,6 +496,18 @@ def __init__( self.ctx = SessionContextInternal(config, runtime) + @classmethod + def global_ctx(cls) -> SessionContext: + """Retrieve the global context as a `SessionContext` wrapper. + + Returns: + A `SessionContext` object that wraps the global `SessionContextInternal`. + """ + internal_ctx = SessionContextInternal.global_ctx() + wrapper = cls() + wrapper.ctx = internal_ctx + return wrapper + def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 3e39703e3..ef5ebf96f 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -21,10 +21,9 @@ from typing import TYPE_CHECKING +from datafusion.context import SessionContext from datafusion.dataframe import DataFrame -from ._internal import SessionContext as SessionContextInternal - if TYPE_CHECKING: import pathlib @@ -68,16 +67,14 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_parquet( - str(path), - table_partition_cols, - parquet_pruning, - file_extension, - skip_metadata, - schema, - file_sort_order, - ) + return SessionContext.global_ctx().read_parquet( + str(path), + table_partition_cols, + parquet_pruning, + file_extension, + skip_metadata, + schema, + file_sort_order, ) @@ -110,15 +107,13 @@ def read_json( """ if table_partition_cols is None: table_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_json( - str(path), - schema, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_json( + str(path), + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -161,17 +156,15 @@ def read_csv( path = [str(p) for p in path] if isinstance(path, list) else str(path) - return DataFrame( - SessionContextInternal._global_ctx().read_csv( - path, - schema, - has_header, - delimiter, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, - ) + return SessionContext.global_ctx().read_csv( + path, + schema, + has_header, + delimiter, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, ) @@ -198,8 +191,6 @@ def read_avro( """ if file_partition_cols is None: file_partition_cols = [] - return DataFrame( - SessionContextInternal._global_ctx().read_avro( - str(path), schema, file_partition_cols, file_extension - ) + return SessionContext.global_ctx().read_avro( + str(path), schema, file_partition_cols, file_extension ) diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 7a0a7aa08..4a15ac9cf 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -632,3 +632,21 @@ def test_sql_with_options_no_statements(ctx): options = SQLOptions().with_allow_statements(allow=False) with pytest.raises(Exception, match="SetVariable"): ctx.sql_with_options(sql, options=options) + + +@pytest.fixture +def batch(): + return pa.RecordBatch.from_arrays( + [pa.array([4, 5, 6])], + names=["a"], + ) + + +def test_create_dataframe_with_global_ctx(batch): + ctx = SessionContext.global_ctx() + + df = ctx.create_dataframe([[batch]]) + + result = df.collect()[0].column(0) + + assert result == pa.array([4, 5, 6]) diff --git a/src/context.rs b/src/context.rs index 9ba87eb8a..0db0f4d7e 100644 --- a/src/context.rs +++ b/src/context.rs @@ -308,7 +308,7 @@ impl PySessionContext { #[classmethod] #[pyo3(signature = ())] - fn _global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { + fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { Ok(Self { ctx: get_global_ctx().clone(), }) From 55141bad7c2270c14742e962d8bab1d4f1be27f5 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:26:31 +0530 Subject: [PATCH 15/22] Renaming Internal Structs (#1059) * Renamed Expr to RawExpr * Fixed CI test for exported classes to include RawExpr as well * Fixed CI test for exported classes to check if Expr class covers RawExpr * Generalized Raw* class checking * fixes * fixes * fixed the CI test to not look for Raw classes in the datafusion module * Add additional text to unit test describing operation and ensure wrapped Raw classes are checked * New ruff rule on main * Resolve ruff errors --------- Co-authored-by: Tim Saucer --- python/datafusion/expr.py | 8 ++-- python/tests/test_wrapper_coverage.py | 55 +++++++++++++++++++-------- src/expr.rs | 2 +- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 702f75aed..77b6c272d 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -193,7 +193,7 @@ class Expr: :ref:`Expressions` in the online documentation for more information. """ - def __init__(self, expr: expr_internal.Expr) -> None: + def __init__(self, expr: expr_internal.RawExpr) -> None: """This constructor should not be called by the end user.""" self.expr = expr @@ -383,7 +383,7 @@ def literal(value: Any) -> Expr: value = pa.scalar(value, type=pa.string_view()) if not isinstance(value, pa.Scalar): value = pa.scalar(value) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) @staticmethod def string_literal(value: str) -> Expr: @@ -398,13 +398,13 @@ def string_literal(value: str) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string()) - return Expr(expr_internal.Expr.literal(value)) + return Expr(expr_internal.RawExpr.literal(value)) return Expr.literal(value) @staticmethod def column(value: str) -> Expr: """Creates a new expression representing a column.""" - return Expr(expr_internal.Expr.column(value)) + return Expr(expr_internal.RawExpr.column(value)) def alias(self, name: str) -> Expr: """Assign a name to the expression.""" diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index d7f6f6e35..a2de2d32b 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,37 +28,62 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: - # Special case enums - just make sure they exist since dir() - # and other functions get overridden. +def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 + """ + Identify if any of the rust exposted structs or functions do not have wrappers. + + Special handling for: + - Raw* classes: Internal implementation details that shouldn't be exposed + - _global_ctx: Internal implementation detail + - __self__, __class__: Python special attributes + """ + # Special case enums - EnumType overrides a some of the internal functions, + # so check all of the values exist and move on if isinstance(wrapped_obj, EnumType): + expected_values = [v for v in dir(internal_obj) if not v.startswith("__")] + for value in expected_values: + assert value in dir(wrapped_obj) return - for attr in dir(internal_obj): - if attr in ["_global_ctx"]: - continue - assert attr in dir(wrapped_obj) + for internal_attr_name in dir(internal_obj): + wrapped_attr_name = internal_attr_name.removeprefix("Raw") + assert wrapped_attr_name in dir(wrapped_obj) - internal_attr = getattr(internal_obj, attr) - wrapped_attr = getattr(wrapped_obj, attr) + internal_attr = getattr(internal_obj, internal_attr_name) + wrapped_attr = getattr(wrapped_obj, wrapped_attr_name) - if internal_attr is not None and wrapped_attr is None: - pytest.fail(f"Missing attribute: {attr}") + # There are some auto generated attributes that can be None, such as + # __kwdefaults__ and __doc__. As long as these are None on the internal + # object, it's okay to skip them. However if they do exist on the internal + # object they must also exist on the wrapped object. + if internal_attr is not None: + if wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") - if attr in ["__self__", "__class__"]: + if internal_attr_name in ["__self__", "__class__"]: continue + if isinstance(internal_attr, list): assert isinstance(wrapped_attr, list) + + # We have cases like __all__ that are a list and we want to be certain that + # every value in the list in the internal object is also in the wrapper list for val in internal_attr: - assert val in wrapped_attr + if isinstance(val, str) and val.startswith("Raw"): + assert val[3:] in wrapped_attr + else: + assert val in wrapped_attr elif hasattr(internal_attr, "__dict__"): + # Check all submodules recursively missing_exports(internal_attr, wrapped_attr) def test_datafusion_missing_exports() -> None: """Check for any missing python exports. - This test verifies that every exposed class, attribute, and function in - the internal (pyo3) module is also exposed in our python wrappers. + This test verifies that every exposed class, attribute, + and function in the internal (pyo3) module - datafusion._internal + is also exposed in our python wrappers - datafusion - + i.e., the ones exposed to the public. """ missing_exports(datafusion._internal, datafusion) diff --git a/src/expr.rs b/src/expr.rs index e750be6a4..d3c528eb4 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -101,7 +101,7 @@ pub mod window; use sort_expr::{to_sort_expressions, PySortExpr}; /// A PyExpr that can be used on a DataFrame -#[pyclass(name = "Expr", module = "datafusion.expr", subclass)] +#[pyclass(name = "RawExpr", module = "datafusion.expr", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, From 4f457030f171a26d0c4cce4d55cf541519956fcc Mon Sep 17 00:00:00 2001 From: jsai28 <54253219+jsai28@users.noreply.github.com> Date: Sat, 15 Mar 2025 04:57:38 -0600 Subject: [PATCH 16/22] added pytest asyncio tests (#1063) --- pyproject.toml | 1 + python/tests/test_dataframe.py | 54 ++++++++++++++++++++++++++++++++++ uv.lock | 17 ++++++++++- 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 060e3b80a..a4ed18c4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,6 +150,7 @@ dev = [ "maturin>=1.8.1", "numpy>1.25.0", "pytest>=7.4.4", + "pytest-asyncio>=0.23.3", "ruff>=0.9.1", "toml>=0.10.2", "pygithub==2.5.0", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index d084f12dd..384b17878 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -771,6 +771,16 @@ def test_execution_plan(aggregate_df): assert rows_returned == 5 +@pytest.mark.asyncio +async def test_async_iteration_of_df(aggregate_df): + rows_returned = 0 + async for batch in aggregate_df.execute_stream(): + assert batch is not None + rows_returned += len(batch.to_pyarrow()[0]) + + assert rows_returned == 5 + + def test_repartition(df): df.repartition(2) @@ -958,6 +968,18 @@ def test_execute_stream(df): assert not list(stream) # after one iteration the generator must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_async(df): + stream = df.execute_stream() + batches = [batch async for batch in stream] + + assert all(batch is not None for batch in batches) + + # After consuming all batches, the stream should be exhausted + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + @pytest.mark.parametrize("schema", [True, False]) def test_execute_stream_to_arrow_table(df, schema): stream = df.execute_stream() @@ -974,6 +996,25 @@ def test_execute_stream_to_arrow_table(df, schema): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +@pytest.mark.asyncio +@pytest.mark.parametrize("schema", [True, False]) +async def test_execute_stream_to_arrow_table_async(df, schema): + stream = df.execute_stream() + + if schema: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream], schema=df.schema() + ) + else: + pyarrow_table = pa.Table.from_batches( + [batch.to_pyarrow() async for batch in stream] + ) + + assert isinstance(pyarrow_table, pa.Table) + assert pyarrow_table.shape == (3, 3) + assert set(pyarrow_table.column_names) == {"a", "b", "c"} + + def test_execute_stream_partitioned(df): streams = df.execute_stream_partitioned() assert all(batch is not None for stream in streams for batch in stream) @@ -982,6 +1023,19 @@ def test_execute_stream_partitioned(df): ) # after one iteration all generators must be exhausted +@pytest.mark.asyncio +async def test_execute_stream_partitioned_async(df): + streams = df.execute_stream_partitioned() + + for stream in streams: + batches = [batch async for batch in stream] + assert all(batch is not None for batch in batches) + + # Ensure the stream is exhausted after iteration + remaining_batches = [batch async for batch in stream] + assert not remaining_batches + + def test_empty_to_arrow_table(df): # Convert empty datafusion dataframe to pyarrow Table pyarrow_table = df.limit(0).to_arrow_table() diff --git a/uv.lock b/uv.lock index 619b92856..7e4bc4c6b 100644 --- a/uv.lock +++ b/uv.lock @@ -284,9 +284,11 @@ dependencies = [ [package.dev-dependencies] dev = [ { name = "maturin" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "numpy", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "pygithub" }, { name = "pytest" }, + { name = "pytest-asyncio" }, { name = "ruff" }, { name = "toml" }, ] @@ -314,9 +316,10 @@ requires-dist = [ [package.metadata.requires-dev] dev = [ { name = "maturin", specifier = ">=1.8.1" }, - { name = "numpy", marker = "python_full_version >= '3.10'", specifier = ">1.24.4" }, + { name = "numpy", specifier = ">1.25.0" }, { name = "pygithub", specifier = "==2.5.0" }, { name = "pytest", specifier = ">=7.4.4" }, + { name = "pytest-asyncio", specifier = ">=0.23.3" }, { name = "ruff", specifier = ">=0.9.1" }, { name = "toml", specifier = ">=0.10.2" }, ] @@ -1079,6 +1082,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 }, ] +[[package]] +name = "pytest-asyncio" +version = "0.25.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/a8/ecbc8ede70921dd2f544ab1cadd3ff3bf842af27f87bbdea774c7baa1d38/pytest_asyncio-0.25.3.tar.gz", hash = "sha256:fc1da2cf9f125ada7e710b4ddad05518d4cee187ae9412e9ac9271003497f07a", size = 54239 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/17/3493c5624e48fd97156ebaec380dcaafee9506d7e2c46218ceebbb57d7de/pytest_asyncio-0.25.3-py3-none-any.whl", hash = "sha256:9e89518e0f9bd08928f97a3482fdc4e244df17529460bc038291ccaf8f85c7c3", size = 19467 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From 2f52688d76e84794343c17ffaf3002534ecfd716 Mon Sep 17 00:00:00 2001 From: kosiew Date: Sat, 15 Mar 2025 19:00:50 +0800 Subject: [PATCH 17/22] Add decorator for udwf (#1061) * feat: Introduce create_udwf method for User-Defined Window Functions - Added `create_udwf` static method to `WindowUDF` class, allowing users to create User-Defined Window Functions (UDWF) as both a function and a decorator. - Updated type hinting for `_R` using `TypeAlias` for better clarity. - Enhanced documentation with usage examples for both function and decorator styles, improving usability and understanding. * refactor: Simplify UDWF test suite and introduce SimpleWindowCount evaluator - Removed multiple exponential smoothing classes to streamline the code. - Introduced SimpleWindowCount class for basic row counting functionality. - Updated test cases to validate the new SimpleWindowCount evaluator. - Refactored fixture and test functions for clarity and consistency. - Enhanced error handling in UDWF creation tests. * fix: Update type alias import to use typing_extensions for compatibility * Add udwf tests for multiple input types and decorator syntax * replace old def udwf * refactor: Simplify df fixture by passing ctx as an argument * refactor: Rename DataFrame fixtures and update test functions - Renamed `df` fixture to `complex_window_df` for clarity. - Renamed `simple_df` fixture to `count_window_df` to better reflect its purpose. - Updated test functions to use the new fixture names, enhancing readability and maintainability. * refactor: Update udwf calls in WindowUDF to use BiasedNumbers directly - Changed udwf1 to use BiasedNumbers instead of bias_10. - Added udwf2 to call udwf with bias_10. - Introduced udwf3 to demonstrate a lambda function returning BiasedNumbers(20). * feat: Add overloads for udwf function to support multiple input types and decorator syntax * refactor: Simplify udwf method signature by removing redundant type hints * refactor: Remove state_type from udwf method signature and update return type handling - Eliminated the state_type parameter from the udwf method to simplify the function signature. - Updated return type handling in the _function and _decorator methods to use a generic type _R for better type flexibility. - Enhanced the decorator to wrap the original function, allowing for improved argument handling and expression return. * refactor: Update volatility parameter type in udwf method signature to support Volatility enum * Fix ruff errors * fix C901 for def udwf * refactor: Update udwf method signature and simplify input handling - Changed the type hint for the return type in the _create_window_udf_decorator method to use pa.DataType directly instead of a TypeVar. - Simplified the handling of input types by removing redundant checks and directly using the input types list. - Removed unnecessary comments and cleaned up the code for better readability. - Updated the test for udwf to use parameterized tests for better coverage and maintainability. * refactor: Rename input_type to input_types in udwf method signature for clarity * refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization * Revert "refactor: Enhance typing in udf.py by introducing Protocol for WindowEvaluator and improving import organization" This reverts commit 16dbe5f3fd88f42d0a304384b162009bd9e49a35. --- python/datafusion/udf.py | 123 +++++++++++++++++++++------ python/tests/test_udwf.py | 170 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 264 insertions(+), 29 deletions(-) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 603b7063d..e93a34ca5 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -621,6 +621,16 @@ def __call__(self, *args: Expr) -> Expr: args_raw = [arg.expr for arg in args] return Expr(self._udwf.__call__(*args_raw)) + @overload + @staticmethod + def udwf( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[..., WindowUDF]: ... + + @overload @staticmethod def udwf( func: Callable[[], WindowEvaluator], @@ -628,24 +638,31 @@ def udwf( return_type: pa.DataType, volatility: Volatility | str, name: Optional[str] = None, - ) -> WindowUDF: - """Create a new User-Defined Window Function. + ) -> WindowUDF: ... - If your :py:class:`WindowEvaluator` can be instantiated with no arguments, you - can simply pass it's type as ``func``. If you need to pass additional arguments - to it's constructor, you can define a lambda or a factory method. During runtime - the :py:class:`WindowEvaluator` will be constructed for every instance in - which this UDWF is used. The following examples are all valid. + @staticmethod + def udwf(*args: Any, **kwargs: Any): # noqa: D417 + """Create a new User-Defined Window Function (UDWF). - .. code-block:: python + This class can be used both as a **function** and as a **decorator**. + + Usage: + - **As a function**: Call `udwf(func, input_types, return_type, volatility, + name)`. + - **As a decorator**: Use `@udwf(input_types, return_type, volatility, + name)`. When using `udwf` as a decorator, **do not pass `func` + explicitly**. + **Function example:** + ``` import pyarrow as pa class BiasedNumbers(WindowEvaluator): def __init__(self, start: int = 0) -> None: self.start = start - def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + def evaluate_all(self, values: list[pa.Array], + num_rows: int) -> pa.Array: return pa.array([self.start + i for i in range(num_rows)]) def bias_10() -> BiasedNumbers: @@ -655,35 +672,93 @@ def bias_10() -> BiasedNumbers: udwf2 = udwf(bias_10, pa.int64(), pa.int64(), "immutable") udwf3 = udwf(lambda: BiasedNumbers(20), pa.int64(), pa.int64(), "immutable") + ``` + + **Decorator example:** + ``` + @udwf(pa.int64(), pa.int64(), "immutable") + def biased_numbers() -> BiasedNumbers: + return BiasedNumbers(10) + ``` + Args: - func: A callable to create the window function. - input_types: The data types of the arguments to ``func``. + func: **Only needed when calling as a function. Skip this argument when + using `udwf` as a decorator.** + input_types: The data types of the arguments. return_type: The data type of the return value. volatility: See :py:class:`Volatility` for allowed values. - arguments: A list of arguments to pass in to the __init__ method for accum. name: A descriptive name for the function. Returns: - A user-defined window function. - """ # noqa: W505, E501 + A user-defined window function that can be used in window function calls. + """ + if args and callable(args[0]): + # Case 1: Used as a function, require the first parameter to be callable + return WindowUDF._create_window_udf(*args, **kwargs) + # Case 2: Used as a decorator with parameters + return WindowUDF._create_window_udf_decorator(*args, **kwargs) + + @staticmethod + def _create_window_udf( + func: Callable[[], WindowEvaluator], + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> WindowUDF: + """Create a WindowUDF instance from function arguments.""" if not callable(func): msg = "`func` must be callable." raise TypeError(msg) if not isinstance(func(), WindowEvaluator): msg = "`func` must implement the abstract base class WindowEvaluator" raise TypeError(msg) - if name is None: - name = func().__class__.__qualname__.lower() - if isinstance(input_types, pa.DataType): - input_types = [input_types] - return WindowUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, + + name = name or func.__qualname__.lower() + input_types = ( + [input_types] if isinstance(input_types, pa.DataType) else input_types ) + return WindowUDF(name, func, input_types, return_type, volatility) + + @staticmethod + def _get_default_name(func: Callable) -> str: + """Get the default name for a function based on its attributes.""" + if hasattr(func, "__qualname__"): + return func.__qualname__.lower() + return func.__class__.__name__.lower() + + @staticmethod + def _normalize_input_types( + input_types: pa.DataType | list[pa.DataType], + ) -> list[pa.DataType]: + """Convert a single DataType to a list if needed.""" + if isinstance(input_types, pa.DataType): + return [input_types] + return input_types + + @staticmethod + def _create_window_udf_decorator( + input_types: pa.DataType | list[pa.DataType], + return_type: pa.DataType, + volatility: Volatility | str, + name: Optional[str] = None, + ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: + """Create a decorator for a WindowUDF.""" + + def decorator(func: Callable[[], WindowEvaluator]) -> Callable[..., Expr]: + udwf_caller = WindowUDF._create_window_udf( + func, input_types, return_type, volatility, name + ) + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + return udwf_caller(*args, **kwargs) + + return wrapper + + return decorator + # Convenience exports so we can import instead of treating as # variables at the package root diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py index 3d6dcf9d8..4190e7d64 100644 --- a/python/tests/test_udwf.py +++ b/python/tests/test_udwf.py @@ -162,14 +162,27 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: return pa.array(results) +class SimpleWindowCount(WindowEvaluator): + """A simple window evaluator that counts rows.""" + + def __init__(self, base: int = 0) -> None: + self.base = base + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + return pa.array([self.base + i for i in range(num_rows)]) + + class NotSubclassOfWindowEvaluator: pass @pytest.fixture -def df(): - ctx = SessionContext() +def ctx(): + return SessionContext() + +@pytest.fixture +def complex_window_df(ctx): # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [ @@ -182,7 +195,17 @@ def df(): return ctx.create_dataframe([[batch]]) -def test_udwf_errors(df): +@pytest.fixture +def count_window_df(ctx): + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]], name="test_table") + + +def test_udwf_errors(complex_window_df): with pytest.raises(TypeError): udwf( NotSubclassOfWindowEvaluator, @@ -192,6 +215,103 @@ def test_udwf_errors(df): ) +def test_udwf_errors_with_message(): + """Test error cases for UDWF creation.""" + with pytest.raises( + TypeError, match="`func` must implement the abstract base class WindowEvaluator" + ): + udwf( + NotSubclassOfWindowEvaluator, pa.int64(), pa.int64(), volatility="immutable" + ) + + +def test_udwf_basic_usage(count_window_df): + """Test basic UDWF usage with a simple counting window function.""" + simple_count = udwf( + SimpleWindowCount, pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + simple_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_with_args(count_window_df): + """Test UDWF with constructor arguments.""" + count_base10 = udwf( + lambda: SimpleWindowCount(10), pa.int64(), pa.int64(), volatility="immutable" + ) + + df = count_window_df.select( + count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_udwf_decorator_basic(count_window_df): + """Test UDWF used as a decorator.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count() -> WindowEvaluator: + return SimpleWindowCount() + + df = count_window_df.select( + window_count(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_decorator_with_args(count_window_df): + """Test UDWF decorator with constructor arguments.""" + + @udwf([pa.int64()], pa.int64(), "immutable") + def window_count_base10() -> WindowEvaluator: + return SimpleWindowCount(10) + + df = count_window_df.select( + window_count_base10(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([10, 11, 12]) + + +def test_register_udwf(ctx, count_window_df): + """Test registering and using UDWF in SQL context.""" + window_count = udwf( + SimpleWindowCount, + [pa.int64()], + pa.int64(), + volatility="immutable", + name="window_count", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT window_count(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table + """ + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + smooth_default = udwf( ExponentialSmoothDefault, pa.float64(), @@ -299,10 +419,50 @@ def test_udwf_errors(df): @pytest.mark.parametrize(("name", "expr", "expected"), data_test_udwf_functions) -def test_udwf_functions(df, name, expr, expected): - df = df.select("a", "b", f.round(expr, lit(3)).alias(name)) +def test_udwf_functions(complex_window_df, name, expr, expected): + df = complex_window_df.select("a", "b", f.round(expr, lit(3)).alias(name)) # execute and collect the first (and only) batch result = df.sort(column("a")).select(column(name)).collect()[0] assert result.column(0) == pa.array(expected) + + +@pytest.mark.parametrize( + "udwf_func", + [ + udwf(SimpleWindowCount, pa.int64(), pa.int64(), "immutable"), + udwf(SimpleWindowCount, [pa.int64()], pa.int64(), "immutable"), + udwf([pa.int64()], pa.int64(), "immutable")(lambda: SimpleWindowCount()), + udwf(pa.int64(), pa.int64(), "immutable")(lambda: SimpleWindowCount()), + ], +) +def test_udwf_overloads(udwf_func, count_window_df): + df = count_window_df.select( + udwf_func(column("a")) + .window_frame(WindowFrame("rows", None, None)) + .build() + .alias("count") + ) + result = df.collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) + + +def test_udwf_named_function(ctx, count_window_df): + """Test UDWF with explicit name parameter.""" + window_count = udwf( + SimpleWindowCount, + pa.int64(), + pa.int64(), + volatility="immutable", + name="my_custom_counter", + ) + + ctx.register_udwf(window_count) + result = ctx.sql( + """ + SELECT my_custom_counter(a) + OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED + FOLLOWING) FROM test_table""" + ).collect()[0] + assert result.column(0) == pa.array([0, 1, 2]) From 7c1c08f8617ac97a2568eb0664e9d4ee30fceba9 Mon Sep 17 00:00:00 2001 From: Nirnay Roy <32942494+nirnayroy@users.noreply.github.com> Date: Sat, 15 Mar 2025 17:05:05 +0530 Subject: [PATCH 18/22] feat: expose regex_count function (#1066) * Added wrapper for regex_count function * fix comment --------- Co-authored-by: Nirnay Roy --- python/datafusion/functions.py | 18 ++++++++++++++++++ python/tests/test_functions.py | 4 ++++ src/functions.rs | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0cc7434cf..26bac149c 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -217,6 +217,7 @@ "random", "range", "rank", + "regexp_count", "regexp_like", "regexp_match", "regexp_replace", @@ -779,6 +780,23 @@ def regexp_replace( return Expr(f.regexp_replace(string.expr, pattern.expr, replacement.expr, flags)) +def regexp_count( + string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None +) -> Expr: + """Returns the number of matches in a string. + + Optional start position (the first position is 1) to search for the regular + expression. + """ + if flags is not None: + flags = flags.expr + if start is not None: + start = start.expr + else: + start = Expr.expr + return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) + + def repeat(string: Expr, n: Expr) -> Expr: """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index ed88a16e3..161e1e3bb 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -740,6 +740,10 @@ def test_array_function_obj_tests(stmt, py_expr): f.regexp_replace(column("a"), literal("(ell|orl)"), literal("-")), pa.array(["H-o", "W-d", "!"]), ), + ( + f.regexp_count(column("a"), literal("(ell|orl)"), literal(1)), + pa.array([1, 1, 0], type=pa.int64()), + ), ], ) def test_string_functions(df, function, expected_result): diff --git a/src/functions.rs b/src/functions.rs index 6a8abb18d..8fac239b4 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -173,6 +173,25 @@ fn regexp_replace( ) .into()) } + +#[pyfunction] +#[pyo3(signature = (string, pattern, start, flags=None))] +/// Returns the number of matches found in the string. +fn regexp_count( + string: PyExpr, + pattern: PyExpr, + start: Option, + flags: Option, +) -> PyResult { + Ok(functions::expr_fn::regexp_count( + string.expr, + pattern.expr, + start.map(|x| x.expr), + flags.map(|x| x.expr), + ) + .into()) +} + /// Creates a new Sort Expr #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { @@ -943,6 +962,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(power))?; m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; + m.add_wrapped(wrap_pyfunction!(regexp_count))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; From b8dd97bc8eefcfecfa8dcc864c4898c654b236a9 Mon Sep 17 00:00:00 2001 From: Spaarsh <67336892+Spaarsh@users.noreply.github.com> Date: Mon, 17 Mar 2025 20:08:16 +0530 Subject: [PATCH 19/22] Add additional ruff suggestions (#1062) * Enabled ruff rule PT001 and ANN204 * Enabled ruff rule B008 * Enabled ruff rule EM101 * Enabled ruff rule PLR1714 * Enabled ruff rule ANN201 * Enabled ruff rule C400 * Enabled ruff rule B904 * Enabled ruff rule UP006 * Enabled ruff rule RUF012 * Enabled ruff rule FBT003 * Enabled ruff rule C416 * Enabled ruff rule SIM102 * Enabled ruff rule PGH003 * Enabled ruff rule PERF401 * Enabled ruff rule EM102 * Enabled ruff rule SIM108 * Enabled ruff rule ICN001 * Enabled ruff rule ICN001 * implemented reviews * Update pyproject.toml to ignore `SIM102` * Enabled ruff rule PLW2901 * Enabled ruff rule RET503 * Fixed failing ruff tests --- benchmarks/db-benchmark/groupby-datafusion.py | 24 ++-- benchmarks/db-benchmark/join-datafusion.py | 5 +- benchmarks/tpch/tpch.py | 7 +- dev/release/generate-changelog.py | 6 +- docs/source/conf.py | 4 +- examples/create-context.py | 12 +- examples/python-udaf.py | 36 +++-- examples/python-udf-comparisons.py | 9 +- examples/python-udf.py | 12 +- examples/query-pyarrow-data.py | 10 +- examples/sql-using-python-udaf.py | 2 +- examples/tpch/_tests.py | 4 +- examples/tpch/convert_data_to_parquet.py | 134 +++++++++--------- examples/tpch/q08_market_share.py | 2 +- examples/tpch/q19_discounted_revenue.py | 4 +- .../tpch/q21_suppliers_kept_orders_waiting.py | 2 +- pyproject.toml | 20 --- python/datafusion/__init__.py | 8 +- python/datafusion/catalog.py | 4 +- python/datafusion/context.py | 51 +++---- python/datafusion/dataframe.py | 55 +++---- python/datafusion/expr.py | 31 ++-- python/datafusion/functions.py | 9 +- python/tests/test_functions.py | 2 +- python/tests/test_wrapper_coverage.py | 7 +- 25 files changed, 213 insertions(+), 247 deletions(-) diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index 04bf7a149..f9e8d638b 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -20,7 +20,7 @@ import timeit import datafusion as df -import pyarrow +import pyarrow as pa from datafusion import ( RuntimeEnvBuilder, SessionConfig, @@ -37,7 +37,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -48,7 +48,7 @@ def ans_shape(batches): return rows, cols -def execute(df): +def execute(df) -> list: print(df.execution_plan().display_indent()) return df.collect() @@ -68,14 +68,14 @@ def execute(df): src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % src_grp, flush=True) -schema = pyarrow.schema( +schema = pa.schema( [ - ("id4", pyarrow.int32()), - ("id5", pyarrow.int32()), - ("id6", pyarrow.int32()), - ("v1", pyarrow.int32()), - ("v2", pyarrow.int32()), - ("v3", pyarrow.float64()), + ("id4", pa.int32()), + ("id5", pa.int32()), + ("id6", pa.int32()), + ("v1", pa.int32()), + ("v2", pa.int32()), + ("v3", pa.float64()), ] ) @@ -93,8 +93,8 @@ def execute(df): ) config = ( SessionConfig() - .with_repartition_joins(False) - .with_repartition_aggregations(False) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) .set("datafusion.execution.coalesce_batches", "false") ) ctx = SessionContext(config, runtime) diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index b45ebf632..039868031 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -29,7 +29,7 @@ exec(open("./_helpers/helpers.py").read()) -def ans_shape(batches): +def ans_shape(batches) -> tuple[int, int]: rows, cols = 0, 0 for batch in batches: rows += batch.num_rows @@ -57,7 +57,8 @@ def ans_shape(batches): os.path.join("data", y_data_name[2] + ".csv"), ] if len(src_jn_y) != 3: - raise Exception("Something went wrong in preparing files used for join") + error_msg = "Something went wrong in preparing files used for join" + raise Exception(error_msg) print( "loading datasets " diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index bfb9ac398..2d1bbae5b 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -21,7 +21,7 @@ from datafusion import SessionContext -def bench(data_path, query_path): +def bench(data_path, query_path) -> None: with open("results.csv", "w") as results: # register tables start = time.time() @@ -68,10 +68,7 @@ def bench(data_path, query_path): with open(f"{query_path}/q{query}.sql") as f: text = f.read() tmp = text.split(";") - queries = [] - for str in tmp: - if len(str.strip()) > 0: - queries.append(str.strip()) + queries = [s.strip() for s in tmp if len(s.strip()) > 0] try: start = time.time() diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py index e30e2def2..d86736773 100755 --- a/dev/release/generate-changelog.py +++ b/dev/release/generate-changelog.py @@ -24,7 +24,7 @@ from github import Github -def print_pulls(repo_name, title, pulls): +def print_pulls(repo_name, title, pulls) -> None: if len(pulls) > 0: print(f"**{title}:**") print() @@ -34,7 +34,7 @@ def print_pulls(repo_name, title, pulls): print() -def generate_changelog(repo, repo_name, tag1, tag2, version): +def generate_changelog(repo, repo_name, tag1, tag2, version) -> None: # get a list of commits between two tags print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) comparison = repo.compare(tag1, tag2) @@ -154,7 +154,7 @@ def generate_changelog(repo, repo_name, tag1, tag2, version): ) -def cli(args=None): +def cli(args=None) -> None: """Process command line arguments.""" if not args: args = sys.argv[1:] diff --git a/docs/source/conf.py b/docs/source/conf.py index c82a189e0..0be03d81d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -73,7 +73,7 @@ autoapi_python_class_content = "both" -def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 +def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 skip_contents = [ # Re-exports ("class", "datafusion.DataFrame"), @@ -93,7 +93,7 @@ def autoapi_skip_member_fn(app, what, name, obj, skip, options): # noqa: ARG001 return skip -def setup(sphinx): +def setup(sphinx) -> None: sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) diff --git a/examples/create-context.py b/examples/create-context.py index 760c8513e..0026d6162 100644 --- a/examples/create-context.py +++ b/examples/create-context.py @@ -25,14 +25,14 @@ runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) config = ( SessionConfig() - .with_create_default_catalog_and_schema(True) + .with_create_default_catalog_and_schema(enabled=True) .with_default_catalog_and_schema("foo", "bar") .with_target_partitions(8) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) + .with_information_schema(enabled=True) + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .with_repartition_windows(enabled=False) + .with_parquet_pruning(enabled=False) .set("datafusion.execution.parquet.pushdown_filters", "true") ) ctx = SessionContext(config, runtime) diff --git a/examples/python-udaf.py b/examples/python-udaf.py index 538f69571..6655edb0a 100644 --- a/examples/python-udaf.py +++ b/examples/python-udaf.py @@ -16,7 +16,7 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa import pyarrow.compute from datafusion import Accumulator, col, udaf @@ -26,25 +26,21 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): - self._sum = pyarrow.scalar(0.0) + def __init__(self) -> None: + self._sum = pa.scalar(0.0) - def update(self, values: pyarrow.Array) -> None: + def update(self, values: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(values).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(values).as_py()) - def merge(self, states: pyarrow.Array) -> None: + def merge(self, states: pa.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` - self._sum = pyarrow.scalar( - self._sum.as_py() + pyarrow.compute.sum(states).as_py() - ) + self._sum = pa.scalar(self._sum.as_py() + pa.compute.sum(states).as_py()) - def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum.as_py()]) + def state(self) -> pa.Array: + return pa.array([self._sum.as_py()]) - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: return self._sum @@ -52,17 +48,17 @@ def evaluate(self) -> pyarrow.Scalar: ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) my_udaf = udaf( MyAccumulator, - pyarrow.float64(), - pyarrow.float64(), - [pyarrow.float64()], + pa.float64(), + pa.float64(), + [pa.float64()], "stable", ) @@ -70,4 +66,4 @@ def evaluate(self) -> pyarrow.Scalar: result = df.collect()[0] -assert result.column(0) == pyarrow.array([6.0]) +assert result.column(0) == pa.array([6.0]) diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index c5d5ec8dd..eb0825011 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -112,8 +112,8 @@ def is_of_interest_impl( returnflag_arr: pa.Array, ) -> pa.Array: result = [] - for idx, partkey in enumerate(partkey_arr): - partkey = partkey.as_py() + for idx, partkey_val in enumerate(partkey_arr): + partkey = partkey_val.as_py() suppkey = suppkey_arr[idx].as_py() returnflag = returnflag_arr[idx].as_py() value = (partkey, suppkey, returnflag) @@ -162,10 +162,7 @@ def udf_using_pyarrow_compute_impl( resultant_arr = pc.and_(filtered_partkey_arr, filtered_suppkey_arr) resultant_arr = pc.and_(resultant_arr, filtered_returnflag_arr) - if results is None: - results = resultant_arr - else: - results = pc.or_(results, resultant_arr) + results = resultant_arr if results is None else pc.or_(results, resultant_arr) return results diff --git a/examples/python-udf.py b/examples/python-udf.py index fb2bc253e..1c08acd1a 100644 --- a/examples/python-udf.py +++ b/examples/python-udf.py @@ -15,23 +15,23 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import pyarrow as pa from datafusion import SessionContext, udf from datafusion import functions as f -def is_null(array: pyarrow.Array) -> pyarrow.Array: +def is_null(array: pa.Array) -> pa.Array: return array.is_null() -is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable") +is_null_arr = udf(is_null, [pa.int64()], pa.bool_(), "stable") # create a context ctx = SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -40,4 +40,4 @@ def is_null(array: pyarrow.Array) -> pyarrow.Array: result = df.collect()[0] -assert result.column(0) == pyarrow.array([False] * 3) +assert result.column(0) == pa.array([False] * 3) diff --git a/examples/query-pyarrow-data.py b/examples/query-pyarrow-data.py index e3456fb5b..9cfe8a62b 100644 --- a/examples/query-pyarrow-data.py +++ b/examples/query-pyarrow-data.py @@ -16,15 +16,15 @@ # under the License. import datafusion -import pyarrow +import pyarrow as pa from datafusion import col # create a context ctx = datafusion.SessionContext() # create a RecordBatch and a new DataFrame from it -batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], +batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) @@ -38,5 +38,5 @@ # execute and collect the first (and only) batch result = df.collect()[0] -assert result.column(0) == pyarrow.array([5, 7, 9]) -assert result.column(1) == pyarrow.array([-3, -3, -3]) +assert result.column(0) == pa.array([5, 7, 9]) +assert result.column(1) == pa.array([-3, -3, -3]) diff --git a/examples/sql-using-python-udaf.py b/examples/sql-using-python-udaf.py index 60ab8d134..32ce38900 100644 --- a/examples/sql-using-python-udaf.py +++ b/examples/sql-using-python-udaf.py @@ -25,7 +25,7 @@ class MyAccumulator(Accumulator): Interface of a user-defined accumulation. """ - def __init__(self): + def __init__(self) -> None: self._sum = pa.scalar(0.0) def update(self, values: pa.Array) -> None: diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 2be4dfabd..80ff80244 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -91,7 +91,7 @@ def check_q17(df): ("q22_global_sales_opportunity", "q22"), ], ) -def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): +def test_tpch_query_vs_answer_file(query_code: str, answer_file: str) -> None: module = import_module(query_code) df: DataFrame = module.df @@ -122,3 +122,5 @@ def test_tpch_query_vs_answer_file(query_code: str, answer_file: str): assert df.join(df_expected, on=cols, how="anti").count() == 0 assert df.count() == df_expected.count() + + return None diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index 73097fac5..fd0fcca49 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -25,112 +25,112 @@ import os import datafusion -import pyarrow +import pyarrow as pa ctx = datafusion.SessionContext() all_schemas = {} all_schemas["customer"] = [ - ("C_CUSTKEY", pyarrow.int64()), - ("C_NAME", pyarrow.string()), - ("C_ADDRESS", pyarrow.string()), - ("C_NATIONKEY", pyarrow.int64()), - ("C_PHONE", pyarrow.string()), - ("C_ACCTBAL", pyarrow.decimal128(15, 2)), - ("C_MKTSEGMENT", pyarrow.string()), - ("C_COMMENT", pyarrow.string()), + ("C_CUSTKEY", pa.int64()), + ("C_NAME", pa.string()), + ("C_ADDRESS", pa.string()), + ("C_NATIONKEY", pa.int64()), + ("C_PHONE", pa.string()), + ("C_ACCTBAL", pa.decimal128(15, 2)), + ("C_MKTSEGMENT", pa.string()), + ("C_COMMENT", pa.string()), ] all_schemas["lineitem"] = [ - ("L_ORDERKEY", pyarrow.int64()), - ("L_PARTKEY", pyarrow.int64()), - ("L_SUPPKEY", pyarrow.int64()), - ("L_LINENUMBER", pyarrow.int32()), - ("L_QUANTITY", pyarrow.decimal128(15, 2)), - ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)), - ("L_DISCOUNT", pyarrow.decimal128(15, 2)), - ("L_TAX", pyarrow.decimal128(15, 2)), - ("L_RETURNFLAG", pyarrow.string()), - ("L_LINESTATUS", pyarrow.string()), - ("L_SHIPDATE", pyarrow.date32()), - ("L_COMMITDATE", pyarrow.date32()), - ("L_RECEIPTDATE", pyarrow.date32()), - ("L_SHIPINSTRUCT", pyarrow.string()), - ("L_SHIPMODE", pyarrow.string()), - ("L_COMMENT", pyarrow.string()), + ("L_ORDERKEY", pa.int64()), + ("L_PARTKEY", pa.int64()), + ("L_SUPPKEY", pa.int64()), + ("L_LINENUMBER", pa.int32()), + ("L_QUANTITY", pa.decimal128(15, 2)), + ("L_EXTENDEDPRICE", pa.decimal128(15, 2)), + ("L_DISCOUNT", pa.decimal128(15, 2)), + ("L_TAX", pa.decimal128(15, 2)), + ("L_RETURNFLAG", pa.string()), + ("L_LINESTATUS", pa.string()), + ("L_SHIPDATE", pa.date32()), + ("L_COMMITDATE", pa.date32()), + ("L_RECEIPTDATE", pa.date32()), + ("L_SHIPINSTRUCT", pa.string()), + ("L_SHIPMODE", pa.string()), + ("L_COMMENT", pa.string()), ] all_schemas["nation"] = [ - ("N_NATIONKEY", pyarrow.int64()), - ("N_NAME", pyarrow.string()), - ("N_REGIONKEY", pyarrow.int64()), - ("N_COMMENT", pyarrow.string()), + ("N_NATIONKEY", pa.int64()), + ("N_NAME", pa.string()), + ("N_REGIONKEY", pa.int64()), + ("N_COMMENT", pa.string()), ] all_schemas["orders"] = [ - ("O_ORDERKEY", pyarrow.int64()), - ("O_CUSTKEY", pyarrow.int64()), - ("O_ORDERSTATUS", pyarrow.string()), - ("O_TOTALPRICE", pyarrow.decimal128(15, 2)), - ("O_ORDERDATE", pyarrow.date32()), - ("O_ORDERPRIORITY", pyarrow.string()), - ("O_CLERK", pyarrow.string()), - ("O_SHIPPRIORITY", pyarrow.int32()), - ("O_COMMENT", pyarrow.string()), + ("O_ORDERKEY", pa.int64()), + ("O_CUSTKEY", pa.int64()), + ("O_ORDERSTATUS", pa.string()), + ("O_TOTALPRICE", pa.decimal128(15, 2)), + ("O_ORDERDATE", pa.date32()), + ("O_ORDERPRIORITY", pa.string()), + ("O_CLERK", pa.string()), + ("O_SHIPPRIORITY", pa.int32()), + ("O_COMMENT", pa.string()), ] all_schemas["part"] = [ - ("P_PARTKEY", pyarrow.int64()), - ("P_NAME", pyarrow.string()), - ("P_MFGR", pyarrow.string()), - ("P_BRAND", pyarrow.string()), - ("P_TYPE", pyarrow.string()), - ("P_SIZE", pyarrow.int32()), - ("P_CONTAINER", pyarrow.string()), - ("P_RETAILPRICE", pyarrow.decimal128(15, 2)), - ("P_COMMENT", pyarrow.string()), + ("P_PARTKEY", pa.int64()), + ("P_NAME", pa.string()), + ("P_MFGR", pa.string()), + ("P_BRAND", pa.string()), + ("P_TYPE", pa.string()), + ("P_SIZE", pa.int32()), + ("P_CONTAINER", pa.string()), + ("P_RETAILPRICE", pa.decimal128(15, 2)), + ("P_COMMENT", pa.string()), ] all_schemas["partsupp"] = [ - ("PS_PARTKEY", pyarrow.int64()), - ("PS_SUPPKEY", pyarrow.int64()), - ("PS_AVAILQTY", pyarrow.int32()), - ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)), - ("PS_COMMENT", pyarrow.string()), + ("PS_PARTKEY", pa.int64()), + ("PS_SUPPKEY", pa.int64()), + ("PS_AVAILQTY", pa.int32()), + ("PS_SUPPLYCOST", pa.decimal128(15, 2)), + ("PS_COMMENT", pa.string()), ] all_schemas["region"] = [ - ("r_REGIONKEY", pyarrow.int64()), - ("r_NAME", pyarrow.string()), - ("r_COMMENT", pyarrow.string()), + ("r_REGIONKEY", pa.int64()), + ("r_NAME", pa.string()), + ("r_COMMENT", pa.string()), ] all_schemas["supplier"] = [ - ("S_SUPPKEY", pyarrow.int64()), - ("S_NAME", pyarrow.string()), - ("S_ADDRESS", pyarrow.string()), - ("S_NATIONKEY", pyarrow.int32()), - ("S_PHONE", pyarrow.string()), - ("S_ACCTBAL", pyarrow.decimal128(15, 2)), - ("S_COMMENT", pyarrow.string()), + ("S_SUPPKEY", pa.int64()), + ("S_NAME", pa.string()), + ("S_ADDRESS", pa.string()), + ("S_NATIONKEY", pa.int32()), + ("S_PHONE", pa.string()), + ("S_ACCTBAL", pa.decimal128(15, 2)), + ("S_COMMENT", pa.string()), ] curr_dir = os.path.dirname(os.path.abspath(__file__)) -for filename, curr_schema in all_schemas.items(): +for filename, curr_schema_val in all_schemas.items(): # For convenience, go ahead and convert the schema column names to lowercase - curr_schema = [(s[0].lower(), s[1]) for s in curr_schema] + curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val] # Pre-collect the output columns so we can ignore the null field we add # in to handle the trailing | in the file output_cols = [r[0] for r in curr_schema] - curr_schema = [pyarrow.field(r[0], r[1], nullable=False) for r in curr_schema] + curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema] # Trailing | requires extra field for in processing - curr_schema.append(("some_null", pyarrow.null())) + curr_schema.append(("some_null", pa.null())) - schema = pyarrow.schema(curr_schema) + schema = pa.schema(curr_schema) source_file = os.path.abspath( os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") diff --git a/examples/tpch/q08_market_share.py b/examples/tpch/q08_market_share.py index d46df30f2..4bf50efba 100644 --- a/examples/tpch/q08_market_share.py +++ b/examples/tpch/q08_market_share.py @@ -150,7 +150,7 @@ df = df.with_column( "national_volume", F.case(col("s_suppkey").is_null()) - .when(lit(False), col("volume")) + .when(lit(value=False), col("volume")) .otherwise(lit(0.0)), ) diff --git a/examples/tpch/q19_discounted_revenue.py b/examples/tpch/q19_discounted_revenue.py index 2b87e1120..bd492aac0 100644 --- a/examples/tpch/q19_discounted_revenue.py +++ b/examples/tpch/q19_discounted_revenue.py @@ -89,8 +89,8 @@ def is_of_interest( same number of rows in the output. """ result = [] - for idx, brand in enumerate(brand_arr): - brand = brand.as_py() + for idx, brand_val in enumerate(brand_arr): + brand = brand_val.as_py() if brand in items_of_interest: values_of_interest = items_of_interest[brand] diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py index 9bbaad779..619c4406b 100644 --- a/examples/tpch/q21_suppliers_kept_orders_waiting.py +++ b/examples/tpch/q21_suppliers_kept_orders_waiting.py @@ -65,7 +65,7 @@ df = df.with_column( "failed_supp", F.case(col("l_receiptdate") > col("l_commitdate")) - .when(lit(True), col("l_suppkey")) + .when(lit(value=True), col("l_suppkey")) .end(), ) diff --git a/pyproject.toml b/pyproject.toml index a4ed18c4c..d86b657ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,37 +80,17 @@ ignore = [ "TD003", # Allow TODO lines "UP007", # Disallowing Union is pedantic # TODO: Enable all of the following, but this PR is getting too large already - "PT001", - "ANN204", - "B008", - "EM101", "PLR0913", - "PLR1714", - "ANN201", - "C400", "TRY003", - "B904", - "UP006", - "RUF012", - "FBT003", - "C416", - "SIM102", - "PGH003", "PLR2004", - "PERF401", "PD901", - "EM102", "ERA001", - "SIM108", - "ICN001", "ANN001", "ANN202", "PTH", "N812", "INP001", "DTZ007", - "PLW2901", - "RET503", "RUF015", "A005", "TC001", diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 286e5dc31..d871fdb71 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -92,17 +92,17 @@ ] -def column(value: str): +def column(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def col(value: str): +def col(value: str) -> Expr: """Create a column expression.""" return Expr.column(value) -def literal(value): +def literal(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) @@ -120,6 +120,6 @@ def str_lit(value): return string_literal(value) -def lit(value): +def lit(value) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 0560f4704..6c3f188cc 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -24,7 +24,7 @@ import datafusion._internal as df_internal if TYPE_CHECKING: - import pyarrow + import pyarrow as pa class Catalog: @@ -67,7 +67,7 @@ def __init__(self, table: df_internal.Table) -> None: self.table = table @property - def schema(self) -> pyarrow.Schema: + def schema(self) -> pa.Schema: """Returns the schema associated with this table.""" return self.table.schema diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 58ad9a943..1429a4975 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -40,9 +40,9 @@ if TYPE_CHECKING: import pathlib - import pandas - import polars - import pyarrow + import pandas as pd + import polars as pl + import pyarrow as pa from datafusion.plan import ExecutionPlan, LogicalPlan @@ -537,7 +537,7 @@ def register_listing_table( path: str | pathlib.Path, table_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".parquet", - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> None: """Register multiple files as a single table. @@ -606,14 +606,14 @@ def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: def create_dataframe( self, - partitions: list[list[pyarrow.RecordBatch]], + partitions: list[list[pa.RecordBatch]], name: str | None = None, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, ) -> DataFrame: """Create and return a dataframe using the provided partitions. Args: - partitions: :py:class:`pyarrow.RecordBatch` partitions to register. + partitions: :py:class:`pa.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -684,16 +684,14 @@ def from_arrow( return DataFrame(self.ctx.from_arrow(data, name)) @deprecated("Use ``from_arrow`` instead.") - def from_arrow_table( - self, data: pyarrow.Table, name: str | None = None - ) -> DataFrame: + def from_arrow_table(self, data: pa.Table, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. This is an alias for :py:func:`from_arrow`. """ return self.from_arrow(data, name) - def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: + def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. Args: @@ -705,7 +703,7 @@ def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFr """ return DataFrame(self.ctx.from_pandas(data, name)) - def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: + def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. Args: @@ -719,7 +717,7 @@ def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFr # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 # is the discussion on how we arrived at adding register_view - def register_view(self, name: str, df: DataFrame): + def register_view(self, name: str, df: DataFrame) -> None: """Register a :py:class: `~datafusion.detaframe.DataFrame` as a view. Args: @@ -755,7 +753,7 @@ def register_table_provider( self.ctx.register_table_provider(name, provider) def register_record_batches( - self, name: str, partitions: list[list[pyarrow.RecordBatch]] + self, name: str, partitions: list[list[pa.RecordBatch]] ) -> None: """Register record batches as a table. @@ -776,7 +774,7 @@ def register_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[SortExpr]] | None = None, ) -> None: """Register a Parquet file as a table. @@ -817,7 +815,7 @@ def register_csv( self, name: str, path: str | pathlib.Path | list[str | pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -843,10 +841,7 @@ def register_csv( selected for data input. file_compression_type: File compression type. """ - if isinstance(path, list): - path = [str(p) for p in path] - else: - path = str(path) + path = [str(p) for p in path] if isinstance(path, list) else str(path) self.ctx.register_csv( name, @@ -863,7 +858,7 @@ def register_json( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -901,7 +896,7 @@ def register_avro( self, name: str, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_extension: str = ".avro", table_partition_cols: list[tuple[str, str]] | None = None, ) -> None: @@ -923,8 +918,8 @@ def register_avro( name, str(path), schema, file_extension, table_partition_cols ) - def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: - """Register a :py:class:`pyarrow.dataset.Dataset` as a table. + def register_dataset(self, name: str, dataset: pa.dataset.Dataset) -> None: + """Register a :py:class:`pa.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -975,7 +970,7 @@ def session_id(self) -> str: def read_json( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, schema_infer_max_records: int = 1000, file_extension: str = ".json", table_partition_cols: list[tuple[str, str]] | None = None, @@ -1012,7 +1007,7 @@ def read_json( def read_csv( self, path: str | pathlib.Path | list[str] | list[pathlib.Path], - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", schema_infer_max_records: int = 1000, @@ -1065,7 +1060,7 @@ def read_parquet( parquet_pruning: bool = True, file_extension: str = ".parquet", skip_metadata: bool = True, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_sort_order: list[list[Expr | SortExpr]] | None = None, ) -> DataFrame: """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. @@ -1110,7 +1105,7 @@ def read_parquet( def read_avro( self, path: str | pathlib.Path, - schema: pyarrow.Schema | None = None, + schema: pa.Schema | None = None, file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index d1c71c2bb..26fe8f453 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -26,10 +26,8 @@ TYPE_CHECKING, Any, Iterable, - List, Literal, Optional, - Type, Union, overload, ) @@ -75,7 +73,7 @@ class Compression(Enum): LZ4_RAW = "lz4_raw" @classmethod - def from_str(cls: Type[Compression], value: str) -> Compression: + def from_str(cls: type[Compression], value: str) -> Compression: """Convert a string to a Compression enum value. Args: @@ -89,11 +87,13 @@ def from_str(cls: Type[Compression], value: str) -> Compression: """ try: return cls(value.lower()) - except ValueError: + except ValueError as err: valid_values = str([item.value for item in Compression]) - raise ValueError( - f"{value} is not a valid Compression. Valid values are: {valid_values}" - ) + error_msg = f""" + {value} is not a valid Compression. + Valid values are: {valid_values} + """ + raise ValueError(error_msg) from err def get_default_level(self) -> Optional[int]: """Get the default compression level for the compression type. @@ -132,7 +132,7 @@ def into_view(self) -> pa.Table: """Convert DataFrame as a ViewTable which can be used in register_table.""" return self.df.into_view() - def __getitem__(self, key: str | List[str]) -> DataFrame: + def __getitem__(self, key: str | list[str]) -> DataFrame: """Return a new :py:class`DataFrame` with the specified column or columns. Args: @@ -287,8 +287,7 @@ def _simplify_expression( if isinstance(expr, Expr): expr_list.append(expr.expr) elif isinstance(expr, Iterable): - for inner_expr in expr: - expr_list.append(inner_expr.expr) + expr_list.extend(inner_expr.expr for inner_expr in expr) else: raise NotImplementedError if named_exprs: @@ -513,10 +512,15 @@ def join( # This check is to prevent breaking API changes where users prior to # DF 43.0.0 would pass the join_keys as a positional argument instead # of a keyword argument. - if isinstance(on, tuple) and len(on) == 2: - if isinstance(on[0], list) and isinstance(on[1], list): - join_keys = on # type: ignore - on = None + if ( + isinstance(on, tuple) + and len(on) == 2 + and isinstance(on[0], list) + and isinstance(on[1], list) + ): + # We know this is safe because we've checked the types + join_keys = on # type: ignore[assignment] + on = None if join_keys is not None: warnings.warn( @@ -529,18 +533,17 @@ def join( if on is not None: if left_on is not None or right_on is not None: - raise ValueError( - "`left_on` or `right_on` should not provided with `on`" - ) + error_msg = "`left_on` or `right_on` should not provided with `on`" + raise ValueError(error_msg) left_on = on right_on = on elif left_on is not None or right_on is not None: if left_on is None or right_on is None: - raise ValueError("`left_on` and `right_on` should both be provided.") + error_msg = "`left_on` and `right_on` should both be provided." + raise ValueError(error_msg) else: - raise ValueError( - "either `on` or `left_on` and `right_on` should be provided." - ) + error_msg = "either `on` or `left_on` and `right_on` should be provided." + raise ValueError(error_msg) if isinstance(left_on, str): left_on = [left_on] if isinstance(right_on, str): @@ -726,9 +729,11 @@ def write_parquet( if isinstance(compression, str): compression = Compression.from_str(compression) - if compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: - if compression_level is None: - compression_level = compression.get_default_level() + if ( + compression in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD} + and compression_level is None + ): + compression_level = compression.get_default_level() self.df.write_parquet(str(path), compression.value, compression_level) @@ -824,7 +829,7 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram Returns: A DataFrame with the columns expanded. """ - columns = [c for c in columns] + columns = list(columns) return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) def __arrow_c_stream__(self, requested_schema: pa.Schema) -> Any: diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 77b6c272d..2697d8143 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -22,7 +22,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, Type +from typing import TYPE_CHECKING, Any, ClassVar, Optional import pyarrow as pa @@ -176,7 +176,7 @@ def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort - return SortExpr(e, True, True).raw_sort + return SortExpr(e, ascending=True, nulls_first=True).raw_sort def sort_list_to_raw_sort_list( @@ -439,24 +439,21 @@ def fill_null(self, value: Any | Expr | None = None) -> Expr: value = Expr.literal(value) return Expr(functions_internal.nvl(self.expr, value.expr)) - _to_pyarrow_types = { + _to_pyarrow_types: ClassVar[dict[type, pa.DataType]] = { float: pa.float64(), int: pa.int64(), str: pa.string(), bool: pa.bool_(), } - def cast( - self, to: pa.DataType[Any] | Type[float] | Type[int] | Type[str] | Type[bool] - ) -> Expr: + def cast(self, to: pa.DataType[Any] | type[float | int | str | bool]) -> Expr: """Cast to a new data type.""" if not isinstance(to, pa.DataType): try: to = self._to_pyarrow_types[to] - except KeyError: - raise TypeError( - "Expected instance of pyarrow.DataType or builtins.type" - ) + except KeyError as err: + error_msg = "Expected instance of pyarrow.DataType or builtins.type" + raise TypeError(error_msg) from err return Expr(self.expr.cast(to)) @@ -565,9 +562,7 @@ def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ - return ExprFuncBuilder( - self.expr.partition_by(list(e.expr for e in partition_by)) - ) + return ExprFuncBuilder(self.expr.partition_by([e.expr for e in partition_by])) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: """Set the frame fora window function. @@ -610,7 +605,7 @@ def over(self, window: Window) -> Expr: class ExprFuncBuilder: - def __init__(self, builder: expr_internal.ExprFuncBuilder): + def __init__(self, builder: expr_internal.ExprFuncBuilder) -> None: self.builder = builder def order_by(self, *exprs: Expr) -> ExprFuncBuilder: @@ -638,7 +633,7 @@ def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set partitioning for window functions.""" return ExprFuncBuilder( - self.builder.partition_by(list(e.expr for e in partition_by)) + self.builder.partition_by([e.expr for e in partition_by]) ) def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: @@ -693,11 +688,11 @@ def __init__( """ if not isinstance(start_bound, pa.Scalar) and start_bound is not None: start_bound = pa.scalar(start_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): start_bound = start_bound.cast(pa.uint64()) if not isinstance(end_bound, pa.Scalar) and end_bound is not None: end_bound = pa.scalar(end_bound) - if units == "rows" or units == "groups": + if units in ("rows", "groups"): end_bound = end_bound.cast(pa.uint64()) self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) @@ -709,7 +704,7 @@ def get_lower_bound(self) -> WindowFrameBound: """Returns starting bound.""" return WindowFrameBound(self.window_frame.get_lower_bound()) - def get_upper_bound(self): + def get_upper_bound(self) -> WindowFrameBound: """Returns end bound.""" return WindowFrameBound(self.window_frame.get_upper_bound()) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 26bac149c..5cf914e16 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -790,10 +790,7 @@ def regexp_count( """ if flags is not None: flags = flags.expr - if start is not None: - start = start.expr - else: - start = Expr.expr + start = start.expr if start is not None else Expr.expr return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) @@ -817,13 +814,15 @@ def right(string: Expr, n: Expr) -> Expr: return Expr(f.right(string.expr, n.expr)) -def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: +def round(value: Expr, decimal_places: Expr | None = None) -> Expr: """Round the argument to the nearest integer. If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. """ + if decimal_places is None: + decimal_places = Expr.literal(0) return Expr(f.round(value.expr, decimal_places.expr)) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 161e1e3bb..37f2075f5 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -81,7 +81,7 @@ def test_literal(df): literal("1"), literal("OK"), literal(3.14), - literal(True), + literal(value=True), literal(b"hello world"), ) result = df.collect() diff --git a/python/tests/test_wrapper_coverage.py b/python/tests/test_wrapper_coverage.py index a2de2d32b..926a65961 100644 --- a/python/tests/test_wrapper_coverage.py +++ b/python/tests/test_wrapper_coverage.py @@ -28,7 +28,7 @@ from enum import EnumMeta as EnumType -def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 +def missing_exports(internal_obj, wrapped_obj) -> None: """ Identify if any of the rust exposted structs or functions do not have wrappers. @@ -56,9 +56,8 @@ def missing_exports(internal_obj, wrapped_obj) -> None: # noqa: C901 # __kwdefaults__ and __doc__. As long as these are None on the internal # object, it's okay to skip them. However if they do exist on the internal # object they must also exist on the wrapped object. - if internal_attr is not None: - if wrapped_attr is None: - pytest.fail(f"Missing attribute: {internal_attr_name}") + if internal_attr is not None and wrapped_attr is None: + pytest.fail(f"Missing attribute: {internal_attr_name}") if internal_attr_name in ["__self__", "__class__"]: continue From 42982dad27ad03e7e9395d4c3ae3064c2b489434 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 10:14:55 -0400 Subject: [PATCH 20/22] Improve collection during repr and repr_html (#1036) * Improve table readout of a dataframe in jupyter notebooks by making the table scrollable and displaying the first record batch up to 2MB * Add option to only display a portion of a cell data and the user can click on a button to toggle showing more or less * We cannot expect that the first non-empy batch is sufficient for our 2MB limit, so switch over to collecting until we run out or use up the size * Update python unit test to allow the additional formatting data to exist and only check the table contents * Combining collection for repr and repr_html into one function * Small clippy suggestion * Collect was occuring twice on repr * Switch to execute_stream_partitioned --- python/tests/test_dataframe.py | 23 ++-- src/dataframe.rs | 240 ++++++++++++++++++++++++++++----- src/utils.rs | 2 +- 3 files changed, 225 insertions(+), 40 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 384b17878..718ebf69d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import os +import re from typing import Any import pyarrow as pa @@ -1245,13 +1246,17 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: def test_dataframe_repr_html(df) -> None: output = df._repr_html_() - ref_html = """ - - - - -
abc
148
255
368
- """ + # Since we've added a fair bit of processing to the html output, lets just verify + # the values we are expecting in the table exist. Use regex and ignore everything + # between the and . We also don't want the closing > on the + # td and th segments because that is where the formatting data is written. - # Ignore whitespace just to make this test look cleaner - assert output.replace(" ", "") == ref_html.replace(" ", "") + headers = ["a", "b", "c"] + headers = [f"{v}" for v in headers] + header_pattern = "(.*?)".join(headers) + assert len(re.findall(header_pattern, output, re.DOTALL)) == 1 + + body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]] + body_lines = [f"{v}" for inner in body_data for v in inner] + body_pattern = "(.*?)".join(body_lines) + assert len(re.findall(body_pattern, output, re.DOTALL)) == 1 diff --git a/src/dataframe.rs b/src/dataframe.rs index 243e2e14f..be10b8c28 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -31,9 +31,11 @@ use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::TableProvider; +use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; +use futures::{StreamExt, TryStreamExt}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; @@ -70,6 +72,9 @@ impl PyTableProvider { PyTable::new(table_provider) } } +const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB +const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20; +const MAX_LENGTH_CELL_WITHOUT_MINIMIZE: usize = 25; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -111,56 +116,151 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); - match batches_as_string { - Ok(batch) => Ok(format!("DataFrame()\n{batch}")), - Err(err) => Ok(format!("Error: {:?}", err.to_string())), + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } - } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - let mut html_str = "\n".to_string(); + let batches_as_displ = + pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; + + let additional_str = match has_more { + true => "\nData truncated.", + false => "", + }; - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; + Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) + } + fn _repr_html_(&self, py: Python) -> PyDataFusionResult { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + MIN_TABLE_ROWS_TO_DISPLAY, + usize::MAX, + ), + )?; if batches.is_empty() { - html_str.push_str("
\n"); - return Ok(html_str); + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } + let table_uuid = uuid::Uuid::new_v4().to_string(); + + let mut html_str = " + + +
+ + \n".to_string(); + let schema = batches[0].schema(); let mut header = Vec::new(); for field in schema.fields() { - header.push(format!("", field.name())); } let header_str = header.join(""); - html_str.push_str(&format!("{}\n", header_str)); - - for batch in batches { - let formatters = batch - .columns() - .iter() - .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) - .collect::, _>>()?; - - for row in 0..batch.num_rows() { + html_str.push_str(&format!("{}\n", header_str)); + + let batch_formatters = batches + .iter() + .map(|batch| { + batch + .columns() + .iter() + .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) + .map(|c| { + c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) + }) + .collect::, _>>() + }) + .collect::, _>>()?; + + let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); + + // We need to build up row by row for html + let mut table_row = 0; + for (batch_formatter, num_rows_in_batch) in batch_formatters.iter().zip(rows_per_batch) { + for batch_row in 0..num_rows_in_batch { + table_row += 1; let mut cells = Vec::new(); - for formatter in &formatters { - cells.push(format!("", formatter.value(row))); + for (col, formatter) in batch_formatter.iter().enumerate() { + let cell_data = formatter.value(batch_row).to_string(); + // From testing, primitive data types do not typically get larger than 21 characters + if cell_data.len() > MAX_LENGTH_CELL_WITHOUT_MINIMIZE { + let short_cell_data = &cell_data[0..MAX_LENGTH_CELL_WITHOUT_MINIMIZE]; + cells.push(format!(" + ")); + } else { + cells.push(format!("", formatter.value(batch_row))); + } } let row_str = cells.join(""); html_str.push_str(&format!("{}\n", row_str)); } } + html_str.push_str("
{}", field.name())); + header.push(format!("{}
{} +
+ {short_cell_data} + {cell_data} + +
+
{}
\n"); + + html_str.push_str(" + + "); - html_str.push_str("\n"); + if has_more { + html_str.push_str("Data truncated due to size."); + } Ok(html_str) } @@ -771,3 +871,83 @@ fn record_batch_into_schema( RecordBatch::try_new(schema, data_arrays) } + +/// This is a helper function to return the first non-empty record batch from executing a DataFrame. +/// It additionally returns a bool, which indicates if there are more record batches available. +/// We do this so we can determine if we should indicate to the user that the data has been +/// truncated. This collects until we have achived both of these two conditions +/// +/// - We have collected our minimum number of rows +/// - We have reached our limit, either data size or maximum number of rows +/// +/// Otherwise it will return when the stream has exhausted. If you want a specific number of +/// rows, set min_rows == max_rows. +async fn collect_record_batches_to_display( + df: DataFrame, + min_rows: usize, + max_rows: usize, +) -> Result<(Vec, bool), DataFusionError> { + let partitioned_stream = df.execute_stream_partitioned().await?; + let mut stream = futures::stream::iter(partitioned_stream).flatten(); + let mut size_estimate_so_far = 0; + let mut rows_so_far = 0; + let mut record_batches = Vec::default(); + let mut has_more = false; + + while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows) + || rows_so_far < min_rows + { + let mut rb = match stream.next().await { + None => { + break; + } + Some(Ok(r)) => r, + Some(Err(e)) => return Err(e), + }; + + let mut rows_in_rb = rb.num_rows(); + if rows_in_rb > 0 { + size_estimate_so_far += rb.get_array_memory_size(); + + if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY { + let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32; + let total_rows = rows_in_rb + rows_so_far; + + let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + if reduced_row_num < min_rows { + reduced_row_num = min_rows.min(total_rows); + } + + let limited_rows_this_rb = reduced_row_num - rows_so_far; + if limited_rows_this_rb < rows_in_rb { + rows_in_rb = limited_rows_this_rb; + rb = rb.slice(0, limited_rows_this_rb); + has_more = true; + } + } + + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); + has_more = true; + } + + rows_so_far += rb.num_rows(); + record_batches.push(rb); + } + } + + if record_batches.is_empty() { + return Ok((Vec::default(), false)); + } + + if !has_more { + // Data was not already truncated, so check to see if more record batches remain + has_more = match stream.try_next().await { + Ok(None) => false, // reached end + Ok(Some(_)) => true, + Err(_) => false, // Stream disconnected + }; + } + + Ok((record_batches, has_more)) +} diff --git a/src/utils.rs b/src/utils.rs index 999aad755..3487de21b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -42,7 +42,7 @@ pub(crate) fn get_tokio_runtime() -> &'static TokioRuntime { #[inline] pub(crate) fn get_global_ctx() -> &'static SessionContext { static CTX: OnceLock = OnceLock::new(); - CTX.get_or_init(|| SessionContext::new()) + CTX.get_or_init(SessionContext::new) } /// Utility to collect rust futures with GIL released From d0315ffa704aba467f769f444208b7ce26d83037 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 14:37:24 -0400 Subject: [PATCH 21/22] feat: Update DataFusion dependency to 46 (#1079) * Update DataFusion dependency to 46 * There was an update upstream in the exec but it is not a breaking change and only needs unit test updates --- Cargo.lock | 296 +++++++++++++++++++-------------- Cargo.toml | 18 +- python/tests/test_dataframe.py | 3 +- src/expr.rs | 39 +++-- src/expr/aggregate.rs | 10 +- src/expr/aggregate_expr.rs | 11 +- src/expr/window.rs | 24 ++- src/functions.rs | 34 ++-- 8 files changed, 252 insertions(+), 183 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c7f2bf3c..3a4915f23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" +checksum = "84ef243634a39fb6e9d1710737e7a5ef96c9bacabd2326859ff889bc9ef755e5" dependencies = [ "arrow-arith", "arrow-array", @@ -201,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" +checksum = "8f420c6aef51dad2e4a96ce29c0ec90ad84880bdb60b321c74c652a6be07b93f" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" +checksum = "24bda5ff6461a4ff9739959b3d57b377f45e3f878f7be1a4f28137c0a8f339fa" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" +checksum = "bc6ed265c73f134a583d02c3cab5e16afab9446d8048ede8707e31f85fad58a0" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" +checksum = "01c648572391edcef10e5fd458db70ba27ed6f71bcaee04397d0cfb100b34f8b" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" +checksum = "a02fb265a6d8011a7d3ad1a36f25816ad0a3bb04cb8e9fe7929c165b98c0cbcd" dependencies = [ "arrow-array", "arrow-cast", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" +checksum = "5f2cebf504bb6a92a134a87fff98f01b14fbb3a93ecf7aef90cd0f888c5fffa4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -292,9 +292,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" +checksum = "8e6405b287671c88846e7751f7291f717b164911474cabac6d3d8614d5aa7374" dependencies = [ "arrow-array", "arrow-buffer", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" +checksum = "5329bf9e7390cbb6b117ddd4d82e94c5362ea4cab5095697139429f36a38350c" dependencies = [ "arrow-array", "arrow-buffer", @@ -319,16 +319,18 @@ dependencies = [ "half", "indexmap", "lexical-core", + "memchr", "num", "serde", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" +checksum = "e103c13d4b80da28339c1d7aa23dd85bd59f42158acc45d39eeb6770627909ce" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +341,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" +checksum = "170549a11b8534f3097a0619cfe89c42812345dc998bcf81128fc700b84345b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,18 +354,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" +checksum = "a5c53775bba63f319189f366d2b86e9a8889373eb198f07d8544938fc9f8ed9a" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" +checksum = "0a99003b2eb562b8d9c99dfb672306f15e94b20d3734179d596895703e821dcf" dependencies = [ "ahash", "arrow-array", @@ -375,9 +377,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.2.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" +checksum = "90fdb130ee8325f4cd8262e19bb6baa3cbcef2b2573c4bee8c6fda7ea08199d7" dependencies = [ "arrow-array", "arrow-buffer", @@ -535,9 +537,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.5" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +checksum = "b17679a8d69b6d7fd9cd9801a536cec9fa5e5970b69f9d4747f70b39b031f5e7" dependencies = [ "arrayref", "arrayvec", @@ -649,15 +651,15 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.39" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +checksum = "1a7964611d71df112cb1730f2ee67324fcf4d0fc6606acbbe9bfe06df124637c" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", "serde", - "windows-targets", + "windows-link", ] [[package]] @@ -864,30 +866,32 @@ dependencies = [ [[package]] name = "datafusion" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" +checksum = "914e6f9525599579abbd90b0f7a55afcaaaa40350b9e9ed52563f126dfe45fd3" dependencies = [ "apache-avro", "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", - "async-compression", "async-trait", "bytes", "bzip2 0.5.1", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -896,7 +900,6 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "glob", "itertools 0.14.0", "log", "num-traits", @@ -908,7 +911,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-util", "url", "uuid", "xz2", @@ -917,9 +919,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" +checksum = "998a6549e6ee4ee3980e05590b2960446a56b343ea30199ef38acd0e0b9036e2" dependencies = [ "arrow", "async-trait", @@ -933,22 +935,40 @@ dependencies = [ "itertools 0.14.0", "log", "parking_lot", - "sqlparser", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ac10096a5b3c0d8a227176c0e543606860842e943594ccddb45cf42a526e43" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "log", + "object_store", + "tokio", ] [[package]] name = "datafusion-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" +checksum = "1f53d7ec508e1b3f68bd301cee3f649834fad51eff9240d898a4b2614cfd0a7a" dependencies = [ "ahash", "apache-avro", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64 0.22.1", "half", "hashbrown 0.14.5", @@ -966,25 +986,59 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" +checksum = "e0fcf41523b22e14cc349b01526e8b9f59206653037f2949a4adbfde5f8cb668" dependencies = [ "log", "tokio", ] +[[package]] +name = "datafusion-datasource" +version = "46.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf7f37ad8b6e88b46c7eeab3236147d32ea64b823544f498455a8d9042839c92" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2 0.5.1", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd", +] + [[package]] name = "datafusion-doc" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" +checksum = "7db7a0239fd060f359dc56c6e7db726abaa92babaed2fb2e91c3a8b2fff8b256" [[package]] name = "datafusion-execution" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" +checksum = "0938f9e5b6bc5782be4111cdfb70c02b7b5451bf34fd57e4de062a7f7c4e31f1" dependencies = [ "arrow", "dashmap", @@ -1001,9 +1055,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" +checksum = "b36c28b00b00019a8695ad7f1a53ee1673487b90322ecbd604e2cf32894eb14f" dependencies = [ "arrow", "chrono", @@ -1022,26 +1076,25 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" +checksum = "18f0a851a436c5a2139189eb4617a54e6a9ccb9edc96c4b3c83b3bb7c58b950e" dependencies = [ "arrow", "datafusion-common", + "indexmap", "itertools 0.14.0", "paste", ] [[package]] name = "datafusion-ffi" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff47a79d442207c168c6e3e1d970c248589c148e4800e5b285ac1b2cb1a230f8" +checksum = "d740dd9f32a4f4ed1b907e6934201bb059efe6c877532512c661771d973c7b21" dependencies = [ "abi_stable", "arrow", - "arrow-array", - "arrow-schema", "async-ffi", "async-trait", "datafusion", @@ -1055,9 +1108,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" +checksum = "e3196e37d7b65469fb79fee4f05e5bb58a456831035f9a38aa5919aeb3298d40" dependencies = [ "arrow", "arrow-buffer", @@ -1071,7 +1124,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", "itertools 0.14.0", "log", @@ -1085,14 +1137,12 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" +checksum = "adfc2d074d5ee4d9354fdcc9283d5b2b9037849237ddecb8942a29144b77ca05" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1108,9 +1158,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" +checksum = "1cbceba0f98d921309a9121b702bcd49289d383684cccabf9a92cda1602f3bbb" dependencies = [ "ahash", "arrow", @@ -1121,15 +1171,12 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5ee8cecb0dc370291279673097ddabec03a011f73f30d7f1096457127e03e" +checksum = "170e27ce4baa27113ddf5f77f1a7ec484b0dbeda0c7abbd4bad3fc609c8ab71a" dependencies = [ "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1145,9 +1192,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" +checksum = "7d3a06a7f0817ded87b026a437e7e51de7f59d48173b0a4e803aa896a7bd6bb5" dependencies = [ "arrow", "async-trait", @@ -1161,9 +1208,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" +checksum = "d6c608b66496a1e05e3d196131eb9bebea579eed1f59e88d962baf3dda853bc6" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1178,9 +1225,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" +checksum = "da2f9d83348957b4ad0cd87b5cb9445f2651863a36592fe5484d43b49a5f8d82" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1188,9 +1235,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" +checksum = "4800e1ff7ecf8f310887e9b54c9c444b8e215ccbc7b21c2f244cfae373b1ece7" dependencies = [ "datafusion-expr", "quote", @@ -1199,9 +1246,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" +checksum = "971c51c54cd309001376fae752fb15a6b41750b6d1552345c46afbfb6458801b" dependencies = [ "arrow", "chrono", @@ -1218,15 +1265,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" +checksum = "e1447c2c6bc8674a16be4786b4abf528c302803fafa186aa6275692570e64d85" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1243,13 +1287,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" +checksum = "69f8c25dcd069073a75b3d2840a79d0f81e64bdd2c05f2d3d18939afb36a7dcb" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -1258,12 +1301,11 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" +checksum = "68da5266b5b9847c11d1b3404ee96b1d423814e1973e1ad3789131e5ec912763" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1271,23 +1313,19 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", "itertools 0.14.0", "log", "recursive", - "url", ] [[package]] name = "datafusion-physical-plan" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" +checksum = "88cc160df00e413e370b3b259c8ea7bfbebc134d32de16325950e9e923846b7f" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1312,9 +1350,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2db5d79f0c974041787b899d24dc91bdab2ff112d1942dd71356a4ce3b407e6c" +checksum = "6f6ef4c6eb52370cb48639e25e2331a415aac0b2b0a0a472b36e26603bdf184f" dependencies = [ "arrow", "chrono", @@ -1328,9 +1366,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de21bde1603aac0ff32cf478e47081be6e3583c6861fe8f57034da911efe7578" +checksum = "5faf4a9bbb0d0a305fea8a6db21ba863286b53e53a212e687d2774028dd6f03f" dependencies = [ "arrow", "datafusion-common", @@ -1362,13 +1400,11 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" +checksum = "325a212b67b677c0eb91447bf9a11b630f9fc4f62d8e5d145bf859f5a6b29e64" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -1381,11 +1417,10 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "45.0.0" +version = "46.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1634405abd8bd3c64c352f2da2f2aec6d80a815930257e0db0ce4ff5daf00944" +checksum = "2c2be3226a683e02cff65181e66e62eba9f812ed0e9b7ec8fe11ac8dabf1a73f" dependencies = [ - "arrow-buffer", "async-recursion", "async-trait", "chrono", @@ -1395,6 +1430,7 @@ dependencies = [ "pbjson-types", "prost", "substrait", + "tokio", "url", ] @@ -1472,9 +1508,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.35" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" +checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", "miniz_oxide", @@ -2117,9 +2153,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libflate" @@ -2447,9 +2483,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "94243778210509a5a5e9e012872127180c155d73a9cd6e2df9243d213e81e100" dependencies = [ "ahash", "arrow-array", @@ -2479,7 +2515,6 @@ dependencies = [ "tokio", "twox-hash", "zstd", - "zstd-sys", ] [[package]] @@ -3401,11 +3436,12 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -3466,9 +3502,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.52.3" +version = "0.53.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db15789cecbfdf6b1fcf2db807e767c92273bdc407ac057c2194b070c597756" +checksum = "6fac3d70185423235f37b889764e184b81a5af4bb7c95833396ee9bd92577e1b" dependencies = [ "heck", "pbjson", @@ -3922,12 +3958,14 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.13.1" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" dependencies = [ "getrandom 0.3.1", + "js-sys", "serde", + "wasm-bindgen", ] [[package]] @@ -4114,6 +4152,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + [[package]] name = "windows-registry" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 50967a219..8afabdd82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,24 +34,24 @@ protoc = [ "datafusion-substrait/protoc" ] substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.43", features = ["macros", "rt", "rt-multi-thread", "sync"] } pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } pyo3-async-runtimes = { version = "0.23", features = ["tokio-runtime"]} -arrow = { version = "54", features = ["pyarrow"] } -datafusion = { version = "45.0.0", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "45.0.0", optional = true } -datafusion-proto = { version = "45.0.0" } -datafusion-ffi = { version = "45.0.0" } -prost = "0.13" # keep in line with `datafusion-substrait` +arrow = { version = "54.2.1", features = ["pyarrow"] } +datafusion = { version = "46.0.1", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "46.0.1", optional = true } +datafusion-proto = { version = "46.0.1" } +datafusion-ffi = { version = "46.0.1" } +prost = "0.13.1" # keep in line with `datafusion-substrait` uuid = { version = "1.12", features = ["v4"] } mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } -async-trait = "0.1" +async-trait = "0.1.73" futures = "0.3" object_store = { version = "0.11.0", features = ["aws", "gcp", "azure", "http"] } url = "2" [build-dependencies] -prost-types = "0.13" # keep in line with `datafusion-substrait` +prost-types = "0.13.1" # keep in line with `datafusion-substrait` pyo3-build-config = "0.23" [lib] diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 718ebf69d..eda13930d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -753,7 +753,8 @@ def test_execution_plan(aggregate_df): assert "AggregateExec:" in indent assert "CoalesceBatchesExec:" in indent assert "RepartitionExec:" in indent - assert "CsvExec:" in indent + assert "DataSourceExec:" in indent + assert "file_type=csv" in indent ctx = SessionContext() rows_returned = 0 diff --git a/src/expr.rs b/src/expr.rs index d3c528eb4..561170289 100644 --- a/src/expr.rs +++ b/src/expr.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use datafusion::logical_expr::expr::{AggregateFunctionParams, WindowFunctionParams}; use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, @@ -172,6 +173,7 @@ impl PyExpr { Expr::ScalarSubquery(value) => { Ok(scalar_subquery::PyScalarSubquery::from(value.clone()).into_bound_py_any(py)?) } + #[allow(deprecated)] Expr::Wildcard { qualifier, options } => Err(py_unsupported_variant_err(format!( "Converting Expr::Wildcard to a Python object is not implemented : {:?} {:?}", qualifier, options @@ -332,7 +334,6 @@ impl PyExpr { | Expr::AggregateFunction { .. } | Expr::WindowFunction { .. } | Expr::InList { .. } - | Expr::Wildcard { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } | Expr::GroupingSet(..) @@ -346,6 +347,10 @@ impl PyExpr { | Expr::Unnest(_) | Expr::IsNotUnknown(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, + #[allow(deprecated)] + Expr::Wildcard { .. } => { + return Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } }) } @@ -394,11 +399,15 @@ impl PyExpr { | Expr::InSubquery(InSubquery { expr, .. }) => Ok(vec![PyExpr::from(*expr.clone())]), // Expr variants containing a collection of Expr(s) for operands - Expr::AggregateFunction(AggregateFunction { args, .. }) + Expr::AggregateFunction(AggregateFunction { + params: AggregateFunctionParams { args, .. }, + .. + }) | Expr::ScalarFunction(ScalarFunction { args, .. }) - | Expr::WindowFunction(WindowFunction { args, .. }) => { - Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()) - } + | Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|arg| PyExpr::from(arg.clone())).collect()), // Expr(s) that require more specific processing Expr::Case(Case { @@ -465,13 +474,17 @@ impl PyExpr { Expr::GroupingSet(..) | Expr::Unnest(_) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } | Expr::Exists { .. } => Err(py_runtime_err(format!( "Unimplemented Expr type: {}", self.expr ))), + + #[allow(deprecated)] + Expr::Wildcard { .. } => { + Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + } } } @@ -575,7 +588,7 @@ impl PyExpr { Expr::AggregateFunction(agg_fn) => { let window_fn = Expr::WindowFunction(WindowFunction::new( WindowFunctionDefinition::AggregateUDF(agg_fn.func.clone()), - agg_fn.args.clone(), + agg_fn.params.args.clone(), )); add_builder_fns_to_window( @@ -663,16 +676,8 @@ impl PyExpr { /// Create a [Field] representing an [Expr], given an input [LogicalPlan] to resolve against pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> PyDataFusionResult> { - match expr { - Expr::Wildcard { .. } => { - // Since * could be any of the valid column names just return the first one - Ok(Arc::new(input_plan.schema().field(0).clone())) - } - _ => { - let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; - Ok(fields[0].1.clone()) - } - } + let fields = exprlist_to_fields(&[expr.clone()], input_plan)?; + Ok(fields[0].1.clone()) } fn _types(expr: &Expr) -> PyResult { match expr { diff --git a/src/expr/aggregate.rs b/src/expr/aggregate.rs index 8fc9da5b0..a99d83d23 100644 --- a/src/expr/aggregate.rs +++ b/src/expr/aggregate.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::DataFusionError; -use datafusion::logical_expr::expr::{AggregateFunction, Alias}; +use datafusion::logical_expr::expr::{AggregateFunction, AggregateFunctionParams, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; use datafusion::logical_expr::Expr; use pyo3::{prelude::*, IntoPyObjectExt}; @@ -126,9 +126,11 @@ impl PyAggregate { match expr { // TODO: This Alias logic seems to be returning some strange results that we should investigate Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), - Expr::AggregateFunction(AggregateFunction { func: _, args, .. }) => { - Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()) - } + Expr::AggregateFunction(AggregateFunction { + func: _, + params: AggregateFunctionParams { args, .. }, + .. + }) => Ok(args.iter().map(|e| PyExpr::from(e.clone())).collect()), _ => Err(py_type_err( "Encountered a non Aggregate type in aggregation_arguments", )), diff --git a/src/expr/aggregate_expr.rs b/src/expr/aggregate_expr.rs index 09471097f..c09f116e3 100644 --- a/src/expr/aggregate_expr.rs +++ b/src/expr/aggregate_expr.rs @@ -40,7 +40,13 @@ impl From for PyAggregateFunction { impl Display for PyAggregateFunction { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - let args: Vec = self.aggr.args.iter().map(|expr| expr.to_string()).collect(); + let args: Vec = self + .aggr + .params + .args + .iter() + .map(|expr| expr.to_string()) + .collect(); write!(f, "{}({})", self.aggr.func.name(), args.join(", ")) } } @@ -54,12 +60,13 @@ impl PyAggregateFunction { /// is this a distinct aggregate such as `COUNT(DISTINCT expr)` fn is_distinct(&self) -> bool { - self.aggr.distinct + self.aggr.params.distinct } /// Get the arguments to the aggregate function fn args(&self) -> Vec { self.aggr + .params .args .iter() .map(|expr| PyExpr::from(expr.clone())) diff --git a/src/expr/window.rs b/src/expr/window.rs index 13deaec25..c5467bf94 100644 --- a/src/expr/window.rs +++ b/src/expr/window.rs @@ -16,7 +16,7 @@ // under the License. use datafusion::common::{DataFusionError, ScalarValue}; -use datafusion::logical_expr::expr::WindowFunction; +use datafusion::logical_expr::expr::{WindowFunction, WindowFunctionParams}; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; @@ -118,7 +118,10 @@ impl PyWindowExpr { /// Returns order by columns in a window function expression pub fn get_sort_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { order_by, .. }) => py_sort_expr_list(&order_by), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { order_by, .. }, + .. + }) => py_sort_expr_list(&order_by), other => Err(not_window_function_err(other)), } } @@ -126,9 +129,10 @@ impl PyWindowExpr { /// Return partition by columns in a window function expression pub fn get_partition_exprs(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { partition_by, .. }) => { - py_expr_list(&partition_by) - } + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { partition_by, .. }, + .. + }) => py_expr_list(&partition_by), other => Err(not_window_function_err(other)), } } @@ -136,7 +140,10 @@ impl PyWindowExpr { /// Return input args for window function pub fn get_args(&self, expr: PyExpr) -> PyResult> { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { args, .. }) => py_expr_list(&args), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { args, .. }, + .. + }) => py_expr_list(&args), other => Err(not_window_function_err(other)), } } @@ -152,7 +159,10 @@ impl PyWindowExpr { /// Returns a Pywindow frame for a given window function expression pub fn get_frame(&self, expr: PyExpr) -> Option { match expr.expr.unalias() { - Expr::WindowFunction(WindowFunction { window_frame, .. }) => Some(window_frame.into()), + Expr::WindowFunction(WindowFunction { + params: WindowFunctionParams { window_frame, .. }, + .. + }) => Some(window_frame.into()), _ => None, } } diff --git a/src/functions.rs b/src/functions.rs index 8fac239b4..9c406b95a 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -17,6 +17,7 @@ use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; +use datafusion::logical_expr::expr::WindowFunctionParams; use datafusion::logical_expr::ExprFunctionExt; use datafusion::logical_expr::WindowFrame; use pyo3::{prelude::*, wrap_pyfunction}; @@ -215,10 +216,7 @@ fn alias(expr: PyExpr, name: &str) -> PyResult { #[pyfunction] fn col(name: &str) -> PyResult { Ok(PyExpr { - expr: datafusion::logical_expr::Expr::Column(Column { - relation: None, - name: name.to_string(), - }), + expr: datafusion::logical_expr::Expr::Column(Column::new_unqualified(name)), }) } @@ -333,19 +331,21 @@ fn window( Ok(PyExpr { expr: datafusion::logical_expr::Expr::WindowFunction(WindowFunction { fun, - args: args.into_iter().map(|x| x.expr).collect::>(), - partition_by: partition_by - .unwrap_or_default() - .into_iter() - .map(|x| x.expr) - .collect::>(), - order_by: order_by - .unwrap_or_default() - .into_iter() - .map(|x| x.into()) - .collect::>(), - window_frame, - null_treatment: None, + params: WindowFunctionParams { + args: args.into_iter().map(|x| x.expr).collect::>(), + partition_by: partition_by + .unwrap_or_default() + .into_iter() + .map(|x| x.expr) + .collect::>(), + order_by: order_by + .unwrap_or_default() + .into_iter() + .map(|x| x.into()) + .collect::>(), + window_frame, + null_treatment: None, + }, }), }) } From 1aa9cb98bc2cc070048027ac96658c69958db516 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 22 Mar 2025 14:46:09 -0400 Subject: [PATCH 22/22] Update changelog and version number --- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/changelog/46.0.0.md | 73 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 dev/changelog/46.0.0.md diff --git a/Cargo.lock b/Cargo.lock index 3a4915f23..f90038c50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,7 +1377,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" dependencies = [ "arrow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 8afabdd82..bc8639d4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "45.2.0" +version = "46.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] diff --git a/dev/changelog/46.0.0.md b/dev/changelog/46.0.0.md new file mode 100644 index 000000000..3e5768099 --- /dev/null +++ b/dev/changelog/46.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 46.0.0 Changelog + +This release consists of 21 commits from 11 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: reads using global ctx [#982](https://github.com/apache/datafusion-python/pull/982) (ion-elgreco) +- feat: Implementation of udf and udaf decorator [#1040](https://github.com/apache/datafusion-python/pull/1040) (CrystalZhou0529) +- feat: expose regex_count function [#1066](https://github.com/apache/datafusion-python/pull/1066) (nirnayroy) +- feat: Update DataFusion dependency to 46 [#1079](https://github.com/apache/datafusion-python/pull/1079) (timsaucer) + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) +- fix: type checking [#993](https://github.com/apache/datafusion-python/pull/993) (chenkovsky) + +**Other:** + +- [infra] Fail Clippy on rust build warnings [#1029](https://github.com/apache/datafusion-python/pull/1029) (kevinjqliu) +- Add user documentation for the FFI approach [#1031](https://github.com/apache/datafusion-python/pull/1031) (timsaucer) +- build(deps): bump arrow from 54.1.0 to 54.2.0 [#1035](https://github.com/apache/datafusion-python/pull/1035) (dependabot[bot]) +- Chore: Release datafusion-python 45 [#1024](https://github.com/apache/datafusion-python/pull/1024) (timsaucer) +- Enable Dataframe to be converted into views which can be used in register_table [#1016](https://github.com/apache/datafusion-python/pull/1016) (kosiew) +- Add ruff check for missing futures import [#1052](https://github.com/apache/datafusion-python/pull/1052) (timsaucer) +- Enable take comments to assign issues to users [#1058](https://github.com/apache/datafusion-python/pull/1058) (timsaucer) +- Update python min version to 3.9 [#1043](https://github.com/apache/datafusion-python/pull/1043) (kevinjqliu) +- feat/improve ruff test coverage [#1055](https://github.com/apache/datafusion-python/pull/1055) (timsaucer) +- feat/making global context accessible for users [#1060](https://github.com/apache/datafusion-python/pull/1060) (jsai28) +- Renaming Internal Structs [#1059](https://github.com/apache/datafusion-python/pull/1059) (Spaarsh) +- test: add pytest asyncio tests [#1063](https://github.com/apache/datafusion-python/pull/1063) (jsai28) +- Add decorator for udwf [#1061](https://github.com/apache/datafusion-python/pull/1061) (kosiew) +- Add additional ruff suggestions [#1062](https://github.com/apache/datafusion-python/pull/1062) (Spaarsh) +- Improve collection during repr and repr_html [#1036](https://github.com/apache/datafusion-python/pull/1036) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 2 Kevin Liu + 2 Spaarsh + 2 jsai28 + 2 kosiew + 1 Chen Chongchen + 1 Chongchen Chen + 1 Crystal Zhou + 1 Ion Koutsouris + 1 Nirnay Roy + 1 dependabot[bot] +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. +