datafusion/dataframe/mod.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`DataFrame`] API for building and executing query plans.
19
20#[cfg(feature = "parquet")]
21mod parquet;
22
23use crate::arrow::record_batch::RecordBatch;
24use crate::arrow::util::pretty;
25use crate::datasource::file_format::csv::CsvFormatFactory;
26use crate::datasource::file_format::format_as_file_type;
27use crate::datasource::file_format::json::JsonFormatFactory;
28use crate::datasource::{
29 DefaultTableSource, MemTable, TableProvider, provider_as_source,
30};
31use crate::error::Result;
32use crate::execution::FunctionRegistry;
33use crate::execution::context::{SessionState, TaskContext};
34use crate::logical_expr::utils::find_window_exprs;
35use crate::logical_expr::{
36 Expr, JoinType, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions,
37 Partitioning, TableType, col, ident,
38};
39use crate::physical_plan::{
40 ExecutionPlan, SendableRecordBatchStream, collect, collect_partitioned,
41 execute_stream, execute_stream_partitioned,
42};
43use crate::prelude::SessionContext;
44use std::any::Any;
45use std::borrow::Cow;
46use std::collections::{HashMap, HashSet};
47use std::sync::Arc;
48
49use arrow::array::{Array, ArrayRef, Int64Array, StringArray};
50use arrow::compute::{cast, concat};
51use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
52use arrow_schema::FieldRef;
53use datafusion_common::config::{CsvOptions, JsonOptions};
54use datafusion_common::{
55 Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError,
56 TableReference, UnnestOptions, exec_err, internal_datafusion_err, not_impl_err,
57 plan_datafusion_err, plan_err, unqualified_field_not_found,
58};
59use datafusion_expr::select_expr::SelectExpr;
60use datafusion_expr::{
61 ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
62 dml::InsertOp,
63 expr::{Alias, ScalarFunction},
64 is_null, lit,
65 utils::COUNT_STAR_EXPANSION,
66};
67use datafusion_functions::core::coalesce;
68use datafusion_functions_aggregate::expr_fn::{
69 avg, count, max, median, min, stddev, sum,
70};
71
72use async_trait::async_trait;
73use datafusion_catalog::Session;
74
75/// Contains options that control how data is
76/// written out from a DataFrame
77pub struct DataFrameWriteOptions {
78 /// Controls how new data should be written to the table, determining whether
79 /// to append, overwrite, or replace existing data.
80 insert_op: InsertOp,
81 /// Controls if all partitions should be coalesced into a single output file.
82 /// - `None`: Use automatic mode (extension-based heuristic)
83 /// - `Some(true)`: Force single file output at exact path
84 /// - `Some(false)`: Force directory output with generated filenames
85 single_file_output: Option<bool>,
86 /// Sets which columns should be used for hive-style partitioned writes by name.
87 /// Can be set to empty vec![] for non-partitioned writes.
88 partition_by: Vec<String>,
89 /// Sets which columns should be used for sorting the output by name.
90 /// Can be set to empty vec![] for non-sorted writes.
91 sort_by: Vec<SortExpr>,
92}
93
94impl DataFrameWriteOptions {
95 /// Create a new DataFrameWriteOptions with default values
96 pub fn new() -> Self {
97 DataFrameWriteOptions {
98 insert_op: InsertOp::Append,
99 single_file_output: None,
100 partition_by: vec![],
101 sort_by: vec![],
102 }
103 }
104
105 /// Set the insert operation
106 pub fn with_insert_operation(mut self, insert_op: InsertOp) -> Self {
107 self.insert_op = insert_op;
108 self
109 }
110
111 /// Set the single_file_output value to true or false
112 ///
113 /// - `true`: Force single file output at the exact path specified
114 /// - `false`: Force directory output with generated filenames
115 ///
116 /// When not called, automatic mode is used (extension-based heuristic).
117 /// When set to true, an output file will always be created even if the DataFrame is empty.
118 pub fn with_single_file_output(mut self, single_file_output: bool) -> Self {
119 self.single_file_output = Some(single_file_output);
120 self
121 }
122
123 /// Sets the partition_by columns for output partitioning
124 pub fn with_partition_by(mut self, partition_by: Vec<String>) -> Self {
125 self.partition_by = partition_by;
126 self
127 }
128
129 /// Sets the sort_by columns for output sorting
130 pub fn with_sort_by(mut self, sort_by: Vec<SortExpr>) -> Self {
131 self.sort_by = sort_by;
132 self
133 }
134
135 /// Build the options HashMap to pass to CopyTo for sink configuration.
136 fn build_sink_options(&self) -> HashMap<String, String> {
137 let mut options = HashMap::new();
138 if let Some(single_file) = self.single_file_output {
139 options.insert("single_file_output".to_string(), single_file.to_string());
140 }
141 options
142 }
143}
144
145impl Default for DataFrameWriteOptions {
146 fn default() -> Self {
147 Self::new()
148 }
149}
150
151/// Represents a logical set of rows with the same named columns.
152///
153/// Similar to a [Pandas DataFrame] or [Spark DataFrame], a DataFusion DataFrame
154/// represents a 2 dimensional table of rows and columns.
155///
156/// The typical workflow using DataFrames looks like
157///
158/// 1. Create a DataFrame via methods on [SessionContext], such as [`read_csv`]
159/// and [`read_parquet`].
160///
161/// 2. Build a desired calculation by calling methods such as [`filter`],
162/// [`select`], [`aggregate`], and [`limit`]
163///
164/// 3. Execute into [`RecordBatch`]es by calling [`collect`]
165///
166/// A `DataFrame` is a wrapper around a [`LogicalPlan`] and the [`SessionState`]
167/// required for execution.
168///
169/// DataFrames are "lazy" in the sense that most methods do not actually compute
170/// anything, they just build up a plan. Calling [`collect`] executes the plan
171/// using the same DataFusion planning and execution process used to execute SQL
172/// and other queries.
173///
174/// [Pandas DataFrame]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
175/// [Spark DataFrame]: https://spark.apache.org/docs/latest/sql-programming-guide.html
176/// [`read_csv`]: SessionContext::read_csv
177/// [`read_parquet`]: SessionContext::read_parquet
178/// [`filter`]: DataFrame::filter
179/// [`select`]: DataFrame::select
180/// [`aggregate`]: DataFrame::aggregate
181/// [`limit`]: DataFrame::limit
182/// [`collect`]: DataFrame::collect
183///
184/// # Example
185/// ```
186/// # use std::sync::Arc;
187/// # use datafusion::prelude::*;
188/// # use datafusion::error::Result;
189/// # use datafusion::functions_aggregate::expr_fn::min;
190/// # use datafusion::arrow::array::{Int32Array, RecordBatch, StringArray};
191/// # use datafusion::arrow::datatypes::{DataType, Field, Schema};
192/// # #[tokio::main]
193/// # async fn main() -> Result<()> {
194/// let ctx = SessionContext::new();
195/// // Read the data from a csv file
196/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
197/// // create a new dataframe that computes the equivalent of
198/// // `SELECT a, MIN(b) FROM df WHERE a <= b GROUP BY a LIMIT 100;`
199/// let df = df.filter(col("a").lt_eq(col("b")))?
200/// .aggregate(vec![col("a")], vec![min(col("b"))])?
201/// .limit(0, Some(100))?;
202/// // Perform the actual computation
203/// let results = df.collect();
204///
205/// // Create a new dataframe with in-memory data
206/// let schema = Schema::new(vec![
207/// Field::new("id", DataType::Int32, true),
208/// Field::new("name", DataType::Utf8, true),
209/// ]);
210/// let batch = RecordBatch::try_new(
211/// Arc::new(schema),
212/// vec![
213/// Arc::new(Int32Array::from(vec![1, 2, 3])),
214/// Arc::new(StringArray::from(vec!["foo", "bar", "baz"])),
215/// ],
216/// )?;
217/// let df = ctx.read_batch(batch)?;
218/// df.show().await?;
219///
220/// // Create a new dataframe with in-memory data using macro
221/// let df = dataframe!(
222/// "id" => [1, 2, 3],
223/// "name" => ["foo", "bar", "baz"]
224/// )?;
225/// df.show().await?;
226/// # Ok(())
227/// # }
228/// ```
229#[derive(Debug, Clone)]
230pub struct DataFrame {
231 // Box the (large) SessionState to reduce the size of DataFrame on the stack
232 session_state: Box<SessionState>,
233 plan: LogicalPlan,
234 // Whether projection ops can skip validation or not. This flag if false
235 // allows for an optimization in `with_column` and `with_column_renamed` functions
236 // where the recursive work required to columnize and normalize expressions can
237 // be skipped if set to false. Since these function calls are often chained or
238 // called many times in dataframe operations this can result in a significant
239 // performance gain.
240 //
241 // The conditions where this can be set to false is when the dataframe function
242 // call results in the last operation being a
243 // `LogicalPlanBuilder::from(plan).project(fields)?.build()` or
244 // `LogicalPlanBuilder::from(plan).project_with_validation(fields)?.build()`
245 // call. This requirement guarantees that the plan has had all columnization
246 // and normalization applied to existing expressions and only new expressions
247 // will require that work. Any operation that update the plan in any way
248 // via anything other than a `project` call should set this to true.
249 projection_requires_validation: bool,
250}
251
252impl DataFrame {
253 /// Create a new `DataFrame ` based on an existing `LogicalPlan`
254 ///
255 /// This is a low-level method and is not typically used by end users. See
256 /// [`SessionContext::read_csv`] and other methods for creating a
257 /// `DataFrame` from an existing datasource.
258 pub fn new(session_state: SessionState, plan: LogicalPlan) -> Self {
259 Self {
260 session_state: Box::new(session_state),
261 plan,
262 projection_requires_validation: true,
263 }
264 }
265
266 /// Creates logical expression from a SQL query text.
267 /// The expression is created and processed against the current schema.
268 ///
269 /// # Example: Parsing SQL queries
270 /// ```
271 /// # use arrow::datatypes::{DataType, Field, Schema};
272 /// # use datafusion::prelude::*;
273 /// # use datafusion_common::{DFSchema, Result};
274 /// # #[tokio::main]
275 /// # async fn main() -> Result<()> {
276 /// // datafusion will parse number as i64 first.
277 /// let sql = "a > 1 and b in (1, 10)";
278 /// let expected = col("a")
279 /// .gt(lit(1 as i64))
280 /// .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false));
281 /// let ctx = SessionContext::new();
282 /// let df = ctx
283 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
284 /// .await?;
285 /// let expr = df.parse_sql_expr(sql)?;
286 /// assert_eq!(expected, expr);
287 /// # Ok(())
288 /// # }
289 /// ```
290 #[cfg(feature = "sql")]
291 pub fn parse_sql_expr(&self, sql: &str) -> Result<Expr> {
292 let df_schema = self.schema();
293
294 self.session_state.create_logical_expr(sql, df_schema)
295 }
296
297 /// Consume the DataFrame and produce a physical plan
298 pub async fn create_physical_plan(self) -> Result<Arc<dyn ExecutionPlan>> {
299 self.session_state.create_physical_plan(&self.plan).await
300 }
301
302 /// Filter the DataFrame by column. Returns a new DataFrame only containing the
303 /// specified columns.
304 ///
305 /// ```
306 /// # use datafusion::prelude::*;
307 /// # use datafusion::error::Result;
308 /// # use datafusion_common::assert_batches_sorted_eq;
309 /// # #[tokio::main]
310 /// # async fn main() -> Result<()> {
311 /// let ctx = SessionContext::new();
312 /// let df = ctx
313 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
314 /// .await?;
315 /// let df = df.select_columns(&["a", "b"])?;
316 /// let expected = vec![
317 /// "+---+---+",
318 /// "| a | b |",
319 /// "+---+---+",
320 /// "| 1 | 2 |",
321 /// "+---+---+",
322 /// ];
323 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
324 /// # Ok(())
325 /// # }
326 /// ```
327 pub fn select_columns(self, columns: &[&str]) -> Result<DataFrame> {
328 let fields = columns
329 .iter()
330 .map(|name| {
331 let fields = self
332 .plan
333 .schema()
334 .qualified_fields_with_unqualified_name(name);
335 if fields.is_empty() {
336 Err(unqualified_field_not_found(name, self.plan.schema()))
337 } else {
338 Ok(fields)
339 }
340 })
341 .collect::<Result<Vec<_>, _>>()?
342 .into_iter()
343 .flatten()
344 .collect::<Vec<_>>();
345 let expr: Vec<Expr> = fields
346 .into_iter()
347 .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
348 .collect();
349 self.select(expr)
350 }
351 /// Project arbitrary list of expression strings into a new `DataFrame`.
352 /// Method will parse string expressions into logical plan expressions.
353 ///
354 /// The output `DataFrame` has one column for each element in `exprs`.
355 ///
356 /// # Example
357 /// ```
358 /// # use datafusion::prelude::*;
359 /// # use datafusion::error::Result;
360 /// # #[tokio::main]
361 /// # async fn main() -> Result<()> {
362 /// let ctx = SessionContext::new();
363 /// let df = ctx
364 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
365 /// .await?;
366 /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?;
367 /// # Ok(())
368 /// # }
369 /// ```
370 #[cfg(feature = "sql")]
371 pub fn select_exprs(self, exprs: &[&str]) -> Result<DataFrame> {
372 let expr_list = exprs
373 .iter()
374 .map(|e| self.parse_sql_expr(e))
375 .collect::<Result<Vec<_>>>()?;
376
377 self.select(expr_list)
378 }
379
380 /// Project arbitrary expressions (like SQL SELECT expressions) into a new
381 /// `DataFrame`.
382 ///
383 /// The output `DataFrame` has one column for each element in `expr_list`.
384 ///
385 /// # Example
386 /// ```
387 /// # use datafusion::prelude::*;
388 /// # use datafusion::error::Result;
389 /// # use datafusion_common::assert_batches_sorted_eq;
390 /// # #[tokio::main]
391 /// # async fn main() -> Result<()> {
392 /// let ctx = SessionContext::new();
393 /// let df = ctx
394 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
395 /// .await?;
396 /// let df = df.select(vec![col("a"), col("b") * col("c")])?;
397 /// let expected = vec![
398 /// "+---+-----------------------+",
399 /// "| a | ?table?.b * ?table?.c |",
400 /// "+---+-----------------------+",
401 /// "| 1 | 6 |",
402 /// "+---+-----------------------+",
403 /// ];
404 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
405 /// # Ok(())
406 /// # }
407 /// ```
408 pub fn select(
409 self,
410 expr_list: impl IntoIterator<Item = impl Into<SelectExpr>>,
411 ) -> Result<DataFrame> {
412 let expr_list: Vec<SelectExpr> =
413 expr_list.into_iter().map(|e| e.into()).collect::<Vec<_>>();
414
415 let expressions = expr_list.iter().filter_map(|e| match e {
416 SelectExpr::Expression(expr) => Some(expr),
417 _ => None,
418 });
419
420 let window_func_exprs = find_window_exprs(expressions);
421 let plan = if window_func_exprs.is_empty() {
422 self.plan
423 } else {
424 LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
425 };
426
427 let project_plan = LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
428
429 Ok(DataFrame {
430 session_state: self.session_state,
431 plan: project_plan,
432 projection_requires_validation: false,
433 })
434 }
435
436 /// Returns a new DataFrame containing all columns except the specified columns.
437 ///
438 /// ```
439 /// # use datafusion::prelude::*;
440 /// # use datafusion::error::Result;
441 /// # use datafusion_common::assert_batches_sorted_eq;
442 /// # #[tokio::main]
443 /// # async fn main() -> Result<()> {
444 /// let ctx = SessionContext::new();
445 /// let df = ctx
446 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
447 /// .await?;
448 /// // +----+----+----+
449 /// // | a | b | c |
450 /// // +----+----+----+
451 /// // | 1 | 2 | 3 |
452 /// // +----+----+----+
453 /// let df = df.drop_columns(&["a"])?;
454 /// let expected = vec![
455 /// "+---+---+",
456 /// "| b | c |",
457 /// "+---+---+",
458 /// "| 2 | 3 |",
459 /// "+---+---+",
460 /// ];
461 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
462 /// # Ok(())
463 /// # }
464 /// ```
465 pub fn drop_columns<T>(self, columns: &[T]) -> Result<DataFrame>
466 where
467 T: Into<Column> + Clone,
468 {
469 let fields_to_drop = columns
470 .iter()
471 .flat_map(|col| {
472 let column: Column = col.clone().into();
473 match column.relation.as_ref() {
474 Some(_) => {
475 // qualified_field_from_column returns Result<(Option<&TableReference>, &FieldRef)>
476 vec![self.plan.schema().qualified_field_from_column(&column)]
477 }
478 None => {
479 // qualified_fields_with_unqualified_name returns Vec<(Option<&TableReference>, &FieldRef)>
480 self.plan
481 .schema()
482 .qualified_fields_with_unqualified_name(&column.name)
483 .into_iter()
484 .map(Ok)
485 .collect::<Vec<_>>()
486 }
487 }
488 })
489 .collect::<Result<Vec<_>, _>>()?;
490 let expr: Vec<Expr> = self
491 .plan
492 .schema()
493 .fields()
494 .into_iter()
495 .enumerate()
496 .map(|(idx, _)| self.plan.schema().qualified_field(idx))
497 .filter(|(qualifier, f)| !fields_to_drop.contains(&(*qualifier, f)))
498 .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
499 .collect();
500 self.select(expr)
501 }
502
503 /// Expand multiple list/struct columns into a set of rows and new columns.
504 ///
505 /// See also: [`UnnestOptions`] documentation for the behavior of `unnest`
506 ///
507 /// # Example
508 /// ```
509 /// # use datafusion::prelude::*;
510 /// # use datafusion::error::Result;
511 /// # use datafusion_common::assert_batches_sorted_eq;
512 /// # #[tokio::main]
513 /// # async fn main() -> Result<()> {
514 /// let ctx = SessionContext::new();
515 /// let df = ctx.read_json("tests/data/unnest.json", JsonReadOptions::default()).await?;
516 /// // expand into multiple columns if it's json array, flatten field name if it's nested structure
517 /// let df = df.unnest_columns(&["b","c","d"])?;
518 /// let expected = vec![
519 /// "+---+------+-------+-----+-----+",
520 /// "| a | b | c | d.e | d.f |",
521 /// "+---+------+-------+-----+-----+",
522 /// "| 1 | 2.0 | false | 1 | 2 |",
523 /// "| 1 | 1.3 | true | 1 | 2 |",
524 /// "| 1 | -6.1 | | 1 | 2 |",
525 /// "| 2 | 3.0 | false | | |",
526 /// "| 2 | 2.3 | true | | |",
527 /// "| 2 | -7.1 | | | |",
528 /// "+---+------+-------+-----+-----+"
529 /// ];
530 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
531 /// # Ok(())
532 /// # }
533 /// ```
534 pub fn unnest_columns(self, columns: &[&str]) -> Result<DataFrame> {
535 self.unnest_columns_with_options(columns, UnnestOptions::new())
536 }
537
538 /// Expand multiple list columns into a set of rows, with
539 /// behavior controlled by [`UnnestOptions`].
540 ///
541 /// Please see the documentation on [`UnnestOptions`] for more
542 /// details about the meaning of unnest.
543 pub fn unnest_columns_with_options(
544 self,
545 columns: &[&str],
546 options: UnnestOptions,
547 ) -> Result<DataFrame> {
548 let columns = columns.iter().map(|c| Column::from(*c)).collect();
549 let plan = LogicalPlanBuilder::from(self.plan)
550 .unnest_columns_with_options(columns, options)?
551 .build()?;
552 Ok(DataFrame {
553 session_state: self.session_state,
554 plan,
555 projection_requires_validation: true,
556 })
557 }
558
559 /// Return a DataFrame with only rows for which `predicate` evaluates to
560 /// `true`.
561 ///
562 /// Rows for which `predicate` evaluates to `false` or `null`
563 /// are filtered out.
564 ///
565 /// # Example
566 /// ```
567 /// # use datafusion::prelude::*;
568 /// # use datafusion::error::Result;
569 /// # use datafusion_common::assert_batches_sorted_eq;
570 /// # #[tokio::main]
571 /// # async fn main() -> Result<()> {
572 /// let ctx = SessionContext::new();
573 /// let df = ctx
574 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
575 /// .await?;
576 /// let df = df.filter(col("a").lt_eq(col("b")))?;
577 /// // all rows where a <= b are returned
578 /// let expected = vec![
579 /// "+---+---+---+",
580 /// "| a | b | c |",
581 /// "+---+---+---+",
582 /// "| 1 | 2 | 3 |",
583 /// "| 4 | 5 | 6 |",
584 /// "| 7 | 8 | 9 |",
585 /// "+---+---+---+",
586 /// ];
587 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
588 /// # Ok(())
589 /// # }
590 /// ```
591 pub fn filter(self, predicate: Expr) -> Result<DataFrame> {
592 let plan = LogicalPlanBuilder::from(self.plan)
593 .filter(predicate)?
594 .build()?;
595 Ok(DataFrame {
596 session_state: self.session_state,
597 plan,
598 projection_requires_validation: true,
599 })
600 }
601
602 /// Return a new `DataFrame` that aggregates the rows of the current
603 /// `DataFrame`, first optionally grouping by the given expressions.
604 ///
605 /// # Example
606 /// ```
607 /// # use datafusion::prelude::*;
608 /// # use datafusion::error::Result;
609 /// # use datafusion::functions_aggregate::expr_fn::min;
610 /// # use datafusion_common::assert_batches_sorted_eq;
611 /// # #[tokio::main]
612 /// # async fn main() -> Result<()> {
613 /// let ctx = SessionContext::new();
614 /// let df = ctx
615 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
616 /// .await?;
617 ///
618 /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
619 /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?;
620 /// let expected1 = vec![
621 /// "+---+----------------+",
622 /// "| a | min(?table?.b) |",
623 /// "+---+----------------+",
624 /// "| 1 | 2 |",
625 /// "| 4 | 5 |",
626 /// "| 7 | 8 |",
627 /// "+---+----------------+",
628 /// ];
629 /// assert_batches_sorted_eq!(expected1, &df1.collect().await?);
630 /// // The following use is the equivalent of "SELECT MIN(b)"
631 /// let df2 = df.aggregate(vec![], vec![min(col("b"))])?;
632 /// let expected2 = vec![
633 /// "+----------------+",
634 /// "| min(?table?.b) |",
635 /// "+----------------+",
636 /// "| 2 |",
637 /// "+----------------+",
638 /// ];
639 /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?);
640 /// # Ok(())
641 /// # }
642 /// ```
643 pub fn aggregate(
644 self,
645 group_expr: Vec<Expr>,
646 aggr_expr: Vec<Expr>,
647 ) -> Result<DataFrame> {
648 let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]);
649 let aggr_expr_len = aggr_expr.len();
650 let options =
651 LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true);
652 let plan = LogicalPlanBuilder::from(self.plan)
653 .with_options(options)
654 .aggregate(group_expr, aggr_expr)?
655 .build()?;
656 let plan = if is_grouping_set {
657 let grouping_id_pos = plan.schema().fields().len() - 1 - aggr_expr_len;
658 // For grouping sets we do a project to not expose the internal grouping id
659 let exprs = plan
660 .schema()
661 .columns()
662 .into_iter()
663 .enumerate()
664 .filter(|(idx, _)| *idx != grouping_id_pos)
665 .map(|(_, column)| Expr::Column(column))
666 .collect::<Vec<_>>();
667 LogicalPlanBuilder::from(plan).project(exprs)?.build()?
668 } else {
669 plan
670 };
671 Ok(DataFrame {
672 session_state: self.session_state,
673 plan,
674 projection_requires_validation: !is_grouping_set,
675 })
676 }
677
678 /// Return a new DataFrame that adds the result of evaluating one or more
679 /// window functions ([`Expr::WindowFunction`]) to the existing columns
680 pub fn window(self, window_exprs: Vec<Expr>) -> Result<DataFrame> {
681 let plan = LogicalPlanBuilder::from(self.plan)
682 .window(window_exprs)?
683 .build()?;
684 Ok(DataFrame {
685 session_state: self.session_state,
686 plan,
687 projection_requires_validation: true,
688 })
689 }
690
691 /// Returns a new `DataFrame` with a limited number of rows.
692 ///
693 /// # Arguments
694 /// `skip` - Number of rows to skip before fetch any row
695 /// `fetch` - Maximum number of rows to return, after skipping `skip` rows.
696 ///
697 /// # Example
698 /// ```
699 /// # use datafusion::prelude::*;
700 /// # use datafusion::error::Result;
701 /// # use datafusion_common::assert_batches_sorted_eq;
702 /// # #[tokio::main]
703 /// # async fn main() -> Result<()> {
704 /// let ctx = SessionContext::new();
705 /// let df = ctx
706 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
707 /// .await?;
708 /// let df = df.limit(1, Some(2))?;
709 /// let expected = vec![
710 /// "+---+---+---+",
711 /// "| a | b | c |",
712 /// "+---+---+---+",
713 /// "| 4 | 5 | 6 |",
714 /// "| 7 | 8 | 9 |",
715 /// "+---+---+---+",
716 /// ];
717 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
718 /// # Ok(())
719 /// # }
720 /// ```
721 pub fn limit(self, skip: usize, fetch: Option<usize>) -> Result<DataFrame> {
722 let plan = LogicalPlanBuilder::from(self.plan)
723 .limit(skip, fetch)?
724 .build()?;
725 Ok(DataFrame {
726 session_state: self.session_state,
727 plan,
728 projection_requires_validation: self.projection_requires_validation,
729 })
730 }
731
732 /// Calculate the union of two [`DataFrame`]s, preserving duplicate rows.
733 ///
734 /// The two [`DataFrame`]s must have exactly the same schema
735 ///
736 /// # Example
737 /// ```
738 /// # use datafusion::prelude::*;
739 /// # use datafusion::error::Result;
740 /// # use datafusion_common::assert_batches_sorted_eq;
741 /// # #[tokio::main]
742 /// # async fn main() -> Result<()> {
743 /// let ctx = SessionContext::new();
744 /// let df = ctx
745 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
746 /// .await?;
747 /// let d2 = df.clone();
748 /// let df = df.union(d2)?;
749 /// let expected = vec![
750 /// "+---+---+---+",
751 /// "| a | b | c |",
752 /// "+---+---+---+",
753 /// "| 1 | 2 | 3 |",
754 /// "| 1 | 2 | 3 |",
755 /// "+---+---+---+",
756 /// ];
757 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
758 /// # Ok(())
759 /// # }
760 /// ```
761 pub fn union(self, dataframe: DataFrame) -> Result<DataFrame> {
762 let plan = LogicalPlanBuilder::from(self.plan)
763 .union(dataframe.plan)?
764 .build()?;
765 Ok(DataFrame {
766 session_state: self.session_state,
767 plan,
768 projection_requires_validation: true,
769 })
770 }
771
772 /// Calculate the union of two [`DataFrame`]s using column names, preserving duplicate rows.
773 ///
774 /// The two [`DataFrame`]s are combined using column names rather than position,
775 /// filling missing columns with null.
776 ///
777 ///
778 /// # Example
779 /// ```
780 /// # use datafusion::prelude::*;
781 /// # use datafusion::error::Result;
782 /// # use datafusion_common::assert_batches_sorted_eq;
783 /// # #[tokio::main]
784 /// # async fn main() -> Result<()> {
785 /// let ctx = SessionContext::new();
786 /// let df = ctx
787 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
788 /// .await?;
789 /// let d2 = df
790 /// .clone()
791 /// .select_columns(&["b", "c", "a"])?
792 /// .with_column("d", lit("77"))?;
793 /// let df = df.union_by_name(d2)?;
794 /// let expected = vec![
795 /// "+---+---+---+----+",
796 /// "| a | b | c | d |",
797 /// "+---+---+---+----+",
798 /// "| 1 | 2 | 3 | |",
799 /// "| 1 | 2 | 3 | 77 |",
800 /// "+---+---+---+----+",
801 /// ];
802 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
803 /// # Ok(())
804 /// # }
805 /// ```
806 pub fn union_by_name(self, dataframe: DataFrame) -> Result<DataFrame> {
807 let plan = LogicalPlanBuilder::from(self.plan)
808 .union_by_name(dataframe.plan)?
809 .build()?;
810 Ok(DataFrame {
811 session_state: self.session_state,
812 plan,
813 projection_requires_validation: true,
814 })
815 }
816
817 /// Calculate the distinct union of two [`DataFrame`]s.
818 ///
819 /// The two [`DataFrame`]s must have exactly the same schema. Any duplicate
820 /// rows are discarded.
821 ///
822 /// # Example
823 /// ```
824 /// # use datafusion::prelude::*;
825 /// # use datafusion::error::Result;
826 /// # use datafusion_common::assert_batches_sorted_eq;
827 /// # #[tokio::main]
828 /// # async fn main() -> Result<()> {
829 /// let ctx = SessionContext::new();
830 /// let df = ctx
831 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
832 /// .await?;
833 /// let d2 = df.clone();
834 /// let df = df.union_distinct(d2)?;
835 /// // df2 are duplicate of df
836 /// let expected = vec![
837 /// "+---+---+---+",
838 /// "| a | b | c |",
839 /// "+---+---+---+",
840 /// "| 1 | 2 | 3 |",
841 /// "+---+---+---+",
842 /// ];
843 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
844 /// # Ok(())
845 /// # }
846 /// ```
847 pub fn union_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
848 let plan = LogicalPlanBuilder::from(self.plan)
849 .union_distinct(dataframe.plan)?
850 .build()?;
851 Ok(DataFrame {
852 session_state: self.session_state,
853 plan,
854 projection_requires_validation: true,
855 })
856 }
857
858 /// Calculate the union of two [`DataFrame`]s using column names with all duplicated rows removed.
859 ///
860 /// The two [`DataFrame`]s are combined using column names rather than position,
861 /// filling missing columns with null.
862 ///
863 ///
864 /// # Example
865 /// ```
866 /// # use datafusion::prelude::*;
867 /// # use datafusion::error::Result;
868 /// # use datafusion_common::assert_batches_sorted_eq;
869 /// # #[tokio::main]
870 /// # async fn main() -> Result<()> {
871 /// let ctx = SessionContext::new();
872 /// let df = ctx
873 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
874 /// .await?;
875 /// let d2 = df.clone().select_columns(&["b", "c", "a"])?;
876 /// let df = df.union_by_name_distinct(d2)?;
877 /// let expected = vec![
878 /// "+---+---+---+",
879 /// "| a | b | c |",
880 /// "+---+---+---+",
881 /// "| 1 | 2 | 3 |",
882 /// "+---+---+---+",
883 /// ];
884 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
885 /// # Ok(())
886 /// # }
887 /// ```
888 pub fn union_by_name_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
889 let plan = LogicalPlanBuilder::from(self.plan)
890 .union_by_name_distinct(dataframe.plan)?
891 .build()?;
892 Ok(DataFrame {
893 session_state: self.session_state,
894 plan,
895 projection_requires_validation: true,
896 })
897 }
898
899 /// Return a new `DataFrame` with all duplicated rows removed.
900 ///
901 /// # Example
902 /// ```
903 /// # use datafusion::prelude::*;
904 /// # use datafusion::error::Result;
905 /// # use datafusion_common::assert_batches_sorted_eq;
906 /// # #[tokio::main]
907 /// # async fn main() -> Result<()> {
908 /// let ctx = SessionContext::new();
909 /// let df = ctx
910 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
911 /// .await?;
912 /// let df = df.distinct()?;
913 /// let expected = vec![
914 /// "+---+---+---+",
915 /// "| a | b | c |",
916 /// "+---+---+---+",
917 /// "| 1 | 2 | 3 |",
918 /// "+---+---+---+",
919 /// ];
920 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
921 /// # Ok(())
922 /// # }
923 /// ```
924 pub fn distinct(self) -> Result<DataFrame> {
925 let plan = LogicalPlanBuilder::from(self.plan).distinct()?.build()?;
926 Ok(DataFrame {
927 session_state: self.session_state,
928 plan,
929 projection_requires_validation: true,
930 })
931 }
932
933 /// Return a new `DataFrame` with duplicated rows removed as per the specified expression list
934 /// according to the provided sorting expressions grouped by the `DISTINCT ON` clause
935 /// expressions.
936 ///
937 /// # Example
938 /// ```
939 /// # use datafusion::prelude::*;
940 /// # use datafusion::error::Result;
941 /// # use datafusion_common::assert_batches_sorted_eq;
942 /// # #[tokio::main]
943 /// # async fn main() -> Result<()> {
944 /// let ctx = SessionContext::new();
945 /// let df = ctx
946 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
947 /// .await?
948 /// // Return a single row (a, b) for each distinct value of a
949 /// .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?;
950 /// let expected = vec![
951 /// "+---+---+",
952 /// "| a | b |",
953 /// "+---+---+",
954 /// "| 1 | 2 |",
955 /// "+---+---+",
956 /// ];
957 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
958 /// # Ok(())
959 /// # }
960 /// ```
961 pub fn distinct_on(
962 self,
963 on_expr: Vec<Expr>,
964 select_expr: Vec<Expr>,
965 sort_expr: Option<Vec<SortExpr>>,
966 ) -> Result<DataFrame> {
967 let plan = LogicalPlanBuilder::from(self.plan)
968 .distinct_on(on_expr, select_expr, sort_expr)?
969 .build()?;
970 Ok(DataFrame {
971 session_state: self.session_state,
972 plan,
973 projection_requires_validation: true,
974 })
975 }
976
977 /// Return a new `DataFrame` that has statistics for a DataFrame.
978 ///
979 /// Only summarizes numeric datatypes at the moment and returns nulls for
980 /// non numeric datatypes. The output format is modeled after pandas
981 ///
982 /// # Example
983 /// ```
984 /// # use datafusion::prelude::*;
985 /// # use datafusion::error::Result;
986 /// # use arrow::util::pretty;
987 /// # use datafusion_common::assert_batches_sorted_eq;
988 /// # #[tokio::main]
989 /// # async fn main() -> Result<()> {
990 /// let ctx = SessionContext::new();
991 /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;
992 /// let stat = df.describe().await?;
993 /// # // some output column are ignored
994 /// let expected = vec![
995 /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+",
996 /// "| describe | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment |",
997 /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+",
998 /// "| count | 9.0 | 9 | 9 | 9.0 | 9 | 9.0 | 9 | 9 |",
999 /// "| max | 10.0 | Customer#000000010 | xKiAFTjUsCuxfeleNqefumTrjS | 20.0 | 30-114-968-4951 | 9561.95 | MACHINERY | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious |",
1000 /// "| mean | 6.0 | null | null | 9.88888888888889 | null | 5153.2155555555555 | null | null |",
1001 /// "| median | 6.0 | null | null | 8.0 | null | 6819.74 | null | null |",
1002 /// "| min | 2.0 | Customer#000000002 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 1.0 | 11-719-748-3364 | 121.65 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov |",
1003 /// "| null_count | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 |",
1004 /// "| std | 2.7386127875258306 | null | null | 7.2188026092359046 | null | 3522.169804254585 | null | null |",
1005 /// "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+"];
1006 /// assert_batches_sorted_eq!(expected, &stat.collect().await?);
1007 /// # Ok(())
1008 /// # }
1009 /// ```
1010 pub async fn describe(self) -> Result<Self> {
1011 //the functions now supported
1012 let supported_describe_functions =
1013 vec!["count", "null_count", "mean", "std", "min", "max", "median"];
1014
1015 let original_schema_fields = self.schema().fields().iter();
1016
1017 //define describe column
1018 let mut describe_schemas = vec![Field::new("describe", DataType::Utf8, false)];
1019 describe_schemas.extend(original_schema_fields.clone().map(|field| {
1020 if field.data_type().is_numeric() {
1021 Field::new(field.name(), DataType::Float64, true)
1022 } else {
1023 Field::new(field.name(), DataType::Utf8, true)
1024 }
1025 }));
1026
1027 //collect recordBatch
1028 let describe_record_batch = [
1029 // count aggregation
1030 self.clone().aggregate(
1031 vec![],
1032 original_schema_fields
1033 .clone()
1034 .map(|f| count(ident(f.name())).alias(f.name()))
1035 .collect::<Vec<_>>(),
1036 ),
1037 // null_count aggregation
1038 self.clone().aggregate(
1039 vec![],
1040 original_schema_fields
1041 .clone()
1042 .map(|f| {
1043 sum(case(is_null(ident(f.name())))
1044 .when(lit(true), lit(1))
1045 .otherwise(lit(0))
1046 .unwrap())
1047 .alias(f.name())
1048 })
1049 .collect::<Vec<_>>(),
1050 ),
1051 // mean aggregation
1052 self.clone().aggregate(
1053 vec![],
1054 original_schema_fields
1055 .clone()
1056 .filter(|f| f.data_type().is_numeric())
1057 .map(|f| avg(ident(f.name())).alias(f.name()))
1058 .collect::<Vec<_>>(),
1059 ),
1060 // std aggregation
1061 self.clone().aggregate(
1062 vec![],
1063 original_schema_fields
1064 .clone()
1065 .filter(|f| f.data_type().is_numeric())
1066 .map(|f| stddev(ident(f.name())).alias(f.name()))
1067 .collect::<Vec<_>>(),
1068 ),
1069 // min aggregation
1070 self.clone().aggregate(
1071 vec![],
1072 original_schema_fields
1073 .clone()
1074 .filter(|f| {
1075 !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1076 })
1077 .map(|f| min(ident(f.name())).alias(f.name()))
1078 .collect::<Vec<_>>(),
1079 ),
1080 // max aggregation
1081 self.clone().aggregate(
1082 vec![],
1083 original_schema_fields
1084 .clone()
1085 .filter(|f| {
1086 !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1087 })
1088 .map(|f| max(ident(f.name())).alias(f.name()))
1089 .collect::<Vec<_>>(),
1090 ),
1091 // median aggregation
1092 self.clone().aggregate(
1093 vec![],
1094 original_schema_fields
1095 .clone()
1096 .filter(|f| f.data_type().is_numeric())
1097 .map(|f| median(ident(f.name())).alias(f.name()))
1098 .collect::<Vec<_>>(),
1099 ),
1100 ];
1101
1102 // first column with function names
1103 let mut array_ref_vec: Vec<ArrayRef> = vec![Arc::new(StringArray::from(
1104 supported_describe_functions.clone(),
1105 ))];
1106 for field in original_schema_fields {
1107 let mut array_datas = vec![];
1108 for result in describe_record_batch.iter() {
1109 let array_ref = match result {
1110 Ok(df) => {
1111 let batches = df.clone().collect().await;
1112 match batches {
1113 Ok(batches)
1114 if batches.len() == 1
1115 && batches[0]
1116 .column_by_name(field.name())
1117 .is_some() =>
1118 {
1119 let column =
1120 batches[0].column_by_name(field.name()).unwrap();
1121
1122 if column.data_type().is_null() {
1123 Arc::new(StringArray::from(vec!["null"]))
1124 } else if field.data_type().is_numeric() {
1125 cast(column, &DataType::Float64)?
1126 } else {
1127 cast(column, &DataType::Utf8)?
1128 }
1129 }
1130 _ => Arc::new(StringArray::from(vec!["null"])),
1131 }
1132 }
1133 //Handling error when only boolean/binary column, and in other cases
1134 Err(err)
1135 if err.to_string().contains(
1136 "Error during planning: \
1137 Aggregate requires at least one grouping \
1138 or aggregate expression",
1139 ) =>
1140 {
1141 Arc::new(StringArray::from(vec!["null"]))
1142 }
1143 Err(e) => return exec_err!("{}", e),
1144 };
1145 array_datas.push(array_ref);
1146 }
1147 array_ref_vec.push(concat(
1148 array_datas
1149 .iter()
1150 .map(|af| af.as_ref())
1151 .collect::<Vec<_>>()
1152 .as_slice(),
1153 )?);
1154 }
1155
1156 let describe_record_batch =
1157 RecordBatch::try_new(Arc::new(Schema::new(describe_schemas)), array_ref_vec)?;
1158
1159 let provider = MemTable::try_new(
1160 describe_record_batch.schema(),
1161 vec![vec![describe_record_batch]],
1162 )?;
1163
1164 let plan = LogicalPlanBuilder::scan(
1165 UNNAMED_TABLE,
1166 provider_as_source(Arc::new(provider)),
1167 None,
1168 )?
1169 .build()?;
1170
1171 Ok(DataFrame {
1172 session_state: self.session_state,
1173 plan,
1174 projection_requires_validation: self.projection_requires_validation,
1175 })
1176 }
1177
1178 /// Apply a sort by provided expressions with default direction
1179 pub fn sort_by(self, expr: Vec<Expr>) -> Result<DataFrame> {
1180 self.sort(
1181 expr.into_iter()
1182 .map(|e| e.sort(true, false))
1183 .collect::<Vec<SortExpr>>(),
1184 )
1185 }
1186
1187 /// Sort the DataFrame by the specified sorting expressions.
1188 ///
1189 /// Note that any expression can be turned into
1190 /// a sort expression by calling its [sort](Expr::sort) method.
1191 ///
1192 /// # Example
1193 ///
1194 /// ```
1195 /// # use datafusion::prelude::*;
1196 /// # use datafusion::error::Result;
1197 /// # use datafusion_common::assert_batches_sorted_eq;
1198 /// # #[tokio::main]
1199 /// # async fn main() -> Result<()> {
1200 /// let ctx = SessionContext::new();
1201 /// let df = ctx
1202 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1203 /// .await?;
1204 /// let df = df.sort(vec![
1205 /// col("a").sort(false, true), // a DESC, nulls first
1206 /// col("b").sort(true, false), // b ASC, nulls last
1207 /// ])?;
1208 /// let expected = vec![
1209 /// "+---+---+---+",
1210 /// "| a | b | c |",
1211 /// "+---+---+---+",
1212 /// "| 1 | 2 | 3 |",
1213 /// "| 4 | 5 | 6 |",
1214 /// "| 7 | 8 | 9 |",
1215 /// "+---+---+---+",
1216 /// ];
1217 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1218 /// # Ok(())
1219 /// # }
1220 /// ```
1221 pub fn sort(self, expr: Vec<SortExpr>) -> Result<DataFrame> {
1222 let plan = LogicalPlanBuilder::from(self.plan).sort(expr)?.build()?;
1223 Ok(DataFrame {
1224 session_state: self.session_state,
1225 plan,
1226 projection_requires_validation: self.projection_requires_validation,
1227 })
1228 }
1229
1230 /// Join this `DataFrame` with another `DataFrame` using explicitly specified
1231 /// columns and an optional filter expression.
1232 ///
1233 /// See [`join_on`](Self::join_on) for a more concise way to specify the
1234 /// join condition. Since DataFusion will automatically identify and
1235 /// optimize equality predicates there is no performance difference between
1236 /// this function and `join_on`
1237 ///
1238 /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see
1239 /// example below), which are then combined with the optional `filter`
1240 /// expression. If `left_cols` and `right_cols` contain ambiguous column
1241 /// references, they will be disambiguated by prioritizing the left relation
1242 /// for `left_cols` and the right relation for `right_cols`.
1243 ///
1244 /// Note that in case of outer join, the `filter` is applied to only matched rows.
1245 ///
1246 /// # Example
1247 /// ```
1248 /// # use datafusion::prelude::*;
1249 /// # use datafusion::error::Result;
1250 /// # use datafusion_common::assert_batches_sorted_eq;
1251 /// # #[tokio::main]
1252 /// # async fn main() -> Result<()> {
1253 /// let ctx = SessionContext::new();
1254 /// let left = ctx
1255 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1256 /// .await?;
1257 /// let right = ctx
1258 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1259 /// .await?
1260 /// .select(vec![
1261 /// col("a").alias("a2"),
1262 /// col("b").alias("b2"),
1263 /// col("c").alias("c2"),
1264 /// ])?;
1265 /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)`
1266 /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`.
1267 /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?;
1268 /// let expected = vec![
1269 /// "+---+---+---+----+----+----+",
1270 /// "| a | b | c | a2 | b2 | c2 |",
1271 /// "+---+---+---+----+----+----+",
1272 /// "| 1 | 2 | 3 | 1 | 2 | 3 |",
1273 /// "+---+---+---+----+----+----+",
1274 /// ];
1275 /// assert_batches_sorted_eq!(expected, &join.collect().await?);
1276 /// # Ok(())
1277 /// # }
1278 /// ```
1279 pub fn join(
1280 self,
1281 right: DataFrame,
1282 join_type: JoinType,
1283 left_cols: &[&str],
1284 right_cols: &[&str],
1285 filter: Option<Expr>,
1286 ) -> Result<DataFrame> {
1287 let plan = LogicalPlanBuilder::from(self.plan)
1288 .join(
1289 right.plan,
1290 join_type,
1291 (left_cols.to_vec(), right_cols.to_vec()),
1292 filter,
1293 )?
1294 .build()?;
1295 Ok(DataFrame {
1296 session_state: self.session_state,
1297 plan,
1298 projection_requires_validation: true,
1299 })
1300 }
1301
1302 /// Join this `DataFrame` with another `DataFrame` using the specified
1303 /// expressions.
1304 ///
1305 /// Note that DataFusion automatically optimizes joins, including
1306 /// identifying and optimizing equality predicates.
1307 ///
1308 /// # Example
1309 /// ```
1310 /// # use datafusion::prelude::*;
1311 /// # use datafusion::error::Result;
1312 /// # use datafusion_common::assert_batches_sorted_eq;
1313 /// # #[tokio::main]
1314 /// # async fn main() -> Result<()> {
1315 /// let ctx = SessionContext::new();
1316 /// let left = ctx
1317 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1318 /// .await?;
1319 /// let right = ctx
1320 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1321 /// .await?
1322 /// .select(vec![
1323 /// col("a").alias("a2"),
1324 /// col("b").alias("b2"),
1325 /// col("c").alias("c2"),
1326 /// ])?;
1327 ///
1328 /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
1329 /// // finding all pairs of rows from `left` and `right` where
1330 /// // where `a != a2` and `b != b2`.
1331 /// let join_on = left.join_on(
1332 /// right,
1333 /// JoinType::Inner,
1334 /// [col("a").not_eq(col("a2")), col("b").not_eq(col("b2"))],
1335 /// )?;
1336 /// let expected = vec![
1337 /// "+---+---+---+----+----+----+",
1338 /// "| a | b | c | a2 | b2 | c2 |",
1339 /// "+---+---+---+----+----+----+",
1340 /// "+---+---+---+----+----+----+",
1341 /// ];
1342 /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?);
1343 /// # Ok(())
1344 /// # }
1345 /// ```
1346 pub fn join_on(
1347 self,
1348 right: DataFrame,
1349 join_type: JoinType,
1350 on_exprs: impl IntoIterator<Item = Expr>,
1351 ) -> Result<DataFrame> {
1352 let plan = LogicalPlanBuilder::from(self.plan)
1353 .join_on(right.plan, join_type, on_exprs)?
1354 .build()?;
1355 Ok(DataFrame {
1356 session_state: self.session_state,
1357 plan,
1358 projection_requires_validation: true,
1359 })
1360 }
1361
1362 /// Repartition a DataFrame based on a logical partitioning scheme.
1363 ///
1364 /// # Example
1365 /// ```
1366 /// # use datafusion::prelude::*;
1367 /// # use datafusion::error::Result;
1368 /// # use datafusion_common::assert_batches_sorted_eq;
1369 /// # #[tokio::main]
1370 /// # async fn main() -> Result<()> {
1371 /// let ctx = SessionContext::new();
1372 /// let df = ctx
1373 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1374 /// .await?;
1375 /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
1376 /// let expected = vec![
1377 /// "+---+---+---+",
1378 /// "| a | b | c |",
1379 /// "+---+---+---+",
1380 /// "| 1 | 2 | 3 |",
1381 /// "| 4 | 5 | 6 |",
1382 /// "| 7 | 8 | 9 |",
1383 /// "+---+---+---+",
1384 /// ];
1385 /// # assert_batches_sorted_eq!(expected, &df1.collect().await?);
1386 /// # Ok(())
1387 /// # }
1388 /// ```
1389 pub fn repartition(self, partitioning_scheme: Partitioning) -> Result<DataFrame> {
1390 let plan = LogicalPlanBuilder::from(self.plan)
1391 .repartition(partitioning_scheme)?
1392 .build()?;
1393 Ok(DataFrame {
1394 session_state: self.session_state,
1395 plan,
1396 projection_requires_validation: true,
1397 })
1398 }
1399
1400 /// Return the total number of rows in this `DataFrame`.
1401 ///
1402 /// Note that this method will actually run a plan to calculate the count,
1403 /// which may be slow for large or complicated DataFrames.
1404 ///
1405 /// # Example
1406 /// ```
1407 /// # use datafusion::prelude::*;
1408 /// # use datafusion::error::Result;
1409 /// # #[tokio::main]
1410 /// # async fn main() -> Result<()> {
1411 /// let ctx = SessionContext::new();
1412 /// let df = ctx
1413 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1414 /// .await?;
1415 /// let count = df.count().await?; // 1
1416 /// # assert_eq!(count, 1);
1417 /// # Ok(())
1418 /// # }
1419 /// ```
1420 pub async fn count(self) -> Result<usize> {
1421 let rows = self
1422 .aggregate(
1423 vec![],
1424 vec![count(Expr::Literal(COUNT_STAR_EXPANSION, None))],
1425 )?
1426 .collect()
1427 .await?;
1428 let len = *rows
1429 .first()
1430 .and_then(|r| r.columns().first())
1431 .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
1432 .and_then(|a| a.values().first())
1433 .ok_or_else(|| {
1434 internal_datafusion_err!("Unexpected output when collecting for count()")
1435 })? as usize;
1436 Ok(len)
1437 }
1438
1439 /// Execute this `DataFrame` and buffer all resulting `RecordBatch`es into memory.
1440 ///
1441 /// Prior to calling `collect`, modifying a DataFrame simply updates a plan
1442 /// (no actual computation is performed). `collect` triggers the computation.
1443 ///
1444 /// See [`Self::execute_stream`] to execute a DataFrame without buffering.
1445 ///
1446 /// # Example
1447 /// ```
1448 /// # use datafusion::prelude::*;
1449 /// # use datafusion::error::Result;
1450 /// # #[tokio::main]
1451 /// # async fn main() -> Result<()> {
1452 /// let ctx = SessionContext::new();
1453 /// let df = ctx
1454 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1455 /// .await?;
1456 /// let batches = df.collect().await?;
1457 /// # Ok(())
1458 /// # }
1459 /// ```
1460 pub async fn collect(self) -> Result<Vec<RecordBatch>> {
1461 let task_ctx = Arc::new(self.task_ctx());
1462 let plan = self.create_physical_plan().await?;
1463 collect(plan, task_ctx).await
1464 }
1465
1466 /// Execute the `DataFrame` and print the results to the console.
1467 ///
1468 /// # Example
1469 /// ```
1470 /// # use datafusion::prelude::*;
1471 /// # use datafusion::error::Result;
1472 /// # #[tokio::main]
1473 /// # async fn main() -> Result<()> {
1474 /// let ctx = SessionContext::new();
1475 /// let df = ctx
1476 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1477 /// .await?;
1478 /// df.show().await?;
1479 /// # Ok(())
1480 /// # }
1481 /// ```
1482 pub async fn show(self) -> Result<()> {
1483 println!("{}", self.to_string().await?);
1484 Ok(())
1485 }
1486
1487 /// Execute the `DataFrame` and return a string representation of the results.
1488 ///
1489 /// # Example
1490 /// ```
1491 /// # use datafusion::prelude::*;
1492 /// # use datafusion::error::Result;
1493 /// # use datafusion::execution::SessionStateBuilder;
1494 ///
1495 /// # #[tokio::main]
1496 /// # async fn main() -> Result<()> {
1497 /// let cfg = SessionConfig::new()
1498 /// .set_str("datafusion.format.null", "no-value");
1499 /// let session_state = SessionStateBuilder::new()
1500 /// .with_config(cfg)
1501 /// .with_default_features()
1502 /// .build();
1503 /// let ctx = SessionContext::new_with_state(session_state);
1504 /// let df = ctx.sql("select null as 'null-column'").await?;
1505 /// let result = df.to_string().await?;
1506 /// assert_eq!(result,
1507 /// "+-------------+
1508 /// | null-column |
1509 /// +-------------+
1510 /// | no-value |
1511 /// +-------------+"
1512 /// );
1513 /// # Ok(())
1514 /// # }
1515 pub async fn to_string(self) -> Result<String> {
1516 let options = self.session_state.config().options().format.clone();
1517 let arrow_options: arrow::util::display::FormatOptions = (&options).try_into()?;
1518
1519 let results = self.collect().await?;
1520 Ok(
1521 pretty::pretty_format_batches_with_options(&results, &arrow_options)?
1522 .to_string(),
1523 )
1524 }
1525
1526 /// Execute the `DataFrame` and print only the first `num` rows of the
1527 /// result to the console.
1528 ///
1529 /// # Example
1530 /// ```
1531 /// # use datafusion::prelude::*;
1532 /// # use datafusion::error::Result;
1533 /// # #[tokio::main]
1534 /// # async fn main() -> Result<()> {
1535 /// let ctx = SessionContext::new();
1536 /// let df = ctx
1537 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1538 /// .await?;
1539 /// df.show_limit(10).await?;
1540 /// # Ok(())
1541 /// # }
1542 /// ```
1543 pub async fn show_limit(self, num: usize) -> Result<()> {
1544 let results = self.limit(0, Some(num))?.collect().await?;
1545 Ok(pretty::print_batches(&results)?)
1546 }
1547
1548 /// Return a new [`TaskContext`] which would be used to execute this DataFrame
1549 pub fn task_ctx(&self) -> TaskContext {
1550 TaskContext::from(self.session_state.as_ref())
1551 }
1552
1553 /// Executes this DataFrame and returns a stream over a single partition
1554 ///
1555 /// See [Self::collect] to buffer the `RecordBatch`es in memory.
1556 ///
1557 /// # Example
1558 /// ```
1559 /// # use datafusion::prelude::*;
1560 /// # use datafusion::error::Result;
1561 /// # #[tokio::main]
1562 /// # async fn main() -> Result<()> {
1563 /// let ctx = SessionContext::new();
1564 /// let df = ctx
1565 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1566 /// .await?;
1567 /// let stream = df.execute_stream().await?;
1568 /// # Ok(())
1569 /// # }
1570 /// ```
1571 ///
1572 /// # Aborting Execution
1573 ///
1574 /// Dropping the stream will abort the execution of the query, and free up
1575 /// any allocated resources
1576 pub async fn execute_stream(self) -> Result<SendableRecordBatchStream> {
1577 let task_ctx = Arc::new(self.task_ctx());
1578 let plan = self.create_physical_plan().await?;
1579 execute_stream(plan, task_ctx)
1580 }
1581
1582 /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
1583 /// maintaining the input partitioning.
1584 ///
1585 /// # Example
1586 /// ```
1587 /// # use datafusion::prelude::*;
1588 /// # use datafusion::error::Result;
1589 /// # #[tokio::main]
1590 /// # async fn main() -> Result<()> {
1591 /// let ctx = SessionContext::new();
1592 /// let df = ctx
1593 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1594 /// .await?;
1595 /// let batches = df.collect_partitioned().await?;
1596 /// # Ok(())
1597 /// # }
1598 /// ```
1599 pub async fn collect_partitioned(self) -> Result<Vec<Vec<RecordBatch>>> {
1600 let task_ctx = Arc::new(self.task_ctx());
1601 let plan = self.create_physical_plan().await?;
1602 collect_partitioned(plan, task_ctx).await
1603 }
1604
1605 /// Executes this DataFrame and returns one stream per partition.
1606 ///
1607 /// # Example
1608 /// ```
1609 /// # use datafusion::prelude::*;
1610 /// # use datafusion::error::Result;
1611 /// # #[tokio::main]
1612 /// # async fn main() -> Result<()> {
1613 /// let ctx = SessionContext::new();
1614 /// let df = ctx
1615 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1616 /// .await?;
1617 /// let batches = df.execute_stream_partitioned().await?;
1618 /// # Ok(())
1619 /// # }
1620 /// ```
1621 /// # Aborting Execution
1622 ///
1623 /// Dropping the stream will abort the execution of the query, and free up
1624 /// any allocated resources
1625 pub async fn execute_stream_partitioned(
1626 self,
1627 ) -> Result<Vec<SendableRecordBatchStream>> {
1628 let task_ctx = Arc::new(self.task_ctx());
1629 let plan = self.create_physical_plan().await?;
1630 execute_stream_partitioned(plan, task_ctx)
1631 }
1632
1633 /// Returns the `DFSchema` describing the output of this DataFrame.
1634 ///
1635 /// The output `DFSchema` contains information on the name, data type, and
1636 /// nullability for each column.
1637 ///
1638 /// # Example
1639 /// ```
1640 /// # use datafusion::prelude::*;
1641 /// # use datafusion::error::Result;
1642 /// # #[tokio::main]
1643 /// # async fn main() -> Result<()> {
1644 /// let ctx = SessionContext::new();
1645 /// let df = ctx
1646 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1647 /// .await?;
1648 /// let schema = df.schema();
1649 /// # Ok(())
1650 /// # }
1651 /// ```
1652 pub fn schema(&self) -> &DFSchema {
1653 self.plan.schema()
1654 }
1655
1656 /// Return a reference to the unoptimized [`LogicalPlan`] that comprises
1657 /// this DataFrame.
1658 ///
1659 /// See [`Self::into_unoptimized_plan`] for more details.
1660 pub fn logical_plan(&self) -> &LogicalPlan {
1661 &self.plan
1662 }
1663
1664 /// Returns both the [`LogicalPlan`] and [`SessionState`] that comprise this [`DataFrame`]
1665 pub fn into_parts(self) -> (SessionState, LogicalPlan) {
1666 (*self.session_state, self.plan)
1667 }
1668
1669 /// Return the [`LogicalPlan`] represented by this DataFrame without running
1670 /// any optimizers
1671 ///
1672 /// Note: This method should not be used outside testing, as it loses the
1673 /// snapshot of the [`SessionState`] attached to this [`DataFrame`] and
1674 /// consequently subsequent operations may take place against a different
1675 /// state (e.g. a different value of `now()`)
1676 ///
1677 /// See [`Self::into_parts`] to retrieve the owned [`LogicalPlan`] and
1678 /// corresponding [`SessionState`].
1679 pub fn into_unoptimized_plan(self) -> LogicalPlan {
1680 self.plan
1681 }
1682
1683 /// Return the optimized [`LogicalPlan`] represented by this DataFrame.
1684 ///
1685 /// Note: This method should not be used outside testing -- see
1686 /// [`Self::into_unoptimized_plan`] for more details.
1687 pub fn into_optimized_plan(self) -> Result<LogicalPlan> {
1688 // Optimize the plan first for better UX
1689 self.session_state.optimize(&self.plan)
1690 }
1691
1692 /// Converts this [`DataFrame`] into a [`TableProvider`] that can be registered
1693 /// as a table view using [`SessionContext::register_table`].
1694 ///
1695 /// Note: This discards the [`SessionState`] associated with this
1696 /// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`]
1697 pub fn into_view(self) -> Arc<dyn TableProvider> {
1698 Arc::new(DataFrameTableProvider {
1699 plan: self.plan,
1700 table_type: TableType::View,
1701 })
1702 }
1703
1704 /// See [`Self::into_view`]. The returned [`TableProvider`] will
1705 /// create a transient table.
1706 pub fn into_temporary_view(self) -> Arc<dyn TableProvider> {
1707 Arc::new(DataFrameTableProvider {
1708 plan: self.plan,
1709 table_type: TableType::Temporary,
1710 })
1711 }
1712
1713 /// Return a DataFrame with the explanation of its plan so far.
1714 ///
1715 /// if `analyze` is specified, runs the plan and reports metrics
1716 /// if `verbose` is true, prints out additional details.
1717 /// The default format is Indent format.
1718 ///
1719 /// ```
1720 /// # use datafusion::prelude::*;
1721 /// # use datafusion::error::Result;
1722 /// # #[tokio::main]
1723 /// # async fn main() -> Result<()> {
1724 /// let ctx = SessionContext::new();
1725 /// let df = ctx
1726 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1727 /// .await?;
1728 /// let batches = df
1729 /// .limit(0, Some(100))?
1730 /// .explain(false, false)?
1731 /// .collect()
1732 /// .await?;
1733 /// # Ok(())
1734 /// # }
1735 /// ```
1736 pub fn explain(self, verbose: bool, analyze: bool) -> Result<DataFrame> {
1737 // Set the default format to Indent to keep the previous behavior
1738 let opts = ExplainOption::default()
1739 .with_verbose(verbose)
1740 .with_analyze(analyze);
1741 self.explain_with_options(opts)
1742 }
1743
1744 /// Return a DataFrame with the explanation of its plan so far.
1745 ///
1746 /// `opt` is used to specify the options for the explain operation.
1747 /// Details of the options can be found in [`ExplainOption`].
1748 /// ```
1749 /// # use datafusion::prelude::*;
1750 /// # use datafusion::error::Result;
1751 /// # #[tokio::main]
1752 /// # async fn main() -> Result<()> {
1753 /// use datafusion_expr::{Explain, ExplainOption};
1754 /// let ctx = SessionContext::new();
1755 /// let df = ctx
1756 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1757 /// .await?;
1758 /// let batches = df
1759 /// .limit(0, Some(100))?
1760 /// .explain_with_options(
1761 /// ExplainOption::default()
1762 /// .with_verbose(false)
1763 /// .with_analyze(false),
1764 /// )?
1765 /// .collect()
1766 /// .await?;
1767 /// # Ok(())
1768 /// # }
1769 /// ```
1770 pub fn explain_with_options(
1771 self,
1772 explain_option: ExplainOption,
1773 ) -> Result<DataFrame> {
1774 if matches!(self.plan, LogicalPlan::Explain(_)) {
1775 return plan_err!("Nested EXPLAINs are not supported");
1776 }
1777 let plan = LogicalPlanBuilder::from(self.plan)
1778 .explain_option_format(explain_option)?
1779 .build()?;
1780 Ok(DataFrame {
1781 session_state: self.session_state,
1782 plan,
1783 projection_requires_validation: self.projection_requires_validation,
1784 })
1785 }
1786
1787 /// Return a `FunctionRegistry` used to plan udf's calls
1788 ///
1789 /// # Example
1790 /// ```
1791 /// # use datafusion::prelude::*;
1792 /// # use datafusion::error::Result;
1793 /// # #[tokio::main]
1794 /// # async fn main() -> Result<()> {
1795 /// let ctx = SessionContext::new();
1796 /// let df = ctx
1797 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1798 /// .await?;
1799 /// let f = df.registry();
1800 /// // use f.udf("name", vec![...]) to use the udf
1801 /// # Ok(())
1802 /// # }
1803 /// ```
1804 pub fn registry(&self) -> &dyn FunctionRegistry {
1805 self.session_state.as_ref()
1806 }
1807
1808 /// Calculate the intersection of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
1809 ///
1810 /// ```
1811 /// # use datafusion::prelude::*;
1812 /// # use datafusion::error::Result;
1813 /// # use datafusion_common::assert_batches_sorted_eq;
1814 /// # #[tokio::main]
1815 /// # async fn main() -> Result<()> {
1816 /// let ctx = SessionContext::new();
1817 /// let df = ctx
1818 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1819 /// .await?;
1820 /// let d2 = ctx
1821 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1822 /// .await?;
1823 /// let df = df.intersect(d2)?;
1824 /// let expected = vec![
1825 /// "+---+---+---+",
1826 /// "| a | b | c |",
1827 /// "+---+---+---+",
1828 /// "| 1 | 2 | 3 |",
1829 /// "+---+---+---+",
1830 /// ];
1831 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1832 /// # Ok(())
1833 /// # }
1834 /// ```
1835 pub fn intersect(self, dataframe: DataFrame) -> Result<DataFrame> {
1836 let left_plan = self.plan;
1837 let right_plan = dataframe.plan;
1838 let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, true)?;
1839 Ok(DataFrame {
1840 session_state: self.session_state,
1841 plan,
1842 projection_requires_validation: true,
1843 })
1844 }
1845
1846 /// Calculate the distinct intersection of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
1847 ///
1848 /// ```
1849 /// # use datafusion::prelude::*;
1850 /// # use datafusion::error::Result;
1851 /// # use datafusion_common::assert_batches_sorted_eq;
1852 /// # #[tokio::main]
1853 /// # async fn main() -> Result<()> {
1854 /// let ctx = SessionContext::new();
1855 /// let df = ctx
1856 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1857 /// .await?;
1858 /// let d2 = ctx
1859 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1860 /// .await?;
1861 /// let df = df.intersect_distinct(d2)?;
1862 /// let expected = vec![
1863 /// "+---+---+---+",
1864 /// "| a | b | c |",
1865 /// "+---+---+---+",
1866 /// "| 1 | 2 | 3 |",
1867 /// "+---+---+---+",
1868 /// ];
1869 /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1870 /// # Ok(())
1871 /// # }
1872 /// ```
1873 pub fn intersect_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
1874 let left_plan = self.plan;
1875 let right_plan = dataframe.plan;
1876 let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, false)?;
1877 Ok(DataFrame {
1878 session_state: self.session_state,
1879 plan,
1880 projection_requires_validation: true,
1881 })
1882 }
1883
1884 /// Calculate the exception of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
1885 ///
1886 /// ```
1887 /// # use datafusion::prelude::*;
1888 /// # use datafusion::error::Result;
1889 /// # use datafusion_common::assert_batches_sorted_eq;
1890 /// # #[tokio::main]
1891 /// # async fn main() -> Result<()> {
1892 /// let ctx = SessionContext::new();
1893 /// let df = ctx
1894 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1895 /// .await?;
1896 /// let d2 = ctx
1897 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1898 /// .await?;
1899 /// let result = df.except(d2)?;
1900 /// // those columns are not in example.csv, but in example_long.csv
1901 /// let expected = vec![
1902 /// "+---+---+---+",
1903 /// "| a | b | c |",
1904 /// "+---+---+---+",
1905 /// "| 4 | 5 | 6 |",
1906 /// "| 7 | 8 | 9 |",
1907 /// "+---+---+---+",
1908 /// ];
1909 /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
1910 /// # Ok(())
1911 /// # }
1912 /// ```
1913 pub fn except(self, dataframe: DataFrame) -> Result<DataFrame> {
1914 let left_plan = self.plan;
1915 let right_plan = dataframe.plan;
1916 let plan = LogicalPlanBuilder::except(left_plan, right_plan, true)?;
1917 Ok(DataFrame {
1918 session_state: self.session_state,
1919 plan,
1920 projection_requires_validation: true,
1921 })
1922 }
1923
1924 /// Calculate the distinct exception of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
1925 ///
1926 /// ```
1927 /// # use datafusion::prelude::*;
1928 /// # use datafusion::error::Result;
1929 /// # use datafusion_common::assert_batches_sorted_eq;
1930 /// # #[tokio::main]
1931 /// # async fn main() -> Result<()> {
1932 /// let ctx = SessionContext::new();
1933 /// let df = ctx
1934 /// .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1935 /// .await?;
1936 /// let d2 = ctx
1937 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
1938 /// .await?;
1939 /// let result = df.except_distinct(d2)?;
1940 /// // those columns are not in example.csv, but in example_long.csv
1941 /// let expected = vec![
1942 /// "+---+---+---+",
1943 /// "| a | b | c |",
1944 /// "+---+---+---+",
1945 /// "| 4 | 5 | 6 |",
1946 /// "| 7 | 8 | 9 |",
1947 /// "+---+---+---+",
1948 /// ];
1949 /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
1950 /// # Ok(())
1951 /// # }
1952 /// ```
1953 pub fn except_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
1954 let left_plan = self.plan;
1955 let right_plan = dataframe.plan;
1956 let plan = LogicalPlanBuilder::except(left_plan, right_plan, false)?;
1957 Ok(DataFrame {
1958 session_state: self.session_state,
1959 plan,
1960 projection_requires_validation: true,
1961 })
1962 }
1963
1964 /// Execute this `DataFrame` and write the results to `table_name`.
1965 ///
1966 /// Returns a single [RecordBatch] containing a single column and
1967 /// row representing the count of total rows written.
1968 ///
1969 /// Unlike most other `DataFrame` methods, this method executes eagerly.
1970 /// Data is written to the table using the [`TableProvider::insert_into`]
1971 /// method. This is the same underlying implementation used by SQL `INSERT
1972 /// INTO` statements.
1973 pub async fn write_table(
1974 self,
1975 table_name: &str,
1976 write_options: DataFrameWriteOptions,
1977 ) -> Result<Vec<RecordBatch>, DataFusionError> {
1978 let plan = if write_options.sort_by.is_empty() {
1979 self.plan
1980 } else {
1981 LogicalPlanBuilder::from(self.plan)
1982 .sort(write_options.sort_by)?
1983 .build()?
1984 };
1985
1986 let table_ref: TableReference = table_name.into();
1987 let table_schema = self.session_state.schema_for_ref(table_ref.clone())?;
1988 let target = match table_schema.table(table_ref.table()).await? {
1989 Some(ref provider) => Ok(Arc::clone(provider)),
1990 _ => plan_err!("No table named '{table_name}'"),
1991 }?;
1992
1993 let target = Arc::new(DefaultTableSource::new(target));
1994
1995 let plan = LogicalPlanBuilder::insert_into(
1996 plan,
1997 table_ref,
1998 target,
1999 write_options.insert_op,
2000 )?
2001 .build()?;
2002
2003 DataFrame {
2004 session_state: self.session_state,
2005 plan,
2006 projection_requires_validation: self.projection_requires_validation,
2007 }
2008 .collect()
2009 .await
2010 }
2011
2012 /// Execute the `DataFrame` and write the results to CSV file(s).
2013 ///
2014 /// # Example
2015 /// ```
2016 /// # use datafusion::prelude::*;
2017 /// # use datafusion::error::Result;
2018 /// # use std::fs;
2019 /// # #[tokio::main]
2020 /// # async fn main() -> Result<()> {
2021 /// use datafusion::dataframe::DataFrameWriteOptions;
2022 /// let ctx = SessionContext::new();
2023 /// // Sort the data by column "b" and write it to a new location
2024 /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
2025 /// .await?
2026 /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
2027 /// .write_csv(
2028 /// "output.csv",
2029 /// DataFrameWriteOptions::new(),
2030 /// None, // can also specify CSV writing options here
2031 /// )
2032 /// .await?;
2033 /// # fs::remove_file("output.csv")?;
2034 /// # Ok(())
2035 /// # }
2036 /// ```
2037 pub async fn write_csv(
2038 self,
2039 path: &str,
2040 options: DataFrameWriteOptions,
2041 writer_options: Option<CsvOptions>,
2042 ) -> Result<Vec<RecordBatch>, DataFusionError> {
2043 if options.insert_op != InsertOp::Append {
2044 return not_impl_err!(
2045 "{} is not implemented for DataFrame::write_csv.",
2046 options.insert_op
2047 );
2048 }
2049
2050 let format = if let Some(csv_opts) = writer_options {
2051 Arc::new(CsvFormatFactory::new_with_options(csv_opts))
2052 } else {
2053 Arc::new(CsvFormatFactory::new())
2054 };
2055
2056 let file_type = format_as_file_type(format);
2057
2058 let copy_options = options.build_sink_options();
2059
2060 let plan = if options.sort_by.is_empty() {
2061 self.plan
2062 } else {
2063 LogicalPlanBuilder::from(self.plan)
2064 .sort(options.sort_by)?
2065 .build()?
2066 };
2067
2068 let plan = LogicalPlanBuilder::copy_to(
2069 plan,
2070 path.into(),
2071 file_type,
2072 copy_options,
2073 options.partition_by,
2074 )?
2075 .build()?;
2076
2077 DataFrame {
2078 session_state: self.session_state,
2079 plan,
2080 projection_requires_validation: self.projection_requires_validation,
2081 }
2082 .collect()
2083 .await
2084 }
2085
2086 /// Execute the `DataFrame` and write the results to JSON file(s).
2087 ///
2088 /// # Example
2089 /// ```
2090 /// # use datafusion::prelude::*;
2091 /// # use datafusion::error::Result;
2092 /// # use std::fs;
2093 /// # #[tokio::main]
2094 /// # async fn main() -> Result<()> {
2095 /// use datafusion::dataframe::DataFrameWriteOptions;
2096 /// let ctx = SessionContext::new();
2097 /// // Sort the data by column "b" and write it to a new location
2098 /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
2099 /// .await?
2100 /// .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
2101 /// .write_json("output.json", DataFrameWriteOptions::new(), None)
2102 /// .await?;
2103 /// # fs::remove_file("output.json")?;
2104 /// # Ok(())
2105 /// # }
2106 /// ```
2107 pub async fn write_json(
2108 self,
2109 path: &str,
2110 options: DataFrameWriteOptions,
2111 writer_options: Option<JsonOptions>,
2112 ) -> Result<Vec<RecordBatch>, DataFusionError> {
2113 if options.insert_op != InsertOp::Append {
2114 return not_impl_err!(
2115 "{} is not implemented for DataFrame::write_json.",
2116 options.insert_op
2117 );
2118 }
2119
2120 let format = if let Some(json_opts) = writer_options {
2121 Arc::new(JsonFormatFactory::new_with_options(json_opts))
2122 } else {
2123 Arc::new(JsonFormatFactory::new())
2124 };
2125
2126 let file_type = format_as_file_type(format);
2127
2128 let copy_options = options.build_sink_options();
2129
2130 let plan = if options.sort_by.is_empty() {
2131 self.plan
2132 } else {
2133 LogicalPlanBuilder::from(self.plan)
2134 .sort(options.sort_by)?
2135 .build()?
2136 };
2137
2138 let plan = LogicalPlanBuilder::copy_to(
2139 plan,
2140 path.into(),
2141 file_type,
2142 copy_options,
2143 options.partition_by,
2144 )?
2145 .build()?;
2146
2147 DataFrame {
2148 session_state: self.session_state,
2149 plan,
2150 projection_requires_validation: self.projection_requires_validation,
2151 }
2152 .collect()
2153 .await
2154 }
2155
2156 /// Add or replace a column in the DataFrame.
2157 ///
2158 /// # Example
2159 /// ```
2160 /// # use datafusion::prelude::*;
2161 /// # use datafusion::error::Result;
2162 /// # #[tokio::main]
2163 /// # async fn main() -> Result<()> {
2164 /// let ctx = SessionContext::new();
2165 /// let df = ctx
2166 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
2167 /// .await?;
2168 /// let df = df.with_column("ab_sum", col("a") + col("b"))?;
2169 /// # Ok(())
2170 /// # }
2171 /// ```
2172 pub fn with_column(self, name: &str, expr: Expr) -> Result<DataFrame> {
2173 let window_func_exprs = find_window_exprs([&expr]);
2174
2175 let original_names: HashSet<String> = self
2176 .plan
2177 .schema()
2178 .iter()
2179 .map(|(_, f)| f.name().clone())
2180 .collect();
2181
2182 // Maybe build window plan
2183 let plan = if window_func_exprs.is_empty() {
2184 self.plan
2185 } else {
2186 LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
2187 };
2188
2189 let new_column = expr.alias(name);
2190 let mut col_exists = false;
2191
2192 let mut fields: Vec<(Expr, bool)> = plan
2193 .schema()
2194 .iter()
2195 .filter_map(|(qualifier, field)| {
2196 // Skip new fields introduced by window_plan
2197 if !original_names.contains(field.name()) {
2198 return None;
2199 }
2200
2201 if field.name() == name {
2202 col_exists = true;
2203 Some((new_column.clone(), true))
2204 } else {
2205 let e = col(Column::from((qualifier, field)));
2206 Some((e, self.projection_requires_validation))
2207 }
2208 })
2209 .collect();
2210
2211 if !col_exists {
2212 fields.push((new_column, true));
2213 }
2214
2215 let project_plan = LogicalPlanBuilder::from(plan)
2216 .project_with_validation(fields)?
2217 .build()?;
2218
2219 Ok(DataFrame {
2220 session_state: self.session_state,
2221 plan: project_plan,
2222 projection_requires_validation: false,
2223 })
2224 }
2225
2226 /// Rename one column by applying a new projection. This is a no-op if the column to be
2227 /// renamed does not exist.
2228 ///
2229 /// The method supports case sensitive rename with wrapping column name into one of following symbols ( " or ' or ` )
2230 ///
2231 /// Alternatively setting DataFusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable
2232 /// case sensitive rename without need to wrap column name into special symbols
2233 ///
2234 /// # Example
2235 /// ```
2236 /// # use datafusion::prelude::*;
2237 /// # use datafusion::error::Result;
2238 /// # #[tokio::main]
2239 /// # async fn main() -> Result<()> {
2240 /// let ctx = SessionContext::new();
2241 /// let df = ctx
2242 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
2243 /// .await?;
2244 /// let df = df.with_column_renamed("ab_sum", "total")?;
2245 ///
2246 /// # Ok(())
2247 /// # }
2248 /// ```
2249 pub fn with_column_renamed(
2250 self,
2251 old_name: impl Into<String>,
2252 new_name: &str,
2253 ) -> Result<DataFrame> {
2254 let ident_opts = self
2255 .session_state
2256 .config_options()
2257 .sql_parser
2258 .enable_ident_normalization;
2259 let old_column: Column = if ident_opts {
2260 Column::from_qualified_name(old_name)
2261 } else {
2262 Column::from_qualified_name_ignore_case(old_name)
2263 };
2264
2265 let (qualifier_rename, field_rename) =
2266 match self.plan.schema().qualified_field_from_column(&old_column) {
2267 Ok(qualifier_and_field) => qualifier_and_field,
2268 // no-op if field not found
2269 Err(DataFusionError::SchemaError(e, _))
2270 if matches!(*e, SchemaError::FieldNotFound { .. }) =>
2271 {
2272 return Ok(self);
2273 }
2274 Err(err) => return Err(err),
2275 };
2276 let projection = self
2277 .plan
2278 .schema()
2279 .iter()
2280 .map(|(qualifier, field)| {
2281 if qualifier.eq(&qualifier_rename) && field == field_rename {
2282 (
2283 col(Column::from((qualifier, field)))
2284 .alias_qualified(qualifier.cloned(), new_name),
2285 false,
2286 )
2287 } else {
2288 (col(Column::from((qualifier, field))), false)
2289 }
2290 })
2291 .collect::<Vec<_>>();
2292 let project_plan = LogicalPlanBuilder::from(self.plan)
2293 .project_with_validation(projection)?
2294 .build()?;
2295 Ok(DataFrame {
2296 session_state: self.session_state,
2297 plan: project_plan,
2298 projection_requires_validation: false,
2299 })
2300 }
2301
2302 /// Replace all parameters in logical plan with the specified
2303 /// values, in preparation for execution.
2304 ///
2305 /// # Example
2306 ///
2307 /// ```
2308 /// use datafusion::prelude::*;
2309 /// # use datafusion::{error::Result, assert_batches_eq};
2310 /// # #[tokio::main]
2311 /// # async fn main() -> Result<()> {
2312 /// # use datafusion_common::ScalarValue;
2313 /// let ctx = SessionContext::new();
2314 /// # ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?;
2315 /// let results = ctx
2316 /// .sql("SELECT a FROM example WHERE b = $1")
2317 /// .await?
2318 /// // replace $1 with value 2
2319 /// .with_param_values(vec![
2320 /// // value at index 0 --> $1
2321 /// ScalarValue::from(2i64)
2322 /// ])?
2323 /// .collect()
2324 /// .await?;
2325 /// assert_batches_eq!(
2326 /// &[
2327 /// "+---+",
2328 /// "| a |",
2329 /// "+---+",
2330 /// "| 1 |",
2331 /// "+---+",
2332 /// ],
2333 /// &results
2334 /// );
2335 /// // Note you can also provide named parameters
2336 /// let results = ctx
2337 /// .sql("SELECT a FROM example WHERE b = $my_param")
2338 /// .await?
2339 /// // replace $my_param with value 2
2340 /// // Note you can also use a HashMap as well
2341 /// .with_param_values(vec![
2342 /// ("my_param", ScalarValue::from(2i64))
2343 /// ])?
2344 /// .collect()
2345 /// .await?;
2346 /// assert_batches_eq!(
2347 /// &[
2348 /// "+---+",
2349 /// "| a |",
2350 /// "+---+",
2351 /// "| 1 |",
2352 /// "+---+",
2353 /// ],
2354 /// &results
2355 /// );
2356 /// # Ok(())
2357 /// # }
2358 /// ```
2359 pub fn with_param_values(self, query_values: impl Into<ParamValues>) -> Result<Self> {
2360 let plan = self.plan.with_param_values(query_values)?;
2361 Ok(DataFrame {
2362 session_state: self.session_state,
2363 plan,
2364 projection_requires_validation: self.projection_requires_validation,
2365 })
2366 }
2367
2368 /// Cache DataFrame as a memory table.
2369 ///
2370 /// Default behavior could be changed using
2371 /// a [`crate::execution::session_state::CacheFactory`]
2372 /// configured via [`SessionState`].
2373 ///
2374 /// ```
2375 /// # use datafusion::prelude::*;
2376 /// # use datafusion::error::Result;
2377 /// # #[tokio::main]
2378 /// # async fn main() -> Result<()> {
2379 /// let ctx = SessionContext::new();
2380 /// let df = ctx
2381 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
2382 /// .await?;
2383 /// let df = df.cache().await?;
2384 /// # Ok(())
2385 /// # }
2386 /// ```
2387 pub async fn cache(self) -> Result<DataFrame> {
2388 if let Some(cache_factory) = self.session_state.cache_factory() {
2389 let new_plan =
2390 cache_factory.create(self.plan, self.session_state.as_ref())?;
2391 Ok(Self::new(*self.session_state, new_plan))
2392 } else {
2393 let context = SessionContext::new_with_state((*self.session_state).clone());
2394 // The schema is consistent with the output
2395 let plan = self.clone().create_physical_plan().await?;
2396 let schema = plan.schema();
2397 let task_ctx = Arc::new(self.task_ctx());
2398 let partitions = collect_partitioned(plan, task_ctx).await?;
2399 let mem_table = MemTable::try_new(schema, partitions)?;
2400 context.read_table(Arc::new(mem_table))
2401 }
2402 }
2403
2404 /// Apply an alias to the DataFrame.
2405 ///
2406 /// This method replaces the qualifiers of output columns with the given alias.
2407 pub fn alias(self, alias: &str) -> Result<DataFrame> {
2408 let plan = LogicalPlanBuilder::from(self.plan).alias(alias)?.build()?;
2409 Ok(DataFrame {
2410 session_state: self.session_state,
2411 plan,
2412 projection_requires_validation: self.projection_requires_validation,
2413 })
2414 }
2415
2416 /// Fill null values in specified columns with a given value
2417 /// If no columns are specified (empty vector), applies to all columns
2418 /// Only fills if the value can be cast to the column's type
2419 ///
2420 /// # Arguments
2421 /// * `value` - Value to fill nulls with
2422 /// * `columns` - List of column names to fill. If empty, fills all columns.
2423 ///
2424 /// # Example
2425 /// ```
2426 /// # use datafusion::prelude::*;
2427 /// # use datafusion::error::Result;
2428 /// # use datafusion_common::ScalarValue;
2429 /// # #[tokio::main]
2430 /// # async fn main() -> Result<()> {
2431 /// let ctx = SessionContext::new();
2432 /// let df = ctx
2433 /// .read_csv("tests/data/example.csv", CsvReadOptions::new())
2434 /// .await?;
2435 /// // Fill nulls in only columns "a" and "c":
2436 /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?;
2437 /// // Fill nulls across all columns:
2438 /// let df = df.fill_null(ScalarValue::from(0), vec![])?;
2439 /// # Ok(())
2440 /// # }
2441 /// ```
2442 #[expect(clippy::needless_pass_by_value)]
2443 pub fn fill_null(
2444 &self,
2445 value: ScalarValue,
2446 columns: Vec<String>,
2447 ) -> Result<DataFrame> {
2448 let cols = if columns.is_empty() {
2449 self.logical_plan()
2450 .schema()
2451 .fields()
2452 .iter()
2453 .map(Arc::clone)
2454 .collect()
2455 } else {
2456 self.find_columns(&columns)?
2457 };
2458
2459 // Create projections for each column
2460 let projections = self
2461 .logical_plan()
2462 .schema()
2463 .fields()
2464 .iter()
2465 .map(|field| {
2466 if cols.contains(field) {
2467 // Try to cast fill value to column type. If the cast fails, fallback to the original column.
2468 match value.clone().cast_to(field.data_type()) {
2469 Ok(fill_value) => Expr::Alias(Alias {
2470 expr: Box::new(Expr::ScalarFunction(ScalarFunction {
2471 func: coalesce(),
2472 args: vec![col(field.name()), lit(fill_value)],
2473 })),
2474 relation: None,
2475 name: field.name().to_string(),
2476 metadata: None,
2477 }),
2478 Err(_) => col(field.name()),
2479 }
2480 } else {
2481 col(field.name())
2482 }
2483 })
2484 .collect::<Vec<_>>();
2485
2486 self.clone().select(projections)
2487 }
2488
2489 // Helper to find columns from names
2490 fn find_columns(&self, names: &[String]) -> Result<Vec<FieldRef>> {
2491 let schema = self.logical_plan().schema();
2492 names
2493 .iter()
2494 .map(|name| {
2495 schema
2496 .field_with_name(None, name)
2497 .cloned()
2498 .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
2499 })
2500 .collect()
2501 }
2502
2503 /// Find qualified columns for this dataframe from names
2504 ///
2505 /// # Arguments
2506 /// * `names` - Unqualified names to find.
2507 ///
2508 /// # Example
2509 /// ```
2510 /// # use datafusion::prelude::*;
2511 /// # use datafusion::error::Result;
2512 /// # use datafusion_common::ScalarValue;
2513 /// # #[tokio::main]
2514 /// # async fn main() -> Result<()> {
2515 /// let ctx = SessionContext::new();
2516 /// ctx.register_csv("first_table", "tests/data/example.csv", CsvReadOptions::new())
2517 /// .await?;
2518 /// let df = ctx.table("first_table").await?;
2519 /// ctx.register_csv("second_table", "tests/data/example.csv", CsvReadOptions::new())
2520 /// .await?;
2521 /// let df2 = ctx.table("second_table").await?;
2522 /// let join_expr = df.find_qualified_columns(&["a"])?.iter()
2523 /// .zip(df2.find_qualified_columns(&["a"])?.iter())
2524 /// .map(|(col1, col2)| col(*col1).eq(col(*col2)))
2525 /// .collect::<Vec<Expr>>();
2526 /// let df3 = df.join_on(df2, JoinType::Inner, join_expr)?;
2527 /// # Ok(())
2528 /// # }
2529 /// ```
2530 pub fn find_qualified_columns(
2531 &self,
2532 names: &[&str],
2533 ) -> Result<Vec<(Option<&TableReference>, &FieldRef)>> {
2534 let schema = self.logical_plan().schema();
2535 names
2536 .iter()
2537 .map(|name| {
2538 schema
2539 .qualified_field_from_column(&Column::from_name(*name))
2540 .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
2541 })
2542 .collect()
2543 }
2544
2545 /// Helper for creating DataFrame.
2546 /// # Example
2547 /// ```
2548 /// use arrow::array::{ArrayRef, Int32Array, StringArray};
2549 /// use datafusion::prelude::DataFrame;
2550 /// use std::sync::Arc;
2551 /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
2552 /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
2553 /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap();
2554 /// // +----+------+,
2555 /// // | id | name |,
2556 /// // +----+------+,
2557 /// // | 1 | foo |,
2558 /// // | 2 | bar |,
2559 /// // | 3 | baz |,
2560 /// // +----+------+,
2561 /// ```
2562 pub fn from_columns(columns: Vec<(&str, ArrayRef)>) -> Result<Self> {
2563 let fields = columns
2564 .iter()
2565 .map(|(name, array)| Field::new(*name, array.data_type().clone(), true))
2566 .collect::<Vec<_>>();
2567
2568 let arrays = columns
2569 .into_iter()
2570 .map(|(_, array)| array)
2571 .collect::<Vec<_>>();
2572
2573 let schema = Arc::new(Schema::new(fields));
2574 let batch = RecordBatch::try_new(schema, arrays)?;
2575 let ctx = SessionContext::new();
2576 let df = ctx.read_batch(batch)?;
2577 Ok(df)
2578 }
2579}
2580
2581/// Macro for creating DataFrame.
2582/// # Example
2583/// ```
2584/// use datafusion::prelude::dataframe;
2585/// # use datafusion::error::Result;
2586/// # #[tokio::main]
2587/// # async fn main() -> Result<()> {
2588/// let df = dataframe!(
2589/// "id" => [1, 2, 3],
2590/// "name" => ["foo", "bar", "baz"]
2591/// )?;
2592/// df.show().await?;
2593/// // +----+------+,
2594/// // | id | name |,
2595/// // +----+------+,
2596/// // | 1 | foo |,
2597/// // | 2 | bar |,
2598/// // | 3 | baz |,
2599/// // +----+------+,
2600/// let df_empty = dataframe!()?; // empty DataFrame
2601/// assert_eq!(df_empty.schema().fields().len(), 0);
2602/// assert_eq!(df_empty.count().await?, 0);
2603/// # Ok(())
2604/// # }
2605/// ```
2606#[macro_export]
2607macro_rules! dataframe {
2608 () => {{
2609 use std::sync::Arc;
2610
2611 use datafusion::prelude::SessionContext;
2612 use datafusion::arrow::array::RecordBatch;
2613 use datafusion::arrow::datatypes::Schema;
2614
2615 let ctx = SessionContext::new();
2616 let batch = RecordBatch::new_empty(Arc::new(Schema::empty()));
2617 ctx.read_batch(batch)
2618 }};
2619
2620 ($($name:expr => $data:expr),+ $(,)?) => {{
2621 use datafusion::prelude::DataFrame;
2622 use datafusion::common::test_util::IntoArrayRef;
2623
2624 let columns = vec![
2625 $(
2626 ($name, $data.into_array_ref()),
2627 )+
2628 ];
2629
2630 DataFrame::from_columns(columns)
2631 }};
2632}
2633
2634#[derive(Debug)]
2635struct DataFrameTableProvider {
2636 plan: LogicalPlan,
2637 table_type: TableType,
2638}
2639
2640#[async_trait]
2641impl TableProvider for DataFrameTableProvider {
2642 fn as_any(&self) -> &dyn Any {
2643 self
2644 }
2645
2646 fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
2647 Some(Cow::Borrowed(&self.plan))
2648 }
2649
2650 fn supports_filters_pushdown(
2651 &self,
2652 filters: &[&Expr],
2653 ) -> Result<Vec<TableProviderFilterPushDown>> {
2654 // A filter is added on the DataFrame when given
2655 Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
2656 }
2657
2658 fn schema(&self) -> SchemaRef {
2659 Arc::clone(self.plan.schema().inner())
2660 }
2661
2662 fn table_type(&self) -> TableType {
2663 self.table_type
2664 }
2665
2666 async fn scan(
2667 &self,
2668 state: &dyn Session,
2669 projection: Option<&Vec<usize>>,
2670 filters: &[Expr],
2671 limit: Option<usize>,
2672 ) -> Result<Arc<dyn ExecutionPlan>> {
2673 let mut expr = LogicalPlanBuilder::from(self.plan.clone());
2674 // Add filter when given
2675 let filter = filters.iter().cloned().reduce(|acc, new| acc.and(new));
2676 if let Some(filter) = filter {
2677 expr = expr.filter(filter)?
2678 }
2679
2680 if let Some(p) = projection {
2681 expr = expr.select(p.iter().copied())?
2682 }
2683
2684 // add a limit if given
2685 if let Some(l) = limit {
2686 expr = expr.limit(0, Some(l))?
2687 }
2688 let plan = expr.build()?;
2689 state.create_physical_plan(&plan).await
2690 }
2691}
2692
2693// see tests in datafusion/core/tests/dataframe/mod.rs:2816