datafusion/dataframe/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [`DataFrame`] API for building and executing query plans.
19
20#[cfg(feature = "parquet")]
21mod parquet;
22
23use crate::arrow::record_batch::RecordBatch;
24use crate::arrow::util::pretty;
25use crate::datasource::file_format::csv::CsvFormatFactory;
26use crate::datasource::file_format::format_as_file_type;
27use crate::datasource::file_format::json::JsonFormatFactory;
28use crate::datasource::{
29    DefaultTableSource, MemTable, TableProvider, provider_as_source,
30};
31use crate::error::Result;
32use crate::execution::FunctionRegistry;
33use crate::execution::context::{SessionState, TaskContext};
34use crate::logical_expr::utils::find_window_exprs;
35use crate::logical_expr::{
36    Expr, JoinType, LogicalPlan, LogicalPlanBuilder, LogicalPlanBuilderOptions,
37    Partitioning, TableType, col, ident,
38};
39use crate::physical_plan::{
40    ExecutionPlan, SendableRecordBatchStream, collect, collect_partitioned,
41    execute_stream, execute_stream_partitioned,
42};
43use crate::prelude::SessionContext;
44use std::any::Any;
45use std::borrow::Cow;
46use std::collections::{HashMap, HashSet};
47use std::sync::Arc;
48
49use arrow::array::{Array, ArrayRef, Int64Array, StringArray};
50use arrow::compute::{cast, concat};
51use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
52use arrow_schema::FieldRef;
53use datafusion_common::config::{CsvOptions, JsonOptions};
54use datafusion_common::{
55    Column, DFSchema, DataFusionError, ParamValues, ScalarValue, SchemaError,
56    TableReference, UnnestOptions, exec_err, internal_datafusion_err, not_impl_err,
57    plan_datafusion_err, plan_err, unqualified_field_not_found,
58};
59use datafusion_expr::select_expr::SelectExpr;
60use datafusion_expr::{
61    ExplainOption, SortExpr, TableProviderFilterPushDown, UNNAMED_TABLE, case,
62    dml::InsertOp,
63    expr::{Alias, ScalarFunction},
64    is_null, lit,
65    utils::COUNT_STAR_EXPANSION,
66};
67use datafusion_functions::core::coalesce;
68use datafusion_functions_aggregate::expr_fn::{
69    avg, count, max, median, min, stddev, sum,
70};
71
72use async_trait::async_trait;
73use datafusion_catalog::Session;
74
75/// Contains options that control how data is
76/// written out from a DataFrame
77pub struct DataFrameWriteOptions {
78    /// Controls how new data should be written to the table, determining whether
79    /// to append, overwrite, or replace existing data.
80    insert_op: InsertOp,
81    /// Controls if all partitions should be coalesced into a single output file.
82    /// - `None`: Use automatic mode (extension-based heuristic)
83    /// - `Some(true)`: Force single file output at exact path
84    /// - `Some(false)`: Force directory output with generated filenames
85    single_file_output: Option<bool>,
86    /// Sets which columns should be used for hive-style partitioned writes by name.
87    /// Can be set to empty vec![] for non-partitioned writes.
88    partition_by: Vec<String>,
89    /// Sets which columns should be used for sorting the output by name.
90    /// Can be set to empty vec![] for non-sorted writes.
91    sort_by: Vec<SortExpr>,
92}
93
94impl DataFrameWriteOptions {
95    /// Create a new DataFrameWriteOptions with default values
96    pub fn new() -> Self {
97        DataFrameWriteOptions {
98            insert_op: InsertOp::Append,
99            single_file_output: None,
100            partition_by: vec![],
101            sort_by: vec![],
102        }
103    }
104
105    /// Set the insert operation
106    pub fn with_insert_operation(mut self, insert_op: InsertOp) -> Self {
107        self.insert_op = insert_op;
108        self
109    }
110
111    /// Set the single_file_output value to true or false
112    ///
113    /// - `true`: Force single file output at the exact path specified
114    /// - `false`: Force directory output with generated filenames
115    ///
116    /// When not called, automatic mode is used (extension-based heuristic).
117    /// When set to true, an output file will always be created even if the DataFrame is empty.
118    pub fn with_single_file_output(mut self, single_file_output: bool) -> Self {
119        self.single_file_output = Some(single_file_output);
120        self
121    }
122
123    /// Sets the partition_by columns for output partitioning
124    pub fn with_partition_by(mut self, partition_by: Vec<String>) -> Self {
125        self.partition_by = partition_by;
126        self
127    }
128
129    /// Sets the sort_by columns for output sorting
130    pub fn with_sort_by(mut self, sort_by: Vec<SortExpr>) -> Self {
131        self.sort_by = sort_by;
132        self
133    }
134
135    /// Build the options HashMap to pass to CopyTo for sink configuration.
136    fn build_sink_options(&self) -> HashMap<String, String> {
137        let mut options = HashMap::new();
138        if let Some(single_file) = self.single_file_output {
139            options.insert("single_file_output".to_string(), single_file.to_string());
140        }
141        options
142    }
143}
144
145impl Default for DataFrameWriteOptions {
146    fn default() -> Self {
147        Self::new()
148    }
149}
150
151/// Represents a logical set of rows with the same named columns.
152///
153/// Similar to a [Pandas DataFrame] or [Spark DataFrame], a DataFusion DataFrame
154/// represents a 2 dimensional table of rows and columns.
155///
156/// The typical workflow using DataFrames looks like
157///
158/// 1. Create a DataFrame via methods on [SessionContext], such as [`read_csv`]
159///    and [`read_parquet`].
160///
161/// 2. Build a desired calculation by calling methods such as [`filter`],
162///    [`select`], [`aggregate`], and [`limit`]
163///
164/// 3. Execute into [`RecordBatch`]es by calling [`collect`]
165///
166/// A `DataFrame` is a wrapper around a [`LogicalPlan`] and the [`SessionState`]
167///    required for execution.
168///
169/// DataFrames are "lazy" in the sense that most methods do not actually compute
170/// anything, they just build up a plan. Calling [`collect`] executes the plan
171/// using the same DataFusion planning and execution process used to execute SQL
172/// and other queries.
173///
174/// [Pandas DataFrame]: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
175/// [Spark DataFrame]: https://spark.apache.org/docs/latest/sql-programming-guide.html
176/// [`read_csv`]: SessionContext::read_csv
177/// [`read_parquet`]: SessionContext::read_parquet
178/// [`filter`]: DataFrame::filter
179/// [`select`]: DataFrame::select
180/// [`aggregate`]: DataFrame::aggregate
181/// [`limit`]: DataFrame::limit
182/// [`collect`]: DataFrame::collect
183///
184/// # Example
185/// ```
186/// # use std::sync::Arc;
187/// # use datafusion::prelude::*;
188/// # use datafusion::error::Result;
189/// # use datafusion::functions_aggregate::expr_fn::min;
190/// # use datafusion::arrow::array::{Int32Array, RecordBatch, StringArray};
191/// # use datafusion::arrow::datatypes::{DataType, Field, Schema};
192/// # #[tokio::main]
193/// # async fn main() -> Result<()> {
194/// let ctx = SessionContext::new();
195/// // Read the data from a csv file
196/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
197/// // create a new dataframe that computes the equivalent of
198/// // `SELECT a, MIN(b) FROM df WHERE a <= b GROUP BY a LIMIT 100;`
199/// let df = df.filter(col("a").lt_eq(col("b")))?
200///            .aggregate(vec![col("a")], vec![min(col("b"))])?
201///            .limit(0, Some(100))?;
202/// // Perform the actual computation
203/// let results = df.collect();
204///
205/// // Create a new dataframe with in-memory data
206/// let schema = Schema::new(vec![
207///     Field::new("id", DataType::Int32, true),
208///     Field::new("name", DataType::Utf8, true),
209/// ]);
210/// let batch = RecordBatch::try_new(
211///     Arc::new(schema),
212///     vec![
213///         Arc::new(Int32Array::from(vec![1, 2, 3])),
214///         Arc::new(StringArray::from(vec!["foo", "bar", "baz"])),
215///     ],
216/// )?;
217/// let df = ctx.read_batch(batch)?;
218/// df.show().await?;
219///
220/// // Create a new dataframe with in-memory data using macro
221/// let df = dataframe!(
222///     "id" => [1, 2, 3],
223///     "name" => ["foo", "bar", "baz"]
224///  )?;
225/// df.show().await?;
226/// # Ok(())
227/// # }
228/// ```
229#[derive(Debug, Clone)]
230pub struct DataFrame {
231    // Box the (large) SessionState to reduce the size of DataFrame on the stack
232    session_state: Box<SessionState>,
233    plan: LogicalPlan,
234    // Whether projection ops can skip validation or not. This flag if false
235    // allows for an optimization in `with_column` and `with_column_renamed` functions
236    // where the recursive work required to columnize and normalize expressions can
237    // be skipped if set to false. Since these function calls are often chained or
238    // called many times in dataframe operations this can result in a significant
239    // performance gain.
240    //
241    // The conditions where this can be set to false is when the dataframe function
242    // call results in the last operation being a
243    // `LogicalPlanBuilder::from(plan).project(fields)?.build()` or
244    // `LogicalPlanBuilder::from(plan).project_with_validation(fields)?.build()`
245    // call. This requirement guarantees that the plan has had all columnization
246    // and normalization applied to existing expressions and only new expressions
247    // will require that work. Any operation that update the plan in any way
248    // via anything other than a `project` call should set this to true.
249    projection_requires_validation: bool,
250}
251
252impl DataFrame {
253    /// Create a new `DataFrame ` based on an existing `LogicalPlan`
254    ///
255    /// This is a low-level method and is not typically used by end users. See
256    /// [`SessionContext::read_csv`] and other methods for creating a
257    /// `DataFrame` from an existing datasource.
258    pub fn new(session_state: SessionState, plan: LogicalPlan) -> Self {
259        Self {
260            session_state: Box::new(session_state),
261            plan,
262            projection_requires_validation: true,
263        }
264    }
265
266    /// Creates logical expression from a SQL query text.
267    /// The expression is created and processed against the current schema.
268    ///
269    /// # Example: Parsing SQL queries
270    /// ```
271    /// # use arrow::datatypes::{DataType, Field, Schema};
272    /// # use datafusion::prelude::*;
273    /// # use datafusion_common::{DFSchema, Result};
274    /// # #[tokio::main]
275    /// # async fn main() -> Result<()> {
276    /// // datafusion will parse number as i64 first.
277    /// let sql = "a > 1 and b in (1, 10)";
278    /// let expected = col("a")
279    ///     .gt(lit(1 as i64))
280    ///     .and(col("b").in_list(vec![lit(1 as i64), lit(10 as i64)], false));
281    /// let ctx = SessionContext::new();
282    /// let df = ctx
283    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
284    ///     .await?;
285    /// let expr = df.parse_sql_expr(sql)?;
286    /// assert_eq!(expected, expr);
287    /// # Ok(())
288    /// # }
289    /// ```
290    #[cfg(feature = "sql")]
291    pub fn parse_sql_expr(&self, sql: &str) -> Result<Expr> {
292        let df_schema = self.schema();
293
294        self.session_state.create_logical_expr(sql, df_schema)
295    }
296
297    /// Consume the DataFrame and produce a physical plan
298    pub async fn create_physical_plan(self) -> Result<Arc<dyn ExecutionPlan>> {
299        self.session_state.create_physical_plan(&self.plan).await
300    }
301
302    /// Filter the DataFrame by column. Returns a new DataFrame only containing the
303    /// specified columns.
304    ///
305    /// ```
306    /// # use datafusion::prelude::*;
307    /// # use datafusion::error::Result;
308    /// # use datafusion_common::assert_batches_sorted_eq;
309    /// # #[tokio::main]
310    /// # async fn main() -> Result<()> {
311    /// let ctx = SessionContext::new();
312    /// let df = ctx
313    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
314    ///     .await?;
315    /// let df = df.select_columns(&["a", "b"])?;
316    /// let expected = vec![
317    ///     "+---+---+",
318    ///     "| a | b |",
319    ///     "+---+---+",
320    ///     "| 1 | 2 |",
321    ///     "+---+---+",
322    /// ];
323    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
324    /// # Ok(())
325    /// # }
326    /// ```
327    pub fn select_columns(self, columns: &[&str]) -> Result<DataFrame> {
328        let fields = columns
329            .iter()
330            .map(|name| {
331                let fields = self
332                    .plan
333                    .schema()
334                    .qualified_fields_with_unqualified_name(name);
335                if fields.is_empty() {
336                    Err(unqualified_field_not_found(name, self.plan.schema()))
337                } else {
338                    Ok(fields)
339                }
340            })
341            .collect::<Result<Vec<_>, _>>()?
342            .into_iter()
343            .flatten()
344            .collect::<Vec<_>>();
345        let expr: Vec<Expr> = fields
346            .into_iter()
347            .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
348            .collect();
349        self.select(expr)
350    }
351    /// Project arbitrary list of expression strings into a new `DataFrame`.
352    /// Method will parse string expressions into logical plan expressions.
353    ///
354    /// The output `DataFrame` has one column for each element in `exprs`.
355    ///
356    /// # Example
357    /// ```
358    /// # use datafusion::prelude::*;
359    /// # use datafusion::error::Result;
360    /// # #[tokio::main]
361    /// # async fn main() -> Result<()> {
362    /// let ctx = SessionContext::new();
363    /// let df = ctx
364    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
365    ///     .await?;
366    /// let df: DataFrame = df.select_exprs(&["a * b", "c"])?;
367    /// # Ok(())
368    /// # }
369    /// ```
370    #[cfg(feature = "sql")]
371    pub fn select_exprs(self, exprs: &[&str]) -> Result<DataFrame> {
372        let expr_list = exprs
373            .iter()
374            .map(|e| self.parse_sql_expr(e))
375            .collect::<Result<Vec<_>>>()?;
376
377        self.select(expr_list)
378    }
379
380    /// Project arbitrary expressions (like SQL SELECT expressions) into a new
381    /// `DataFrame`.
382    ///
383    /// The output `DataFrame` has one column for each element in `expr_list`.
384    ///
385    /// # Example
386    /// ```
387    /// # use datafusion::prelude::*;
388    /// # use datafusion::error::Result;
389    /// # use datafusion_common::assert_batches_sorted_eq;
390    /// # #[tokio::main]
391    /// # async fn main() -> Result<()> {
392    /// let ctx = SessionContext::new();
393    /// let df = ctx
394    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
395    ///     .await?;
396    /// let df = df.select(vec![col("a"), col("b") * col("c")])?;
397    /// let expected = vec![
398    ///     "+---+-----------------------+",
399    ///     "| a | ?table?.b * ?table?.c |",
400    ///     "+---+-----------------------+",
401    ///     "| 1 | 6                     |",
402    ///     "+---+-----------------------+",
403    /// ];
404    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
405    /// # Ok(())
406    /// # }
407    /// ```
408    pub fn select(
409        self,
410        expr_list: impl IntoIterator<Item = impl Into<SelectExpr>>,
411    ) -> Result<DataFrame> {
412        let expr_list: Vec<SelectExpr> =
413            expr_list.into_iter().map(|e| e.into()).collect::<Vec<_>>();
414
415        let expressions = expr_list.iter().filter_map(|e| match e {
416            SelectExpr::Expression(expr) => Some(expr),
417            _ => None,
418        });
419
420        let window_func_exprs = find_window_exprs(expressions);
421        let plan = if window_func_exprs.is_empty() {
422            self.plan
423        } else {
424            LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
425        };
426
427        let project_plan = LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
428
429        Ok(DataFrame {
430            session_state: self.session_state,
431            plan: project_plan,
432            projection_requires_validation: false,
433        })
434    }
435
436    /// Returns a new DataFrame containing all columns except the specified columns.
437    ///
438    /// ```
439    /// # use datafusion::prelude::*;
440    /// # use datafusion::error::Result;
441    /// # use datafusion_common::assert_batches_sorted_eq;
442    /// # #[tokio::main]
443    /// # async fn main() -> Result<()> {
444    /// let ctx = SessionContext::new();
445    /// let df = ctx
446    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
447    ///     .await?;
448    /// // +----+----+----+
449    /// // | a  | b  | c  |
450    /// // +----+----+----+
451    /// // | 1  | 2  | 3  |
452    /// // +----+----+----+
453    /// let df = df.drop_columns(&["a"])?;
454    /// let expected = vec![
455    ///     "+---+---+",
456    ///     "| b | c |",
457    ///     "+---+---+",
458    ///     "| 2 | 3 |",
459    ///     "+---+---+",
460    /// ];
461    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
462    /// # Ok(())
463    /// # }
464    /// ```
465    pub fn drop_columns<T>(self, columns: &[T]) -> Result<DataFrame>
466    where
467        T: Into<Column> + Clone,
468    {
469        let fields_to_drop = columns
470            .iter()
471            .flat_map(|col| {
472                let column: Column = col.clone().into();
473                match column.relation.as_ref() {
474                    Some(_) => {
475                        // qualified_field_from_column returns Result<(Option<&TableReference>, &FieldRef)>
476                        vec![self.plan.schema().qualified_field_from_column(&column)]
477                    }
478                    None => {
479                        // qualified_fields_with_unqualified_name returns Vec<(Option<&TableReference>, &FieldRef)>
480                        self.plan
481                            .schema()
482                            .qualified_fields_with_unqualified_name(&column.name)
483                            .into_iter()
484                            .map(Ok)
485                            .collect::<Vec<_>>()
486                    }
487                }
488            })
489            .collect::<Result<Vec<_>, _>>()?;
490        let expr: Vec<Expr> = self
491            .plan
492            .schema()
493            .fields()
494            .into_iter()
495            .enumerate()
496            .map(|(idx, _)| self.plan.schema().qualified_field(idx))
497            .filter(|(qualifier, f)| !fields_to_drop.contains(&(*qualifier, f)))
498            .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
499            .collect();
500        self.select(expr)
501    }
502
503    /// Expand multiple list/struct columns into a set of rows and new columns.
504    ///
505    /// See also: [`UnnestOptions`] documentation for the behavior of `unnest`
506    ///
507    /// # Example
508    /// ```
509    /// # use datafusion::prelude::*;
510    /// # use datafusion::error::Result;
511    /// # use datafusion_common::assert_batches_sorted_eq;
512    /// # #[tokio::main]
513    /// # async fn main() -> Result<()> {
514    /// let ctx = SessionContext::new();
515    /// let df = ctx.read_json("tests/data/unnest.json", JsonReadOptions::default()).await?;
516    /// // expand into multiple columns if it's json array, flatten field name if it's nested structure
517    /// let df = df.unnest_columns(&["b","c","d"])?;
518    /// let expected = vec![
519    ///     "+---+------+-------+-----+-----+",
520    ///     "| a | b    | c     | d.e | d.f |",
521    ///     "+---+------+-------+-----+-----+",
522    ///     "| 1 | 2.0  | false | 1   | 2   |",
523    ///     "| 1 | 1.3  | true  | 1   | 2   |",
524    ///     "| 1 | -6.1 |       | 1   | 2   |",
525    ///     "| 2 | 3.0  | false |     |     |",
526    ///     "| 2 | 2.3  | true  |     |     |",
527    ///     "| 2 | -7.1 |       |     |     |",
528    ///     "+---+------+-------+-----+-----+"
529    /// ];
530    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
531    /// # Ok(())
532    /// # }
533    /// ```
534    pub fn unnest_columns(self, columns: &[&str]) -> Result<DataFrame> {
535        self.unnest_columns_with_options(columns, UnnestOptions::new())
536    }
537
538    /// Expand multiple list columns into a set of rows, with
539    /// behavior controlled by [`UnnestOptions`].
540    ///
541    /// Please see the documentation on [`UnnestOptions`] for more
542    /// details about the meaning of unnest.
543    pub fn unnest_columns_with_options(
544        self,
545        columns: &[&str],
546        options: UnnestOptions,
547    ) -> Result<DataFrame> {
548        let columns = columns.iter().map(|c| Column::from(*c)).collect();
549        let plan = LogicalPlanBuilder::from(self.plan)
550            .unnest_columns_with_options(columns, options)?
551            .build()?;
552        Ok(DataFrame {
553            session_state: self.session_state,
554            plan,
555            projection_requires_validation: true,
556        })
557    }
558
559    /// Return a DataFrame with only rows for which `predicate` evaluates to
560    /// `true`.
561    ///
562    /// Rows for which `predicate` evaluates to `false` or `null`
563    /// are filtered out.
564    ///
565    /// # Example
566    /// ```
567    /// # use datafusion::prelude::*;
568    /// # use datafusion::error::Result;
569    /// # use datafusion_common::assert_batches_sorted_eq;
570    /// # #[tokio::main]
571    /// # async fn main() -> Result<()> {
572    /// let ctx = SessionContext::new();
573    /// let df = ctx
574    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
575    ///     .await?;
576    /// let df = df.filter(col("a").lt_eq(col("b")))?;
577    /// // all rows where a <= b are returned
578    /// let expected = vec![
579    ///     "+---+---+---+",
580    ///     "| a | b | c |",
581    ///     "+---+---+---+",
582    ///     "| 1 | 2 | 3 |",
583    ///     "| 4 | 5 | 6 |",
584    ///     "| 7 | 8 | 9 |",
585    ///     "+---+---+---+",
586    /// ];
587    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
588    /// # Ok(())
589    /// # }
590    /// ```
591    pub fn filter(self, predicate: Expr) -> Result<DataFrame> {
592        let plan = LogicalPlanBuilder::from(self.plan)
593            .filter(predicate)?
594            .build()?;
595        Ok(DataFrame {
596            session_state: self.session_state,
597            plan,
598            projection_requires_validation: true,
599        })
600    }
601
602    /// Return a new `DataFrame` that aggregates the rows of the current
603    /// `DataFrame`, first optionally grouping by the given expressions.
604    ///
605    /// # Example
606    /// ```
607    /// # use datafusion::prelude::*;
608    /// # use datafusion::error::Result;
609    /// # use datafusion::functions_aggregate::expr_fn::min;
610    /// # use datafusion_common::assert_batches_sorted_eq;
611    /// # #[tokio::main]
612    /// # async fn main() -> Result<()> {
613    /// let ctx = SessionContext::new();
614    /// let df = ctx
615    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
616    ///     .await?;
617    ///
618    /// // The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
619    /// let df1 = df.clone().aggregate(vec![col("a")], vec![min(col("b"))])?;
620    /// let expected1 = vec![
621    ///     "+---+----------------+",
622    ///     "| a | min(?table?.b) |",
623    ///     "+---+----------------+",
624    ///     "| 1 | 2              |",
625    ///     "| 4 | 5              |",
626    ///     "| 7 | 8              |",
627    ///     "+---+----------------+",
628    /// ];
629    /// assert_batches_sorted_eq!(expected1, &df1.collect().await?);
630    /// // The following use is the equivalent of "SELECT MIN(b)"
631    /// let df2 = df.aggregate(vec![], vec![min(col("b"))])?;
632    /// let expected2 = vec![
633    ///     "+----------------+",
634    ///     "| min(?table?.b) |",
635    ///     "+----------------+",
636    ///     "| 2              |",
637    ///     "+----------------+",
638    /// ];
639    /// # assert_batches_sorted_eq!(expected2, &df2.collect().await?);
640    /// # Ok(())
641    /// # }
642    /// ```
643    pub fn aggregate(
644        self,
645        group_expr: Vec<Expr>,
646        aggr_expr: Vec<Expr>,
647    ) -> Result<DataFrame> {
648        let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]);
649        let aggr_expr_len = aggr_expr.len();
650        let options =
651            LogicalPlanBuilderOptions::new().with_add_implicit_group_by_exprs(true);
652        let plan = LogicalPlanBuilder::from(self.plan)
653            .with_options(options)
654            .aggregate(group_expr, aggr_expr)?
655            .build()?;
656        let plan = if is_grouping_set {
657            let grouping_id_pos = plan.schema().fields().len() - 1 - aggr_expr_len;
658            // For grouping sets we do a project to not expose the internal grouping id
659            let exprs = plan
660                .schema()
661                .columns()
662                .into_iter()
663                .enumerate()
664                .filter(|(idx, _)| *idx != grouping_id_pos)
665                .map(|(_, column)| Expr::Column(column))
666                .collect::<Vec<_>>();
667            LogicalPlanBuilder::from(plan).project(exprs)?.build()?
668        } else {
669            plan
670        };
671        Ok(DataFrame {
672            session_state: self.session_state,
673            plan,
674            projection_requires_validation: !is_grouping_set,
675        })
676    }
677
678    /// Return a new DataFrame that adds the result of evaluating one or more
679    /// window functions ([`Expr::WindowFunction`]) to the existing columns
680    pub fn window(self, window_exprs: Vec<Expr>) -> Result<DataFrame> {
681        let plan = LogicalPlanBuilder::from(self.plan)
682            .window(window_exprs)?
683            .build()?;
684        Ok(DataFrame {
685            session_state: self.session_state,
686            plan,
687            projection_requires_validation: true,
688        })
689    }
690
691    /// Returns a new `DataFrame` with a limited number of rows.
692    ///
693    /// # Arguments
694    /// `skip` - Number of rows to skip before fetch any row
695    /// `fetch` - Maximum number of rows to return, after skipping `skip` rows.
696    ///
697    /// # Example
698    /// ```
699    /// # use datafusion::prelude::*;
700    /// # use datafusion::error::Result;
701    /// # use datafusion_common::assert_batches_sorted_eq;
702    /// # #[tokio::main]
703    /// # async fn main() -> Result<()> {
704    /// let ctx = SessionContext::new();
705    /// let df = ctx
706    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
707    ///     .await?;
708    /// let df = df.limit(1, Some(2))?;
709    /// let expected = vec![
710    ///     "+---+---+---+",
711    ///     "| a | b | c |",
712    ///     "+---+---+---+",
713    ///     "| 4 | 5 | 6 |",
714    ///     "| 7 | 8 | 9 |",
715    ///     "+---+---+---+",
716    /// ];
717    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
718    /// # Ok(())
719    /// # }
720    /// ```
721    pub fn limit(self, skip: usize, fetch: Option<usize>) -> Result<DataFrame> {
722        let plan = LogicalPlanBuilder::from(self.plan)
723            .limit(skip, fetch)?
724            .build()?;
725        Ok(DataFrame {
726            session_state: self.session_state,
727            plan,
728            projection_requires_validation: self.projection_requires_validation,
729        })
730    }
731
732    /// Calculate the union of two [`DataFrame`]s, preserving duplicate rows.
733    ///
734    /// The two [`DataFrame`]s must have exactly the same schema
735    ///
736    /// # Example
737    /// ```
738    /// # use datafusion::prelude::*;
739    /// # use datafusion::error::Result;
740    /// # use datafusion_common::assert_batches_sorted_eq;
741    /// # #[tokio::main]
742    /// # async fn main() -> Result<()> {
743    /// let ctx = SessionContext::new();
744    /// let df = ctx
745    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
746    ///     .await?;
747    /// let d2 = df.clone();
748    /// let df = df.union(d2)?;
749    /// let expected = vec![
750    ///     "+---+---+---+",
751    ///     "| a | b | c |",
752    ///     "+---+---+---+",
753    ///     "| 1 | 2 | 3 |",
754    ///     "| 1 | 2 | 3 |",
755    ///     "+---+---+---+",
756    /// ];
757    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
758    /// # Ok(())
759    /// # }
760    /// ```
761    pub fn union(self, dataframe: DataFrame) -> Result<DataFrame> {
762        let plan = LogicalPlanBuilder::from(self.plan)
763            .union(dataframe.plan)?
764            .build()?;
765        Ok(DataFrame {
766            session_state: self.session_state,
767            plan,
768            projection_requires_validation: true,
769        })
770    }
771
772    /// Calculate the union of two [`DataFrame`]s using column names, preserving duplicate rows.
773    ///
774    /// The two [`DataFrame`]s are combined using column names rather than position,
775    /// filling missing columns with null.
776    ///
777    ///
778    /// # Example
779    /// ```
780    /// # use datafusion::prelude::*;
781    /// # use datafusion::error::Result;
782    /// # use datafusion_common::assert_batches_sorted_eq;
783    /// # #[tokio::main]
784    /// # async fn main() -> Result<()> {
785    /// let ctx = SessionContext::new();
786    /// let df = ctx
787    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
788    ///     .await?;
789    /// let d2 = df
790    ///     .clone()
791    ///     .select_columns(&["b", "c", "a"])?
792    ///     .with_column("d", lit("77"))?;
793    /// let df = df.union_by_name(d2)?;
794    /// let expected = vec![
795    ///     "+---+---+---+----+",
796    ///     "| a | b | c | d  |",
797    ///     "+---+---+---+----+",
798    ///     "| 1 | 2 | 3 |    |",
799    ///     "| 1 | 2 | 3 | 77 |",
800    ///     "+---+---+---+----+",
801    /// ];
802    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
803    /// # Ok(())
804    /// # }
805    /// ```
806    pub fn union_by_name(self, dataframe: DataFrame) -> Result<DataFrame> {
807        let plan = LogicalPlanBuilder::from(self.plan)
808            .union_by_name(dataframe.plan)?
809            .build()?;
810        Ok(DataFrame {
811            session_state: self.session_state,
812            plan,
813            projection_requires_validation: true,
814        })
815    }
816
817    /// Calculate the distinct union of two [`DataFrame`]s.
818    ///
819    /// The two [`DataFrame`]s must have exactly the same schema. Any duplicate
820    /// rows are discarded.
821    ///
822    /// # Example
823    /// ```
824    /// # use datafusion::prelude::*;
825    /// # use datafusion::error::Result;
826    /// # use datafusion_common::assert_batches_sorted_eq;
827    /// # #[tokio::main]
828    /// # async fn main() -> Result<()> {
829    /// let ctx = SessionContext::new();
830    /// let df = ctx
831    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
832    ///     .await?;
833    /// let d2 = df.clone();
834    /// let df = df.union_distinct(d2)?;
835    /// // df2 are duplicate of df
836    /// let expected = vec![
837    ///     "+---+---+---+",
838    ///     "| a | b | c |",
839    ///     "+---+---+---+",
840    ///     "| 1 | 2 | 3 |",
841    ///     "+---+---+---+",
842    /// ];
843    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
844    /// # Ok(())
845    /// # }
846    /// ```
847    pub fn union_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
848        let plan = LogicalPlanBuilder::from(self.plan)
849            .union_distinct(dataframe.plan)?
850            .build()?;
851        Ok(DataFrame {
852            session_state: self.session_state,
853            plan,
854            projection_requires_validation: true,
855        })
856    }
857
858    /// Calculate the union of two [`DataFrame`]s using column names with all duplicated rows removed.
859    ///
860    /// The two [`DataFrame`]s are combined using column names rather than position,
861    /// filling missing columns with null.
862    ///
863    ///
864    /// # Example
865    /// ```
866    /// # use datafusion::prelude::*;
867    /// # use datafusion::error::Result;
868    /// # use datafusion_common::assert_batches_sorted_eq;
869    /// # #[tokio::main]
870    /// # async fn main() -> Result<()> {
871    /// let ctx = SessionContext::new();
872    /// let df = ctx
873    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
874    ///     .await?;
875    /// let d2 = df.clone().select_columns(&["b", "c", "a"])?;
876    /// let df = df.union_by_name_distinct(d2)?;
877    /// let expected = vec![
878    ///     "+---+---+---+",
879    ///     "| a | b | c |",
880    ///     "+---+---+---+",
881    ///     "| 1 | 2 | 3 |",
882    ///     "+---+---+---+",
883    /// ];
884    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
885    /// # Ok(())
886    /// # }
887    /// ```
888    pub fn union_by_name_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
889        let plan = LogicalPlanBuilder::from(self.plan)
890            .union_by_name_distinct(dataframe.plan)?
891            .build()?;
892        Ok(DataFrame {
893            session_state: self.session_state,
894            plan,
895            projection_requires_validation: true,
896        })
897    }
898
899    /// Return a new `DataFrame` with all duplicated rows removed.
900    ///
901    /// # Example
902    /// ```
903    /// # use datafusion::prelude::*;
904    /// # use datafusion::error::Result;
905    /// # use datafusion_common::assert_batches_sorted_eq;
906    /// # #[tokio::main]
907    /// # async fn main() -> Result<()> {
908    /// let ctx = SessionContext::new();
909    /// let df = ctx
910    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
911    ///     .await?;
912    /// let df = df.distinct()?;
913    /// let expected = vec![
914    ///     "+---+---+---+",
915    ///     "| a | b | c |",
916    ///     "+---+---+---+",
917    ///     "| 1 | 2 | 3 |",
918    ///     "+---+---+---+",
919    /// ];
920    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
921    /// # Ok(())
922    /// # }
923    /// ```
924    pub fn distinct(self) -> Result<DataFrame> {
925        let plan = LogicalPlanBuilder::from(self.plan).distinct()?.build()?;
926        Ok(DataFrame {
927            session_state: self.session_state,
928            plan,
929            projection_requires_validation: true,
930        })
931    }
932
933    /// Return a new `DataFrame` with duplicated rows removed as per the specified expression list
934    /// according to the provided sorting expressions grouped by the `DISTINCT ON` clause
935    /// expressions.
936    ///
937    /// # Example
938    /// ```
939    /// # use datafusion::prelude::*;
940    /// # use datafusion::error::Result;
941    /// # use datafusion_common::assert_batches_sorted_eq;
942    /// # #[tokio::main]
943    /// # async fn main() -> Result<()> {
944    /// let ctx = SessionContext::new();
945    /// let df = ctx
946    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
947    ///     .await?
948    ///     // Return a single row (a, b) for each distinct value of a
949    ///     .distinct_on(vec![col("a")], vec![col("a"), col("b")], None)?;
950    /// let expected = vec![
951    ///     "+---+---+",
952    ///     "| a | b |",
953    ///     "+---+---+",
954    ///     "| 1 | 2 |",
955    ///     "+---+---+",
956    /// ];
957    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
958    /// # Ok(())
959    /// # }
960    /// ```
961    pub fn distinct_on(
962        self,
963        on_expr: Vec<Expr>,
964        select_expr: Vec<Expr>,
965        sort_expr: Option<Vec<SortExpr>>,
966    ) -> Result<DataFrame> {
967        let plan = LogicalPlanBuilder::from(self.plan)
968            .distinct_on(on_expr, select_expr, sort_expr)?
969            .build()?;
970        Ok(DataFrame {
971            session_state: self.session_state,
972            plan,
973            projection_requires_validation: true,
974        })
975    }
976
977    /// Return a new `DataFrame` that has statistics for a DataFrame.
978    ///
979    /// Only summarizes numeric datatypes at the moment and returns nulls for
980    /// non numeric datatypes. The output format is modeled after pandas
981    ///
982    /// # Example
983    /// ```
984    /// # use datafusion::prelude::*;
985    /// # use datafusion::error::Result;
986    /// # use arrow::util::pretty;
987    /// # use datafusion_common::assert_batches_sorted_eq;
988    /// # #[tokio::main]
989    /// # async fn main() -> Result<()> {
990    /// let ctx = SessionContext::new();
991    /// let df = ctx.read_csv("tests/tpch-csv/customer.csv", CsvReadOptions::new()).await?;
992    /// let stat = df.describe().await?;
993    /// # // some output column are ignored
994    /// let expected = vec![
995    ///     "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+",
996    ///     "| describe   | c_custkey          | c_name             | c_address                          | c_nationkey        | c_phone         | c_acctbal          | c_mktsegment | c_comment                                                                                                |",
997    ///     "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+",
998    ///     "| count      | 9.0                | 9                  | 9                                  | 9.0                | 9               | 9.0                | 9            | 9                                                                                                        |",
999    ///     "| max        | 10.0               | Customer#000000010 | xKiAFTjUsCuxfeleNqefumTrjS         | 20.0               | 30-114-968-4951 | 9561.95            | MACHINERY    | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious |",
1000    ///     "| mean       | 6.0                | null               | null                               | 9.88888888888889   | null            | 5153.2155555555555 | null         | null                                                                                                     |",
1001    ///     "| median     | 6.0                | null               | null                               | 8.0                | null            | 6819.74            | null         | null                                                                                                     |",
1002    ///     "| min        | 2.0                | Customer#000000002 | 6LrEaV6KR6PLVcgl2ArL Q3rqzLzcT1 v2 | 1.0                | 11-719-748-3364 | 121.65             | AUTOMOBILE   |  deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov   |",
1003    ///     "| null_count | 0.0                | 0                  | 0                                  | 0.0                | 0               | 0.0                | 0            | 0                                                                                                        |",
1004    ///     "| std        | 2.7386127875258306 | null               | null                               | 7.2188026092359046 | null            | 3522.169804254585  | null         | null                                                                                                     |",
1005    ///     "+------------+--------------------+--------------------+------------------------------------+--------------------+-----------------+--------------------+--------------+----------------------------------------------------------------------------------------------------------+"];
1006    /// assert_batches_sorted_eq!(expected, &stat.collect().await?);
1007    /// # Ok(())
1008    /// # }
1009    /// ```
1010    pub async fn describe(self) -> Result<Self> {
1011        //the functions now supported
1012        let supported_describe_functions =
1013            vec!["count", "null_count", "mean", "std", "min", "max", "median"];
1014
1015        let original_schema_fields = self.schema().fields().iter();
1016
1017        //define describe column
1018        let mut describe_schemas = vec![Field::new("describe", DataType::Utf8, false)];
1019        describe_schemas.extend(original_schema_fields.clone().map(|field| {
1020            if field.data_type().is_numeric() {
1021                Field::new(field.name(), DataType::Float64, true)
1022            } else {
1023                Field::new(field.name(), DataType::Utf8, true)
1024            }
1025        }));
1026
1027        //collect recordBatch
1028        let describe_record_batch = [
1029            // count aggregation
1030            self.clone().aggregate(
1031                vec![],
1032                original_schema_fields
1033                    .clone()
1034                    .map(|f| count(ident(f.name())).alias(f.name()))
1035                    .collect::<Vec<_>>(),
1036            ),
1037            // null_count aggregation
1038            self.clone().aggregate(
1039                vec![],
1040                original_schema_fields
1041                    .clone()
1042                    .map(|f| {
1043                        sum(case(is_null(ident(f.name())))
1044                            .when(lit(true), lit(1))
1045                            .otherwise(lit(0))
1046                            .unwrap())
1047                        .alias(f.name())
1048                    })
1049                    .collect::<Vec<_>>(),
1050            ),
1051            // mean aggregation
1052            self.clone().aggregate(
1053                vec![],
1054                original_schema_fields
1055                    .clone()
1056                    .filter(|f| f.data_type().is_numeric())
1057                    .map(|f| avg(ident(f.name())).alias(f.name()))
1058                    .collect::<Vec<_>>(),
1059            ),
1060            // std aggregation
1061            self.clone().aggregate(
1062                vec![],
1063                original_schema_fields
1064                    .clone()
1065                    .filter(|f| f.data_type().is_numeric())
1066                    .map(|f| stddev(ident(f.name())).alias(f.name()))
1067                    .collect::<Vec<_>>(),
1068            ),
1069            // min aggregation
1070            self.clone().aggregate(
1071                vec![],
1072                original_schema_fields
1073                    .clone()
1074                    .filter(|f| {
1075                        !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1076                    })
1077                    .map(|f| min(ident(f.name())).alias(f.name()))
1078                    .collect::<Vec<_>>(),
1079            ),
1080            // max aggregation
1081            self.clone().aggregate(
1082                vec![],
1083                original_schema_fields
1084                    .clone()
1085                    .filter(|f| {
1086                        !matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1087                    })
1088                    .map(|f| max(ident(f.name())).alias(f.name()))
1089                    .collect::<Vec<_>>(),
1090            ),
1091            // median aggregation
1092            self.clone().aggregate(
1093                vec![],
1094                original_schema_fields
1095                    .clone()
1096                    .filter(|f| f.data_type().is_numeric())
1097                    .map(|f| median(ident(f.name())).alias(f.name()))
1098                    .collect::<Vec<_>>(),
1099            ),
1100        ];
1101
1102        // first column with function names
1103        let mut array_ref_vec: Vec<ArrayRef> = vec![Arc::new(StringArray::from(
1104            supported_describe_functions.clone(),
1105        ))];
1106        for field in original_schema_fields {
1107            let mut array_datas = vec![];
1108            for result in describe_record_batch.iter() {
1109                let array_ref = match result {
1110                    Ok(df) => {
1111                        let batches = df.clone().collect().await;
1112                        match batches {
1113                            Ok(batches)
1114                                if batches.len() == 1
1115                                    && batches[0]
1116                                        .column_by_name(field.name())
1117                                        .is_some() =>
1118                            {
1119                                let column =
1120                                    batches[0].column_by_name(field.name()).unwrap();
1121
1122                                if column.data_type().is_null() {
1123                                    Arc::new(StringArray::from(vec!["null"]))
1124                                } else if field.data_type().is_numeric() {
1125                                    cast(column, &DataType::Float64)?
1126                                } else {
1127                                    cast(column, &DataType::Utf8)?
1128                                }
1129                            }
1130                            _ => Arc::new(StringArray::from(vec!["null"])),
1131                        }
1132                    }
1133                    //Handling error when only boolean/binary column, and in other cases
1134                    Err(err)
1135                        if err.to_string().contains(
1136                            "Error during planning: \
1137                                            Aggregate requires at least one grouping \
1138                                            or aggregate expression",
1139                        ) =>
1140                    {
1141                        Arc::new(StringArray::from(vec!["null"]))
1142                    }
1143                    Err(e) => return exec_err!("{}", e),
1144                };
1145                array_datas.push(array_ref);
1146            }
1147            array_ref_vec.push(concat(
1148                array_datas
1149                    .iter()
1150                    .map(|af| af.as_ref())
1151                    .collect::<Vec<_>>()
1152                    .as_slice(),
1153            )?);
1154        }
1155
1156        let describe_record_batch =
1157            RecordBatch::try_new(Arc::new(Schema::new(describe_schemas)), array_ref_vec)?;
1158
1159        let provider = MemTable::try_new(
1160            describe_record_batch.schema(),
1161            vec![vec![describe_record_batch]],
1162        )?;
1163
1164        let plan = LogicalPlanBuilder::scan(
1165            UNNAMED_TABLE,
1166            provider_as_source(Arc::new(provider)),
1167            None,
1168        )?
1169        .build()?;
1170
1171        Ok(DataFrame {
1172            session_state: self.session_state,
1173            plan,
1174            projection_requires_validation: self.projection_requires_validation,
1175        })
1176    }
1177
1178    /// Apply a sort by provided expressions with default direction
1179    pub fn sort_by(self, expr: Vec<Expr>) -> Result<DataFrame> {
1180        self.sort(
1181            expr.into_iter()
1182                .map(|e| e.sort(true, false))
1183                .collect::<Vec<SortExpr>>(),
1184        )
1185    }
1186
1187    /// Sort the DataFrame by the specified sorting expressions.
1188    ///
1189    /// Note that any expression can be turned into
1190    /// a sort expression by calling its [sort](Expr::sort) method.
1191    ///
1192    /// # Example
1193    ///
1194    /// ```
1195    /// # use datafusion::prelude::*;
1196    /// # use datafusion::error::Result;
1197    /// # use datafusion_common::assert_batches_sorted_eq;
1198    /// # #[tokio::main]
1199    /// # async fn main() -> Result<()> {
1200    /// let ctx = SessionContext::new();
1201    /// let df = ctx
1202    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1203    ///     .await?;
1204    /// let df = df.sort(vec![
1205    ///     col("a").sort(false, true), // a DESC, nulls first
1206    ///     col("b").sort(true, false), // b ASC, nulls last
1207    /// ])?;
1208    /// let expected = vec![
1209    ///     "+---+---+---+",
1210    ///     "| a | b | c |",
1211    ///     "+---+---+---+",
1212    ///     "| 1 | 2 | 3 |",
1213    ///     "| 4 | 5 | 6 |",
1214    ///     "| 7 | 8 | 9 |",
1215    ///     "+---+---+---+",
1216    /// ];
1217    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1218    /// # Ok(())
1219    /// # }
1220    /// ```
1221    pub fn sort(self, expr: Vec<SortExpr>) -> Result<DataFrame> {
1222        let plan = LogicalPlanBuilder::from(self.plan).sort(expr)?.build()?;
1223        Ok(DataFrame {
1224            session_state: self.session_state,
1225            plan,
1226            projection_requires_validation: self.projection_requires_validation,
1227        })
1228    }
1229
1230    /// Join this `DataFrame` with another `DataFrame` using explicitly specified
1231    /// columns and an optional filter expression.
1232    ///
1233    /// See [`join_on`](Self::join_on) for a more concise way to specify the
1234    /// join condition. Since DataFusion will automatically identify and
1235    /// optimize equality predicates there is no performance difference between
1236    /// this function and `join_on`
1237    ///
1238    /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see
1239    /// example below), which are then combined with the optional `filter`
1240    /// expression. If `left_cols` and `right_cols` contain ambiguous column
1241    /// references, they will be disambiguated by prioritizing the left relation
1242    /// for `left_cols` and the right relation for `right_cols`.
1243    ///
1244    /// Note that in case of outer join, the `filter` is applied to only matched rows.
1245    ///
1246    /// # Example
1247    /// ```
1248    /// # use datafusion::prelude::*;
1249    /// # use datafusion::error::Result;
1250    /// # use datafusion_common::assert_batches_sorted_eq;
1251    /// # #[tokio::main]
1252    /// # async fn main() -> Result<()> {
1253    /// let ctx = SessionContext::new();
1254    /// let left = ctx
1255    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1256    ///     .await?;
1257    /// let right = ctx
1258    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1259    ///     .await?
1260    ///     .select(vec![
1261    ///         col("a").alias("a2"),
1262    ///         col("b").alias("b2"),
1263    ///         col("c").alias("c2"),
1264    ///     ])?;
1265    /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)`
1266    /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`.
1267    /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?;
1268    /// let expected = vec![
1269    ///     "+---+---+---+----+----+----+",
1270    ///     "| a | b | c | a2 | b2 | c2 |",
1271    ///     "+---+---+---+----+----+----+",
1272    ///     "| 1 | 2 | 3 | 1  | 2  | 3  |",
1273    ///     "+---+---+---+----+----+----+",
1274    /// ];
1275    /// assert_batches_sorted_eq!(expected, &join.collect().await?);
1276    /// # Ok(())
1277    /// # }
1278    /// ```
1279    pub fn join(
1280        self,
1281        right: DataFrame,
1282        join_type: JoinType,
1283        left_cols: &[&str],
1284        right_cols: &[&str],
1285        filter: Option<Expr>,
1286    ) -> Result<DataFrame> {
1287        let plan = LogicalPlanBuilder::from(self.plan)
1288            .join(
1289                right.plan,
1290                join_type,
1291                (left_cols.to_vec(), right_cols.to_vec()),
1292                filter,
1293            )?
1294            .build()?;
1295        Ok(DataFrame {
1296            session_state: self.session_state,
1297            plan,
1298            projection_requires_validation: true,
1299        })
1300    }
1301
1302    /// Join this `DataFrame` with another `DataFrame` using the specified
1303    /// expressions.
1304    ///
1305    /// Note that DataFusion automatically optimizes joins, including
1306    /// identifying and optimizing equality predicates.
1307    ///
1308    /// # Example
1309    /// ```
1310    /// # use datafusion::prelude::*;
1311    /// # use datafusion::error::Result;
1312    /// # use datafusion_common::assert_batches_sorted_eq;
1313    /// # #[tokio::main]
1314    /// # async fn main() -> Result<()> {
1315    /// let ctx = SessionContext::new();
1316    /// let left = ctx
1317    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1318    ///     .await?;
1319    /// let right = ctx
1320    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1321    ///     .await?
1322    ///     .select(vec![
1323    ///         col("a").alias("a2"),
1324    ///         col("b").alias("b2"),
1325    ///         col("c").alias("c2"),
1326    ///     ])?;
1327    ///
1328    /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
1329    /// // finding all pairs of rows from `left` and `right` where
1330    /// // where `a != a2` and `b != b2`.
1331    /// let join_on = left.join_on(
1332    ///     right,
1333    ///     JoinType::Inner,
1334    ///     [col("a").not_eq(col("a2")), col("b").not_eq(col("b2"))],
1335    /// )?;
1336    /// let expected = vec![
1337    ///     "+---+---+---+----+----+----+",
1338    ///     "| a | b | c | a2 | b2 | c2 |",
1339    ///     "+---+---+---+----+----+----+",
1340    ///     "+---+---+---+----+----+----+",
1341    /// ];
1342    /// # assert_batches_sorted_eq!(expected, &join_on.collect().await?);
1343    /// # Ok(())
1344    /// # }
1345    /// ```
1346    pub fn join_on(
1347        self,
1348        right: DataFrame,
1349        join_type: JoinType,
1350        on_exprs: impl IntoIterator<Item = Expr>,
1351    ) -> Result<DataFrame> {
1352        let plan = LogicalPlanBuilder::from(self.plan)
1353            .join_on(right.plan, join_type, on_exprs)?
1354            .build()?;
1355        Ok(DataFrame {
1356            session_state: self.session_state,
1357            plan,
1358            projection_requires_validation: true,
1359        })
1360    }
1361
1362    /// Repartition a DataFrame based on a logical partitioning scheme.
1363    ///
1364    /// # Example
1365    /// ```
1366    /// # use datafusion::prelude::*;
1367    /// # use datafusion::error::Result;
1368    /// # use datafusion_common::assert_batches_sorted_eq;
1369    /// # #[tokio::main]
1370    /// # async fn main() -> Result<()> {
1371    /// let ctx = SessionContext::new();
1372    /// let df = ctx
1373    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1374    ///     .await?;
1375    /// let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
1376    /// let expected = vec![
1377    ///     "+---+---+---+",
1378    ///     "| a | b | c |",
1379    ///     "+---+---+---+",
1380    ///     "| 1 | 2 | 3 |",
1381    ///     "| 4 | 5 | 6 |",
1382    ///     "| 7 | 8 | 9 |",
1383    ///     "+---+---+---+",
1384    /// ];
1385    /// # assert_batches_sorted_eq!(expected, &df1.collect().await?);
1386    /// # Ok(())
1387    /// # }
1388    /// ```
1389    pub fn repartition(self, partitioning_scheme: Partitioning) -> Result<DataFrame> {
1390        let plan = LogicalPlanBuilder::from(self.plan)
1391            .repartition(partitioning_scheme)?
1392            .build()?;
1393        Ok(DataFrame {
1394            session_state: self.session_state,
1395            plan,
1396            projection_requires_validation: true,
1397        })
1398    }
1399
1400    /// Return the total number of rows in this `DataFrame`.
1401    ///
1402    /// Note that this method will actually run a plan to calculate the count,
1403    /// which may be slow for large or complicated DataFrames.
1404    ///
1405    /// # Example
1406    /// ```
1407    /// # use datafusion::prelude::*;
1408    /// # use datafusion::error::Result;
1409    /// # #[tokio::main]
1410    /// # async fn main() -> Result<()> {
1411    /// let ctx = SessionContext::new();
1412    /// let df = ctx
1413    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1414    ///     .await?;
1415    /// let count = df.count().await?; // 1
1416    /// # assert_eq!(count, 1);
1417    /// # Ok(())
1418    /// # }
1419    /// ```
1420    pub async fn count(self) -> Result<usize> {
1421        let rows = self
1422            .aggregate(
1423                vec![],
1424                vec![count(Expr::Literal(COUNT_STAR_EXPANSION, None))],
1425            )?
1426            .collect()
1427            .await?;
1428        let len = *rows
1429            .first()
1430            .and_then(|r| r.columns().first())
1431            .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
1432            .and_then(|a| a.values().first())
1433            .ok_or_else(|| {
1434                internal_datafusion_err!("Unexpected output when collecting for count()")
1435            })? as usize;
1436        Ok(len)
1437    }
1438
1439    /// Execute this `DataFrame` and buffer all resulting `RecordBatch`es  into memory.
1440    ///
1441    /// Prior to calling `collect`, modifying a DataFrame simply updates a plan
1442    /// (no actual computation is performed). `collect` triggers the computation.
1443    ///
1444    /// See [`Self::execute_stream`] to execute a DataFrame without buffering.
1445    ///
1446    /// # Example
1447    /// ```
1448    /// # use datafusion::prelude::*;
1449    /// # use datafusion::error::Result;
1450    /// # #[tokio::main]
1451    /// # async fn main() -> Result<()> {
1452    /// let ctx = SessionContext::new();
1453    /// let df = ctx
1454    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1455    ///     .await?;
1456    /// let batches = df.collect().await?;
1457    /// # Ok(())
1458    /// # }
1459    /// ```
1460    pub async fn collect(self) -> Result<Vec<RecordBatch>> {
1461        let task_ctx = Arc::new(self.task_ctx());
1462        let plan = self.create_physical_plan().await?;
1463        collect(plan, task_ctx).await
1464    }
1465
1466    /// Execute the `DataFrame` and print the results to the console.
1467    ///
1468    /// # Example
1469    /// ```
1470    /// # use datafusion::prelude::*;
1471    /// # use datafusion::error::Result;
1472    /// # #[tokio::main]
1473    /// # async fn main() -> Result<()> {
1474    /// let ctx = SessionContext::new();
1475    /// let df = ctx
1476    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1477    ///     .await?;
1478    /// df.show().await?;
1479    /// # Ok(())
1480    /// # }
1481    /// ```
1482    pub async fn show(self) -> Result<()> {
1483        println!("{}", self.to_string().await?);
1484        Ok(())
1485    }
1486
1487    /// Execute the `DataFrame` and return a string representation of the results.
1488    ///
1489    /// # Example
1490    /// ```
1491    /// # use datafusion::prelude::*;
1492    /// # use datafusion::error::Result;
1493    /// # use datafusion::execution::SessionStateBuilder;
1494    ///
1495    /// # #[tokio::main]
1496    /// # async fn main() -> Result<()> {
1497    /// let cfg = SessionConfig::new()
1498    ///     .set_str("datafusion.format.null", "no-value");
1499    /// let session_state = SessionStateBuilder::new()
1500    ///     .with_config(cfg)
1501    ///     .with_default_features()
1502    ///     .build();
1503    /// let ctx = SessionContext::new_with_state(session_state);
1504    /// let df = ctx.sql("select null as 'null-column'").await?;
1505    /// let result = df.to_string().await?;
1506    /// assert_eq!(result,
1507    /// "+-------------+
1508    /// | null-column |
1509    /// +-------------+
1510    /// | no-value    |
1511    /// +-------------+"
1512    /// );
1513    /// # Ok(())
1514    /// # }
1515    pub async fn to_string(self) -> Result<String> {
1516        let options = self.session_state.config().options().format.clone();
1517        let arrow_options: arrow::util::display::FormatOptions = (&options).try_into()?;
1518
1519        let results = self.collect().await?;
1520        Ok(
1521            pretty::pretty_format_batches_with_options(&results, &arrow_options)?
1522                .to_string(),
1523        )
1524    }
1525
1526    /// Execute the `DataFrame` and print only the first `num` rows of the
1527    /// result to the console.
1528    ///
1529    /// # Example
1530    /// ```
1531    /// # use datafusion::prelude::*;
1532    /// # use datafusion::error::Result;
1533    /// # #[tokio::main]
1534    /// # async fn main() -> Result<()> {
1535    /// let ctx = SessionContext::new();
1536    /// let df = ctx
1537    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1538    ///     .await?;
1539    /// df.show_limit(10).await?;
1540    /// # Ok(())
1541    /// # }
1542    /// ```
1543    pub async fn show_limit(self, num: usize) -> Result<()> {
1544        let results = self.limit(0, Some(num))?.collect().await?;
1545        Ok(pretty::print_batches(&results)?)
1546    }
1547
1548    /// Return a new [`TaskContext`] which would be used to execute this DataFrame
1549    pub fn task_ctx(&self) -> TaskContext {
1550        TaskContext::from(self.session_state.as_ref())
1551    }
1552
1553    /// Executes this DataFrame and returns a stream over a single partition
1554    ///
1555    /// See [Self::collect] to buffer the `RecordBatch`es in memory.
1556    ///
1557    /// # Example
1558    /// ```
1559    /// # use datafusion::prelude::*;
1560    /// # use datafusion::error::Result;
1561    /// # #[tokio::main]
1562    /// # async fn main() -> Result<()> {
1563    /// let ctx = SessionContext::new();
1564    /// let df = ctx
1565    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1566    ///     .await?;
1567    /// let stream = df.execute_stream().await?;
1568    /// # Ok(())
1569    /// # }
1570    /// ```
1571    ///
1572    /// # Aborting Execution
1573    ///
1574    /// Dropping the stream will abort the execution of the query, and free up
1575    /// any allocated resources
1576    pub async fn execute_stream(self) -> Result<SendableRecordBatchStream> {
1577        let task_ctx = Arc::new(self.task_ctx());
1578        let plan = self.create_physical_plan().await?;
1579        execute_stream(plan, task_ctx)
1580    }
1581
1582    /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
1583    /// maintaining the input partitioning.
1584    ///
1585    /// # Example
1586    /// ```
1587    /// # use datafusion::prelude::*;
1588    /// # use datafusion::error::Result;
1589    /// # #[tokio::main]
1590    /// # async fn main() -> Result<()> {
1591    /// let ctx = SessionContext::new();
1592    /// let df = ctx
1593    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1594    ///     .await?;
1595    /// let batches = df.collect_partitioned().await?;
1596    /// # Ok(())
1597    /// # }
1598    /// ```
1599    pub async fn collect_partitioned(self) -> Result<Vec<Vec<RecordBatch>>> {
1600        let task_ctx = Arc::new(self.task_ctx());
1601        let plan = self.create_physical_plan().await?;
1602        collect_partitioned(plan, task_ctx).await
1603    }
1604
1605    /// Executes this DataFrame and returns one stream per partition.
1606    ///
1607    /// # Example
1608    /// ```
1609    /// # use datafusion::prelude::*;
1610    /// # use datafusion::error::Result;
1611    /// # #[tokio::main]
1612    /// # async fn main() -> Result<()> {
1613    /// let ctx = SessionContext::new();
1614    /// let df = ctx
1615    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1616    ///     .await?;
1617    /// let batches = df.execute_stream_partitioned().await?;
1618    /// # Ok(())
1619    /// # }
1620    /// ```
1621    /// # Aborting Execution
1622    ///
1623    /// Dropping the stream will abort the execution of the query, and free up
1624    /// any allocated resources
1625    pub async fn execute_stream_partitioned(
1626        self,
1627    ) -> Result<Vec<SendableRecordBatchStream>> {
1628        let task_ctx = Arc::new(self.task_ctx());
1629        let plan = self.create_physical_plan().await?;
1630        execute_stream_partitioned(plan, task_ctx)
1631    }
1632
1633    /// Returns the `DFSchema` describing the output of this DataFrame.
1634    ///
1635    /// The output `DFSchema` contains information on the name, data type, and
1636    /// nullability for each column.
1637    ///
1638    /// # Example
1639    /// ```
1640    /// # use datafusion::prelude::*;
1641    /// # use datafusion::error::Result;
1642    /// # #[tokio::main]
1643    /// # async fn main() -> Result<()> {
1644    /// let ctx = SessionContext::new();
1645    /// let df = ctx
1646    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1647    ///     .await?;
1648    /// let schema = df.schema();
1649    /// # Ok(())
1650    /// # }
1651    /// ```
1652    pub fn schema(&self) -> &DFSchema {
1653        self.plan.schema()
1654    }
1655
1656    /// Return a reference to the unoptimized [`LogicalPlan`] that comprises
1657    /// this DataFrame.
1658    ///
1659    /// See [`Self::into_unoptimized_plan`] for more details.
1660    pub fn logical_plan(&self) -> &LogicalPlan {
1661        &self.plan
1662    }
1663
1664    /// Returns both the [`LogicalPlan`] and [`SessionState`] that comprise this [`DataFrame`]
1665    pub fn into_parts(self) -> (SessionState, LogicalPlan) {
1666        (*self.session_state, self.plan)
1667    }
1668
1669    /// Return the [`LogicalPlan`] represented by this DataFrame without running
1670    /// any optimizers
1671    ///
1672    /// Note: This method should not be used outside testing, as it loses the
1673    /// snapshot of the [`SessionState`] attached to this [`DataFrame`] and
1674    /// consequently subsequent operations may take place against a different
1675    /// state (e.g. a different value of `now()`)
1676    ///
1677    /// See [`Self::into_parts`] to retrieve the owned [`LogicalPlan`] and
1678    /// corresponding [`SessionState`].
1679    pub fn into_unoptimized_plan(self) -> LogicalPlan {
1680        self.plan
1681    }
1682
1683    /// Return the optimized [`LogicalPlan`] represented by this DataFrame.
1684    ///
1685    /// Note: This method should not be used outside testing -- see
1686    /// [`Self::into_unoptimized_plan`] for more details.
1687    pub fn into_optimized_plan(self) -> Result<LogicalPlan> {
1688        // Optimize the plan first for better UX
1689        self.session_state.optimize(&self.plan)
1690    }
1691
1692    /// Converts this [`DataFrame`] into a [`TableProvider`] that can be registered
1693    /// as a table view using [`SessionContext::register_table`].
1694    ///
1695    /// Note: This discards the [`SessionState`] associated with this
1696    /// [`DataFrame`] in favour of the one passed to [`TableProvider::scan`]
1697    pub fn into_view(self) -> Arc<dyn TableProvider> {
1698        Arc::new(DataFrameTableProvider {
1699            plan: self.plan,
1700            table_type: TableType::View,
1701        })
1702    }
1703
1704    /// See [`Self::into_view`]. The returned [`TableProvider`] will
1705    /// create a transient table.
1706    pub fn into_temporary_view(self) -> Arc<dyn TableProvider> {
1707        Arc::new(DataFrameTableProvider {
1708            plan: self.plan,
1709            table_type: TableType::Temporary,
1710        })
1711    }
1712
1713    /// Return a DataFrame with the explanation of its plan so far.
1714    ///
1715    /// if `analyze` is specified, runs the plan and reports metrics
1716    /// if `verbose` is true, prints out additional details.
1717    /// The default format is Indent format.
1718    ///
1719    /// ```
1720    /// # use datafusion::prelude::*;
1721    /// # use datafusion::error::Result;
1722    /// # #[tokio::main]
1723    /// # async fn main() -> Result<()> {
1724    /// let ctx = SessionContext::new();
1725    /// let df = ctx
1726    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1727    ///     .await?;
1728    /// let batches = df
1729    ///     .limit(0, Some(100))?
1730    ///     .explain(false, false)?
1731    ///     .collect()
1732    ///     .await?;
1733    /// # Ok(())
1734    /// # }
1735    /// ```
1736    pub fn explain(self, verbose: bool, analyze: bool) -> Result<DataFrame> {
1737        // Set the default format to Indent to keep the previous behavior
1738        let opts = ExplainOption::default()
1739            .with_verbose(verbose)
1740            .with_analyze(analyze);
1741        self.explain_with_options(opts)
1742    }
1743
1744    /// Return a DataFrame with the explanation of its plan so far.
1745    ///
1746    /// `opt` is used to specify the options for the explain operation.
1747    /// Details of the options can be found in [`ExplainOption`].
1748    /// ```
1749    /// # use datafusion::prelude::*;
1750    /// # use datafusion::error::Result;
1751    /// # #[tokio::main]
1752    /// # async fn main() -> Result<()> {
1753    /// use datafusion_expr::{Explain, ExplainOption};
1754    /// let ctx = SessionContext::new();
1755    /// let df = ctx
1756    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1757    ///     .await?;
1758    /// let batches = df
1759    ///     .limit(0, Some(100))?
1760    ///     .explain_with_options(
1761    ///         ExplainOption::default()
1762    ///             .with_verbose(false)
1763    ///             .with_analyze(false),
1764    ///     )?
1765    ///     .collect()
1766    ///     .await?;
1767    /// # Ok(())
1768    /// # }
1769    /// ```
1770    pub fn explain_with_options(
1771        self,
1772        explain_option: ExplainOption,
1773    ) -> Result<DataFrame> {
1774        if matches!(self.plan, LogicalPlan::Explain(_)) {
1775            return plan_err!("Nested EXPLAINs are not supported");
1776        }
1777        let plan = LogicalPlanBuilder::from(self.plan)
1778            .explain_option_format(explain_option)?
1779            .build()?;
1780        Ok(DataFrame {
1781            session_state: self.session_state,
1782            plan,
1783            projection_requires_validation: self.projection_requires_validation,
1784        })
1785    }
1786
1787    /// Return a `FunctionRegistry` used to plan udf's calls
1788    ///
1789    /// # Example
1790    /// ```
1791    /// # use datafusion::prelude::*;
1792    /// # use datafusion::error::Result;
1793    /// # #[tokio::main]
1794    /// # async fn main() -> Result<()> {
1795    /// let ctx = SessionContext::new();
1796    /// let df = ctx
1797    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1798    ///     .await?;
1799    /// let f = df.registry();
1800    /// // use f.udf("name", vec![...]) to use the udf
1801    /// # Ok(())
1802    /// # }
1803    /// ```
1804    pub fn registry(&self) -> &dyn FunctionRegistry {
1805        self.session_state.as_ref()
1806    }
1807
1808    /// Calculate the intersection of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
1809    ///
1810    /// ```
1811    /// # use datafusion::prelude::*;
1812    /// # use datafusion::error::Result;
1813    /// # use datafusion_common::assert_batches_sorted_eq;
1814    /// # #[tokio::main]
1815    /// # async fn main() -> Result<()> {
1816    /// let ctx = SessionContext::new();
1817    /// let df = ctx
1818    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1819    ///     .await?;
1820    /// let d2 = ctx
1821    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1822    ///     .await?;
1823    /// let df = df.intersect(d2)?;
1824    /// let expected = vec![
1825    ///     "+---+---+---+",
1826    ///     "| a | b | c |",
1827    ///     "+---+---+---+",
1828    ///     "| 1 | 2 | 3 |",
1829    ///     "+---+---+---+",
1830    /// ];
1831    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1832    /// # Ok(())
1833    /// # }
1834    /// ```
1835    pub fn intersect(self, dataframe: DataFrame) -> Result<DataFrame> {
1836        let left_plan = self.plan;
1837        let right_plan = dataframe.plan;
1838        let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, true)?;
1839        Ok(DataFrame {
1840            session_state: self.session_state,
1841            plan,
1842            projection_requires_validation: true,
1843        })
1844    }
1845
1846    /// Calculate the distinct intersection of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
1847    ///
1848    /// ```
1849    /// # use datafusion::prelude::*;
1850    /// # use datafusion::error::Result;
1851    /// # use datafusion_common::assert_batches_sorted_eq;
1852    /// # #[tokio::main]
1853    /// # async fn main() -> Result<()> {
1854    /// let ctx = SessionContext::new();
1855    /// let df = ctx
1856    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1857    ///     .await?;
1858    /// let d2 = ctx
1859    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1860    ///     .await?;
1861    /// let df = df.intersect_distinct(d2)?;
1862    /// let expected = vec![
1863    ///     "+---+---+---+",
1864    ///     "| a | b | c |",
1865    ///     "+---+---+---+",
1866    ///     "| 1 | 2 | 3 |",
1867    ///     "+---+---+---+",
1868    /// ];
1869    /// # assert_batches_sorted_eq!(expected, &df.collect().await?);
1870    /// # Ok(())
1871    /// # }
1872    /// ```
1873    pub fn intersect_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
1874        let left_plan = self.plan;
1875        let right_plan = dataframe.plan;
1876        let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, false)?;
1877        Ok(DataFrame {
1878            session_state: self.session_state,
1879            plan,
1880            projection_requires_validation: true,
1881        })
1882    }
1883
1884    /// Calculate the exception of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
1885    ///
1886    /// ```
1887    /// # use datafusion::prelude::*;
1888    /// # use datafusion::error::Result;
1889    /// # use datafusion_common::assert_batches_sorted_eq;
1890    /// # #[tokio::main]
1891    /// # async fn main() -> Result<()> {
1892    /// let ctx = SessionContext::new();
1893    /// let df = ctx
1894    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1895    ///     .await?;
1896    /// let d2 = ctx
1897    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1898    ///     .await?;
1899    /// let result = df.except(d2)?;
1900    /// // those columns are not in example.csv, but in example_long.csv
1901    /// let expected = vec![
1902    ///     "+---+---+---+",
1903    ///     "| a | b | c |",
1904    ///     "+---+---+---+",
1905    ///     "| 4 | 5 | 6 |",
1906    ///     "| 7 | 8 | 9 |",
1907    ///     "+---+---+---+",
1908    /// ];
1909    /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
1910    /// # Ok(())
1911    /// # }
1912    /// ```
1913    pub fn except(self, dataframe: DataFrame) -> Result<DataFrame> {
1914        let left_plan = self.plan;
1915        let right_plan = dataframe.plan;
1916        let plan = LogicalPlanBuilder::except(left_plan, right_plan, true)?;
1917        Ok(DataFrame {
1918            session_state: self.session_state,
1919            plan,
1920            projection_requires_validation: true,
1921        })
1922    }
1923
1924    /// Calculate the distinct exception of two [`DataFrame`]s.  The two [`DataFrame`]s must have exactly the same schema
1925    ///
1926    /// ```
1927    /// # use datafusion::prelude::*;
1928    /// # use datafusion::error::Result;
1929    /// # use datafusion_common::assert_batches_sorted_eq;
1930    /// # #[tokio::main]
1931    /// # async fn main() -> Result<()> {
1932    /// let ctx = SessionContext::new();
1933    /// let df = ctx
1934    ///     .read_csv("tests/data/example_long.csv", CsvReadOptions::new())
1935    ///     .await?;
1936    /// let d2 = ctx
1937    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
1938    ///     .await?;
1939    /// let result = df.except_distinct(d2)?;
1940    /// // those columns are not in example.csv, but in example_long.csv
1941    /// let expected = vec![
1942    ///     "+---+---+---+",
1943    ///     "| a | b | c |",
1944    ///     "+---+---+---+",
1945    ///     "| 4 | 5 | 6 |",
1946    ///     "| 7 | 8 | 9 |",
1947    ///     "+---+---+---+",
1948    /// ];
1949    /// # assert_batches_sorted_eq!(expected, &result.collect().await?);
1950    /// # Ok(())
1951    /// # }
1952    /// ```
1953    pub fn except_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
1954        let left_plan = self.plan;
1955        let right_plan = dataframe.plan;
1956        let plan = LogicalPlanBuilder::except(left_plan, right_plan, false)?;
1957        Ok(DataFrame {
1958            session_state: self.session_state,
1959            plan,
1960            projection_requires_validation: true,
1961        })
1962    }
1963
1964    /// Execute this `DataFrame` and write the results to `table_name`.
1965    ///
1966    /// Returns a single [RecordBatch] containing a single column and
1967    /// row representing the count of total rows written.
1968    ///
1969    /// Unlike most other `DataFrame` methods, this method executes eagerly.
1970    /// Data is written to the table using the [`TableProvider::insert_into`]
1971    /// method. This is the same underlying implementation used by SQL `INSERT
1972    /// INTO` statements.
1973    pub async fn write_table(
1974        self,
1975        table_name: &str,
1976        write_options: DataFrameWriteOptions,
1977    ) -> Result<Vec<RecordBatch>, DataFusionError> {
1978        let plan = if write_options.sort_by.is_empty() {
1979            self.plan
1980        } else {
1981            LogicalPlanBuilder::from(self.plan)
1982                .sort(write_options.sort_by)?
1983                .build()?
1984        };
1985
1986        let table_ref: TableReference = table_name.into();
1987        let table_schema = self.session_state.schema_for_ref(table_ref.clone())?;
1988        let target = match table_schema.table(table_ref.table()).await? {
1989            Some(ref provider) => Ok(Arc::clone(provider)),
1990            _ => plan_err!("No table named '{table_name}'"),
1991        }?;
1992
1993        let target = Arc::new(DefaultTableSource::new(target));
1994
1995        let plan = LogicalPlanBuilder::insert_into(
1996            plan,
1997            table_ref,
1998            target,
1999            write_options.insert_op,
2000        )?
2001        .build()?;
2002
2003        DataFrame {
2004            session_state: self.session_state,
2005            plan,
2006            projection_requires_validation: self.projection_requires_validation,
2007        }
2008        .collect()
2009        .await
2010    }
2011
2012    /// Execute the `DataFrame` and write the results to CSV file(s).
2013    ///
2014    /// # Example
2015    /// ```
2016    /// # use datafusion::prelude::*;
2017    /// # use datafusion::error::Result;
2018    /// # use std::fs;
2019    /// # #[tokio::main]
2020    /// # async fn main() -> Result<()> {
2021    /// use datafusion::dataframe::DataFrameWriteOptions;
2022    /// let ctx = SessionContext::new();
2023    /// // Sort the data by column "b" and write it to a new location
2024    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
2025    ///     .await?
2026    ///     .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
2027    ///     .write_csv(
2028    ///         "output.csv",
2029    ///         DataFrameWriteOptions::new(),
2030    ///         None, // can also specify CSV writing options here
2031    ///     )
2032    ///     .await?;
2033    /// # fs::remove_file("output.csv")?;
2034    /// # Ok(())
2035    /// # }
2036    /// ```
2037    pub async fn write_csv(
2038        self,
2039        path: &str,
2040        options: DataFrameWriteOptions,
2041        writer_options: Option<CsvOptions>,
2042    ) -> Result<Vec<RecordBatch>, DataFusionError> {
2043        if options.insert_op != InsertOp::Append {
2044            return not_impl_err!(
2045                "{} is not implemented for DataFrame::write_csv.",
2046                options.insert_op
2047            );
2048        }
2049
2050        let format = if let Some(csv_opts) = writer_options {
2051            Arc::new(CsvFormatFactory::new_with_options(csv_opts))
2052        } else {
2053            Arc::new(CsvFormatFactory::new())
2054        };
2055
2056        let file_type = format_as_file_type(format);
2057
2058        let copy_options = options.build_sink_options();
2059
2060        let plan = if options.sort_by.is_empty() {
2061            self.plan
2062        } else {
2063            LogicalPlanBuilder::from(self.plan)
2064                .sort(options.sort_by)?
2065                .build()?
2066        };
2067
2068        let plan = LogicalPlanBuilder::copy_to(
2069            plan,
2070            path.into(),
2071            file_type,
2072            copy_options,
2073            options.partition_by,
2074        )?
2075        .build()?;
2076
2077        DataFrame {
2078            session_state: self.session_state,
2079            plan,
2080            projection_requires_validation: self.projection_requires_validation,
2081        }
2082        .collect()
2083        .await
2084    }
2085
2086    /// Execute the `DataFrame` and write the results to JSON file(s).
2087    ///
2088    /// # Example
2089    /// ```
2090    /// # use datafusion::prelude::*;
2091    /// # use datafusion::error::Result;
2092    /// # use std::fs;
2093    /// # #[tokio::main]
2094    /// # async fn main() -> Result<()> {
2095    /// use datafusion::dataframe::DataFrameWriteOptions;
2096    /// let ctx = SessionContext::new();
2097    /// // Sort the data by column "b" and write it to a new location
2098    /// ctx.read_csv("tests/data/example.csv", CsvReadOptions::new())
2099    ///     .await?
2100    ///     .sort(vec![col("b").sort(true, true)])? // sort by b asc, nulls first
2101    ///     .write_json("output.json", DataFrameWriteOptions::new(), None)
2102    ///     .await?;
2103    /// # fs::remove_file("output.json")?;
2104    /// # Ok(())
2105    /// # }
2106    /// ```
2107    pub async fn write_json(
2108        self,
2109        path: &str,
2110        options: DataFrameWriteOptions,
2111        writer_options: Option<JsonOptions>,
2112    ) -> Result<Vec<RecordBatch>, DataFusionError> {
2113        if options.insert_op != InsertOp::Append {
2114            return not_impl_err!(
2115                "{} is not implemented for DataFrame::write_json.",
2116                options.insert_op
2117            );
2118        }
2119
2120        let format = if let Some(json_opts) = writer_options {
2121            Arc::new(JsonFormatFactory::new_with_options(json_opts))
2122        } else {
2123            Arc::new(JsonFormatFactory::new())
2124        };
2125
2126        let file_type = format_as_file_type(format);
2127
2128        let copy_options = options.build_sink_options();
2129
2130        let plan = if options.sort_by.is_empty() {
2131            self.plan
2132        } else {
2133            LogicalPlanBuilder::from(self.plan)
2134                .sort(options.sort_by)?
2135                .build()?
2136        };
2137
2138        let plan = LogicalPlanBuilder::copy_to(
2139            plan,
2140            path.into(),
2141            file_type,
2142            copy_options,
2143            options.partition_by,
2144        )?
2145        .build()?;
2146
2147        DataFrame {
2148            session_state: self.session_state,
2149            plan,
2150            projection_requires_validation: self.projection_requires_validation,
2151        }
2152        .collect()
2153        .await
2154    }
2155
2156    /// Add or replace a column in the DataFrame.
2157    ///
2158    /// # Example
2159    /// ```
2160    /// # use datafusion::prelude::*;
2161    /// # use datafusion::error::Result;
2162    /// # #[tokio::main]
2163    /// # async fn main() -> Result<()> {
2164    /// let ctx = SessionContext::new();
2165    /// let df = ctx
2166    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
2167    ///     .await?;
2168    /// let df = df.with_column("ab_sum", col("a") + col("b"))?;
2169    /// # Ok(())
2170    /// # }
2171    /// ```
2172    pub fn with_column(self, name: &str, expr: Expr) -> Result<DataFrame> {
2173        let window_func_exprs = find_window_exprs([&expr]);
2174
2175        let original_names: HashSet<String> = self
2176            .plan
2177            .schema()
2178            .iter()
2179            .map(|(_, f)| f.name().clone())
2180            .collect();
2181
2182        // Maybe build window plan
2183        let plan = if window_func_exprs.is_empty() {
2184            self.plan
2185        } else {
2186            LogicalPlanBuilder::window_plan(self.plan, window_func_exprs)?
2187        };
2188
2189        let new_column = expr.alias(name);
2190        let mut col_exists = false;
2191
2192        let mut fields: Vec<(Expr, bool)> = plan
2193            .schema()
2194            .iter()
2195            .filter_map(|(qualifier, field)| {
2196                // Skip new fields introduced by window_plan
2197                if !original_names.contains(field.name()) {
2198                    return None;
2199                }
2200
2201                if field.name() == name {
2202                    col_exists = true;
2203                    Some((new_column.clone(), true))
2204                } else {
2205                    let e = col(Column::from((qualifier, field)));
2206                    Some((e, self.projection_requires_validation))
2207                }
2208            })
2209            .collect();
2210
2211        if !col_exists {
2212            fields.push((new_column, true));
2213        }
2214
2215        let project_plan = LogicalPlanBuilder::from(plan)
2216            .project_with_validation(fields)?
2217            .build()?;
2218
2219        Ok(DataFrame {
2220            session_state: self.session_state,
2221            plan: project_plan,
2222            projection_requires_validation: false,
2223        })
2224    }
2225
2226    /// Rename one column by applying a new projection. This is a no-op if the column to be
2227    /// renamed does not exist.
2228    ///
2229    /// The method supports case sensitive rename with wrapping column name into one of following symbols (  "  or  '  or  `  )
2230    ///
2231    /// Alternatively setting DataFusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable
2232    /// case sensitive rename without need to wrap column name into special symbols
2233    ///
2234    /// # Example
2235    /// ```
2236    /// # use datafusion::prelude::*;
2237    /// # use datafusion::error::Result;
2238    /// # #[tokio::main]
2239    /// # async fn main() -> Result<()> {
2240    /// let ctx = SessionContext::new();
2241    /// let df = ctx
2242    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
2243    ///     .await?;
2244    /// let df = df.with_column_renamed("ab_sum", "total")?;
2245    ///
2246    /// # Ok(())
2247    /// # }
2248    /// ```
2249    pub fn with_column_renamed(
2250        self,
2251        old_name: impl Into<String>,
2252        new_name: &str,
2253    ) -> Result<DataFrame> {
2254        let ident_opts = self
2255            .session_state
2256            .config_options()
2257            .sql_parser
2258            .enable_ident_normalization;
2259        let old_column: Column = if ident_opts {
2260            Column::from_qualified_name(old_name)
2261        } else {
2262            Column::from_qualified_name_ignore_case(old_name)
2263        };
2264
2265        let (qualifier_rename, field_rename) =
2266            match self.plan.schema().qualified_field_from_column(&old_column) {
2267                Ok(qualifier_and_field) => qualifier_and_field,
2268                // no-op if field not found
2269                Err(DataFusionError::SchemaError(e, _))
2270                    if matches!(*e, SchemaError::FieldNotFound { .. }) =>
2271                {
2272                    return Ok(self);
2273                }
2274                Err(err) => return Err(err),
2275            };
2276        let projection = self
2277            .plan
2278            .schema()
2279            .iter()
2280            .map(|(qualifier, field)| {
2281                if qualifier.eq(&qualifier_rename) && field == field_rename {
2282                    (
2283                        col(Column::from((qualifier, field)))
2284                            .alias_qualified(qualifier.cloned(), new_name),
2285                        false,
2286                    )
2287                } else {
2288                    (col(Column::from((qualifier, field))), false)
2289                }
2290            })
2291            .collect::<Vec<_>>();
2292        let project_plan = LogicalPlanBuilder::from(self.plan)
2293            .project_with_validation(projection)?
2294            .build()?;
2295        Ok(DataFrame {
2296            session_state: self.session_state,
2297            plan: project_plan,
2298            projection_requires_validation: false,
2299        })
2300    }
2301
2302    /// Replace all parameters in logical plan with the specified
2303    /// values, in preparation for execution.
2304    ///
2305    /// # Example
2306    ///
2307    /// ```
2308    /// use datafusion::prelude::*;
2309    /// # use datafusion::{error::Result, assert_batches_eq};
2310    /// # #[tokio::main]
2311    /// # async fn main() -> Result<()> {
2312    /// # use datafusion_common::ScalarValue;
2313    /// let ctx = SessionContext::new();
2314    /// # ctx.register_csv("example", "tests/data/example.csv", CsvReadOptions::new()).await?;
2315    /// let results = ctx
2316    ///   .sql("SELECT a FROM example WHERE b = $1")
2317    ///   .await?
2318    ///    // replace $1 with value 2
2319    ///   .with_param_values(vec![
2320    ///      // value at index 0 --> $1
2321    ///      ScalarValue::from(2i64)
2322    ///    ])?
2323    ///   .collect()
2324    ///   .await?;
2325    /// assert_batches_eq!(
2326    ///  &[
2327    ///    "+---+",
2328    ///    "| a |",
2329    ///    "+---+",
2330    ///    "| 1 |",
2331    ///    "+---+",
2332    ///  ],
2333    ///  &results
2334    /// );
2335    /// // Note you can also provide named parameters
2336    /// let results = ctx
2337    ///   .sql("SELECT a FROM example WHERE b = $my_param")
2338    ///   .await?
2339    ///    // replace $my_param with value 2
2340    ///    // Note you can also use a HashMap as well
2341    ///   .with_param_values(vec![
2342    ///       ("my_param", ScalarValue::from(2i64))
2343    ///    ])?
2344    ///   .collect()
2345    ///   .await?;
2346    /// assert_batches_eq!(
2347    ///  &[
2348    ///    "+---+",
2349    ///    "| a |",
2350    ///    "+---+",
2351    ///    "| 1 |",
2352    ///    "+---+",
2353    ///  ],
2354    ///  &results
2355    /// );
2356    /// # Ok(())
2357    /// # }
2358    /// ```
2359    pub fn with_param_values(self, query_values: impl Into<ParamValues>) -> Result<Self> {
2360        let plan = self.plan.with_param_values(query_values)?;
2361        Ok(DataFrame {
2362            session_state: self.session_state,
2363            plan,
2364            projection_requires_validation: self.projection_requires_validation,
2365        })
2366    }
2367
2368    /// Cache DataFrame as a memory table.
2369    ///
2370    /// Default behavior could be changed using
2371    /// a [`crate::execution::session_state::CacheFactory`]
2372    /// configured via [`SessionState`].
2373    ///
2374    /// ```
2375    /// # use datafusion::prelude::*;
2376    /// # use datafusion::error::Result;
2377    /// # #[tokio::main]
2378    /// # async fn main() -> Result<()> {
2379    /// let ctx = SessionContext::new();
2380    /// let df = ctx
2381    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
2382    ///     .await?;
2383    /// let df = df.cache().await?;
2384    /// # Ok(())
2385    /// # }
2386    /// ```
2387    pub async fn cache(self) -> Result<DataFrame> {
2388        if let Some(cache_factory) = self.session_state.cache_factory() {
2389            let new_plan =
2390                cache_factory.create(self.plan, self.session_state.as_ref())?;
2391            Ok(Self::new(*self.session_state, new_plan))
2392        } else {
2393            let context = SessionContext::new_with_state((*self.session_state).clone());
2394            // The schema is consistent with the output
2395            let plan = self.clone().create_physical_plan().await?;
2396            let schema = plan.schema();
2397            let task_ctx = Arc::new(self.task_ctx());
2398            let partitions = collect_partitioned(plan, task_ctx).await?;
2399            let mem_table = MemTable::try_new(schema, partitions)?;
2400            context.read_table(Arc::new(mem_table))
2401        }
2402    }
2403
2404    /// Apply an alias to the DataFrame.
2405    ///
2406    /// This method replaces the qualifiers of output columns with the given alias.
2407    pub fn alias(self, alias: &str) -> Result<DataFrame> {
2408        let plan = LogicalPlanBuilder::from(self.plan).alias(alias)?.build()?;
2409        Ok(DataFrame {
2410            session_state: self.session_state,
2411            plan,
2412            projection_requires_validation: self.projection_requires_validation,
2413        })
2414    }
2415
2416    /// Fill null values in specified columns with a given value
2417    /// If no columns are specified (empty vector), applies to all columns
2418    /// Only fills if the value can be cast to the column's type
2419    ///
2420    /// # Arguments
2421    /// * `value` - Value to fill nulls with
2422    /// * `columns` - List of column names to fill. If empty, fills all columns.
2423    ///
2424    /// # Example
2425    /// ```
2426    /// # use datafusion::prelude::*;
2427    /// # use datafusion::error::Result;
2428    /// # use datafusion_common::ScalarValue;
2429    /// # #[tokio::main]
2430    /// # async fn main() -> Result<()> {
2431    /// let ctx = SessionContext::new();
2432    /// let df = ctx
2433    ///     .read_csv("tests/data/example.csv", CsvReadOptions::new())
2434    ///     .await?;
2435    /// // Fill nulls in only columns "a" and "c":
2436    /// let df = df.fill_null(ScalarValue::from(0), vec!["a".to_owned(), "c".to_owned()])?;
2437    /// // Fill nulls across all columns:
2438    /// let df = df.fill_null(ScalarValue::from(0), vec![])?;
2439    /// # Ok(())
2440    /// # }
2441    /// ```
2442    #[expect(clippy::needless_pass_by_value)]
2443    pub fn fill_null(
2444        &self,
2445        value: ScalarValue,
2446        columns: Vec<String>,
2447    ) -> Result<DataFrame> {
2448        let cols = if columns.is_empty() {
2449            self.logical_plan()
2450                .schema()
2451                .fields()
2452                .iter()
2453                .map(Arc::clone)
2454                .collect()
2455        } else {
2456            self.find_columns(&columns)?
2457        };
2458
2459        // Create projections for each column
2460        let projections = self
2461            .logical_plan()
2462            .schema()
2463            .fields()
2464            .iter()
2465            .map(|field| {
2466                if cols.contains(field) {
2467                    // Try to cast fill value to column type. If the cast fails, fallback to the original column.
2468                    match value.clone().cast_to(field.data_type()) {
2469                        Ok(fill_value) => Expr::Alias(Alias {
2470                            expr: Box::new(Expr::ScalarFunction(ScalarFunction {
2471                                func: coalesce(),
2472                                args: vec![col(field.name()), lit(fill_value)],
2473                            })),
2474                            relation: None,
2475                            name: field.name().to_string(),
2476                            metadata: None,
2477                        }),
2478                        Err(_) => col(field.name()),
2479                    }
2480                } else {
2481                    col(field.name())
2482                }
2483            })
2484            .collect::<Vec<_>>();
2485
2486        self.clone().select(projections)
2487    }
2488
2489    // Helper to find columns from names
2490    fn find_columns(&self, names: &[String]) -> Result<Vec<FieldRef>> {
2491        let schema = self.logical_plan().schema();
2492        names
2493            .iter()
2494            .map(|name| {
2495                schema
2496                    .field_with_name(None, name)
2497                    .cloned()
2498                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
2499            })
2500            .collect()
2501    }
2502
2503    /// Find qualified columns for this dataframe from names
2504    ///
2505    /// # Arguments
2506    /// * `names` - Unqualified names to find.
2507    ///
2508    /// # Example
2509    /// ```
2510    /// # use datafusion::prelude::*;
2511    /// # use datafusion::error::Result;
2512    /// # use datafusion_common::ScalarValue;
2513    /// # #[tokio::main]
2514    /// # async fn main() -> Result<()> {
2515    /// let ctx = SessionContext::new();
2516    /// ctx.register_csv("first_table", "tests/data/example.csv", CsvReadOptions::new())
2517    ///     .await?;
2518    /// let df = ctx.table("first_table").await?;
2519    /// ctx.register_csv("second_table", "tests/data/example.csv", CsvReadOptions::new())
2520    ///     .await?;
2521    /// let df2 = ctx.table("second_table").await?;
2522    /// let join_expr = df.find_qualified_columns(&["a"])?.iter()
2523    ///     .zip(df2.find_qualified_columns(&["a"])?.iter())
2524    ///     .map(|(col1, col2)| col(*col1).eq(col(*col2)))
2525    ///     .collect::<Vec<Expr>>();
2526    /// let df3 = df.join_on(df2, JoinType::Inner, join_expr)?;
2527    /// # Ok(())
2528    /// # }
2529    /// ```
2530    pub fn find_qualified_columns(
2531        &self,
2532        names: &[&str],
2533    ) -> Result<Vec<(Option<&TableReference>, &FieldRef)>> {
2534        let schema = self.logical_plan().schema();
2535        names
2536            .iter()
2537            .map(|name| {
2538                schema
2539                    .qualified_field_from_column(&Column::from_name(*name))
2540                    .map_err(|_| plan_datafusion_err!("Column '{}' not found", name))
2541            })
2542            .collect()
2543    }
2544
2545    /// Helper for creating DataFrame.
2546    /// # Example
2547    /// ```
2548    /// use arrow::array::{ArrayRef, Int32Array, StringArray};
2549    /// use datafusion::prelude::DataFrame;
2550    /// use std::sync::Arc;
2551    /// let id: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
2552    /// let name: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
2553    /// let df = DataFrame::from_columns(vec![("id", id), ("name", name)]).unwrap();
2554    /// // +----+------+,
2555    /// // | id | name |,
2556    /// // +----+------+,
2557    /// // | 1  | foo  |,
2558    /// // | 2  | bar  |,
2559    /// // | 3  | baz  |,
2560    /// // +----+------+,
2561    /// ```
2562    pub fn from_columns(columns: Vec<(&str, ArrayRef)>) -> Result<Self> {
2563        let fields = columns
2564            .iter()
2565            .map(|(name, array)| Field::new(*name, array.data_type().clone(), true))
2566            .collect::<Vec<_>>();
2567
2568        let arrays = columns
2569            .into_iter()
2570            .map(|(_, array)| array)
2571            .collect::<Vec<_>>();
2572
2573        let schema = Arc::new(Schema::new(fields));
2574        let batch = RecordBatch::try_new(schema, arrays)?;
2575        let ctx = SessionContext::new();
2576        let df = ctx.read_batch(batch)?;
2577        Ok(df)
2578    }
2579}
2580
2581/// Macro for creating DataFrame.
2582/// # Example
2583/// ```
2584/// use datafusion::prelude::dataframe;
2585/// # use datafusion::error::Result;
2586/// # #[tokio::main]
2587/// # async fn main() -> Result<()> {
2588/// let df = dataframe!(
2589///    "id" => [1, 2, 3],
2590///    "name" => ["foo", "bar", "baz"]
2591///  )?;
2592/// df.show().await?;
2593/// // +----+------+,
2594/// // | id | name |,
2595/// // +----+------+,
2596/// // | 1  | foo  |,
2597/// // | 2  | bar  |,
2598/// // | 3  | baz  |,
2599/// // +----+------+,
2600/// let df_empty = dataframe!()?; // empty DataFrame
2601/// assert_eq!(df_empty.schema().fields().len(), 0);
2602/// assert_eq!(df_empty.count().await?, 0);
2603/// # Ok(())
2604/// # }
2605/// ```
2606#[macro_export]
2607macro_rules! dataframe {
2608    () => {{
2609        use std::sync::Arc;
2610
2611        use datafusion::prelude::SessionContext;
2612        use datafusion::arrow::array::RecordBatch;
2613        use datafusion::arrow::datatypes::Schema;
2614
2615        let ctx = SessionContext::new();
2616        let batch = RecordBatch::new_empty(Arc::new(Schema::empty()));
2617        ctx.read_batch(batch)
2618    }};
2619
2620    ($($name:expr => $data:expr),+ $(,)?) => {{
2621        use datafusion::prelude::DataFrame;
2622        use datafusion::common::test_util::IntoArrayRef;
2623
2624        let columns = vec![
2625            $(
2626                ($name, $data.into_array_ref()),
2627            )+
2628        ];
2629
2630        DataFrame::from_columns(columns)
2631    }};
2632}
2633
2634#[derive(Debug)]
2635struct DataFrameTableProvider {
2636    plan: LogicalPlan,
2637    table_type: TableType,
2638}
2639
2640#[async_trait]
2641impl TableProvider for DataFrameTableProvider {
2642    fn as_any(&self) -> &dyn Any {
2643        self
2644    }
2645
2646    fn get_logical_plan(&self) -> Option<Cow<'_, LogicalPlan>> {
2647        Some(Cow::Borrowed(&self.plan))
2648    }
2649
2650    fn supports_filters_pushdown(
2651        &self,
2652        filters: &[&Expr],
2653    ) -> Result<Vec<TableProviderFilterPushDown>> {
2654        // A filter is added on the DataFrame when given
2655        Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
2656    }
2657
2658    fn schema(&self) -> SchemaRef {
2659        Arc::clone(self.plan.schema().inner())
2660    }
2661
2662    fn table_type(&self) -> TableType {
2663        self.table_type
2664    }
2665
2666    async fn scan(
2667        &self,
2668        state: &dyn Session,
2669        projection: Option<&Vec<usize>>,
2670        filters: &[Expr],
2671        limit: Option<usize>,
2672    ) -> Result<Arc<dyn ExecutionPlan>> {
2673        let mut expr = LogicalPlanBuilder::from(self.plan.clone());
2674        // Add filter when given
2675        let filter = filters.iter().cloned().reduce(|acc, new| acc.and(new));
2676        if let Some(filter) = filter {
2677            expr = expr.filter(filter)?
2678        }
2679
2680        if let Some(p) = projection {
2681            expr = expr.select(p.iter().copied())?
2682        }
2683
2684        // add a limit if given
2685        if let Some(l) = limit {
2686            expr = expr.limit(0, Some(l))?
2687        }
2688        let plan = expr.build()?;
2689        state.create_physical_plan(&plan).await
2690    }
2691}
2692
2693// see tests in datafusion/core/tests/dataframe/mod.rs:2816
datafusion/dataframe/mod.rs

datafusion/dataframe/
mod.rs