A DuckDB client for Elixir, providing a powerful Relation API for analytical queries.
Status: 🚀 Core Relation API implemented and tested - Ready for evaluation
DuckdbEx brings DuckDB's analytical power to Elixir with:
- Lazy Relation API: Composable, chainable query building inspired by Python's DuckDB API
- CLI-based Architecture: Uses DuckDB CLI via erlexec for maximum portability
- Idiomatic Elixir: Functional, pipe-friendly API with pattern matching
- Comprehensive Testing: TDD approach with 71 passing tests
- Python API Compatibility: Functions mirror the official Python API
# Add to mix.exs
def deps do
[
{:duckdb_ex, "~> 0.1.1"}
]
end💡 New! Check out the examples/ directory for 8 comprehensive, runnable examples:
mix run examples/00_quickstart.exs mix run examples/01_basic_queries.exs mix run examples/02_tables_and_data.exs # ... and more!
# Connect to in-memory database
{:ok, conn} = DuckdbEx.Connection.connect(:memory)
# Or connect to a file
{:ok, conn} = DuckdbEx.Connection.connect("/path/to/database.duckdb")
# Execute SQL directly
{:ok, result} = DuckdbEx.Connection.execute(conn, "SELECT 42 as answer")
# Fetch all rows as maps
{:ok, rows} = DuckdbEx.Connection.fetch_all(conn, "SELECT * FROM users")
# => [%{"id" => 1, "name" => "Alice"}, ...]
# Fetch single row
{:ok, row} = DuckdbEx.Connection.fetch_one(conn, "SELECT * FROM users LIMIT 1")
# => %{"id" => 1, "name" => "Alice"}
# Close connection
DuckdbEx.Connection.close(conn)The Relation API allows you to build complex queries through method chaining, with execution deferred until you fetch results:
# Create a relation (no execution yet)
relation = DuckdbEx.Connection.table(conn, "orders")
# Chain operations (still no execution)
result = relation
|> DuckdbEx.Relation.filter("amount > 100")
|> DuckdbEx.Relation.project(["customer_name", "amount", "order_date"])
|> DuckdbEx.Relation.order("amount DESC")
|> DuckdbEx.Relation.limit(10)
|> DuckdbEx.Relation.fetch_all() # Executes here
# Result: Top 10 orders over $100
{:ok, rows} = result# From a table or view
relation = DuckdbEx.Connection.table(conn, "products")
# From SQL
relation = DuckdbEx.Connection.sql(conn, "SELECT * FROM generate_series(1, 100)")
# From range (using DuckDB's range function)
relation = DuckdbEx.Connection.sql(conn, "SELECT * FROM range(10)")# Simple filter
relation
|> DuckdbEx.Relation.filter("price > 50")
|> DuckdbEx.Relation.fetch_all()
# Chain multiple filters (AND logic)
relation
|> DuckdbEx.Relation.filter("price > 50")
|> DuckdbEx.Relation.filter("category = 'Electronics'")
|> DuckdbEx.Relation.fetch_all()
# Complex conditions
relation
|> DuckdbEx.Relation.filter("price > 50 AND (category = 'Electronics' OR category = 'Computers')")
|> DuckdbEx.Relation.fetch_all()# Select specific columns
relation
|> DuckdbEx.Relation.project(["name", "price"])
|> DuckdbEx.Relation.fetch_all()
# Use expressions
relation
|> DuckdbEx.Relation.project([
"name",
"price",
"price * 1.1 as price_with_tax",
"upper(category) as category_upper"
])
|> DuckdbEx.Relation.fetch_all()# Order by column
relation
|> DuckdbEx.Relation.order("price DESC")
|> DuckdbEx.Relation.fetch_all()
# Multiple columns
relation
|> DuckdbEx.Relation.order("category ASC, price DESC")
|> DuckdbEx.Relation.fetch_all()
# Limit results
relation
|> DuckdbEx.Relation.limit(100)
|> DuckdbEx.Relation.fetch_all()
# Top-N query
relation
|> DuckdbEx.Relation.order("revenue DESC")
|> DuckdbEx.Relation.limit(10)
|> DuckdbEx.Relation.fetch_all()# Count all rows
relation
|> DuckdbEx.Relation.aggregate("count(*) as total")
|> DuckdbEx.Relation.fetch_all()
# => {:ok, [%{"total" => 1000}]}
# Multiple aggregations
relation
|> DuckdbEx.Relation.aggregate([
"count(*) as count",
"sum(amount) as total",
"avg(amount) as average",
"min(amount) as minimum",
"max(amount) as maximum"
])
|> DuckdbEx.Relation.fetch_all()# Group by single column
DuckdbEx.Connection.table(conn, "sales")
|> DuckdbEx.Relation.aggregate(
"sum(amount) as total_sales",
group_by: ["region"]
)
|> DuckdbEx.Relation.fetch_all()
# Group by multiple columns
DuckdbEx.Connection.table(conn, "sales")
|> DuckdbEx.Relation.aggregate(
["sum(amount) as total", "count(*) as count"],
group_by: ["region", "year"]
)
|> DuckdbEx.Relation.fetch_all()
# With filtering and ordering
DuckdbEx.Connection.table(conn, "products")
|> DuckdbEx.Relation.filter("price > 10") # WHERE clause
|> DuckdbEx.Relation.aggregate(
["sum(price) as total", "count(*) as count"],
group_by: ["category"]
)
|> DuckdbEx.Relation.filter("total > 1000") # HAVING clause
|> DuckdbEx.Relation.order("total DESC")
|> DuckdbEx.Relation.fetch_all()# Count rows
relation |> DuckdbEx.Relation.count() |> DuckdbEx.Relation.fetch_all()
# => {:ok, [%{"count" => 100}]}
# Sum a column
relation |> DuckdbEx.Relation.sum("amount") |> DuckdbEx.Relation.fetch_all()
# => {:ok, [%{"sum" => 45000}]}
# Average
relation |> DuckdbEx.Relation.avg("price") |> DuckdbEx.Relation.fetch_all()
# => {:ok, [%{"avg" => 42.5}]}
# Min/Max
relation |> DuckdbEx.Relation.min("temperature") |> DuckdbEx.Relation.fetch_all()
relation |> DuckdbEx.Relation.max("score") |> DuckdbEx.Relation.fetch_all(){:ok, conn} = DuckdbEx.Connection.connect(:memory)
# Create and populate table
DuckdbEx.Connection.execute(conn, """
CREATE TABLE orders (
order_id INTEGER,
customer_name VARCHAR,
product_category VARCHAR,
amount DECIMAL(10,2),
order_date DATE
)
""")
DuckdbEx.Connection.execute(conn, """
INSERT INTO orders VALUES
(1, 'Alice', 'Electronics', 999.99, '2024-01-15'),
(2, 'Bob', 'Books', 29.99, '2024-01-16'),
(3, 'Alice', 'Electronics', 49.99, '2024-01-17'),
(4, 'Charlie', 'Furniture', 599.99, '2024-01-18'),
(5, 'Bob', 'Electronics', 299.99, '2024-01-19')
""")
# Analyze: Top customers by total spending in Electronics
{:ok, top_customers} =
conn
|> DuckdbEx.Connection.table("orders")
|> DuckdbEx.Relation.filter("product_category = 'Electronics'")
|> DuckdbEx.Relation.aggregate(
["sum(amount) as total_spent", "count(*) as order_count"],
group_by: ["customer_name"]
)
|> DuckdbEx.Relation.filter("total_spent > 100")
|> DuckdbEx.Relation.order("total_spent DESC")
|> DuckdbEx.Relation.fetch_all()
# Result:
# [
# %{"customer_name" => "Alice", "total_spent" => 1049.98, "order_count" => 2},
# %{"customer_name" => "Bob", "total_spent" => 299.99, "order_count" => 1}
# ]# Daily sales aggregation with statistical measures
{:ok, daily_stats} =
conn
|> DuckdbEx.Connection.table("sales")
|> DuckdbEx.Relation.aggregate(
[
"date_trunc('day', timestamp) as day",
"sum(amount) as daily_total",
"avg(amount) as daily_avg",
"stddev_pop(amount) as daily_stddev",
"count(*) as transaction_count"
],
group_by: ["date_trunc('day', timestamp)"]
)
|> DuckdbEx.Relation.order("day DESC")
|> DuckdbEx.Relation.limit(30)
|> DuckdbEx.Relation.fetch_all()defmodule DataPipeline do
def process_sales_data(conn) do
# Reusable base relation
base = DuckdbEx.Connection.table(conn, "raw_sales")
# High-value customers
high_value = base
|> DuckdbEx.Relation.filter("total_purchases > 1000")
|> DuckdbEx.Relation.project(["customer_id", "email"])
# Recent activity
recent = base
|> DuckdbEx.Relation.filter("order_date > '2024-01-01'")
|> DuckdbEx.Relation.aggregate(
"count(*) as recent_orders",
group_by: ["customer_id"]
)
# Execute both queries
{:ok, high_value_customers} = DuckdbEx.Relation.fetch_all(high_value)
{:ok, recent_activity} = DuckdbEx.Relation.fetch_all(recent)
{high_value_customers, recent_activity}
end
end# Use DuckDB's built-in functions
conn
|> DuckdbEx.Connection.sql("SELECT * FROM range(100)")
|> DuckdbEx.Relation.filter("range % 2 = 0") # Even numbers only
|> DuckdbEx.Relation.project(["range", "range * range as squared"])
|> DuckdbEx.Relation.fetch_all()
# Generate test data
conn
|> DuckdbEx.Connection.sql("SELECT * FROM generate_series(1, 1000) as id")
|> DuckdbEx.Relation.project([
"id",
"random() as random_value",
"case when id % 2 = 0 then 'even' else 'odd' end as parity"
])
|> DuckdbEx.Relation.aggregate(
["avg(random_value) as avg_random", "count(*) as count"],
group_by: ["parity"]
)
|> DuckdbEx.Relation.fetch_all()connect(database, opts \\ [])- Open database connectionexecute(conn, sql, params \\ [])- Execute SQL queryfetch_all(conn, sql)- Execute and fetch all rowsfetch_one(conn, sql)- Execute and fetch first rowclose(conn)- Close connectionsql(conn, sql)- Create relation from SQLtable(conn, table_name)- Create relation from table
Transformations (lazy, return new relation):
project(relation, columns)- Select columnsfilter(relation, condition)- Filter rowslimit(relation, n)- Limit resultsorder(relation, order_by)- Sort resultsaggregate(relation, expressions, opts \\ [])- Aggregate data
Convenience Aggregates:
count(relation)- Count rowssum(relation, column)- Sum columnavg(relation, column)- Average columnmin(relation, column)- Minimum valuemax(relation, column)- Maximum value
Execution (trigger query execution):
execute(relation)- Execute and return result structfetch_all(relation)- Execute and fetch all rowsfetch_one(relation)- Execute and fetch first rowfetch_many(relation, n)- Execute and fetch N rows
fetch_all(result)- Get all rows as list of mapsfetch_one(result)- Get first row as mapfetch_many(result, n)- Get N rows as list of mapsrow_count(result)- Get number of rowscolumns(result)- Get column namesto_tuples(result)- Convert rows to tuples
DuckdbEx uses the DuckDB CLI process via erlexec instead of native NIFs:
Advantages:
- ✅ Maximum portability (works everywhere DuckDB CLI works)
- ✅ No compilation needed
- ✅ Easy to debug and maintain
- ✅ Handles all DuckDB features automatically
Trade-offs:
- JSON serialization overhead (minimal for analytical queries)
- No zero-copy data transfer
- Cannot implement native UDFs
This architecture is ideal for analytical workloads where query execution time dominates, and the JSON overhead is negligible compared to query processing.
The examples/ directory contains 8 comprehensive, runnable examples demonstrating all features:
| Example | Description | Run With |
|---|---|---|
00_quickstart.exs |
Your first DuckDB query | mix run examples/00_quickstart.exs |
01_basic_queries.exs |
Simple queries, math, strings, dates | mix run examples/01_basic_queries.exs |
02_tables_and_data.exs |
CREATE, INSERT, UPDATE, DELETE | mix run examples/02_tables_and_data.exs |
03_transactions.exs |
Transaction management | mix run examples/03_transactions.exs |
04_relations_api.exs |
Lazy query building | mix run examples/04_relations_api.exs |
05_csv_parquet_json.exs |
Reading/writing files | mix run examples/05_csv_parquet_json.exs |
06_analytics_window_functions.exs |
Advanced analytics | mix run examples/06_analytics_window_functions.exs |
07_persistent_database.exs |
File-based databases | mix run examples/07_persistent_database.exs |
See examples/README.md for detailed descriptions and more information.
# Run all tests
mix test
# Run specific test file
mix test test/duckdb_ex/relation_test.exs
# Run with coverage
mix test --cover
# Run with specific seed
mix test --seed 123456Current Test Coverage: 114 tests, 100% pass rate (after performance optimization)
Core Connection API:
- Connection management (connect, close)
- Query execution (execute) with deterministic completion detection
- Result fetching (fetch_all, fetch_one)
- Exception hierarchy (27 types)
- Transaction management (begin, commit, rollback, transaction helper)
- Checkpoint support
- Read-only connections
Relation API - Basic Operations:
- Relation creation (sql, table)
- Projections (project)
- Filtering (filter)
- Ordering (order)
- Limiting (limit)
- Lazy evaluation
Relation API - Aggregations:
- Generic aggregation (aggregate)
- GROUP BY support
- HAVING clause (via filter after aggregate)
- Convenience methods (count, sum, avg, min, max)
- Statistical functions (stddev, variance)
Relation API - Advanced:
- Joins (inner, left, right, outer, cross)
- Set operations (union, intersect, except)
- Distinct operations
File Format Support:
- CSV reading/writing (read_csv_auto, COPY TO)
- Parquet reading/writing
- JSON reading/writing
- Direct file querying
Performance:
- Optimized query execution (100-200x faster via completion markers)
- Tests run in ~1 second (previously took minutes due to timeouts)
- Explorer DataFrame integration
- Prepared statements
- Extensions management
- Streaming results
This project follows strict Test-Driven Development (TDD):
- RED: Write failing tests first
- GREEN: Implement minimal code to pass tests
- REFACTOR: Improve code while keeping tests green
- DOCUMENT: Add comprehensive docs and examples
All contributions should:
- Include comprehensive tests
- Follow existing code style
- Reference Python API where applicable
- Maintain 100% test pass rate
# Python DuckDB
import duckdb
conn = duckdb.connect()
rel = conn.table('users')
result = (rel
.filter('age > 25')
.project(['name', 'email'])
.order('name')
.limit(10)
.fetchall())# Elixir DuckDB
{:ok, conn} = DuckdbEx.Connection.connect(:memory)
{:ok, result} = conn
|> DuckdbEx.Connection.table("users")
|> DuckdbEx.Relation.filter("age > 25")
|> DuckdbEx.Relation.project(["name", "email"])
|> DuckdbEx.Relation.order("name")
|> DuckdbEx.Relation.limit(10)
|> DuckdbEx.Relation.fetch_all()API is intentionally similar for easy migration!
DuckdbEx uses a completion marker approach for deterministic query completion detection instead of timeouts:
- 100-200x faster query execution (7-12ms vs 1000-2000ms per query)
- Full test suite runs in ~1 second (114 tests)
- No arbitrary timeouts or guessing
- Proper error handling for aborted transactions
Instead of waiting for timeouts, we append a marker query after each command:
-- Your query
SELECT * FROM users;
-- Completion marker (added automatically)
SELECT '__DUCKDB_COMPLETE__' as __status__;When we see the marker in the output, we know DuckDB is done. The marker is stripped before returning results to you.
- Deterministic: We know exactly when queries complete
- Fast: No waiting for arbitrary timeouts
- Reliable: Works for all query types (SELECT, DDL, DML)
- Error-aware: Special handling for aborted transactions
- DuckDB excels at analytical queries on large datasets
- Relation API allows DuckDB to optimize entire query tree
- JSON overhead is minimal compared to query execution time
- Best for OLAP workloads, not OLTP
- Elixir 1.14+
- Erlang/OTP 25+
- DuckDB CLI installed and in PATH
MIT License - see LICENSE file for details.
- DuckDB - The amazing analytical database
- DuckDB Python API - API design inspiration
- Community contributors
For questions and discussions:
- Open an issue on GitHub
- Check DuckDB documentation
- Review the
docs/directory for detailed guides
Made with ❤️ for the Elixir and DuckDB communities