From b68ccbe75d8632df8ac0023a1debc4a5b9a1353d Mon Sep 17 00:00:00 2001 From: Lona Date: Tue, 8 Apr 2025 13:05:28 +0200 Subject: [PATCH 01/14] add polars backend --- DESCRIPTION | 11 +- R/DataBackendPolars.R | 249 +++++++++++++++++++++ R/as_polars_backend.R | 54 +++++ tests/testthat/test_as_polars_backend.R | 31 +++ tests/testthat/test_polars.R | 83 +++++++ tests/testthat/test_train_predict_polars.R | 25 +++ 6 files changed, 451 insertions(+), 2 deletions(-) create mode 100644 R/DataBackendPolars.R create mode 100644 R/as_polars_backend.R create mode 100644 tests/testthat/test_as_polars_backend.R create mode 100644 tests/testthat/test_polars.R create mode 100644 tests/testthat/test_train_predict_polars.R diff --git a/DESCRIPTION b/DESCRIPTION index 3b59d1a..3071ae6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,11 +2,17 @@ Package: mlr3db Title: Data Base Backend for 'mlr3' Version: 0.5.1-9000 Authors@R: - person(given = "Michel", + c( + person(given = "Michel", family = "Lang", role = c("cre", "aut"), email = "michellang@gmail.com", - comment = c(ORCID = "0000-0001-9754-0393")) + comment = c(ORCID = "0000-0001-9754-0393")), + person(given = "Lona", + family = "Koers", + role = c("ctb"), + email = "lona.koers@gmail.com") + ) Description: Extends the 'mlr3' package with a backend to transparently work with databases such as 'SQLite', 'DuckDB', 'MySQL', 'MariaDB', or 'PostgreSQL'. The package provides two additional backends: @@ -36,6 +42,7 @@ Suggests: future.apply, future.callr, lgr, + polars, testthat (>= 3.0.0), tibble Encoding: UTF-8 diff --git a/R/DataBackendPolars.R b/R/DataBackendPolars.R new file mode 100644 index 0000000..6de1ea1 --- /dev/null +++ b/R/DataBackendPolars.R @@ -0,0 +1,249 @@ +#' @title DataBackend for Polars +#' +#' @description +#' A [mlr3::DataBackend] using [polars::RPolarsLazyFrame] from package \CRANpkg{polars}. +#' Can be easily constructed with [as_polars_backend()]. +#' [mlr3::Task]s can interface out-of-memory files if the [polars::RPolarsLazyFrame] was imported using a `polars::scan_x` function. +#' Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. +#' Connector is not required but can be useful e.g. for scanning larger than memory files +#' +#' @seealso +#' \url{https://pola-rs.github.io/r-polars/} +#' +#' @param rows (`integer()`)\cr +#' Row indices. +#' @param cols (`character()`)\cr +#' Column names. +#' @param na_rm (`logical(1)`)\cr +#' Whether to remove NAs or not. +#' @param primary_key (`character(1)`)\cr +#' Name of the primary key column. +#' Because `polars` does not natively support primary keys, uniqueness of the primary key column is expected but not enforced. +#' @param connector (`function()`)\cr +#' Optional function which is called to re-connect to e.g. a source file in case the connection became invalid. +#' +#' @template param_strings_as_factors +#' +#' @importFrom mlr3 DataBackend +#' @export +DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneable = FALSE, + public = list( + #' @template field_levels + levels = NULL, + + #' @template field_connector + connector = NULL, + + #' @description + #' + #' Creates a backend for a [polars::RPolarsDataFrame] object. + #' + #' @param data ([polars::RPolarsLazyFrame])\cr + #' The data object. + #' + #' Instead of calling the constructor itself, please call [mlr3::as_data_backend()] on + #' a [polars::RPolarsLazyFrame] or [polars::RPolarsDataFrame]. + #' Note that only [polars::RPolarsLazyFrame]s will be converted to a [DataBackendPolars]. + #' [polars::RPolarsDataFrame] objects without lazy execution will be converted to a + #' [DataBackendDataTable][mlr3::DataBackendDataTable]. + initialize = function(data, primary_key, strings_as_factors = TRUE, connector = NULL) { + loadNamespace("polars") + assert_choice(class(data), "RPolarsLazyFrame") + + super$initialize(data, primary_key) + assert_choice(primary_key, colnames(data)) + self$connector = assert_function(connector, args = character(), null.ok = TRUE) + + if (isFALSE(strings_as_factors)) { + self$levels = list() + } else { + h = self$head(1L) + string_cols = setdiff(names(h)[map_lgl(h, function(x) {is.character(x) || is.factor(x)})], self$primary_key) + + if (isTRUE(strings_as_factors)) { + strings_as_factors = string_cols + } else { + assert_subset(strings_as_factors, string_cols) + } + + self$levels = self$distinct(rows = NULL, cols = strings_as_factors) + } + }, + + #' @description + #' Returns a slice of the data. + #' + #' The rows must be addressed as vector of primary key values, columns must be referred to via column names. + #' Queries for rows with no matching row id and queries for columns with no matching + #' column name are silently ignored. + data = function(rows, cols) { + private$.reconnect() + rows = assert_integerish(rows, coerce = TRUE) + assert_names(cols, type = "unique") + cols = intersect(cols, self$colnames) + + data = private$.data + res = data$filter(pl$col(self$primary_key)$is_in(rows))$select(pl$col(union(self$primary_key, cols)))$collect(streaming = TRUE) + res = as.data.table(res) + + recode(res[list(rows), cols, nomatch = NULL, on = self$primary_key, with = FALSE], + self$levels) + }, + + #' @description + #' Retrieve the first `n` rows. + #' + #' @param n (`integer(1)`)\cr + #' Number of rows. + #' + #' @return [data.table::data.table()] of the first `n` rows. + head = function(n = 6L) { + private$.reconnect() + recode(as.data.table(private$.data$head(n)$collect(streaming = TRUE)), self$levels) + }, + + #' @description + #' Returns a named list of vectors of distinct values for each column + #' specified. If `na_rm` is `TRUE`, missing values are removed from the + #' returned vectors of distinct values. Non-existing rows and columns are + #' silently ignored. + #' + #' @return Named `list()` of distinct values. + distinct = function(rows, cols, na_rm = TRUE) { + private$.reconnect() + assert_names(cols, type = "unique") + cols = intersect(cols, self$colnames) + + dat = private$.data + + if (!is.null(rows)) { + dat = dat$filter(pl$col(self$primary_key)$is_in(rows)) + } + + get_distinct = function(col) { + x = as.vector( + dat$select( + pl$col(col)$unique() + )$collect(streaming = TRUE)$get_column(col) + ) + + if (is.factor(x)) { + x = as.character(x) + } + if (na_rm) { + x = x[!is.na(x)] + } + x + } + setNames(lapply(cols, get_distinct), cols) + }, + + #' @description + #' Returns the number of missing values per column in the specified slice + #' of data. Non-existing rows and columns are silently ignored. + #' + #' @return Total of missing values per column (named `numeric()`). + missings = function(rows, cols) { + private$.reconnect() + rows = assert_integerish(rows, coerce = TRUE) + assert_names(cols, type = "unique") + + cols = intersect(cols, self$colnames) + if (length(cols) == 0L) { + return(setNames(integer(0L), character(0L))) + } + + res = private$.data$filter( + pl$col(self$primary_key)$is_in(rows) + ) + res = res$select( + lapply(cols, function(col) { + pl$col(col)$is_null()$sum()$alias(col) + }) + )$collect(streaming = TRUE) + + if (res$height == 0L) { + return(setNames(integer(length(cols)), cols)) + } + + setNames(mlr3misc::map_int(cols, function(col) as.integer(as.vector(res$get_column(col)))), cols) + } + ), + + active = list( + #' @field rownames (`integer()`)\cr + #' Returns vector of all distinct row identifiers, i.e. the contents of the primary key column. + rownames = function() { + private$.reconnect() + + as.vector( + private$.data$ + select(pl$col(self$primary_key))$ + collect()$ + get_column(self$primary_key) + ) + }, + + #' @field colnames (`character()`)\cr + #' Returns vector of all column names, including the primary key column. + colnames = function() { + private$.reconnect() + names(private$.data$schema) + }, + + #' @field nrow (`integer(1)`)\cr + #' Number of rows (observations). + nrow = function() { + private$.reconnect() + n = private$.data$select(pl$len())$collect(streaming = TRUE)$item() + as.integer(n) + }, + + #' @field ncol (`integer(1)`)\cr + #' Number of columns (variables), including the primary key column. + ncol = function() { + private$.reconnect() + length(private$.data$schema) + } + ), + + private = list( + .calculate_hash = function() { + private$.reconnect() + calculate_hash(private$.data) + }, + + .reconnect = function() { + if (is.null(self$connector)) { + return(invisible()) + } + + con = self$connector() + + if (!all(class(private$.data) == class(con))) { + stop(sprintf("Reconnecting failed. Expected a connection of class %s, but got %s", + paste0(class(private$.data), collapse = "/"), paste0(class(con), collapse = "/")), call. = FALSE) + } + + private$.data = con + } + ) +) + +#' @importFrom mlr3 as_data_backend +#' @export +as_data_backend.RPolarsDataFrame = function(data, primary_key = NULL, ...) { # nolint + data = as.data.frame(data) + + if (!is.null(primary_key) && test_integerish(data[[primary_key]])) { + data[[primary_key]] = as.integer(data[[primary_key]]) + } + + as_data_backend(data, primary_key = primary_key) +} + +#' @importFrom mlr3 as_data_backend +#' @export +as_data_backend.RPolarsLazyFrame = function(data, primary_key, strings_as_factors = TRUE, ...) { # nolint + DataBackendPolars$new(data, primary_key, strings_as_factors) +} diff --git a/R/as_polars_backend.R b/R/as_polars_backend.R new file mode 100644 index 0000000..28e575b --- /dev/null +++ b/R/as_polars_backend.R @@ -0,0 +1,54 @@ +#' @title Convert to Polars Backend +#' +#' @description +#' Converts to a [DataBackendPolars] using the \CRANpkg{polars} database, depending on the input type: +#' +#' * `data.frame`: Creates a new [DataBackendDataTable] first using [as_data_backend()], then proceeds +#' with the conversion from [DataBackendDataTable] to [DataBackendPolars]. +#' * [mlr3::DataBackend]: Creates a new [DataBackendPolars]. +#' +#' There is no automatic connection to the origin file set. +#' If the data is obtained using streaming, a `connector` can be set manually but is not required. +#' +#' @param data (`data.frame()` | [mlr3::DataBackend])\cr +#' See description. +#' @param streaming (`logical(1)`)\cr +#' Whether the data should be only scanned (recommended for large data sets) or loaded into memory completely with every [DataBackendPolars] operation. +#' +#' @param ... (`any`)\cr +#' Additional arguments, passed to [DataBackendPolars]. +#' @template param_path +#' +#' @return [DataBackendPolars] or [Task]. +#' @export +as_polars_backend = function(data, path = getOption("mlr3db.polars_dir", ":temp:"), ...) { + UseMethod("as_polars_backend") +} + + +#' @export +as_polars_backend.data.frame = function(data, primary_key = NULL, ...) { + backend = as_data_backend(data, primary_key = primary_key) + as_polars_backend.DataBackend(backend, ...) +} + + +#' @export +as_polars_backend.DataBackend = function(data, streaming = FALSE, ...) { + path = get_db_path(tempfile(), data$hash, "polars") + + on.exit({ + if (file.exists(path)) file.remove(path) + }) + + primary_key = data$primary_key + + if(streaming) { + as_polars_df(data$head(Inf))$write_parquet(sprintf("%s.parquet", path)) + data = pl$scan_parquet(sprintf("%s.parquet", path)) + } else { + data = as_polars_lf(data$head(Inf)) + } + + DataBackendPolars$new(data = data, primary_key = primary_key, ...) +} diff --git a/tests/testthat/test_as_polars_backend.R b/tests/testthat/test_as_polars_backend.R new file mode 100644 index 0000000..d18504b --- /dev/null +++ b/tests/testthat/test_as_polars_backend.R @@ -0,0 +1,31 @@ +skip_if_not_installed("polars") + +test_that("data.frame", { + b = as_polars_backend(iris, path = tempfile()) + expect_r6(b, "DataBackendPolars") + expect_backend(b) + expect_iris_backend(b) +}) + +test_that("DataBackend", { + # without streaming + b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend, path = tempfile()) + expect_r6(b, "DataBackendPolars") + expect_iris_backend(b) + expect_backend(b) + + # with streaming + b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend, path = tempfile(), streaming = TRUE) + expect_r6(b, "DataBackendPolars") + expect_iris_backend(b) + expect_backend(b) +}) + +test_that("Task", { + task = mlr3::tsk("iris") + task$backend = as_polars_backend(task$backend) + expect_r6(task$backend, "DataBackendPolars") + expect_backend(task$backend) + expect_task(task) +}) + diff --git a/tests/testthat/test_polars.R b/tests/testthat/test_polars.R new file mode 100644 index 0000000..8cde99f --- /dev/null +++ b/tests/testthat/test_polars.R @@ -0,0 +1,83 @@ +skip_if_not_installed("polars") + +test_that("valid DataBackend (polars DataFrame)", { + data = iris + data$Petal.Length[91:120] = NA + data = as_polars_df(data) + b = as_data_backend(data) + expect_backend(b) + expect_iris_backend(b, n_missing = 30L) +}) + +test_that("valid DataBackend (polars LazyFrame)", { + data = iris + data$Petal.Length[91:120] = NA + data = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) + expect_backend(b) + expect_iris_backend(b, n_missing = 30L) +}) + +test_that("valid DataBackend with scanning", { + as_polars_df(iris)$with_row_index("row_id", offset = 1L)$write_parquet("iris.parquet") + on.exit({ + if (file.exists("iris.parquet")) { + file.remove("iris.parquet") + } + }, add = TRUE) + + data = pl$scan_parquet("iris.parquet") + + # valid scanning + b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) + expect_backend(b) + expect_equal(b$nrow, nrow(iris)) + + # valid with connector + b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE, + connector = function() pl$scan_parquet("iris.parquet")) + expect_backend(b) + expect_equal(b$nrow, nrow(iris)) +}) + +test_that("strings_as_factors", { + data = iris + data$Species = as.character(data$Species) + data = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + + b_str = DataBackendPolars$new(data = data, "row_id", strings_as_factors = FALSE) + expect_character(b_str$head()$Species, any.missing = FALSE) + expect_character(b_str$data(b_str$rownames[1], "Species")$Species, any.missing = FALSE) + + b_fact = DataBackendPolars$new(data = data, "row_id", strings_as_factors = TRUE) + expect_factor(b_fact$head()$Species, any.missing = FALSE) + expect_factor(b_fact$data(b_fact$rownames[1], "Species")$Species, any.missing = FALSE) + + b_species = DataBackendPolars$new(data = data, "row_id", strings_as_factors = "Species") + expect_factor(b_species$head()$Species, any.missing = FALSE) + expect_factor(b_species$data(b_species$rownames[1], "Species")$Species, any.missing = FALSE) + + expect_error(DataBackendPolars$new(data = data, "row_id", strings_as_factors = "Sepal.Length")) +}) + +test_that("as_data_backend", { + data = iris + + pl_df = as_polars_df(data)$with_row_index("row_id", offset = 1L) + b = as_data_backend(pl_df, primary_key = "row_id") + expect_r6(b, "DataBackendDataTable") + + pl_lf = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + b = as_data_backend(pl_lf, primary_key = "row_id") + expect_r6(b, "DataBackendPolars") +}) + +test_that("distinct with NULL rows", { + data = as_polars_df(iris) + b = as_data_backend(data) + + expect_equal( + b$distinct(NULL, b$colnames), + b$distinct(b$rownames, b$colnames) + ) +}) diff --git a/tests/testthat/test_train_predict_polars.R b/tests/testthat/test_train_predict_polars.R new file mode 100644 index 0000000..ec0ff99 --- /dev/null +++ b/tests/testthat/test_train_predict_polars.R @@ -0,0 +1,25 @@ +skip_if_not_installed("polars") + +b = as_polars_backend(iris) +task = mlr3::TaskClassif$new("iris_polars", b, "Species") +learner = mlr3::mlr_learners$get("classif.featureless") + +test_that("single step train + predict", { + expect_learner(learner$train(task, 1:120)) + expect_r6(b, "DataBackendPolars") + p = learner$predict(task, 121:150) + expect_prediction(p) + expect_data_table(data.table::as.data.table(p), nrows = 30) + expect_character(learner$errors, len = 0L) +}) + +test_that("resample works", { + rr = mlr3::resample(task, learner, mlr3::rsmp("cv", folds = 3)) + expect_resample_result(rr) +}) + +test_that("predict_newdata", { + learner$train(task, 1:120) + p = learner$predict_newdata(b) + expect_prediction(p) +}) From 968e0e1ba7bc2e5208c535c746baf9276f38f401 Mon Sep 17 00:00:00 2001 From: Lona Date: Wed, 9 Apr 2025 11:40:08 +0200 Subject: [PATCH 02/14] install polars from github --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 3071ae6..e9d8626 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,6 +45,8 @@ Suggests: polars, testthat (>= 3.0.0), tibble +Remotes: + pola-rs/r-polars Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From 1df00632d6f1e4027973b9003fd84685f67f649c Mon Sep 17 00:00:00 2001 From: Lona Date: Wed, 9 Apr 2025 15:50:44 +0200 Subject: [PATCH 03/14] add documentation + examples --- DESCRIPTION | 4 +- NAMESPACE | 6 + R/DataBackendPolars.R | 69 ++++++- R/as_polars_backend.R | 18 +- man/DataBackendPolars.Rd | 263 ++++++++++++++++++++++++ man/as_polars_backend.Rd | 33 +++ man/mlr3db-package.Rd | 5 + tests/testthat/test_as_polars_backend.R | 6 +- tests/testthat/test_polars.R | 18 +- 9 files changed, 391 insertions(+), 31 deletions(-) create mode 100644 man/DataBackendPolars.Rd create mode 100644 man/as_polars_backend.Rd diff --git a/DESCRIPTION b/DESCRIPTION index e9d8626..322baf3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ Authors@R: comment = c(ORCID = "0000-0001-9754-0393")), person(given = "Lona", family = "Koers", - role = c("ctb"), + role = c("aut"), email = "lona.koers@gmail.com") ) Description: Extends the 'mlr3' package with a backend to @@ -50,4 +50,4 @@ Remotes: Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index ed19f2c..fe7818f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,22 @@ # Generated by roxygen2: do not edit by hand +S3method(as_data_backend,RPolarsDataFrame) +S3method(as_data_backend,RPolarsLazyFrame) S3method(as_data_backend,tbl_SQLiteConnection) S3method(as_data_backend,tbl_duckdb_connection) S3method(as_data_backend,tbl_lazy) S3method(as_duckdb_backend,DataBackend) S3method(as_duckdb_backend,character) S3method(as_duckdb_backend,data.frame) +S3method(as_polars_backend,DataBackend) +S3method(as_polars_backend,data.frame) S3method(as_sqlite_backend,DataBackend) S3method(as_sqlite_backend,data.frame) export(DataBackendDplyr) export(DataBackendDuckDB) +export(DataBackendPolars) export(as_duckdb_backend) +export(as_polars_backend) export(as_sqlite_backend) if (getRversion() >= "3.6.0") S3method(dplyr::show_query, DataBackendDplyr) import(checkmate) diff --git a/R/DataBackendPolars.R b/R/DataBackendPolars.R index 6de1ea1..37d5957 100644 --- a/R/DataBackendPolars.R +++ b/R/DataBackendPolars.R @@ -5,7 +5,7 @@ #' Can be easily constructed with [as_polars_backend()]. #' [mlr3::Task]s can interface out-of-memory files if the [polars::RPolarsLazyFrame] was imported using a `polars::scan_x` function. #' Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. -#' Connector is not required but can be useful e.g. for scanning larger than memory files +#' A connector is not required but can be useful e.g. for scanning larger than memory files #' #' @seealso #' \url{https://pola-rs.github.io/r-polars/} @@ -26,6 +26,59 @@ #' #' @importFrom mlr3 DataBackend #' @export +#' @examples +#' if (mlr3misc::require_namespaces("polars", quietly = TRUE)) { +#' # Backend using a in-memory data set +#' data = iris +#' data$Sepal.Length[1:30] = NA +#' data$row_id = 1:150 +#' data = polars::as_polars_lf(data) +#' b = DataBackendPolars$new(data, primary_key = "row_id") +#' +#' # Object supports all accessors of DataBackend +#' print(b) +#' b$nrow +#' b$ncol +#' b$colnames +#' b$data(rows = 100:101, cols = "Species") +#' b$distinct(b$rownames, "Species") +#' +#' # Classification task using this backend +#' task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") +#' print(task) +#' head(task) +#' +#' # Write a parquet file to scan +#' data$collect()$write_parquet("iris.parquet") +#' data = polars::pl$scan_parquet("iris.parquet") +#' +#' # Backend that re-reads the parquet file if the connection fails +#' b = DataBackendPolars$new(data, "row_id", +#' connector = function() polars::pl$scan_parquet("iris.parquet")) +#' print(b) +#' +#' # Define a backend on a subset of the database: do not use column "Sepal.Width" +#' data = data$select( +#' polars::pl$col(setdiff(colnames(data), "Sepal.Width")) +#' )$filter( +#' polars::pl$col("row_id")$is_in(1:120) # Use only first 120 rows +#' ) +#' +#' # Backend with only scanned data +#' b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) +#' print(b) +#' +#' # Query disinct values +#' b$distinct(b$rownames, "Species") +#' +#' # Query number of missing values +#' b$missings(b$rownames, b$colnames) +#' +#' # Cleanup +#' if (file.exists("iris.parquet")) { +#' file.remove("iris.parquet") +#' } +#' } DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneable = FALSE, public = list( #' @template field_levels @@ -83,7 +136,7 @@ DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneabl cols = intersect(cols, self$colnames) data = private$.data - res = data$filter(pl$col(self$primary_key)$is_in(rows))$select(pl$col(union(self$primary_key, cols)))$collect(streaming = TRUE) + res = data$filter(polars::pl$col(self$primary_key)$is_in(rows))$select(polars::pl$col(union(self$primary_key, cols)))$collect(streaming = TRUE) res = as.data.table(res) recode(res[list(rows), cols, nomatch = NULL, on = self$primary_key, with = FALSE], @@ -117,13 +170,13 @@ DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneabl dat = private$.data if (!is.null(rows)) { - dat = dat$filter(pl$col(self$primary_key)$is_in(rows)) + dat = dat$filter(polars::pl$col(self$primary_key)$is_in(rows)) } get_distinct = function(col) { x = as.vector( dat$select( - pl$col(col)$unique() + polars::pl$col(col)$unique() )$collect(streaming = TRUE)$get_column(col) ) @@ -154,11 +207,11 @@ DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneabl } res = private$.data$filter( - pl$col(self$primary_key)$is_in(rows) + polars::pl$col(self$primary_key)$is_in(rows) ) res = res$select( lapply(cols, function(col) { - pl$col(col)$is_null()$sum()$alias(col) + polars::pl$col(col)$is_null()$sum()$alias(col) }) )$collect(streaming = TRUE) @@ -178,7 +231,7 @@ DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneabl as.vector( private$.data$ - select(pl$col(self$primary_key))$ + select(polars::pl$col(self$primary_key))$ collect()$ get_column(self$primary_key) ) @@ -195,7 +248,7 @@ DataBackendPolars = R6Class("DataBackendPolars", inherit = DataBackend, cloneabl #' Number of rows (observations). nrow = function() { private$.reconnect() - n = private$.data$select(pl$len())$collect(streaming = TRUE)$item() + n = private$.data$select(polars::pl$len())$collect(streaming = TRUE)$item() as.integer(n) }, diff --git a/R/as_polars_backend.R b/R/as_polars_backend.R index 28e575b..f7a5d95 100644 --- a/R/as_polars_backend.R +++ b/R/as_polars_backend.R @@ -8,27 +8,27 @@ #' * [mlr3::DataBackend]: Creates a new [DataBackendPolars]. #' #' There is no automatic connection to the origin file set. -#' If the data is obtained using streaming, a `connector` can be set manually but is not required. +#' If the data is obtained using scanning and the data is streamed, a `connector` can be set manually but is not required. #' #' @param data (`data.frame()` | [mlr3::DataBackend])\cr #' See description. #' @param streaming (`logical(1)`)\cr -#' Whether the data should be only scanned (recommended for large data sets) or loaded into memory completely with every [DataBackendPolars] operation. +#' Whether the data should be only scanned (recommended for large data sets) and streamed with +#' every [DataBackendPolars] operation or loaded into memory completely. #' #' @param ... (`any`)\cr #' Additional arguments, passed to [DataBackendPolars]. -#' @template param_path #' #' @return [DataBackendPolars] or [Task]. #' @export -as_polars_backend = function(data, path = getOption("mlr3db.polars_dir", ":temp:"), ...) { +as_polars_backend = function(data, streaming = FALSE, ...) { UseMethod("as_polars_backend") } #' @export -as_polars_backend.data.frame = function(data, primary_key = NULL, ...) { - backend = as_data_backend(data, primary_key = primary_key) +as_polars_backend.data.frame = function(data, streaming = FALSE, primary_key = NULL, ...) { + backend = as_data_backend(data, primary_key = primary_key, streaming = streaming) as_polars_backend.DataBackend(backend, ...) } @@ -44,10 +44,10 @@ as_polars_backend.DataBackend = function(data, streaming = FALSE, ...) { primary_key = data$primary_key if(streaming) { - as_polars_df(data$head(Inf))$write_parquet(sprintf("%s.parquet", path)) - data = pl$scan_parquet(sprintf("%s.parquet", path)) + polars::as_polars_df(data$head(Inf))$write_parquet(sprintf("%s.parquet", path)) + data = polars::pl$scan_parquet(sprintf("%s.parquet", path)) } else { - data = as_polars_lf(data$head(Inf)) + data = polars::as_polars_lf(data$head(Inf)) } DataBackendPolars$new(data = data, primary_key = primary_key, ...) diff --git a/man/DataBackendPolars.Rd b/man/DataBackendPolars.Rd new file mode 100644 index 0000000..87ed724 --- /dev/null +++ b/man/DataBackendPolars.Rd @@ -0,0 +1,263 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataBackendPolars.R +\name{DataBackendPolars} +\alias{DataBackendPolars} +\title{DataBackend for Polars} +\description{ +A \link[mlr3:DataBackend]{mlr3::DataBackend} using \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} from package \CRANpkg{polars}. +Can be easily constructed with \code{\link[=as_polars_backend]{as_polars_backend()}}. +\link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function. +Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. +A connector is not required but can be useful e.g. for scanning larger than memory files +} +\examples{ +if (mlr3misc::require_namespaces("polars", quietly = TRUE)) { + # Backend using a in-memory data set + data = iris + data$Sepal.Length[1:30] = NA + data$row_id = 1:150 + data = polars::as_polars_lf(data) + b = DataBackendPolars$new(data, primary_key = "row_id") + + # Object supports all accessors of DataBackend + print(b) + b$nrow + b$ncol + b$colnames + b$data(rows = 100:101, cols = "Species") + b$distinct(b$rownames, "Species") + + # Classification task using this backend + task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") + print(task) + head(task) + + # Write a parquet file to scan + data$collect()$write_parquet("iris.parquet") + data = polars::pl$scan_parquet("iris.parquet") + + # Backend that re-reads the parquet file if the connection fails + b = DataBackendPolars$new(data, "row_id", + connector = function() polars::pl$scan_parquet("iris.parquet")) + print(b) + + # Define a backend on a subset of the database: do not use column "Sepal.Width" + data = data$select( + polars::pl$col(setdiff(colnames(data), "Sepal.Width")) + )$filter( + polars::pl$col("row_id")$is_in(1:120) # Use only first 120 rows + ) + + # Backend with only scanned data + b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) + print(b) + + # Query disinct values + b$distinct(b$rownames, "Species") + + # Query number of missing values + b$missings(b$rownames, b$colnames) + + # Cleanup + if (file.exists("iris.parquet")) { + file.remove("iris.parquet") + } +} +} +\seealso{ +\url{https://pola-rs.github.io/r-polars/} +} +\section{Super class}{ +\code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{DataBackendPolars} +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{levels}}{(named \code{list()})\cr +List (named with column names) of factor levels as \code{character()}. +Used to auto-convert character columns to factor variables.} + +\item{\code{connector}}{(\verb{function()})\cr +Function which is called to re-connect in case the connection became invalid.} +} +\if{html}{\out{
}} +} +\section{Active bindings}{ +\if{html}{\out{
}} +\describe{ +\item{\code{rownames}}{(\code{integer()})\cr +Returns vector of all distinct row identifiers, i.e. the contents of the primary key column.} + +\item{\code{colnames}}{(\code{character()})\cr +Returns vector of all column names, including the primary key column.} + +\item{\code{nrow}}{(\code{integer(1)})\cr +Number of rows (observations).} + +\item{\code{ncol}}{(\code{integer(1)})\cr +Number of columns (variables), including the primary key column.} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-DataBackendPolars-new}{\code{DataBackendPolars$new()}} +\item \href{#method-DataBackendPolars-data}{\code{DataBackendPolars$data()}} +\item \href{#method-DataBackendPolars-head}{\code{DataBackendPolars$head()}} +\item \href{#method-DataBackendPolars-distinct}{\code{DataBackendPolars$distinct()}} +\item \href{#method-DataBackendPolars-missings}{\code{DataBackendPolars$missings()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendPolars-new}{}}} +\subsection{Method \code{new()}}{ +Creates a backend for a \link[polars:DataFrame_class]{polars::RPolarsDataFrame} object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendPolars$new( + data, + primary_key, + strings_as_factors = TRUE, + connector = NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{(\link[polars:LazyFrame_class]{polars::RPolarsLazyFrame})\cr +The data object. + +Instead of calling the constructor itself, please call \code{\link[mlr3:as_data_backend]{mlr3::as_data_backend()}} on +a \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} or \link[polars:DataFrame_class]{polars::RPolarsDataFrame}. +Note that only \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame}s will be converted to a \link{DataBackendPolars}. +\link[polars:DataFrame_class]{polars::RPolarsDataFrame} objects without lazy execution will be converted to a +\link[mlr3:DataBackendDataTable]{DataBackendDataTable}.} + +\item{\code{primary_key}}{(\code{character(1)})\cr +Name of the primary key column. +Because \code{polars} does not natively support primary keys, uniqueness of the primary key column is expected but not enforced.} + +\item{\code{strings_as_factors}}{(\code{logical(1)} || \code{character()})\cr +Either a character vector of column names to convert to factors, or a single logical flag: +if \code{FALSE}, no column will be converted, if \code{TRUE} all string columns (except the primary key). +For conversion, the backend is queried for distinct values of the respective columns +on construction and their levels are stored in \verb{$levels}.} + +\item{\code{connector}}{(\verb{function()})\cr +Optional function which is called to re-connect to e.g. a source file in case the connection became invalid.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendPolars-data}{}}} +\subsection{Method \code{data()}}{ +Returns a slice of the data. + +The rows must be addressed as vector of primary key values, columns must be referred to via column names. +Queries for rows with no matching row id and queries for columns with no matching +column name are silently ignored. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendPolars$data(rows, cols)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{rows}}{(\code{integer()})\cr +Row indices.} + +\item{\code{cols}}{(\code{character()})\cr +Column names.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendPolars-head}{}}} +\subsection{Method \code{head()}}{ +Retrieve the first \code{n} rows. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendPolars$head(n = 6L)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{n}}{(\code{integer(1)})\cr +Number of rows.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +\code{\link[data.table:data.table]{data.table::data.table()}} of the first \code{n} rows. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendPolars-distinct}{}}} +\subsection{Method \code{distinct()}}{ +Returns a named list of vectors of distinct values for each column +specified. If \code{na_rm} is \code{TRUE}, missing values are removed from the +returned vectors of distinct values. Non-existing rows and columns are +silently ignored. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendPolars$distinct(rows, cols, na_rm = TRUE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{rows}}{(\code{integer()})\cr +Row indices.} + +\item{\code{cols}}{(\code{character()})\cr +Column names.} + +\item{\code{na_rm}}{(\code{logical(1)})\cr +Whether to remove NAs or not.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Named \code{list()} of distinct values. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendPolars-missings}{}}} +\subsection{Method \code{missings()}}{ +Returns the number of missing values per column in the specified slice +of data. Non-existing rows and columns are silently ignored. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendPolars$missings(rows, cols)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{rows}}{(\code{integer()})\cr +Row indices.} + +\item{\code{cols}}{(\code{character()})\cr +Column names.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Total of missing values per column (named \code{numeric()}). +} +} +} diff --git a/man/as_polars_backend.Rd b/man/as_polars_backend.Rd new file mode 100644 index 0000000..ce9409b --- /dev/null +++ b/man/as_polars_backend.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_polars_backend.R +\name{as_polars_backend} +\alias{as_polars_backend} +\title{Convert to Polars Backend} +\usage{ +as_polars_backend(data, streaming = FALSE, ...) +} +\arguments{ +\item{data}{(\code{data.frame()} | \link[mlr3:DataBackend]{mlr3::DataBackend})\cr +See description.} + +\item{streaming}{(\code{logical(1)})\cr +Whether the data should be only scanned (recommended for large data sets) and streamed with +every \link{DataBackendPolars} operation or loaded into memory completely.} + +\item{...}{(\code{any})\cr +Additional arguments, passed to \link{DataBackendPolars}.} +} +\value{ +\link{DataBackendPolars} or \link{Task}. +} +\description{ +Converts to a \link{DataBackendPolars} using the \CRANpkg{polars} database, depending on the input type: +\itemize{ +\item \code{data.frame}: Creates a new \link{DataBackendDataTable} first using \code{\link[=as_data_backend]{as_data_backend()}}, then proceeds +with the conversion from \link{DataBackendDataTable} to \link{DataBackendPolars}. +\item \link[mlr3:DataBackend]{mlr3::DataBackend}: Creates a new \link{DataBackendPolars}. +} + +There is no automatic connection to the origin file set. +If the data is obtained using scanning and the data is streamed, a \code{connector} can be set manually but is not required. +} diff --git a/man/mlr3db-package.Rd b/man/mlr3db-package.Rd index 4bac0d7..94768c5 100644 --- a/man/mlr3db-package.Rd +++ b/man/mlr3db-package.Rd @@ -30,4 +30,9 @@ Useful links: \author{ \strong{Maintainer}: Michel Lang \email{michellang@gmail.com} (\href{https://orcid.org/0000-0001-9754-0393}{ORCID}) +Authors: +\itemize{ + \item Lona Koers \email{lona.koers@gmail.com} +} + } diff --git a/tests/testthat/test_as_polars_backend.R b/tests/testthat/test_as_polars_backend.R index d18504b..501bbd0 100644 --- a/tests/testthat/test_as_polars_backend.R +++ b/tests/testthat/test_as_polars_backend.R @@ -1,7 +1,7 @@ skip_if_not_installed("polars") test_that("data.frame", { - b = as_polars_backend(iris, path = tempfile()) + b = as_polars_backend(iris) expect_r6(b, "DataBackendPolars") expect_backend(b) expect_iris_backend(b) @@ -9,13 +9,13 @@ test_that("data.frame", { test_that("DataBackend", { # without streaming - b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend, path = tempfile()) + b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend) expect_r6(b, "DataBackendPolars") expect_iris_backend(b) expect_backend(b) # with streaming - b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend, path = tempfile(), streaming = TRUE) + b = as_polars_backend(mlr3::mlr_tasks$get("iris")$backend, streaming = TRUE) expect_r6(b, "DataBackendPolars") expect_iris_backend(b) expect_backend(b) diff --git a/tests/testthat/test_polars.R b/tests/testthat/test_polars.R index 8cde99f..7d1b0c7 100644 --- a/tests/testthat/test_polars.R +++ b/tests/testthat/test_polars.R @@ -3,7 +3,7 @@ skip_if_not_installed("polars") test_that("valid DataBackend (polars DataFrame)", { data = iris data$Petal.Length[91:120] = NA - data = as_polars_df(data) + data = polars::as_polars_df(data) b = as_data_backend(data) expect_backend(b) expect_iris_backend(b, n_missing = 30L) @@ -12,21 +12,21 @@ test_that("valid DataBackend (polars DataFrame)", { test_that("valid DataBackend (polars LazyFrame)", { data = iris data$Petal.Length[91:120] = NA - data = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + data = polars::as_polars_lf(data)$with_row_index("row_id", offset = 1L) b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) expect_backend(b) expect_iris_backend(b, n_missing = 30L) }) test_that("valid DataBackend with scanning", { - as_polars_df(iris)$with_row_index("row_id", offset = 1L)$write_parquet("iris.parquet") + polars::as_polars_df(iris)$with_row_index("row_id", offset = 1L)$write_parquet("iris.parquet") on.exit({ if (file.exists("iris.parquet")) { file.remove("iris.parquet") } }, add = TRUE) - data = pl$scan_parquet("iris.parquet") + data = polars::pl$scan_parquet("iris.parquet") # valid scanning b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE) @@ -35,7 +35,7 @@ test_that("valid DataBackend with scanning", { # valid with connector b = DataBackendPolars$new(data, "row_id", strings_as_factors = TRUE, - connector = function() pl$scan_parquet("iris.parquet")) + connector = function() polars::pl$scan_parquet("iris.parquet")) expect_backend(b) expect_equal(b$nrow, nrow(iris)) }) @@ -43,7 +43,7 @@ test_that("valid DataBackend with scanning", { test_that("strings_as_factors", { data = iris data$Species = as.character(data$Species) - data = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + data = polars::as_polars_lf(data)$with_row_index("row_id", offset = 1L) b_str = DataBackendPolars$new(data = data, "row_id", strings_as_factors = FALSE) expect_character(b_str$head()$Species, any.missing = FALSE) @@ -63,17 +63,17 @@ test_that("strings_as_factors", { test_that("as_data_backend", { data = iris - pl_df = as_polars_df(data)$with_row_index("row_id", offset = 1L) + pl_df = polars::as_polars_df(data)$with_row_index("row_id", offset = 1L) b = as_data_backend(pl_df, primary_key = "row_id") expect_r6(b, "DataBackendDataTable") - pl_lf = as_polars_lf(data)$with_row_index("row_id", offset = 1L) + pl_lf = polars::as_polars_lf(data)$with_row_index("row_id", offset = 1L) b = as_data_backend(pl_lf, primary_key = "row_id") expect_r6(b, "DataBackendPolars") }) test_that("distinct with NULL rows", { - data = as_polars_df(iris) + data = polars::as_polars_df(iris) b = as_data_backend(data) expect_equal( From a42b0b2107ea3197afa9236a6f9ab1bee981c8c3 Mon Sep 17 00:00:00 2001 From: Lona Date: Thu, 10 Apr 2025 10:55:32 +0200 Subject: [PATCH 04/14] . --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 322baf3..3269914 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,8 +45,7 @@ Suggests: polars, testthat (>= 3.0.0), tibble -Remotes: - pola-rs/r-polars +extra-repositories: https://community.r-multiverse.org Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From 3c84a0267d75781fe4657eed093a83999777a3ce Mon Sep 17 00:00:00 2001 From: Lona Date: Thu, 10 Apr 2025 11:00:15 +0200 Subject: [PATCH 05/14] . --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3269914..e8fe1b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,7 @@ Suggests: polars, testthat (>= 3.0.0), tibble -extra-repositories: https://community.r-multiverse.org +Additional_repositories: https://community.r-multiverse.org Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From 0c17dfac7f7091e3e8228eb81516217229625536 Mon Sep 17 00:00:00 2001 From: Lona Date: Thu, 10 Apr 2025 11:06:23 +0200 Subject: [PATCH 06/14] . --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e8fe1b5..8b12e80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,9 +43,11 @@ Suggests: future.callr, lgr, polars, + remotes, testthat (>= 3.0.0), tibble -Additional_repositories: https://community.r-multiverse.org +Remotes: + pola-rs/r-polars Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From 819d768a922555ae3a85390ff96a9bf4f3e543e1 Mon Sep 17 00:00:00 2001 From: mb706 Date: Thu, 10 Apr 2025 09:23:36 +0000 Subject: [PATCH 07/14] try this out --- DESCRIPTION | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8b12e80..3e315c5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,8 +46,7 @@ Suggests: remotes, testthat (>= 3.0.0), tibble -Remotes: - pola-rs/r-polars +Additional_repositories: https://community.r-multiverse.org Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From f7df60fee0e3f0f48dea37e40c162d75440a0121 Mon Sep 17 00:00:00 2001 From: mb706 Date: Thu, 10 Apr 2025 09:33:30 +0000 Subject: [PATCH 08/14] see if this works --- .github/workflows/r-cmd-check.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/r-cmd-check.yml b/.github/workflows/r-cmd-check.yml index d5e5559..f45ca57 100644 --- a/.github/workflows/r-cmd-check.yml +++ b/.github/workflows/r-cmd-check.yml @@ -33,6 +33,7 @@ jobs: - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} + extra-repositories: 'https://community.r-multiverse.org' - uses: r-lib/actions/setup-r-dependencies@v2 with: From 4b5bfc78611bfcb1367ad2e3ce3574219b31d69a Mon Sep 17 00:00:00 2001 From: mb706 Date: Thu, 10 Apr 2025 09:36:03 +0000 Subject: [PATCH 09/14] maybe not needed --- DESCRIPTION | 2 -- 1 file changed, 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e315c5..ffb1d07 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,10 +43,8 @@ Suggests: future.callr, lgr, polars, - remotes, testthat (>= 3.0.0), tibble -Additional_repositories: https://community.r-multiverse.org Encoding: UTF-8 Config/testthat/edition: 3 Roxygen: list(markdown = TRUE) From 78a9764386a460a88db8c138a5bff3b4f2c5c7cb Mon Sep 17 00:00:00 2001 From: mb706 Date: Thu, 10 Apr 2025 09:37:20 +0000 Subject: [PATCH 10/14] seems to work --- .github/workflows/pkgdown.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pkgdown.yml b/.github/workflows/pkgdown.yml index 3dc3194..3727205 100644 --- a/.github/workflows/pkgdown.yml +++ b/.github/workflows/pkgdown.yml @@ -28,6 +28,8 @@ jobs: - uses: r-lib/actions/setup-pandoc@v2 - uses: r-lib/actions/setup-r@v2 + with: + extra-repositories: 'https://community.r-multiverse.org' - uses: r-lib/actions/setup-r-dependencies@v2 with: From b09a2de4121a7a076e575417c44b40c2f7476461 Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 3 Jun 2025 08:58:10 +0200 Subject: [PATCH 11/14] ... --- man/DataBackendPolars.Rd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/man/DataBackendPolars.Rd b/man/DataBackendPolars.Rd index 87ed724..f2e5a16 100644 --- a/man/DataBackendPolars.Rd +++ b/man/DataBackendPolars.Rd @@ -4,9 +4,9 @@ \alias{DataBackendPolars} \title{DataBackend for Polars} \description{ -A \link[mlr3:DataBackend]{mlr3::DataBackend} using \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} from package \CRANpkg{polars}. +A \link[mlr3:DataBackend]{mlr3::DataBackend} using \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} from package \CRANpkg{polars}. Can be easily constructed with \code{\link[=as_polars_backend]{as_polars_backend()}}. -\link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function. +\link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function. Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. A connector is not required but can be useful e.g. for scanning larger than memory files } @@ -121,7 +121,7 @@ Number of columns (variables), including the primary key column.} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-DataBackendPolars-new}{}}} \subsection{Method \code{new()}}{ -Creates a backend for a \link[polars:DataFrame_class]{polars::RPolarsDataFrame} object. +Creates a backend for a \link[polars:RPolarsDataFrame]{polars::RPolarsDataFrame} object. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{DataBackendPolars$new( data, @@ -134,13 +134,13 @@ Creates a backend for a \link[polars:DataFrame_class]{polars::RPolarsDataFrame} \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{data}}{(\link[polars:LazyFrame_class]{polars::RPolarsLazyFrame})\cr +\item{\code{data}}{(\link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame})\cr The data object. Instead of calling the constructor itself, please call \code{\link[mlr3:as_data_backend]{mlr3::as_data_backend()}} on -a \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame} or \link[polars:DataFrame_class]{polars::RPolarsDataFrame}. -Note that only \link[polars:LazyFrame_class]{polars::RPolarsLazyFrame}s will be converted to a \link{DataBackendPolars}. -\link[polars:DataFrame_class]{polars::RPolarsDataFrame} objects without lazy execution will be converted to a +a \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} or \link[polars:RPolarsDataFrame]{polars::RPolarsDataFrame}. +Note that only \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame}s will be converted to a \link{DataBackendPolars}. +\link[polars:RPolarsDataFrame]{polars::RPolarsDataFrame} objects without lazy execution will be converted to a \link[mlr3:DataBackendDataTable]{DataBackendDataTable}.} \item{\code{primary_key}}{(\code{character(1)})\cr From 187f6a3bac11abef443489d18bb4637243e0655b Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 3 Jun 2025 08:59:43 +0200 Subject: [PATCH 12/14] ... --- R/DataBackendPolars.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/DataBackendPolars.R b/R/DataBackendPolars.R index 37d5957..8732824 100644 --- a/R/DataBackendPolars.R +++ b/R/DataBackendPolars.R @@ -44,7 +44,7 @@ #' b$distinct(b$rownames, "Species") #' #' # Classification task using this backend -#' task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") +#' task = mlr3::TaskClassif$new(id = "iris_polars", backend = b, target = "Species") #' print(task) #' head(task) #' @@ -54,7 +54,7 @@ #' #' # Backend that re-reads the parquet file if the connection fails #' b = DataBackendPolars$new(data, "row_id", -#' connector = function() polars::pl$scan_parquet("iris.parquet")) +#' connector = function() polars::pl$scan_parquet("iris.parquet")) #' print(b) #' #' # Define a backend on a subset of the database: do not use column "Sepal.Width" From 7c55ba5d6824030066684f65d4ca994ca03835b6 Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 3 Jun 2025 08:59:52 +0200 Subject: [PATCH 13/14] ... --- man/DataBackendPolars.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/DataBackendPolars.Rd b/man/DataBackendPolars.Rd index f2e5a16..855f1ac 100644 --- a/man/DataBackendPolars.Rd +++ b/man/DataBackendPolars.Rd @@ -28,7 +28,7 @@ if (mlr3misc::require_namespaces("polars", quietly = TRUE)) { b$distinct(b$rownames, "Species") # Classification task using this backend - task = mlr3::TaskClassif$new(id = "iris_tibble", backend = b, target = "Species") + task = mlr3::TaskClassif$new(id = "iris_polars", backend = b, target = "Species") print(task) head(task) @@ -38,7 +38,7 @@ if (mlr3misc::require_namespaces("polars", quietly = TRUE)) { # Backend that re-reads the parquet file if the connection fails b = DataBackendPolars$new(data, "row_id", - connector = function() polars::pl$scan_parquet("iris.parquet")) + connector = function() polars::pl$scan_parquet("iris.parquet")) print(b) # Define a backend on a subset of the database: do not use column "Sepal.Width" From 5cd286c3079d36155cd85c72a4c017e6c507e500 Mon Sep 17 00:00:00 2001 From: be-marc Date: Tue, 3 Jun 2025 09:00:28 +0200 Subject: [PATCH 14/14] ... --- R/DataBackendPolars.R | 4 ++-- man/DataBackendPolars.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/DataBackendPolars.R b/R/DataBackendPolars.R index 8732824..b34ddd9 100644 --- a/R/DataBackendPolars.R +++ b/R/DataBackendPolars.R @@ -1,9 +1,9 @@ #' @title DataBackend for Polars #' #' @description -#' A [mlr3::DataBackend] using [polars::RPolarsLazyFrame] from package \CRANpkg{polars}. +#' A [mlr3::DataBackend] using `RPolarsLazyFrame` from package \CRANpkg{polars}. #' Can be easily constructed with [as_polars_backend()]. -#' [mlr3::Task]s can interface out-of-memory files if the [polars::RPolarsLazyFrame] was imported using a `polars::scan_x` function. +#' [mlr3::Task]s can interface out-of-memory files if the `polars::RPolarsLazyFrame` was imported using a `polars::scan_x` function. #' Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. #' A connector is not required but can be useful e.g. for scanning larger than memory files #' diff --git a/man/DataBackendPolars.Rd b/man/DataBackendPolars.Rd index 855f1ac..c11a28a 100644 --- a/man/DataBackendPolars.Rd +++ b/man/DataBackendPolars.Rd @@ -4,9 +4,9 @@ \alias{DataBackendPolars} \title{DataBackend for Polars} \description{ -A \link[mlr3:DataBackend]{mlr3::DataBackend} using \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} from package \CRANpkg{polars}. +A \link[mlr3:DataBackend]{mlr3::DataBackend} using \code{RPolarsLazyFrame} from package \CRANpkg{polars}. Can be easily constructed with \code{\link[=as_polars_backend]{as_polars_backend()}}. -\link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \link[polars:RPolarsLazyFrame]{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function. +\link[mlr3:Task]{mlr3::Task}s can interface out-of-memory files if the \code{polars::RPolarsLazyFrame} was imported using a \code{polars::scan_x} function. Streaming, a \CRANpkg{polars} alpha feature, is always enabled, but only used when applicable. A connector is not required but can be useful e.g. for scanning larger than memory files }