#' @importClassesFrom Matrix Matrix #' @importClassesFrom DelayedArray DelayedMatrix setClassUnion("Matrix_OR_DelayedMatrix", c("matrix", "Matrix", "DelayedMatrix")) #' @importClassesFrom S4Vectors DataFrame setClassUnion("data.frame_OR_DataFrame", c("data.frame", "DataFrame")) #' Save an experiment to an .h5ad file. #' #' Note that NA values are not supported by HDF5, and therefore by h5ad. The behavior of this #' function if NAs are present is undefined. #' #' @param object The object to save. #' @param file Name of the file to save to. #' @param overwrite Currently unused. #' #' @returns NULL, invisibly #' #' @examples #' data(miniACC, package="MultiAssayExperiment") #' writeH5AD(miniACC[[1]], "miniacc.h5ad") #' #' @importFrom rhdf5 H5Iget_type H5Gcreate H5Gclose #' @importFrom SummarizedExperiment colData assays #' @importFrom S4Vectors metadata #' @importFrom methods hasMethod as #' @importFrom SingleCellExperiment altExps rowData colData colPairNames colPair rowPairNames rowPair reducedDims #' #' @export writeH5AD <- function(object, file, overwrite) { need_finalize <- FALSE written_object <- FALSE if (is.character(file)) { file <- open_h5(file) need_finalize <- TRUE } else if (!(H5Iget_type(file) %in% c("H5I_FILE", "H5I_GROUP"))) stop("file must be a character, file or group") cls <- class(object) if (is(object, "RangedSummarizedExperiment") && !is(object, "SingleCellExperiment")) { warning("Ranged data is currently unsupported. Coercing to SummarizedExperiment...") object <- as(object, "SummarizedExperiment") } if (is(object, "SingleCellExperiment")) { write_data_frame(file, "var", rowData(object)) obsm <- reducedDims(object) if (length(obsm) > 0) { obsmgrp <- H5Gcreate(file, "obsm") mapply(function(name, data) { if (is(data, "data.frame_OR_DataFrame")) { rownames(data) <- rownames(colData(object)) write_data_frame(obsmgrp, name, data) } else { if (length(dim(data)) == 1) data <- as.vector(data) else data <- data write_matrix(obsmgrp, name, data) } }, names(obsm), obsm) H5Gclose(obsmgrp) } lapply(list(list(name="obsp", names=colPairNames, getter=colPair), list(name="varp", names=rowPairNames, getter=rowPair)), function(cp) { names <- cp$names(object) if (length(names) > 0) { pairgrp <- H5Gcreate(file, cp$name) lapply(names, function(cname) { write_matrix(pairgrp, cname, cp$getter(object, cname, asSparse=TRUE)) }) H5Gclose(pairgrp) } }) object <- as(object, "SummarizedExperiment") written_object <- TRUE } if (is(object, "SummarizedExperiment")) { write_data_frame(file, "obs", colData(object)) rdata <- rowData(object) if (ncol(rdata) > 0 || !is.null(rownames(rdata))) write_data_frame(file, "var", rdata) assays <- assays(object) nassays <- length(assays) write_matrix(file, "X", assays[[1]], needTranspose=FALSE) if (nassays > 1) { layersgrp <- H5Gcreate(file, "layers") mapply(function(name, mat) { write_matrix(layersgrp, name, mat, needTranspose=FALSE) }, names(assays[2:nassays]), assays[2:nassays]) H5Gclose(layersgrp) } writeList(file, "uns", metadata(object)) if (hasMethod("altExps", class(object))) { naltexps <- length(altExps(object)) if (naltexps > 1) { warning("Alternative experiments are currently unsupported. Construct a MultiAssayExperiment object or write them as individual H5AD files.") } } written_object <- TRUE } if (is(object, "Matrix_OR_DelayedMatrix")) { write_matrix(file, "X", object, needTranspose=FALSE) rownames <- rownames(object) colnames <- colnames(object) if (is.null(rownames)) rownames <- as.character(seq_len(nrow(object))) if (is.null(colnames)) colnames <- as.character(seq_len(ncol(object))) var <- data.frame(row.names=rownames) obs <- data.frame(row.names=colnames) write_data_frame(file, "obs", obs) write_data_frame(file, "var", var) written_object <- TRUE } if (!written_object) { warning("Objects of class ", class(object), " are currently unsupported, skipping...") } else { write_object_class(file, cls[1]) finalize_anndata_internal(file) if (need_finalize) finalize_anndata(file) } invisible(NULL) } #' Save a \code{\linkS4class{MultiAssayExperiment}} to an .h5mu file. #' #' Note that NA values are not supported by HDF5, and therefore by h5mu. The behavior of this #' function if NAs are present is undefined. #' #' @param object A \code{\linkS4class{MultiAssayExperiment}}. #' @param file Name of the file to save to. #' @param overwrite Currently unused. #' #' @returns NULL, invisibly #' #' @examples #' data(miniACC, package="MultiAssayExperiment") #' writeH5MU(miniACC, "miniacc.h5mu") #' #' @importFrom rhdf5 H5Gcreate H5Gclose #' @importFrom MultiAssayExperiment colData experiments sampleMap metadata #' #' @export writeH5MU <- function(object, file, overwrite) { if (!is(object, "MultiAssayExperiment")) stop("Only MultiAssayExperiment objects are currently supported.") if (is.character(file)) { h5 <- open_h5(file) need_finalize <- TRUE } else stop("file must be a character") obs <- as.data.frame(colData(object), stringsAsFactors = FALSE) write_data_frame(h5, "obs", obs) mods <- H5Gcreate(h5, "mod") obsmgrp <- H5Gcreate(h5, "obsm") obsmapgrp <- H5Gcreate(h5, "obsmap") samplemap <- sampleMap(object) globalrownames <- rownames(obs) vars <- mapply(function(mname, mod) { mod_group <- H5Gcreate(mods, mname) writeH5AD(mod, mod_group) H5Gclose(mod_group) cmap <- samplemap[samplemap$assay == mname,] cmaporder <- match(globalrownames, cmap$primary) localorder <- match(cmap$colname, colnames(mod)) obsmap <- vapply(cmaporder, function(o)ifelse(is.na(o), 0L, localorder[o]), 0L) writeDataset(obsmapgrp, mname, obsmap) writeDataset(obsmgrp, mname, as.integer(!is.na(cmaporder))) data.frame(row.names = rownames(mod)) }, names(object), object) writeAttribute(mods, "order", names(object)) H5Gclose(mods) H5Gclose(obsmgrp) H5Gclose(obsmapgrp) var <- do.call(rbind, vars) write_data_frame(h5, "var", var) writeList(h5, "uns", metadata(object)) write_object_class(h5, class(object)[1]) finalize_mudata(h5) invisible(NULL) } #' @importFrom rhdf5 H5Gcreate H5Gclose #' @importFrom methods is as write_matrix <- function(parent, key, mat, needTranspose=TRUE) { if (is(mat, "dgeMatrix")) mat <- as.matrix(mat) if (is.matrix(mat) || is.vector(mat) || is.array(mat) || is.numeric(mat) || is.integer(mat) || is.logical(mat) || is.character(mat)) { # is.vector returns false for vectors with attributes isscalar <- length(mat) == 1 & !is.null(attr(mat, "encoding-scalar")) naidx <- is.na(mat) hasna <- any(naidx) if (needTranspose) { if (is.matrix(mat)) mat <- t(mat) else if (is.array(mat)) mat <- aperm(mat, length(dim(mat)):1) } if (hasna && is.double(mat)) { # FIXME: extend anndata spec to handle double NAs? mat[naidx] <- NaN hasna <- FALSE } else if (hasna && is.character(mat)) { # FIXME: extend anndata spec to handle string NAs? mat[naidx] <- "" hasna <- FALSE } if (isscalar || !hasna) { writeDataset(parent, key, mat, scalar=isscalar) dset <- h5autoclose(parent & key) if (!isscalar) writeAttribute(dset, "encoding-type", ifelse(is.character(mat), "string-array", "array")) else writeAttribute(dset, "encoding-type", ifelse(is.character(mat), "string", "numeric-scalar")) writeAttribute(dset, "encoding-version", "0.2.0") } else { grp <- H5Gcreate(parent, key) if (is.character(mat)) mat[naidx] <- "" else mat[naidx] <- as(0, type(mat)) write_matrix(grp, "values", mat) write_matrix(grp, "mask", naidx) writeAttribute(grp, "encoding-type", ifelse(is.logical(mat), "nullable-boolean", "nullable-integer")) writeAttribute(grp, "encoding-version", "0.1.0") H5Gclose(grp) } } else if (is.factor(mat)) { grp <- H5Gcreate(parent, key) codes <- as.integer(mat) codes[is.na(mat)] <- 0L write_matrix(grp, "codes", codes - 1L) write_matrix(grp, "categories", levels(mat)) writeAttribute(grp, "ordered", is.ordered(mat)) writeAttribute(grp, "encoding-type", "categorical") writeAttribute(grp, "encoding-version", "0.2.0") H5Gclose(grp) } else if (is(mat, "dgCMatrix") || is(mat, "dgRMatrix") || is(mat, "DelayedArray") && DelayedArray::is_sparse(mat)) { if (is(mat, "DelayedArray")) mat <- as(mat, "RsparseMatrix") grp <- H5Gcreate(parent, key) writeDataset(grp, "indptr", mat@p) writeDataset(grp, "data", mat@x) writeAttribute(grp, "shape", if (needTranspose) dim(mat) else rev(dim(mat)), scalar=FALSE) writeAttribute(grp, "encoding-version", "0.1.0") if (is(mat, "dgCMatrix")) { writeDataset(grp, "indices", mat@i) writeAttribute(grp, "encoding-type", ifelse(needTranspose, "csc_matrix", "csr_matrix")) } else { writeDataset(grp, "indices", mat@j) writeAttribute(grp, "encoding-type", ifelse(needTranspose, "csr_matrix", "csc_matrix")) } H5Gclose(grp) } else if (is(mat, "DelayedArray") && requireNamespace("HDF5Array", quietly=TRUE)) { writeArrayToMuData(mat, parent, key) } else { stop("Writing matrices of type ", class(mat), " is not implemented.") } } #' @importFrom rhdf5 H5Gcreate H5Gclose h5createAttribute write_data_frame <- function(parent, key, df) { group <- H5Gcreate(parent, key) columns <- colnames(df) index <- rownames(df) if (is.null(index)) index <- as.character(1:nrow(df)) df[["_index"]] <- index for (col in colnames(df)) { write_matrix(group, col, df[[col]]) } # Write attributes writeAttribute(group, "_index", "_index") writeAttribute(group, "encoding-type", "dataframe") writeAttribute(group, "encoding-version", "0.2.0") if (length(columns) > 0) { writeAttribute(group, "column-order", columns, scalar=FALSE) } else { # When there are no columns, null buffer can't be written to a file. h5createAttribute(group, "column-order", dims=0) } H5Gclose(group) invisible(NULL) } .registeredHDF5ArrayMethods <- new.env() .registeredHDF5ArrayMethods$registered <- FALSE #' @importFrom rhdf5 h5write H5Dclose #' @importFrom DelayedArray write_block start width #' @importFrom methods setClass registerHDF5ArrayMethods <- function() { if (!.registeredHDF5ArrayMethods$registered) { haveHDF5Array <- requireNamespace("HDF5Array", quietly=TRUE) if (!haveHDF5Array) return(FALSE) setClass("MuDataFileRealizationSink", contains="HDF5RealizationSink", slots=c(parent="H5IdComponent", datasetname="character"), where=.registeredHDF5ArrayMethods) setMethod(write_block, "MuDataFileRealizationSink", function(sink, viewport, block) { if (!is.array(block)) block <- as.array(block) h5write(block, sink@parent, sink@datasetname, start=start(viewport), count=width(viewport)) sink }, where=.registeredHDF5ArrayMethods) .registeredHDF5ArrayMethods$registered <- TRUE } .registeredHDF5ArrayMethods$registered } #' @importFrom rhdf5 h5createDataset H5Fget_name H5Iget_name #' @importFrom methods new MuDataFileRealizationSink <- function(dim, type, parent, key, dimnames=NULL, as.sparse=FALSE) { chunkdim <- HDF5Array::getHDF5DumpChunkDim(dim) h5createDataset(parent, key, dim, storage.mode=type, chunk=chunkdim, level=9, shuffle=TRUE) file <- H5Fget_name(parent) path <- paste(H5Iget_name(parent), key, sep="/") new("MuDataFileRealizationSink", dim=dim, dimnames=dimnames, type=type, as_sparse=as.sparse, filepath=file, name=path, chunkdim=chunkdim, parent=parent, datasetname=key) } #' @importFrom DelayedArray is_sparse type BLOCK_write_to_sink writeArrayToMuData <- function(x, parent, key, verbose=NA) { if (!registerHDF5ArrayMethods()) stop("The HDF5Array packages must be installed to save DelayedArrays.") as.sparse <- is_sparse(x) sink_dimnames <- dimnames(x) sink <- MuDataFileRealizationSink(dim(x), type(x), parent, key, sink_dimnames, as.sparse) verbose <- DelayedArray:::normarg_verbose(verbose) sink <- BLOCK_write_to_sink(sink, x, verbose=verbose) invisible(NULL) } # this is a straight port of the h5py guess_chunk function chunk_base <- 16 * 1024 chunk_min <- 8 * 1024 chunk_max <- 1024 * 1024 guess_chunk <- function(shape, storage.mode) { nbytes <- switch(storage.mode, double=8, integer=4, logical=1, character=8) ndims <- length(shape) dset_size <- prod(shape) * nbytes target_size <- chunk_base * (2^log10(dset_size / 1024^2)) if (target_size > chunk_max) target_size <- chunk_max else if (target_size < chunk_min) target_size <- chunk_min idx <- 0 while(TRUE) { chunk_bytes <- prod(shape) * nbytes if ((chunk_bytes < target_size || abs(chunk_bytes - target_size) / target_size < 0.5) && chunk_bytes < chunk_max) break if (prod(shape) == 1) break shape[idx %% ndims + 1] <- ceiling(shape[idx %% ndims + 1] / 2) idx <- idx + 1 } as.integer(shape) } #' @importFrom rhdf5 h5createDataset h5writeDataset writeDataset <- function(parent, key, data, scalar=FALSE) { shape <- dim(data) if (is.null(shape)) shape <- length(data) if (length(shape) == 1 && shape == 1 && scalar) { shape <- chunksize <- NULL level <- 0 } else { chunksize <- guess_chunk(shape, storage.mode(data)) level <- 9 } h5createDataset(parent, key, shape, storage.mode=storage.mode(data), chunk=chunksize, level=level, shuffle=TRUE, encoding="UTF-8") h5writeDataset(data, parent, key, variableLengthString=TRUE, encoding="UTF-8") } #' @importFrom rhdf5 h5writeAttribute writeAttribute <- function(obj, name, value, scalar=TRUE) { if (is.logical(value)) value <- as.integer(value) # rhdf5 hasn't implemented logical attributes yet args <- list(attr=value, h5obj=obj, name=name) if (is.character(value)) args$variableLengthString <- TRUE if (length(value) == 1 && scalar) args$asScalar <- TRUE do.call(h5writeAttribute, args) } #' @importFrom rhdf5 H5Gcreate H5Gclose #' @importFrom methods slotNames slot write_elem <- function(parent, key, data) { if (is(data, "list_OR_List")) writeList(parent, key, data) else if (is(data, "Matrix_OR_DelayedMatrix") || is(data, "vector_OR_Vector")) write_matrix(parent, key, data) else if (is(data, "data.frame_OR_DataFrame")) write_data_frame(parent, key, data) else if (isS4(data)) { grp <- H5Gcreate(parent, key) for (slotnm in slotNames(data)) { write_elem(grp, slotnm, slot(data, slotnm)) } writeAttribute(grp, "encoding-type", "dict") writeAttribute(grp, "encoding-version", "0.1.0") H5Gclose(grp) } else warning(paste("Cannot write object of class", class(data), "skipping...")) } #' @importFrom rhdf5 H5Gcreate H5Gclose writeList <- function(parent, key, data) { if (length(data) > 0) { nms <- names(data) if (is.null(nms)) nms <- as.character(seq_along(data)) grp <- H5Gcreate(parent, key) mapply(function(name, data) { write_elem(grp, name, data) }, nms, data) writeAttribute(grp, "encoding-type", "dict") writeAttribute(grp, "encoding-version", "0.1.0") H5Gclose(grp) } }