Thanks to visit codestin.com
Credit goes to github.com

Skip to content

agnesdeng/miae

Repository files navigation

miae

miae is an R package for multiple imputation through autoencoders built with Torch. It’s currently under development.

1. Installation

You can install the current development version of miae from GitHub with:

# install.packages("devtools")
#devtools::install_github("agnesdeng/miae")
# To load the nhanes3_newborn dataset from the R package
# mixgb
library(mixgb)
# To obtain visualization diagnostics plot
library(vismi)
# Multiple imputation through autoencoder
library(miae)

2. Multiple imputation with denoising autoencoder with dropout

# load the nhanes3_newborn dataset from the R package mixgb
data("nhanes3_newborn")
str(nhanes3_newborn)
#> tibble [2,107 × 16] (S3: tbl_df/tbl/data.frame)
#>  $ HSHSIZER: int [1:2107] 4 3 5 4 4 3 5 3 3 3 ...
#>  $ HSAGEIR : int [1:2107] 2 5 10 10 8 3 10 7 2 7 ...
#>  $ HSSEX   : Factor w/ 2 levels "1","2": 2 1 2 2 1 1 2 2 2 1 ...
#>  $ DMARACER: Factor w/ 3 levels "1","2","3": 1 1 2 1 1 1 2 1 2 2 ...
#>  $ DMAETHNR: Factor w/ 3 levels "1","2","3": 3 1 3 3 3 3 3 3 3 3 ...
#>  $ DMARETHN: Factor w/ 4 levels "1","2","3","4": 1 3 2 1 1 1 2 1 2 2 ...
#>  $ BMPHEAD : num [1:2107] 39.3 45.4 43.9 45.8 44.9 42.2 45.8 NA 40.2 44.5 ...
#>   ..- attr(*, "label")= chr "Head circumference (cm)"
#>  $ BMPRECUM: num [1:2107] 59.5 69.2 69.8 73.8 69 61.7 74.8 NA 64.5 70.2 ...
#>   ..- attr(*, "label")= chr "Recumbent length (cm)"
#>  $ BMPSB1  : num [1:2107] 8.2 13 6 8 8.2 9.4 5.2 NA 7 5.9 ...
#>   ..- attr(*, "label")= chr "First subscapular skinfold (mm)"
#>  $ BMPSB2  : num [1:2107] 8 13 5.6 10 7.8 8.4 5.2 NA 7 5.4 ...
#>   ..- attr(*, "label")= chr "Second subscapular skinfold (mm)"
#>  $ BMPTR1  : num [1:2107] 9 15.6 7 16.4 9.8 9.6 5.8 NA 11 6.8 ...
#>   ..- attr(*, "label")= chr "First triceps skinfold (mm)"
#>  $ BMPTR2  : num [1:2107] 9.4 14 8.2 12 8.8 8.2 6.6 NA 10.9 7.6 ...
#>   ..- attr(*, "label")= chr "Second triceps skinfold (mm)"
#>  $ BMPWT   : num [1:2107] 6.35 9.45 7.15 10.7 9.35 7.15 8.35 NA 7.35 8.65 ...
#>   ..- attr(*, "label")= chr "Weight (kg)"
#>  $ DMPPIR  : num [1:2107] 3.186 1.269 0.416 2.063 1.464 ...
#>   ..- attr(*, "label")= chr "Poverty income ratio"
#>  $ HFF1    : Factor w/ 2 levels "1","2": 2 2 1 1 1 2 2 1 2 1 ...
#>  $ HYD1    : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 1 3 1 1 1 1 1 1 2 1 ...
colSums(is.na(nhanes3_newborn))
#> HSHSIZER  HSAGEIR    HSSEX DMARACER DMAETHNR DMARETHN  BMPHEAD BMPRECUM 
#>        0        0        0        0        0        0      124      114 
#>   BMPSB1   BMPSB2   BMPTR1   BMPTR2    BMPWT   DMPPIR     HFF1     HYD1 
#>      161      169      124      167      117      192        7        0
#use default setting
midae.default <- midae(data = nhanes3_newborn, m = 5,
                       save.model = TRUE, path = file.path (tempdir ( ), "midaemodel.pt"))
)
# use customized setting
params <- list(input.dropout = 0.1, hidden.dropout = 0.3, optimizer = "adamW",
    learning.rate = 1e-04, encoder.structure = c(128, 64, 32),
    decoder.structure = c(32, 64, 128), scaler = "robust", act = "elu",
    init.weight = "he.normal.elu.dropout")

midae.data <- midae(data = nhanes3_newborn, m = 5, categorical.encoding = "onehot",
    device = "cpu", epochs = 10, batch.size = 32, subsample = 1,
    early.stopping.epochs = 1, dae.params = params, pmm.type = NULL,
    save.model = FALSE, path = file.path(tempdir(), "midaemodel.pt"))
#> [1] "cpu"
#> [1] "Running midae()."
#> Loss at epoch 1: 12.742303
#> Loss at epoch 2: 11.805174
#> Loss at epoch 3: 11.027690
#> Loss at epoch 4: 10.199140
#> Loss at epoch 5: 9.571312
#> Loss at epoch 6: 9.027369
#> Loss at epoch 7: 8.628547
#> Loss at epoch 8: 8.427220
#> Loss at epoch 9: 8.088479
#> Loss at epoch 10: 7.923125
# obtain the fifth imputed dataset
midae.data[[5]]
# show the imputed values for missing entries in the
# variable 'BMPHEAD'
show_var(imputation.list = midae.data, var.name = "BMPHEAD",
    original.data = nhanes3_newborn)
#>            m1       m2       m3       m4       m5
#>   1: 42.06059 42.85000 43.89252 44.15647 41.92642
#>   2: 44.01146 42.87655 42.93002 43.94204 40.74512
#>   3: 41.87237 41.49434 44.63226 42.82038 43.88140
#>   4: 41.73535 43.62703 43.71303 44.79532 43.43610
#>   5: 42.57945 41.74102 45.64261 42.84653 43.69893
#>  ---                                             
#> 120: 44.83619 44.37527 44.66512 43.90899 43.33476
#> 121: 44.29740 43.88688 43.68660 43.83380 42.30541
#> 122: 41.99109 42.14834 43.79788 41.85656 41.97642
#> 123: 46.61103 44.46716 45.43449 44.54855 45.87122
#> 124: 40.73145 44.41811 41.89078 43.24716 44.48600
plot2D(imputation.list = midae.data, var.x = "BMPHEAD", var.y = "BMPRECUM",
    original.data = nhanes3_newborn)

3. Multiple imputation with variational autoencoder

#use default setting
mivae.default <- mivae(data = nhanes3_newborn, m = 5,
                       save.model = TRUE, path = file.path (tempdir ( ), "mivaemodel.pt"))
)
# use customized setting
params <- list(beta = 0.95, optimizer = "adamW", learning.rate = 1e-04,
    encoder.structure = c(128, 64, 32), decoder.structure = c(32,
        64, 128), scaler = "robust", act = "elu", init.weight = "he.normal.elu")

mivae.data <- mivae(data = nhanes3_newborn, m = 5, categorical.encoding = "onehot",
    device = "cpu", epochs = 10, batch.size = 32, subsample = 1,
    vae.params = params, pmm.type = NULL, save.model = FALSE,
    path = file.path(tempdir(), "mivaemodel.pt"))
#> [1] "cpu"
#> [1] "Running mivae()."
#> Loss at epoch 1: 29.801553
#> Loss at epoch 2: 18.865960
#> Loss at epoch 3: 15.507260
#> Loss at epoch 4: 13.625918
#> Loss at epoch 5: 12.683349
#> Loss at epoch 6: 12.053438
#> Loss at epoch 7: 11.616923
#> Loss at epoch 8: 11.170577
#> Loss at epoch 9: 10.975068
#> Loss at epoch 10: 10.719830
plot2D(imputation.list = mivae.data, var.x = "BMPHEAD", var.y = "BMPRECUM",
    original.data = nhanes3_newborn, shape = T)

4. Impute new data using a saved imputation model

set.seed(2023)
n <- nrow(nhanes3_newborn)
idx <- sample(1:n, size = round(0.7 * n), replace = FALSE)
train.data <- nhanes3_newborn[idx, ]
test.data <- nhanes3_newborn[-idx, ]
mivae.obj <- mivae(data = train.data, m = 5, categorical.encoding = "onehot",
    device = "cpu", epochs = 10, batch.size = 32, vae.params = params,
    pmm.type = NULL, save.model = TRUE, path = file.path(tempdir(),
        "mivaemodel.pt"))
#> [1] "cpu"
#> [1] "Running mivae()."
#> Loss at epoch 1: 25.185160
#> Loss at epoch 2: 17.317467
#> Loss at epoch 3: 14.648660
#> Loss at epoch 4: 13.337311
#> Loss at epoch 5: 12.620026
#> Loss at epoch 6: 11.949572
#> Loss at epoch 7: 11.701335
#> Loss at epoch 8: 11.458507
#> Loss at epoch 9: 11.220618
#> Loss at epoch 10: 11.013243
#> [1] "The VAE multiple imputation model is saved in  /var/folders/wz/s9x7fy7d2wsgmcf5kfcv040h0000gn/T//RtmpQDZE9E/mivaemodel.pt"
mivae.newdata <- impute_new(object = mivae.obj, newdata = test.data,
    m = 5)

About

miae: multiple imputation through autoencoders

Resources

License

GPL-3.0, GPL-3.0 licenses found

Licenses found

GPL-3.0
LICENSE
GPL-3.0
LICENSE.md

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages