scAgeClock: a single-cell transcriptome based human aging clock model using gated multi-head attention neural networks
scAgeClock --model_file ${model_file} --testing_h5ad_files_dir ${h5ad_folder} --output_file ${out_file}from scageclock.evaluation import prediction
model_file="scAgeClock_GMA_model_state_dict.pth" ## pre-trained scAgeClock GMA model provided by scAgeClock
h5ad_folder="/path/to/h5adfiles/" ## scAgeClock formatted .h5ad files
results_df = prediction(model_file=model_file,
h5ad_dir=h5ad_folder)% conda create -n scAgeClock
% conda activate scAgeClock
% conda install python=3.12
% pip install scageclock
% conda create -n scAgeClock
% conda activate scAgeClock
% conda install python=3.12
% pip install scageclock-0.1.3.tar.gz # download the latest release
% scAgeClock --help
usage: scAgeClock [-h] [--model_file MODEL_FILE]
[--testing_h5ad_files_dir TESTING_H5AD_FILES_DIR]
[--output_file OUTPUT_FILE]
scAgeClock CLI tools
options:
-h, --help show this help message and exit
--model_file MODEL_FILE
model file (eg: .pth file generated by scAgeClock GMA)
--testing_h5ad_files_dir TESTING_H5AD_FILES_DIR
directory path to the .h5ad files used for model
prediction (format should be matched with scAgeClock
requirements)
--output_file OUTPUT_FILE
output file with predicted results
#check the python imports
from scageclock.scAgeClock import training_pipeline
from scageclock.evaluation import prediction- feature file: data/metadata/h5ad_var.tsv
- categorical index: data/metadata/categorical_features_index (assay, sex, tissue_general, and cell_type)
- h5ad example file: data/pytest_data/k_fold_mode/train_val/Fold1/Pytest_Fold1_200K_chunk27.h5ad (500 cells sampled)
- shape of anndata from h5ad file: N x 19238, where N is the number of cells
## 19238 features, including 4 categorical features (the first four columns, in the order of assay, cell_type, tissue_general, and sex) and 19179 selected protein coding genes
AnnData object with n_obs × n_vars = 500 × 19238
obs: 'soma_joinid', 'age'
var: 'feature_id', 'feature_name'Click to check the data formatting example code
import scanpy as sc
import pandas as pd
import numpy as np
from scageclock.formatting import format_anndata_multiple
raw_h5ad_file = "/your/raw/inputfile/example.h5ad"
raw_adata_all = sc.read_h5ad(raw_h5ad_file,backed='r')
meta_df = pd.read_parquet("example_meta.parquet") ## metadata for example.h5ad
split_dfs = np.array_split(filtered_meta_df, 10) ## split the cells into 10 chunks (to reduce memory loading while formatting)
###load the matching table for the categorical features and update the .obs dataframe of the original anndata
meta_df = raw_adata_all.obs
cat_index_dict = {}
# matching table needs to be created based on your input anndata's .obs dataframe
# Example matching table files can be found in ./scageclock/data/example/data_formatting/obs_columns_matching_examples
for cat in ["assay","cell_type","tissue","sex"]:
df = pd.read_excel(f"../{cat}_matching_table.xlsx")
cat_index_dict[cat] = df
names_dict = {"platform":"assay",
"cellType1":"cell_type",
"tissue":"tissue",
"sex":"sex"}
for original_colname in names_dict.keys():
model_colname = names_dict[original_colname]
cat_df = pd.DataFrame({"raw_id": meta_df[original_colname]})
cat_df_with_index = pd.merge(cat_df,
cat_index_dict[model_colname],
left_on="raw_id",
right_on="original_cat_name",
how="left")
meta_df[f"{model_colname}_index"] = list(cat_df_with_index["model_cat_index"])
## update original obs dataframe with scAgeClock index added
raw_adata_all.obs = meta_df
### loading the model's feature file
model_feature_df = pd.read_csv("./scageclock/data/metadata/h5ad_var.tsv",sep="\t")
model_genes = list(model_feature_df["h5ad_var"])[4:] #get the model's gene features
### refomat for each chunks
chunk_id = 0
for chunk_df in split_dfs:
chunk_id += 1
adata_chunk = raw_adata_all[list(chunk_df.index)].to_memory()
print(adata_chunk.obs_names[0])
adata_formatted = format_anndata_multiple(adata_raw=adata_chunk,
model_genes=model_genes,
normalize=True,
cat_cols=["assay_index", "cell_type_index", "tissue_index", "sex_index"])
print(chunk_id)
adata_formatted.write_h5ad(f"chunk{chunk_id}.h5ad")- example data can be found at "data/pytest_data" of this repository
- example GMA model file can be found at "data/trained_models" of this repository
$${\color{red}GMA\space(Gated \space Multi-head \space Attention \space Neural \space Networks, default \space and \space recommended)}$$ - MLP (Multi-layer Perceptron)
- linear (Elastic Net based Linear regression model)
- xgboost
- catboost
from scageclock.evaluation import prediction
model_file="./data/trained_models/scAgeClock_GMA_model_state_dict.pth"
h5ad_folder="./data/pytest_data/train_val_test_mode/test/"
results_df = prediction(model_file=model_file,
h5ad_dir=h5ad_folder)#!/bin/bash
model_file="./data/trained_models/scAgeClock_GMA_model_state_dict.pth"
h5ad_folder="./data/pytest_data/train_val_test_mode/test/"
scAgeClock --model_file ${model_file} --testing_h5ad_files_dir ${h5ad_folder} --output_file './tmp/test_predicted.xlsx'from scageclock.scAgeClock import load_GMA_model, get_feature_importance
model_file = "./data/trained_models/scAgeClock_GMA_model_state_dict.pth"
gma_model = load_GMA_model(model_file)
feature_file = "data/metadata/h5ad_var.tsv"
feature_importance = get_feature_importance(gma_model,feature_file=feature_file)
#sort by feature importance score
feature_importance = feature_importance.sort_values(by="feature_importance",ascending=False)from scageclock.scAgeClock import training_pipeline
model_name = "GMA" # Gated Multihead Attention Neural Network, default model of scAgeClock
ad_dir_root = "data/pytest_data/train_val_test_mode/"
meta_file = "data/pytest_data/pytest_dataset_metadata.parquet"
dataset_folder_dict = {"training": "train", "validation": "val", "testing": "test"}
predict_dataset = "testing"
loader_method = "scageclock"
out_root_dir = "./tmp/"
results = training_pipeline(model_name=model_name,
ad_dir_root=ad_dir_root,
meta_file_path=meta_file,
dataset_folder_dict=dataset_folder_dict,
predict_dataset=predict_dataset,
validation_during_training=True,
loader_method=loader_method,
out_root_dir=out_root_dir)from scageclock.scAgeClock import training_pipeline
model_name = "GMA" # Gated Multihead Attention Neural Network, default model of scAgeClock
k_fold_data_dir="data/pytest_data/k_fold_mode/" # h5ad files are located at train_val/Fold1; train_val/Fold2; train_val/Fold3
meta_file = "data/pytest_data/pytest_dataset_metadata.parquet"
dataset_folder_dict = {"training_validation": "train_val"}
predict_dataset = "validation" ## prediction based on the validation dataset
loader_method = "scageclock"
out_root_dir = "./tmp/"
results = training_pipeline(model_name=model_name,
ad_dir_root=k_fold_data_dir,
meta_file_path=meta_file,
dataset_folder_dict=dataset_folder_dict,
predict_dataset=predict_dataset,
K_fold_mode=True,
K_fold_train=("Fold1", "Fold2"),
K_fold_val="Fold3",
validation_during_training=False,
loader_method=loader_method,
out_root_dir=out_root_dir)from scageclock.scAgeClock import training_pipeline
model_name = "catboost" # Gated Multihead Attention Neural Network, default model of scAgeClock
ad_dir_root = "data/pytest_data/train_val_test_mode/"
meta_file = "data/pytest_data/pytest_dataset_metadata.parquet"
dataset_folder_dict = {"training": "train", "validation": "val", "testing": "test"}
predict_dataset = "testing"
loader_method = "scageclock"
out_root_dir = "./tmp/"
results = training_pipeline(model_name=model_name,
ad_dir_root=ad_dir_root,
meta_file_path=meta_file,
dataset_folder_dict=dataset_folder_dict,
predict_dataset=predict_dataset,
validation_during_training=True,
loader_method=loader_method,
train_dataset_fully_loaded=True, ##make sure the memory is enough
out_root_dir=out_root_dir)- Author: Gangcai Xie (Medical School of Nantong University);
- ORCID