SPART, a Semi-automated pipeline for assembling reference sequence of telomere-to-telomere (T2T).
See tutorial for more details.
git clone https://github.com/liushoucheng/SPART.git
cd SPART
conda env create -f SPART.yaml
conda activate spartList of tools assumed loadable or accessible with no path are:
Using snakemake to run the pipeline can be assembled to the chromosome level but may contain gaps that require the rest to be done manually.(Exclude Verkko,Bionano DLS Map,Telomere determination and patch,Centromeric region analysis,Variant calls and Evaluation):
# Replace SPART_PATH with the current working directory
sed -i "s#^ SPART_PATH# ${PWD}#g" conf_ck.yaml
# HiC enzyme
HiC_enzyme=" GATC"
# Replace hic_sca_enzyme with the value stored in the HiC_enzyme variable
sed -i "s#^ hic_sca_enzyme# ${HiC_enzyme}#g" conf_ck.yaml
# Ligation site sequence used for reads trimming. Depends on the fill in strategy. Example: AAGCTAGCTT
HiC_ligation_site=" GATCGATC"
sed -i "s#^ hic_sca_ligation_site# ${HiC_ligation_site}#g" conf_ck.yaml #Replace hic_sca_ligation_site with the value stored in the HiC_ligation_site variable
# This process uses the centos 7.6 operating system, slurm job scheduling system, please modify your SPART/clust.json according to the cluster situation.
# This process requires the use of HiC-Pro, please add it to the environment before running.
snakemake -s SPART.py --cluster-config clust.json --configfile conf_ck.yaml --cluster '{cluster.account}' --jobs $threads --rerun-incomplete --restart-times 1 -np --rulegraph |dot -Tpng > rule.png #Running pipeline with snakemake
# configfile:The config file can be used to define a dictionary of configuration parameters and their values.
# cluster-config:A JSON or YAML file that defines the wildcards used in 'cluster'for specific rules.please see the complete documentation.
HiFi_reads=# file names of HiFi reads
ONT_reads=# file names of Ultra-Long reads
thread=# number of threads
memory=# Specify the upper limit on memory to use
output_prefix=# prefix of output files
mitochondrion=# mitochondrion fasta
chloroplast=# chloroplast fasta
ref=# Sequences of mitochondria and chloroplasts need to be removed
# Fastp :was used to filter adapter sequences, primers and other low quality sequence from raw sequencing reads.
SPART/00_Contig_screen/fastp.sh $HiFi_reads $ONT_reads
# Hifiasm
SPART/00_Contig_screen/hifiasm.sh $HiFi_reads $ONT_reads $output_prefix $thread
# Verkko
SPART/00_Contig_screen/verkko.sh $output_prefix $HiFi_reads $ONT_reads $threads $memory
# Flye
SPART/00_Contig_screen/flye.sh $ONT_reads $output_prefix $threads
# Remove mitochondrion && chloroplast
SPART/00_Contig_screen/rm_mt_cp.sh $mitochondrion $chloroplast $ref $threadsthreads=# Nominal threads per Node, without overloading (non-zero value will override -T -Tp -Te -TJ)
bnx=# Input molecule (.bnx) file, required
ref_cmap=# Reference file (must be .cmap), to compare resulting contigs
prefix=# Location of output files root directory, required, will be created if does not exist; if does exist, will overwrite contents
xml=# Read XML file for parameters
Bio_dir=# Location of executable files (RefAligner and Assembler, required)
cluster_xml=# Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)
ref=# Input NGS FASTA
bio_camp=# Input BioNano CMAP
merge_xml=# Merge configuration file
RefAligner=# RefAligner program
hicpro_data=# input data folder; Must contains a folder per sample with input files
hicpro_config=# configuration file for Hi-C processing
hicpro_outdir=# output folder
enzyme=# restriction enzyme cutting sites
#### Bionano
SPART/01_Contig_scaffolding/Bionano_DLS_map.sh $threads $bnx $ref_cmap $prefix $xml $Bio_dir $cluster_xml $ref $bio_camp $merge_xml $RefAligner
#### Hi-C
# hic-pro
SPART/01_Contig_scaffolding/HiC-Pro.sh $ref $prefix $hicpro_data $hicpro_config $hicpro_outdir
# yahs
SPART/01_Contig_scaffolding/yahs.sh $enzyme $ref $bed/bam/bin $profixquery=# query fasta file (uncompressed or bgzipped)
ref=# target fasta file (uncompressed or bgzipped)
region=# output directory
# Download _submit_telomere.sh to SPART/02_Gap_patching/ first:
# https://github.com/VGP/vgp-assembly/blob/master/pipeline/telomere/_submit_telomere.sh
SPART/02_Gap_patching/wfmash_ragtag.sh $query $ref $regioncd ragtag_output
perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5Manually editing the ragtag.patch.debug.filtered.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf that matches the sequence at both ends of the gap.
perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf -start seq00000000 -end seq00000001 -o test.agpTest.agp is merged into ragtag.patch.agp and fasta is generated.
# make joins and fill gaps in target.fa using sequences from query.fa
cd SPART/example
ragtag.py patch -i 0.99 --remove-small -q 10 --debug -u --aligner minimap2 -t 128 --mm2-params "-x asm20 -I1G -t 128" reference1A.fasta query1A.fasta
# filter
cd ragtag_output
perl SPART/02_Gap_patching/paf_filter.pl -i ragtag.patch.debug.filtered.paf -minlen 10000000 -iden 0.5
# Manually editing the ragtag.patch.debug.filtered.paf_fiter.paf file.Keep the high-quality contig and preserve the location of the only high confidence match in ragtag.patch.debug.filtered.paf_fiter.paf that matches the sequence at both ends of the gap.
less ragtag.patch.debug.filtered.paf_fiter.paf
qseq00000000    600453479       27150   3999147 +       seq00000001     3972000 4       3971997 2266668 3972018 60
qseq00000000    600453479       4038251 35116708        +       seq00000002     597339226       17      31075089        17568679        31079144        60
# gain agp
perl SPART/02_Gap_patching/renameagp.pl -i ragtag.patch.ctg.agp -i1 ragtag.patch.debug.filtered.paf_fiter.paf -start seq00000001 -end seq00000002 -o test.agp
less -S ragtag.patch.agp
chr1A_RagTag_MOD_MOD	1	2046621	1	W	seq00000000	1	2046621	+
chr1A_RagTag_MOD_MOD	2046622	2046821	2	N	200	scaffold	yes	align_genus
chr1A_RagTag_MOD_MOD	2046822	6018821	3	W	seq00000001	1	3972000	+
chr1A_RagTag_MOD_MOD	6018822	6019021	4	N	200	scaffold	yes	align_genus
chr1A_RagTag_MOD_MOD	6019022	603358247	5	W	seq00000002	1	597339226	+
# Test.agp is merged into ragtag.patch.agp and fasta is generated.
less -S ragtag.patch.agp
scf00000000	1	2046621	1	W	seq00000000	1	2046621	+
scf00000000	2046622	2046821	2	N	200	scaffold	yes	align_genus
scf00000000	2046822	6018821	3	W	seq00000001	1	3972000	+
scf00000000	6018822	6057905	4	W	qseq00000000	3999151	4038234	+
scf00000000	6057906	603397131	5	W	seq00000002	1	597339226	+
ragtag_agp2fa.py ragtag.patch.agp ragtag.patch.comps.fasta > ragtag.patch.fastaWe used _submit_telomere.sh in ONT reads >100kb.ONT reads with telomere sequence mapping to this locus based on minimap2 alignments were manually identified. The longest was selected as template , all others aligned to it and polished with Medaka:
medaka -v -i ONT_tel_reads.fasta -d longest_ont_tel.fasta -o ont_tel_medaka.fastaTelomere signal in all HiFi reads was identified with the commands:
_submit_telomere.sh hifi_reads.fastaAdditional HiFi reads were recruited from a manual analysis. We looked for trimmed tips that could extend. All reads had telomere signal and were aligned to the medaka consensus and polished with Racon with the commands:
minimap2 -t16 -ax map-pb ont_tel_medaka.fasta hifi_tel.fasta > medaka.sam
racon hifi_tel.fasta medaka.sam ont_tel_medaka.fasta > racon.fastaFinally, the polished result was patched into the assembly with ragtag patch or manually patched.
https://github.com/marbl/CHM13-issues/blob/main/error_detection.md.
workdir=# work directory
FASTA=# target fasta file (uncompressed or bgzipped)
prefix=# prefix of output files
CHIP1_treatment=# Treatment (pull-down) file(s).
CHIP2_treatment=# Treatment (pull-down) file(s).
threads=# number of threads
CHIP1_control=# Control (input) file(s)
CHIP2_control=# Control (input) file(s)
SPART/02_Gap_patching/Centromeric_region_analysis.sh $workdir $FASTA $prefix $CHIP1_treatment $CHIP2_treatment $threads $CHIP1_control $CHIP2_control# Use singularity and docker to download google_deepvariant_latest-gpu.sif and kishwars/pepper_deepvariant:r0.8-gpu respectively and modify the cluster-config and configfile in snakemake
workdir=# work directory
ref=# target fasta file (uncompressed or bgzipped)
threads=# number of threads
SPART/03_Polishing/calsv_snv.sh $workdir $ref $threadsref=# target fasta file (uncompressed or bgzipped)
prefix=# prefix of output files
query=# query fasta file (uncompressed or bgzipped)
threads=# number of threads
partition=# your cluster partition
bac_reads=# bac reads
ref_chr=# target chromosome fasta file (uncompressed or bgzipped)
protein=# target protein fasta file
name=# output file name
gff3# target gff file
#### BUSCO
SPART/04_Evaluation/BUSCO.sh $ref $prefix
#### mapping rates & coverages
SPART/04_Evaluation/mapping_rates_coverages.sh hybrid_bam single_bam ont_bam
#### LTR
SPART/04_Evaluation/ltr.sh $ref $prefix
#### QV
SPART/04_Evaluation/qv.sh $query $ref
#### BACs
SPART/04_Evaluation/bac.sh $bac_reads $ref_chr
### Addition
SPART/04_Evaluation/while.sh $threads $partition $ref $query
### Analysis of synteny
SPART/04_Evaluation/synteny.sh $protein $name $gff3Detect adapter
fastp --detect_adapter_for_pe -w ${threads} -i ${RNAseq1} -I ${RNAseq2} -o ${RNAseq1_clean} -O ${RNAseq2_clean} --json ${output}.json --html ${output}.htmlBuild genome index
STAR --runThreadN ${threads} --runMode genomeGenerate --genomeDir ${Output Dir} --genomeFastaFiles ${genome} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --limitGenomeGenerateRAM 40000000000 --sjdbOverhang 149 --sjdbFileChrStartEnd ${genomic coordinates} --limitSjdbInsertNsj 1854820Mapping to genome
STAR --runThreadN ${threads} --genomeDir ${Output Dir} --readFilesIn ${RNAseq1_clean} ${RNAseq2_clean} --sjdbGTFtagExonParentTranscript Parent --sjdbGTFfile ${annotations} --outFileNamePrefix "$profix" --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterType BySJout --outSAMunmapped Within --outFilterMultimapNmax 20 --outSAMstrandField intronMotif --outFilterMismatchNoverLmax 0.02 --outFilterMismatchNmax 999 --alignIntronMin 20 --alignIntronMax 10000 --alignMatesGapMax 100000 --sjdbScore 1 --genomeLoad NoSharedMemory --outSAMtype BAM SortedByCoordinate --limitSjdbInsertNsj 1854820Assembly and merge
stringtie -j 2 -c 2 -m 150 -f 0.3 -G ${reference annotation} -l rna-seq -t -p ${threads} -l "$profix" -A "$profix"gene_abund.tab -C "$profix"cov_refs.gtf -o "$profix".gtf "$profix"Aligned.sortedByCoord.out.bam
stringtie --merge -p 96 -m 150 -c 10 -G ${reference annotation} -l rna_merge -o rna_all.gtf { gtf_list | strg1.gtf ...}TransDecoder
python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1Build genome index
minimap2 -t 96 -I 16G -d $mmi $genomeAlign && Correct && Collapse
flair 123 --mm2_args=-I15g,-axsplice:hq,-uf,-secondary=no -g $genome -r $iso_seq --mm_index $mmi -f $gtf -o flair.output --temp_dir temp_flair --stringent --no_gtf_end_adjustment --check_splice --generate_map --trust_end -t 96 --annotation_reliant generate --junction_bed $stringtie.bedTransDecoder
python snakemake -s Snakefile --cluster-config clust.json --configfile config.yaml --jobs 2000 --cluster '{cluster.account}' --rerun-incomplete --restart-times 1miniprot
miniprot -t96 -d CS-IAAS_v1.softmask.mpi CS-IAAS_v1.softmask.fasta
miniprot -It96 --gff CS-IAAS_v1.softmask.mpi ${Homology protein} > miniprot.gff3
python miniprot_GFF_2_EVM_alignment_GFF3.py miniprot.gff3 > protein_alignments.gff3Braker3
##### RNA-seq && Homology protein
docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" teambraker/braker3:latest braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --addUTR=on --gff3 --nocleanup --bam=rna_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH= --JAVA_PATH=
##### ISO-seq && Homology protein
docker run -c ${threads} --user 1000:100 -v /tmp:/tmp -v /home:/home -v /data:/data -v "$PWD":"$PWD" katharinahoff/playground:devel braker.pl --workingdir="$PWD" --species=CS-IAAS --softmasking --genome=CS-IAAS_v1.softmask.fasta --gff3 --nocleanup --bam=iso_seq.bam --prot_seq=${Homology protein} --threads ${threads} --BAMTOOLS_PATH= --AUGUSTUS_BIN_PATH=Shoucheng Liu ([email protected]) Xiaopeng Li ([email protected])
If you use SPART in your work, please cite:
Liu, S., Li, K., Dai, X. et al. A telomere-to-telomere genome assembly coupled with multi-omic data provides insights into the evolution of hexaploid bread wheat. Nat Genet (2025). https://doi.org/10.1038/s41588-025-02137-x