Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mess/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"--frag-len",
"--frag-sd",
"--errfree",
"--art-args",
],
},
{
Expand Down
23 changes: 18 additions & 5 deletions mess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,27 @@
)


def get_fasta_dirs(config):
def fasta_path(config):
if os.path.isfile(config["args"]["input"]):
files = [config["args"]["input"]]
else:
files = glob.glob(os.path.join(config["args"]["input"], "*.tsv"))
df = pd.concat([pd.read_csv(file, sep="\t") for file in files])
if "path" in df.columns:
return set(os.path.abspath(os.path.dirname(p)) for p in df["path"])
return os.path.commonpath(df["path"].to_list())
else:
return False


def custom_taxonkit_dir(config):
if config["args"]["taxonkit"] == os.path.join(os.getcwd(), ".taxonkit"):
return False
else:
return os.path.dirname(
os.path.realpath(os.path.join(config["args"]["taxonkit"], "names.dmp"))
)


def snake_base(rel_path):
"""Get the filepath to a Snaketool system file (relative to __main__.py)"""
return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)
Expand Down Expand Up @@ -124,9 +133,10 @@ def run_snakemake(
os.path.join(os.path.dirname(os.path.realpath(__file__))),
os.path.abspath(snake_config["args"]["output"]),
]
if get_fasta_dirs(snake_config):
for path in get_fasta_dirs(snake_config):
paths.append(path)
if fasta_path(snake_config):
paths.append(fasta_path(snake_config))
if custom_taxonkit_dir(snake_config):
paths.append(custom_taxonkit_dir(snake_config))

sdm_args = " ".join([f"-B {path}:{path}" for path in paths])

Expand Down Expand Up @@ -344,6 +354,9 @@ def sim_options(func):
type=str,
default=None,
),
click.option(
"--art-args", help="additional art_illumina args", type=str, default=""
),
click.option(
"--errfree",
help="Generate error free alignments with art_illumina",
Expand Down
19 changes: 14 additions & 5 deletions mess/workflow/rules/preflight/functions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,22 @@ def get_value(value, wildcards):


def get_asm_summary(wildcards):
if (
("seq_len" in tsv_df.columns)
and ("seq_num" in tsv_df.columns)
and ("path" in tsv_df.columns)
):
return os.path.join(dir.out.base, "replicates.tsv")
if ("taxon" in tsv_df.columns) or ("accession" in tsv_df.columns):
if PRIMERSEARCH:
return [
checkpoints.download_assemblies.get(**wildcards).output[0],
os.path.join(dir.out.processing, "seqkit_stats.tsv"),
]
else:
return checkpoints.download_assemblies.get(**wildcards).output[0]
if PRIMERSEARCH or FASTA_DIR or FASTA_PATH:
return os.path.join(dir.out.processing, "seqkit_stats.tsv")
else:
try:
return checkpoints.download_assemblies.get(**wildcards).output[0]
except AttributeError:
return os.path.join(dir.out.processing, "seqkit_stats.tsv")


tsv_cache = {}
Expand Down
2 changes: 1 addition & 1 deletion mess/workflow/rules/processing/coverages.smk
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ rule replicates_table:

checkpoint calculate_genome_coverages:
input:
df=os.path.join(dir.out.base, "replicates.tsv"),
rep=os.path.join(dir.out.base, "replicates.tsv"),
asm=get_asm_summary,
output:
os.path.join(dir.out.base, "coverages.tsv"),
Expand Down
1 change: 0 additions & 1 deletion mess/workflow/rules/processing/fastas.smk
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ checkpoint split_contigs:
params:
circular=CIRCULAR,
rotate=ROTATE,
amplicons=PRIMERSEARCH,
read_len=MEAN_LEN,
resources:
mem_mb=config.resources.sml.mem,
Expand Down
6 changes: 5 additions & 1 deletion mess/workflow/rules/processing/reads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,11 @@ rule cat_fastqs:
mem=str(config.resources.sml.mem) + "MB",
time=config.resources.norm.time,
message:
"Concatenating {wildcards.sample} reads : {params.head} ... "
(
"Concatenating {wildcards.sample} R{wildcards.p} reads : {params.head} ... "
if PAIRED
else "Concatenating {wildcards.sample} reads : {params.head} ... "
)
shell:
"""
find {params.dir} -name "{params.name}" | sort | xargs cat > {output}
Expand Down
2 changes: 2 additions & 0 deletions mess/workflow/rules/simulate/short_reads.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ if BAM or TAX:
if ERRFREE:
art_args += "-ef "

art_args += f"{config.args.art_args}"


fq_prefix = os.path.join(dir.out.short, "{sample}", "{fasta}", "{contig}")
if CIRCULAR:
Expand Down
142 changes: 90 additions & 52 deletions mess/workflow/scripts/calculate_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,143 +48,170 @@ def strip_fasta_ext(filename):
Main
"""
# Set seed for distributions
np.random.seed(snakemake.params.seed)
np.random.seed(snakemake.params["seed"])

# Set seed for read simulators
random.seed(snakemake.params.seed)
random.seed(snakemake.params["seed"])

# Pairing
if snakemake.params.pairing:
if snakemake.params["pairing"]:
p = 2
else:
p = 1

entry_df = pd.read_csv(snakemake.input["rep"], sep="\t")

# Get table with assembly genome sizes and their taxonomy

if ("fasta" not in entry_df.columns) and ("path" in entry_df.columns):
entry_df["fasta"] = [
strip_fasta_ext(os.path.basename(path)) for path in entry_df["path"]
]

asm_df = pd.read_csv(snakemake.input.asm, sep="\t")
entry_df = pd.read_csv(snakemake.input.df, sep="\t")

if snakemake.params.fa_dir:
if snakemake.params["fa_dir"]:
entry_df["fasta"] = entry_df["fasta"].apply(strip_fasta_ext)
entry_df["path"] = [
glob.glob(os.path.join(snakemake.params.fa_dir, f"{fa}*"))[0]
glob.glob(os.path.join(snakemake.params["fa_dir"], f"{fa}*"))[0]
for fa in entry_df["fasta"]
]
if snakemake.params.fa_path:
if snakemake.params["fa_path"]:
entry_df["fasta"] = [
strip_fasta_ext(os.path.basename(path)) for path in entry_df["path"]
]

if snakemake.params.fa_dir or snakemake.params.fa_path:
asm_df = pd.read_csv(snakemake.input.asm, sep="\t")
asm_df.rename(
if isinstance(snakemake.input["asm"], list):
summary_df = pd.read_csv(snakemake.input["asm"][0], sep="\t")
summary_df["fasta"] = [
strip_fasta_ext(os.path.basename(path)) for path in summary_df["path"]
]
stats_df = pd.read_csv(snakemake.input["asm"][1], sep="\t")
stats_df.rename(
columns={
"file": "fasta",
"sum_len": "total_sequence_length",
"num_seqs": "number_of_contigs",
"sum_len": "seq_len",
"num_seqs": "seq_num",
},
inplace=True,
)
asm_df["fasta"] = asm_df["fasta"].apply(strip_fasta_ext)
stats_df["fasta"] = stats_df["fasta"].apply(strip_fasta_ext)
stats_df["fasta"] = stats_df["fasta"].str.replace(".amplicons", "")
asm_df = pd.merge(stats_df, summary_df, on="fasta")

if "fasta" not in asm_df.columns:
asm_df["fasta"] = [
strip_fasta_ext(os.path.basename(path)) for path in asm_df["path"]
]

if snakemake.params.amplicons:
asm_df["fasta"] = asm_df["fasta"].str.replace(".amplicons", "")
else:
asm_df = pd.read_csv(snakemake.input["asm"], sep="\t")
if "file" in asm_df.columns:
asm_df.rename(
columns={
"file": "fasta",
"sum_len": "seq_len",
"num_seqs": "seq_num",
},
inplace=True,
)
asm_df["fasta"] = asm_df["fasta"].apply(strip_fasta_ext)

if ("fasta" not in asm_df.columns) and ("path" in asm_df.columns):
asm_df["fasta"] = [
strip_fasta_ext(os.path.basename(path)) for path in asm_df["path"]
]
if snakemake.params["amplicons"]:
asm_df["fasta"] = asm_df["fasta"].str.replace(".amplicons", "")
if (
"total_sequence_length" in asm_df.columns
and "number_of_contigs" in asm_df.columns
):
asm_df = asm_df.rename(
columns={"total_sequence_length": "seq_len", "number_of_contigs": "seq_num"}
)


same_cols = list(np.intersect1d(entry_df.columns, asm_df.columns))
df = pd.merge(entry_df, asm_df, how="left", on=same_cols)


# Get total bases
bases = parse_size(snakemake.params.bases)
bases = parse_size(snakemake.params["bases"])


if "tax_id" in df.columns:
df["tax_id"] = df["tax_id"].astype(int)
# Calculate prportion with dist
if snakemake.params.dist == "even":
if snakemake.params["dist"] == "even":
df = get_even_dist(df)
df["tax_abundance"] = df["proportion"] / df["count"]
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
df["genome_bases"] = df["seq_len"] * df["tax_abundance"]
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform("sum")
df["cov_obtained"] = bases / df["sum_genome_bases"]
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params.read_len
df["bases"] = df["cov_sim"] * df["seq_len"]
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]


elif snakemake.params.dist == "lognormal":
df = get_lognormal_dist(df, mu=snakemake.params.mu, sigma=snakemake.params.sigma)
elif snakemake.params["dist"] == "lognormal":
df = get_lognormal_dist(
df, mu=snakemake.params["mu"], sigma=snakemake.params["sigma"]
)
df["bases"] = df["seq_abundance"] * bases
df["reads"] = df["bases"] / snakemake.params.read_len
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["cov_sim"] = df["bases"] / df["seq_len"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]
else:
if "tax_abundance" in entry_df.columns:
df["genome_bases"] = df["total_sequence_length"] * df["tax_abundance"]
df["genome_bases"] = df["seq_len"] * df["tax_abundance"]
df["sum_genome_bases"] = df.groupby("samplename")["genome_bases"].transform(
"sum"
)
df["cov_obtained"] = bases / df["sum_genome_bases"]
df["cov_sim"] = df["tax_abundance"] * df["cov_obtained"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params.read_len
df["bases"] = df["cov_sim"] * df["seq_len"]
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]

if "seq_abundance" in entry_df.columns:
df["bases"] = df["seq_abundance"] * bases
df["reads"] = df["bases"] / snakemake.params.read_len
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["cov_sim"] = df["bases"] / df["seq_len"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]

if "reads" in entry_df.columns:
df["bases"] = df["reads"] * snakemake.params.read_len * p
df["bases"] = df["reads"] * snakemake.params["read_len"] * p
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["cov_sim"] = df["bases"] / df["seq_len"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]

if "bases" in entry_df.columns:
df["reads"] = df["bases"] / snakemake.params.read_len
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["seq_abundance"] = df["bases"] / df["sum_bases"]
df["cov_sim"] = df["bases"] / df["total_sequence_length"]
df["cov_sim"] = df["bases"] / df["seq_len"]
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]

elif "cov_sim" in entry_df.columns:
df["sum_cov"] = df.groupby("samplename")["cov_sim"].transform("sum")
df["tax_abundance"] = df["cov_sim"] / df["sum_cov"]
df["bases"] = df["cov_sim"] * df["total_sequence_length"]
df["bases"] = df["cov_sim"] * df["seq_len"]
df["sum_bases"] = df.groupby("samplename")["bases"].transform("sum")
df["reads"] = df["bases"] / snakemake.params.read_len
df["reads"] = df["bases"] / snakemake.params["read_len"]
df["seq_abundance"] = df["bases"] / df["sum_bases"]


df["seed"] = random.sample(range(1, 1000000), len(df))
df = df.rename(
columns={"total_sequence_length": "seq_len", "number_of_contigs": "seq_num"}
)
cols = [
"samplename",
"fasta",
"path",
"seq_len",
"seq_num",
"reads",
Expand All @@ -200,10 +227,21 @@ def strip_fasta_ext(filename):
cols.append("tax_id")

# replace values with 0 for empty amplicon fastas
df.loc[
df["seq_len"] == 0,
["seq_num", "reads", "bases", "cov_sim", "tax_abundance", "seq_abundance", "seed"],
] = 0
df[cols].replace(0, np.nan).convert_dtypes().to_csv(
snakemake.output[0], sep="\t", index=False
)

if (df["seq_len"] == 0).any():
df.loc[
df["seq_len"] == 0,
[
"seq_num",
"reads",
"bases",
"cov_sim",
"tax_abundance",
"seq_abundance",
"seed",
],
] = 0
df = df[cols].replace(0, np.nan)
df[cols].sort_values(
["samplename", "fasta", "cov_sim"], ascending=[True, True, False]
).convert_dtypes().to_csv(snakemake.output[0], sep="\t", index=False)
5 changes: 2 additions & 3 deletions mess/workflow/scripts/split_contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,12 @@ def split_fasta(fa, outdir, suffix):
os.mkdir(snakemake.output.dir)
id2fa = []
suffix = ".fasta"
if snakemake.params.amplicons:
suffix = ".amplicons.fasta"
for fa in snakemake.input.fa:
if ".amplicons" in fa:
suffix = ".amplicons"
id2fa.append(split_fasta(fa, snakemake.output.dir, suffix))
id2fa = list(chain.from_iterable(id2fa))
contig_df = pd.DataFrame.from_records(id2fa)

df = pd.merge(contig_df, cov_df, how="left", on="fasta")

cols = [
Expand Down