-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpbs_runner.py
More file actions
158 lines (143 loc) · 8.07 KB
/
Copy pathpbs_runner.py
File metadata and controls
158 lines (143 loc) · 8.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
This script runs the runner on a PBS cluster system by creating a pbs cmd file and running it with qsub.
In the output directory there will be an extra directory called pbs_logs which will contain the cmd file and the
output logs of the cluster.
You can set defaults for the pbs_runner in the config.ini file in the installation directory.
These can also be parameters for the runner itself which will be used only when running the pbs_runner.
"""
import os
from random import randint
from runner import create_runner_parser, assign_output_dir
from utils import get_config
def create_pbs_cmd_file(path, alias, output_logs_dir, cmd, queue, gmem=10, ncpus=50, nodes=1, custom_command=None,
jnums=None, run_after_job_id=None, job_suffix=None, default_command=None):
with open(path, 'w') as o:
o.write("#!/bin/bash\n#PBS -S /bin/bash\n#PBS -j oe\n#PBS -r y\n")
o.write(f"#PBS -q {queue}\n")
o.write("#PBS -v PBS_O_SHELL=bash,PBS_ENVIRONMENT=PBS_BATCH \n")
o.write("#PBS -N " + alias + "\n")
o.write(f"#PBS -o {output_logs_dir} \n")
o.write(f"#PBS -e {output_logs_dir} \n")
o.write(f"#PBS -l select={nodes}:ncpus={ncpus}:mem={gmem}gb\n")
if jnums:
if isinstance(jnums, int):
o.write(f"#PBS -J 1-{str(jnums)} \n\n")
else:
o.write(f"#PBS -J {str(jnums[0])}-{str(jnums[1])} \n\n")
if run_after_job_id:
if job_suffix:
run_after_job_id = str(run_after_job_id) + job_suffix
o.write("#PBS -W depend=afterok:" + str(run_after_job_id) + "\n")
if default_command:
o.write(default_command + "\n")
if custom_command:
o.write(custom_command + "\n")
o.write(cmd)
o.close()
def submit_cmdfile_to_pbs(cmdfile, pbs_cmd_path):
cmd = f"{pbs_cmd_path} {cmdfile}"
result = os.popen(cmd).read()
return result.split(".")[0]
def runner_cmd(input_dir, output_dir, reference_file, max_basecall_iterations, db_path, db_comment,
quality_threshold, task, evalue, dust, num_alignments, mode, perc_identity, soft_masking, min_coverage,
align_to_ref, python_path, min_frequency, base_path, cleanup, cpu_count, overlapping_reads):
runner_path = os.path.join(base_path, 'runner.py')
cmd = f"{python_path} {runner_path} -i {input_dir} -o {output_dir} -r {reference_file} "
if max_basecall_iterations is not None:
cmd += f" -m {max_basecall_iterations}"
if quality_threshold is not None:
cmd += f" -qt {quality_threshold}"
if task is not None:
cmd += f" -bt {task}"
if evalue is not None:
cmd += f" -be {evalue}"
if dust is not None:
cmd += f" -bd {dust}"
if num_alignments is not None:
cmd += f" -bn {num_alignments}"
if mode is not None:
cmd += f" -bm {mode}"
if perc_identity is not None:
cmd += f" -bp {perc_identity}"
if soft_masking is not None:
cmd += f" -bs {soft_masking}"
if min_coverage is not None:
cmd += f" -mc {min_coverage}"
if min_frequency is not None:
cmd += f" -mf {min_frequency}"
if align_to_ref is not None:
cmd += f" -ar {align_to_ref}"
if cleanup is not None:
cmd += f" -c {cleanup}"
if cpu_count is not None:
cmd += f" -cc {cpu_count}"
if overlapping_reads is not None:
cmd += f" -or {overlapping_reads}"
if db_path is not None:
cmd += f" -db {db_path}"
if db_comment is not None:
cmd += f" -dbc '{db_comment}'"
return cmd
def pbs_runner(input_dir, output_dir, reference_file, max_basecall_iterations, db_path, db_comment, pbs_cmd_path,
quality_threshold, task, evalue, dust, num_alignments, mode, perc_identity, overlapping_reads, gmem,
soft_masking, min_coverage, align_to_ref, alias, queue, cleanup, cpu_count, python_path, min_frequency,
custom_command=None, after_jobid=None, job_suffix=None, default_command=None):
# TODO: reintroduce stages into the runner,
# do relevant aggregation (when needed) in aggregation instead of linked mutations,
# create a dir with files representing mutation_read_list_parts,
# use it in a pbs array calling linked mutations on each part
# where each job uses 24 cpus or less and limit to 500 jobs per array.
if not output_dir:
output_dir = assign_output_dir(db_path, alias)
base_path = os.path.dirname(os.path.abspath(__file__))
pbs_logs_dir = os.path.join(output_dir, "pbs_logs")
os.makedirs(pbs_logs_dir, exist_ok=True)
if not alias:
cmd_identifier = randint(42, 777) # so that we can easily connect cmdfile and job
alias = f"AccuNGS_{cmd_identifier}"
cmd_path = os.path.join(pbs_logs_dir, f'{alias}.cmd')
cmd = runner_cmd(input_dir=input_dir, output_dir=output_dir, reference_file=reference_file,
max_basecall_iterations=max_basecall_iterations,
quality_threshold=quality_threshold, task=task, evalue=evalue, dust=dust, python_path=python_path,
num_alignments=num_alignments, mode=mode, perc_identity=perc_identity, db_comment=db_comment,
soft_masking=soft_masking, min_coverage=min_coverage, cleanup=cleanup, cpu_count=cpu_count,
align_to_ref=align_to_ref, db_path=db_path,
base_path=base_path,
overlapping_reads=overlapping_reads,
min_frequency=min_frequency)
create_pbs_cmd_file(cmd_path, alias, output_logs_dir=pbs_logs_dir, cmd=cmd, queue=queue, gmem=gmem,
ncpus=cpu_count, run_after_job_id=after_jobid, job_suffix=job_suffix,
custom_command=custom_command, default_command=default_command)
job_id = submit_cmdfile_to_pbs(cmd_path, pbs_cmd_path)
if job_id:
print(f"Submitted jod '{alias}' with id {job_id}")
print(f"Output files will be in {output_dir}")
print(f"runner log file will be in {os.path.join(output_dir, '.log')}")
else:
print(f"Could not submit job {alias} to queue!")
print(f"cmd file and pbs logs are in {pbs_logs_dir}")
return job_id
def get_pbs_args():
parser = create_runner_parser()
parser.add_argument("-a", "--alias", help="job alias visible in qstat")
parser.add_argument("-q", "--queue", help="PBS queue to run on")
parser.add_argument("-j", "--after_jobid", help="Run after successfully completing this jobid")
parser.add_argument("-gm", "--gmem", help="Memory in GB to ask for in cmd file")
parser_args = vars(parser.parse_args())
args = dict(get_config()['runner_defaults']) # get runner defaults
args.update({key: value for key, value in dict(get_config()['pbs_defaults']).items()}) # overide with pbs defaults
args.update({key: value for key, value in parser_args.items() if value is not None}) # overide with cli args
return args
if __name__ == "__main__":
args = get_pbs_args()
pbs_runner(input_dir=args['input_dir'], output_dir=args['output_dir'], reference_file=args['reference_file'],
max_basecall_iterations=args['max_basecall_iterations'], custom_command=args['custom_command'],
quality_threshold=args['quality_threshold'], task=args['blast_task'], db_comment=args['db_comment'],
evalue=args['blast_evalue'], dust=args['blast_dust'], num_alignments=args['blast_num_alignments'],
mode=args['blast_mode'], perc_identity=args['blast_perc_identity'],
min_coverage=args['min_coverage'], cleanup=args['cleanup'], default_command=args['default_command'],
align_to_ref=args['align_to_ref'], queue=args['queue'], python_path=args['python_path'],
cpu_count=args['cpu_count'], overlapping_reads=args['overlapping_reads'], db_path=args['db_path'],
after_jobid=args['after_jobid'], job_suffix=args['job_suffix'], alias=args['alias'],
pbs_cmd_path=args['pbs_cmd_path'], gmem=args['gmem'],
soft_masking=args['blast_soft_masking'], min_frequency=args['min_frequency'])