Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions reform.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,14 @@ def index_fasta(fasta_path):
the temporary file. Finally, the indexing result is returned.
'''
if fasta_path.endswith('.gz'):
# Create a tempfile to store uncompressde content
## Create a tempfile to store uncompressde content
with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
tmp_f_path = tmp_f.name
# Use pgzip or gzip to decompress parallely. Set thread=None means use all cores
## Use pgzip or gzip to decompress parallely. Set thread=None means use all cores
with gzip_module.open(fasta_path, 'rt', thread=None) as f:
tmp_f.write(f.read())
chrom_seqs = SeqIO.index(tmp_f_path, 'fasta')
# remove temp file
## remove temp file
os.remove(tmp_f_path)
else:
chrom_seqs = SeqIO.index(fasta_path, 'fasta')
Expand Down Expand Up @@ -232,7 +232,6 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
# Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die
upstream_seq_count = seq_str.count(upstream_seq)
downstream_seq_count = seq_str.count(downstream_seq)
### TODO: Update postion based on previous modifications
if upstream_seq_count == 1 and downstream_seq_count == 1:
## Obtain the starting position of the left_strand
new_index = seq_str.find(upstream_seq)
Expand All @@ -252,7 +251,7 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
exit()
return {'position': position, 'down_position': down_position}

def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length):
def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom):
'''
in_gff_lines: a list of lists where each nested list is a list of
columns (in gff format) associated with each new feature to insert
Expand All @@ -261,6 +260,9 @@ def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence
sequence_length: length of the inserted sequence, used to determine
the new end positions in the GFF file.
'''
## Replace the chromosome ID from in_gff with the correct chromosome ID
for l in in_gff_lines:
l[0] = chrom
# Handling of single-line comments
if len(in_gff_lines) == 1:
l = in_gff_lines[0]
Expand Down Expand Up @@ -334,7 +336,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
ref_gff_path = ref_gff
if ref_gff.endswith('.gz'):
with gzip_module.open(ref_gff, 'rt') as f:
# Create a tempfile to store uncompressde content
## Create a tempfile to store uncompressde content
with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
tmp_f.write(f.read())
ref_gff_path = tmp_f.name
Expand Down Expand Up @@ -367,7 +369,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
and gff_chrom_id != last_seen_chrom_id
and not in_gff_lines_appended):
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length)
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)

last_seen_chrom_id = gff_chrom_id

Expand Down Expand Up @@ -450,7 +452,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
else:
if not in_gff_lines_appended:
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length)
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)

# Change start position of feature to after cutoff point if
# the feature starts within the deletion
Expand Down Expand Up @@ -497,7 +499,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
and last_seen_chrom_id == chrom_id
and not in_gff_lines_appended):
in_gff_lines_appended = write_in_gff_lines(
gff_out, in_gff_lines, position, split_features, new_seq_length)
gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)

# Checking to ensure in_gff_lines written
if not in_gff_lines_appended:
Expand Down
6 changes: 3 additions & 3 deletions test_data/14/gold.gtf
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
X new exon 5 14 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 5 14 . + 0 gene_id "first"; transcript_id "new.1";
X ref exon 15 23 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X ref CDS 15 23 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X new exon 24 33 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 24 33 . + 0 gene_id "second"; transcript_id "new.2";
X ref exon 34 38 . + 0 gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X ref CDS 34 37 . + 0 gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
X ref stop_codon 38 38 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X new exon 39 48 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 39 48 . + 0 gene_id "third"; transcript_id "new.3";
X ref exon 49 49 . + 0 gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
X ref stop_codon 49 49 . + 0 gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
2 changes: 1 addition & 1 deletion test_data/14/in1.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "first"; transcript_id "new.1";
2 changes: 1 addition & 1 deletion test_data/14/in2.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "second"; transcript_id "new.2";
2 changes: 1 addition & 1 deletion test_data/14/in3.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "third"; transcript_id "new.3";
6 changes: 3 additions & 3 deletions test_data/15/gold.gtf
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "first"; transcript_id "new.1";
X ref exon 15 15 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X new exon 16 25 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 16 25 . + 0 gene_id "second"; transcript_id "new.2";
X ref exon 26 45 . + 0 gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
X ref CDS 28 42 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref start_codon 15 15 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
X ref stop_codon 43 45 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X new exon 51 60 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 51 60 . + 0 gene_id "third"; transcript_id "new.3";
2 changes: 1 addition & 1 deletion test_data/15/in1.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "first"; transcript_id "new.1";
2 changes: 1 addition & 1 deletion test_data/15/in2.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "second"; transcript_id "new.2";
2 changes: 1 addition & 1 deletion test_data/15/in3.gtf
Original file line number Diff line number Diff line change
@@ -1 +1 @@
X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
X new exon 1 10 . + 0 gene_id "third"; transcript_id "new.3";
2 changes: 2 additions & 0 deletions test_data/16/gold.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>X
XZZZABBBBBDDDDDCCCCCIIIIIKKKKK----------
5 changes: 5 additions & 0 deletions test_data/16/gold.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X new exon 31 40 . + 0 gene_id "new"; transcript_id "new.1";
2 changes: 2 additions & 0 deletions test_data/16/in.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>test 1
----------
1 change: 1 addition & 0 deletions test_data/16/in.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
2 changes: 2 additions & 0 deletions test_data/16/ref.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>X
XZZZABBBBBDDDDDCCCCCIIIIIKKKKK
4 changes: 4 additions & 0 deletions test_data/16/ref.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1";
40 changes: 40 additions & 0 deletions test_reform.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,46 @@ def test_case_15(self):
print("Done")

os.chdir(wd)

def test_case_16(self):
"""
Case 16:
Testing Reform which invalid chrom
"""

wd = os.getcwd()
os.chdir('test_data/16/')

command = """
python3 ../../reform.py \
--chrom="X" \
--in_fasta=in.fa \
--in_gff=in.gtf \
--ref_fasta=ref.fa \
--ref_gff=ref.gtf \
--position=-1
"""

response = subprocess.getoutput(command)
print(response)

with open('gold.gtf', 'r') as f:
gold_gff = f.read()
with open('ref_reformed.gtf', 'r') as f:
new_gff = f.read()
print("Testing gtf")
self.assertListEqual(list(gold_gff), list(new_gff))
print("Done")

with open('gold.fa', 'r') as f:
gold_fa = f.read()
with open('ref_reformed.fa', 'r') as f:
new_fa = f.read()
print("Testing Fasta")
self.assertListEqual(list(gold_fa), list(new_fa))
print("Done")

os.chdir(wd)

if __name__ == '__main__':
unittest.main()