gencorefacility · mohammedkhalfan · Nov 18, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/reform.py b/reform.py
@@ -128,14 +128,14 @@ def index_fasta(fasta_path):
 	the temporary file. Finally, the indexing result is returned.
 	'''
 	if fasta_path.endswith('.gz'):
-		# Create a tempfile to store uncompressde content
+		## Create a tempfile to store uncompressde content
 		with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
 			tmp_f_path = tmp_f.name
-			# Use pgzip or gzip to decompress parallely. Set thread=None means use all cores
+			## Use pgzip or gzip to decompress parallely. Set thread=None means use all cores
 			with gzip_module.open(fasta_path, 'rt', thread=None) as f:
 				tmp_f.write(f.read())
 		chrom_seqs = SeqIO.index(tmp_f_path, 'fasta')
-        	# remove temp file
+        ## remove temp file
 		os.remove(tmp_f_path)
 	else:
 		chrom_seqs = SeqIO.index(fasta_path, 'fasta')
@@ -232,7 +232,6 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
 			# Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die
 			upstream_seq_count = seq_str.count(upstream_seq)
 			downstream_seq_count = seq_str.count(downstream_seq)
-			### TODO: Update postion based on previous modifications
 			if upstream_seq_count == 1 and downstream_seq_count == 1:
 				## Obtain the starting position of the left_strand
 				new_index = seq_str.find(upstream_seq)
@@ -252,7 +251,7 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
 		exit()
 	return {'position': position, 'down_position': down_position}
 
-def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length):
+def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom):
 	'''
 	in_gff_lines: a list of lists where each nested list is a list of 
 		columns (in gff format) associated with each new feature to insert
@@ -261,6 +260,9 @@ def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence
 	sequence_length: length of the inserted sequence, used to determine 
 		the new end positions in the GFF file.
 	'''
+ 	## Replace the chromosome ID from in_gff with the correct chromosome ID
+	for l in in_gff_lines:
+		l[0] = chrom
 	# Handling of single-line comments
 	if len(in_gff_lines) == 1:
 		l = in_gff_lines[0]
@@ -334,7 +336,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 		ref_gff_path = ref_gff
 		if ref_gff.endswith('.gz'):
 			with gzip_module.open(ref_gff, 'rt') as f:
-						# Create a tempfile to store uncompressde content
+				## Create a tempfile to store uncompressde content
 				with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
 					tmp_f.write(f.read())
 					ref_gff_path = tmp_f.name
@@ -367,7 +369,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 						and gff_chrom_id != last_seen_chrom_id 
 						and not in_gff_lines_appended):
 						in_gff_lines_appended = write_in_gff_lines(
-							gff_out, in_gff_lines, position, split_features, new_seq_length)
+							gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
 
 					last_seen_chrom_id = gff_chrom_id
 
@@ -450,7 +452,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					else:
 						if not in_gff_lines_appended:
 							in_gff_lines_appended = write_in_gff_lines(
-								gff_out, in_gff_lines, position, split_features, new_seq_length)
+								gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
 
 						# Change start position of feature to after cutoff point if
 						# the feature starts within the deletion
@@ -497,7 +499,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 				and last_seen_chrom_id == chrom_id
 				and not in_gff_lines_appended):
 				in_gff_lines_appended = write_in_gff_lines(
-					gff_out, in_gff_lines, position, split_features, new_seq_length)
+					gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
 
 			# Checking to ensure in_gff_lines written
 			if not in_gff_lines_appended:

diff --git a/test_data/14/gold.gtf b/test_data/14/gold.gtf
@@ -1,10 +1,10 @@
-X	new	exon	5	14	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	5	14	.	+	0	gene_id "first"; transcript_id "new.1";
 X	ref	exon	15	23	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
 X	ref	CDS	15	23	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
-X	new	exon	24	33	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	24	33	.	+	0	gene_id "second"; transcript_id "new.2";
 X	ref	exon	34	38	.	+	0	gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
 X	ref	CDS	34	37	.	+	0	gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
 X	ref	stop_codon	38	38	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
-X	new	exon	39	48	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	39	48	.	+	0	gene_id "third"; transcript_id "new.3";
 X	ref	exon	49	49	.	+	0	gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "5 prime side of feature cut-off by inserted sequence";reform_comment "original feature split by inserted sequence, this is the 3 prime end";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
 X	ref	stop_codon	49	49	.	+	0	gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
diff --git a/test_data/14/in1.gtf b/test_data/14/in1.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "first"; transcript_id "new.1";
diff --git a/test_data/14/in2.gtf b/test_data/14/in2.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "second"; transcript_id "new.2";
diff --git a/test_data/14/in3.gtf b/test_data/14/in3.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "third"; transcript_id "new.3";
diff --git a/test_data/15/gold.gtf b/test_data/15/gold.gtf
@@ -1,8 +1,8 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "first"; transcript_id "new.1";
 X	ref	exon	15	15	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
-X	new	exon	16	25	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	16	25	.	+	0	gene_id "second"; transcript_id "new.2";
 X	ref	exon	26	45	.	+	0	gene _id "ref_gene_split"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 3 prime end";
 X	ref	CDS	28	42	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	start_codon	15	15	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";reform_comment "original feature split by inserted sequence, this is the 5 prime end";
 X	ref	stop_codon	43	45	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
-X	new	exon	51	60	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	51	60	.	+	0	gene_id "third"; transcript_id "new.3";
diff --git a/test_data/15/in1.gtf b/test_data/15/in1.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "first"; transcript_id "new.1";
diff --git a/test_data/15/in2.gtf b/test_data/15/in2.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "second"; transcript_id "new.2";
diff --git a/test_data/15/in3.gtf b/test_data/15/in3.gtf
@@ -1 +1 @@
-X	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
+X	new	exon	1	10	.	+	0	gene_id "third"; transcript_id "new.3";
diff --git a/test_data/16/gold.fa b/test_data/16/gold.fa
@@ -0,0 +1,2 @@
+>X
+XZZZABBBBBDDDDDCCCCCIIIIIKKKKK----------
diff --git a/test_data/16/gold.gtf b/test_data/16/gold.gtf
@@ -0,0 +1,5 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	new	exon	31	40	.	+	0	gene_id "new"; transcript_id "new.1";
diff --git a/test_data/16/in.fa b/test_data/16/in.fa
@@ -0,0 +1,2 @@
+>test 1
+----------
diff --git a/test_data/16/in.gtf b/test_data/16/in.gtf
@@ -0,0 +1 @@
+I	new	exon	1	10	.	+	0	gene_id "new"; transcript_id "new.1";
diff --git a/test_data/16/ref.fa b/test_data/16/ref.fa
@@ -0,0 +1,2 @@
+>X
+XZZZABBBBBDDDDDCCCCCIIIIIKKKKK
diff --git a/test_data/16/ref.gtf b/test_data/16/ref.gtf
@@ -0,0 +1,4 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
diff --git a/test_reform.py b/test_reform.py
@@ -643,6 +643,46 @@ def test_case_15(self):
 		print("Done")
 
 		os.chdir(wd)
+
+	def test_case_16(self):
+		"""
+		Case 16:
+		Testing Reform which invalid chrom
+		"""
+
+		wd = os.getcwd()
+		os.chdir('test_data/16/')
+
+		command = """
+		python3 ../../reform.py \
+		--chrom="X" \
+		--in_fasta=in.fa \
+		--in_gff=in.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf \
+		--position=-1
+		"""
+
+		response = subprocess.getoutput(command)
+		print(response)
+
+		with open('gold.gtf', 'r') as f:
+			gold_gff = f.read()
+		with open('ref_reformed.gtf', 'r') as f:
+			new_gff = f.read()
+		print("Testing gtf")
+		self.assertListEqual(list(gold_gff), list(new_gff))
+		print("Done")
+
+		with open('gold.fa', 'r') as f:
+			gold_fa = f.read()
+		with open('ref_reformed.fa', 'r') as f:
+			new_fa = f.read()
+		print("Testing Fasta")
+		self.assertListEqual(list(gold_fa), list(new_fa))
+		print("Done")
+
+		os.chdir(wd)
 
 if __name__ == '__main__':
     unittest.main()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
		X new exon 1 10 . + 0 gene_id "first"; transcript_id "new.1";
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
		X new exon 1 10 . + 0 gene_id "second"; transcript_id "new.2";
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		X new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";
		X new exon 1 10 . + 0 gene_id "third"; transcript_id "new.3";
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		I new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1";