-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathsequence.h
More file actions
214 lines (188 loc) · 5.73 KB
/
sequence.h
File metadata and controls
214 lines (188 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/*
Copyright (C) 2016,2017 BGI Research
Author: Shi Quan ([email protected])
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
// sequence - a simple collection for handling sequences
//
#ifndef SEQUENCE_HEADER
#define SEQUENCE_HEADER
#include <stdio.h>
#include <stdlib.h>
#if defined(_MSC_VER) && !defined(__clang__)
# define inline __inline
#endif
#define C4_A 0
#define C4_C 1
#define C4_G 2
#define C4_T 3
#define C4_U 3
#define C4_N 4
#define seqarr "ACGTN"
#define revseqarr "TGCAN"
#define SEQ_COMP(a,b) (a + b == 3)
typedef char * (*func_dup_seq)(const char *, unsigned long );
extern int seq2code4(int seq);
extern char *rev_seqs(const char *dna_seqs, unsigned long n);
#define C4_Stop 0
#define C4_Phe 1
#define C4_Leu 2
#define C4_Ser 3
#define C4_Tyr 4
#define C4_Cys 5
#define C4_Trp 6
#define C4_Pro 7
#define C4_His 8
#define C4_Gln 9
#define C4_Arg 10
#define C4_Ile 11
#define C4_Met 12
#define C4_Thr 13
#define C4_Asn 14
#define C4_Lys 15
#define C4_Val 16
#define C4_Ala 17
#define C4_Asp 18
#define C4_Glu 19
#define C4_Gly 20
const static char *codon_names[] = {
"Stop", "Phe", "Leu", "Ser", "Tyr", "Cys", "Trp", "Pro", "His", "Gln",
"Arg", "Ile", "Met", "Thr", "Asn", "Lys", "Val", "Ala", "Asp", "Glu", "Gly",
};
const static char *codon_short_names[] = {
"X", "F", "L", "S", "Y", "C", "W", "P", "H", "Q",
"R", "I", "M", "T", "N", "K", "V", "A", "D", "E", "G",
};
const static int codon_matrix[4][4][4] = {
{
{ C4_Lys, C4_Asn, C4_Lys, C4_Asn, },
{ C4_Thr, C4_Thr, C4_Thr, C4_Thr, },
{ C4_Arg, C4_Ser, C4_Arg, C4_Ser, },
{ C4_Ile, C4_Ile, C4_Met, C4_Ile, },
},{
{ C4_Gln, C4_His, C4_Gln, C4_His, },
{ C4_Pro, C4_Pro, C4_Pro, C4_Pro, },
{ C4_Arg, C4_Arg, C4_Arg, C4_Arg, },
{ C4_Leu, C4_Leu, C4_Leu, C4_Leu, },
},{
{ C4_Glu, C4_Asp, C4_Glu, C4_Asp, },
{ C4_Ala, C4_Ala, C4_Ala, C4_Ala, },
{ C4_Gly, C4_Gly, C4_Gly, C4_Gly, },
{ C4_Val, C4_Val, C4_Val, C4_Val, },
},{
{ C4_Stop, C4_Tyr, C4_Stop, C4_Tyr, },
{ C4_Ser, C4_Ser, C4_Ser, C4_Ser, },
{ C4_Stop, C4_Cys, C4_Trp, C4_Cys, },
{ C4_Leu, C4_Phe, C4_Leu, C4_Phe, },
},
};
// no check the codon length for fast read
static inline int codon2aminoid(char *codon)
{
return codon_matrix[seq2code4(codon[0])][seq2code4(codon[1])][seq2code4(codon[2])];
}
// check the variants type
enum var_type {
_var_type_promoter_to_int = -1,
var_is_unknown,
var_is_reference,
var_is_intron,
var_is_noncoding,
var_is_utr5,
var_is_utr3,
var_is_synonymous,
var_is_missense,
var_is_nonsense, // stop gain
var_is_inframe_insertion,
var_is_inframe_deletion,
var_is_inframe_delins,
var_is_frameshift,
var_is_stop_lost,
var_is_stop_retained,
//var_is_splice_site,
//var_is_splice_donor,
//var_is_splice_acceptor,
var_is_complex,
var_is_no_call,
};
enum var_type_splice {
var_is_not_splice = 0,
var_is_splice_site,
var_is_splice_donor,
var_is_splice_acceptor,
};
static inline const char *var_type_string(enum var_type type)
{
static const char* vartypes[21] = {
"Unknown",
"Reference",
"Intron",
"Noncoding",
"Utr5",
"Utr3",
"Synonymous",
"Missense",
"Nonsense",
"InframeInsertion",
"InframeDeletion",
"InframeDelins",
"Frameshift",
"StopLost",
"StopRetained",
//"SpliceSite",
//"SpliceDonor",
//"SpliceAcceptor",
"Complex",
"NoCall",
NULL, NULL, NULL, NULL,
};
assert(type >= 0);
return vartypes[type];
}
static inline const char *var_type_splice_string(enum var_type_splice type)
{
static const char *splicetypes[5] = {
"NotSplice",
"SpliceSite",
"SpliceDonor",
"SpliceAcceptor",
NULL,
};
assert(type >= 0 );
return splicetypes[type];
}
// 1 on yes, 0 on no
static inline int check_is_stop(char *codon)
{
return codon_matrix[seq2code4(codon[0])][seq2code4(codon[1])][seq2code4(codon[2])] == C4_Stop;
}
static inline void compl_seq(char *seq, int l)
{
int i;
for ( i = 0; i < l/2; i++ ) {
char c = revseqarr[seq2code4(seq[i])];
seq[i] = revseqarr[seq2code4(seq[l-i-1])];
seq[l-i-1] = c;
}
if ( l & 1 ) {
seq[l/2] = revseqarr[seq2code4(seq[l/2])];
}
}
extern int same_DNA_seqs(const char *a, const char *b, int l );
extern int check_stop_codon(char *seq, char *p_end);
extern enum var_type check_var_type(char *block, int block_length, int start, char *ref, int ref_length, char *alt, int alt_length );
#endif