git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@109238 bc3139a8-67e5-0310-9ffc-ced21a209358
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,489 @@ |
| 1 |
+#include <zlib.h> |
|
| 2 |
+#include <stdio.h> |
|
| 3 |
+#include <ctype.h> |
|
| 4 |
+#include <string.h> |
|
| 5 |
+#include <stdlib.h> |
|
| 6 |
+#include <unistd.h> |
|
| 7 |
+#include <assert.h> |
|
| 8 |
+#ifdef _WIN32 |
|
| 9 |
+#include <fcntl.h> |
|
| 10 |
+#endif |
|
| 11 |
+#include "kstring.h" |
|
| 12 |
+#include "bam.h" |
|
| 13 |
+#include "sam_header.h" |
|
| 14 |
+#include "kseq.h" |
|
| 15 |
+#include "khash.h" |
|
| 16 |
+ |
|
| 17 |
+KSTREAM_INIT(gzFile, gzread, 16384) |
|
| 18 |
+KHASH_MAP_INIT_STR(ref, uint64_t) |
|
| 19 |
+ |
|
| 20 |
+void bam_init_header_hash(bam_header_t *header); |
|
| 21 |
+void bam_destroy_header_hash(bam_header_t *header); |
|
| 22 |
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); |
|
| 23 |
+ |
|
| 24 |
+unsigned char bam_nt16_table[256] = {
|
|
| 25 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 26 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 27 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 28 |
+ 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, |
|
| 29 |
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, |
|
| 30 |
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, |
|
| 31 |
+ 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, |
|
| 32 |
+ 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, |
|
| 33 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 34 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 35 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 36 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 37 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 38 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 39 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, |
|
| 40 |
+ 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 |
|
| 41 |
+}; |
|
| 42 |
+ |
|
| 43 |
+unsigned short bam_char2flag_table[256] = {
|
|
| 44 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 45 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 46 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 47 |
+ 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 48 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 49 |
+ BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0, |
|
| 50 |
+ 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0, |
|
| 51 |
+ BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0, |
|
| 52 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 53 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 54 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 55 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 56 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 57 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 58 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|
| 59 |
+ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 |
|
| 60 |
+}; |
|
| 61 |
+ |
|
| 62 |
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; |
|
| 63 |
+ |
|
| 64 |
+struct __tamFile_t {
|
|
| 65 |
+ gzFile fp; |
|
| 66 |
+ kstream_t *ks; |
|
| 67 |
+ kstring_t *str; |
|
| 68 |
+ uint64_t n_lines; |
|
| 69 |
+ int is_first; |
|
| 70 |
+}; |
|
| 71 |
+ |
|
| 72 |
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only |
|
| 73 |
+{
|
|
| 74 |
+ char **list = 0, *s; |
|
| 75 |
+ int n = 0, dret, m = 0; |
|
| 76 |
+ gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); |
|
| 77 |
+ kstream_t *ks; |
|
| 78 |
+ kstring_t *str; |
|
| 79 |
+ str = (kstring_t*)calloc(1, sizeof(kstring_t)); |
|
| 80 |
+ ks = ks_init(fp); |
|
| 81 |
+ while (ks_getuntil(ks, '\n', str, &dret) > 0) {
|
|
| 82 |
+ if (n == m) {
|
|
| 83 |
+ m = m? m << 1 : 16; |
|
| 84 |
+ list = (char**)realloc(list, m * sizeof(char*)); |
|
| 85 |
+ } |
|
| 86 |
+ if (str->s[str->l-1] == '\r') |
|
| 87 |
+ str->s[--str->l] = '\0'; |
|
| 88 |
+ s = list[n++] = (char*)calloc(str->l + 1, 1); |
|
| 89 |
+ strcpy(s, str->s); |
|
| 90 |
+ } |
|
| 91 |
+ ks_destroy(ks); |
|
| 92 |
+ gzclose(fp); |
|
| 93 |
+ free(str->s); free(str); |
|
| 94 |
+ *_n = n; |
|
| 95 |
+ return list; |
|
| 96 |
+} |
|
| 97 |
+ |
|
| 98 |
+static bam_header_t *hash2header(const kh_ref_t *hash) |
|
| 99 |
+{
|
|
| 100 |
+ bam_header_t *header; |
|
| 101 |
+ khiter_t k; |
|
| 102 |
+ header = bam_header_init(); |
|
| 103 |
+ header->n_targets = kh_size(hash); |
|
| 104 |
+ header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); |
|
| 105 |
+ header->target_len = (uint32_t*)calloc(kh_size(hash), 4); |
|
| 106 |
+ for (k = kh_begin(hash); k != kh_end(hash); ++k) {
|
|
| 107 |
+ if (kh_exist(hash, k)) {
|
|
| 108 |
+ int i = (int)kh_value(hash, k); |
|
| 109 |
+ header->target_name[i] = (char*)kh_key(hash, k); |
|
| 110 |
+ header->target_len[i] = kh_value(hash, k)>>32; |
|
| 111 |
+ } |
|
| 112 |
+ } |
|
| 113 |
+ bam_init_header_hash(header); |
|
| 114 |
+ return header; |
|
| 115 |
+} |
|
| 116 |
+bam_header_t *sam_header_read2(const char *fn) |
|
| 117 |
+{
|
|
| 118 |
+ bam_header_t *header; |
|
| 119 |
+ int c, dret, ret, error = 0; |
|
| 120 |
+ gzFile fp; |
|
| 121 |
+ kstream_t *ks; |
|
| 122 |
+ kstring_t *str; |
|
| 123 |
+ kh_ref_t *hash; |
|
| 124 |
+ khiter_t k; |
|
| 125 |
+ if (fn == 0) return 0; |
|
| 126 |
+ fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); |
|
| 127 |
+ if (fp == 0) return 0; |
|
| 128 |
+ hash = kh_init(ref); |
|
| 129 |
+ ks = ks_init(fp); |
|
| 130 |
+ str = (kstring_t*)calloc(1, sizeof(kstring_t)); |
|
| 131 |
+ while (ks_getuntil(ks, 0, str, &dret) > 0) {
|
|
| 132 |
+ char *s = strdup(str->s); |
|
| 133 |
+ int len, i; |
|
| 134 |
+ i = kh_size(hash); |
|
| 135 |
+ ks_getuntil(ks, 0, str, &dret); |
|
| 136 |
+ len = atoi(str->s); |
|
| 137 |
+ k = kh_put(ref, hash, s, &ret); |
|
| 138 |
+ if (ret == 0) {
|
|
| 139 |
+ fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s); |
|
| 140 |
+ error = 1; |
|
| 141 |
+ } |
|
| 142 |
+ kh_value(hash, k) = (uint64_t)len<<32 | i; |
|
| 143 |
+ if (dret != '\n') |
|
| 144 |
+ while ((c = ks_getc(ks)) != '\n' && c != -1); |
|
| 145 |
+ } |
|
| 146 |
+ ks_destroy(ks); |
|
| 147 |
+ gzclose(fp); |
|
| 148 |
+ free(str->s); free(str); |
|
| 149 |
+ fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); |
|
| 150 |
+ if (error) return 0; |
|
| 151 |
+ header = hash2header(hash); |
|
| 152 |
+ kh_destroy(ref, hash); |
|
| 153 |
+ return header; |
|
| 154 |
+} |
|
| 155 |
+static inline uint8_t *alloc_data(bam1_t *b, int size) |
|
| 156 |
+{
|
|
| 157 |
+ if (b->m_data < size) {
|
|
| 158 |
+ b->m_data = size; |
|
| 159 |
+ kroundup32(b->m_data); |
|
| 160 |
+ b->data = (uint8_t*)realloc(b->data, b->m_data); |
|
| 161 |
+ } |
|
| 162 |
+ return b->data; |
|
| 163 |
+} |
|
| 164 |
+static inline void parse_error(int64_t n_lines, const char * __restrict msg) |
|
| 165 |
+{
|
|
| 166 |
+ fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); |
|
| 167 |
+ abort(); |
|
| 168 |
+} |
|
| 169 |
+static inline void append_text(bam_header_t *header, kstring_t *str) |
|
| 170 |
+{
|
|
| 171 |
+ size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null |
|
| 172 |
+ kroundup32(x); kroundup32(y); |
|
| 173 |
+ if (x < y) |
|
| 174 |
+ {
|
|
| 175 |
+ header->n_text = y; |
|
| 176 |
+ header->text = (char*)realloc(header->text, y); |
|
| 177 |
+ if ( !header->text ) |
|
| 178 |
+ {
|
|
| 179 |
+ fprintf(stderr,"realloc failed to alloc %ld bytes\n", y); |
|
| 180 |
+ abort(); |
|
| 181 |
+ } |
|
| 182 |
+ } |
|
| 183 |
+ // Sanity check |
|
| 184 |
+ if ( header->l_text+str->l+1 >= header->n_text ) |
|
| 185 |
+ {
|
|
| 186 |
+ fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y); |
|
| 187 |
+ abort(); |
|
| 188 |
+ } |
|
| 189 |
+ strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. |
|
| 190 |
+ header->l_text += str->l + 1; |
|
| 191 |
+ header->text[header->l_text] = 0; |
|
| 192 |
+} |
|
| 193 |
+ |
|
| 194 |
+int sam_header_parse(bam_header_t *h) |
|
| 195 |
+{
|
|
| 196 |
+ char **tmp; |
|
| 197 |
+ int i; |
|
| 198 |
+ free(h->target_len); free(h->target_name); |
|
| 199 |
+ h->n_targets = 0; h->target_len = 0; h->target_name = 0; |
|
| 200 |
+ if (h->l_text < 3) return 0; |
|
| 201 |
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text); |
|
| 202 |
+ tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets); |
|
| 203 |
+ if (h->n_targets == 0) return 0; |
|
| 204 |
+ h->target_name = calloc(h->n_targets, sizeof(void*)); |
|
| 205 |
+ for (i = 0; i < h->n_targets; ++i) |
|
| 206 |
+ h->target_name[i] = strdup(tmp[i]); |
|
| 207 |
+ free(tmp); |
|
| 208 |
+ tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets); |
|
| 209 |
+ h->target_len = calloc(h->n_targets, 4); |
|
| 210 |
+ for (i = 0; i < h->n_targets; ++i) |
|
| 211 |
+ h->target_len[i] = atoi(tmp[i]); |
|
| 212 |
+ free(tmp); |
|
| 213 |
+ return h->n_targets; |
|
| 214 |
+} |
|
| 215 |
+ |
|
| 216 |
+bam_header_t *sam_header_read(tamFile fp) |
|
| 217 |
+{
|
|
| 218 |
+ int ret, dret; |
|
| 219 |
+ bam_header_t *header = bam_header_init(); |
|
| 220 |
+ kstring_t *str = fp->str; |
|
| 221 |
+ while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
|
|
| 222 |
+ str->s[str->l] = dret; // note that str->s is NOT null terminated!! |
|
| 223 |
+ append_text(header, str); |
|
| 224 |
+ if (dret != '\n') {
|
|
| 225 |
+ ret = ks_getuntil(fp->ks, '\n', str, &dret); |
|
| 226 |
+ str->s[str->l] = '\n'; // NOT null terminated!! |
|
| 227 |
+ append_text(header, str); |
|
| 228 |
+ } |
|
| 229 |
+ ++fp->n_lines; |
|
| 230 |
+ } |
|
| 231 |
+ sam_header_parse(header); |
|
| 232 |
+ bam_init_header_hash(header); |
|
| 233 |
+ fp->is_first = 1; |
|
| 234 |
+ return header; |
|
| 235 |
+} |
|
| 236 |
+ |
|
| 237 |
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) |
|
| 238 |
+{
|
|
| 239 |
+ int ret, doff, doff0, dret, z = 0; |
|
| 240 |
+ bam1_core_t *c = &b->core; |
|
| 241 |
+ kstring_t *str = fp->str; |
|
| 242 |
+ kstream_t *ks = fp->ks; |
|
| 243 |
+ |
|
| 244 |
+ if (fp->is_first) {
|
|
| 245 |
+ fp->is_first = 0; |
|
| 246 |
+ ret = str->l; |
|
| 247 |
+ } else {
|
|
| 248 |
+ do { // special consideration for empty lines
|
|
| 249 |
+ ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); |
|
| 250 |
+ if (ret >= 0) z += str->l + 1; |
|
| 251 |
+ } while (ret == 0); |
|
| 252 |
+ } |
|
| 253 |
+ if (ret < 0) return -1; |
|
| 254 |
+ ++fp->n_lines; |
|
| 255 |
+ doff = 0; |
|
| 256 |
+ |
|
| 257 |
+ { // name
|
|
| 258 |
+ c->l_qname = strlen(str->s) + 1; |
|
| 259 |
+ memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); |
|
| 260 |
+ doff += c->l_qname; |
|
| 261 |
+ } |
|
| 262 |
+ { // flag
|
|
| 263 |
+ long flag; |
|
| 264 |
+ char *s; |
|
| 265 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; |
|
| 266 |
+ flag = strtol((char*)str->s, &s, 0); |
|
| 267 |
+ if (*s) { // not the end of the string
|
|
| 268 |
+ flag = 0; |
|
| 269 |
+ for (s = str->s; *s; ++s) |
|
| 270 |
+ flag |= bam_char2flag_table[(int)*s]; |
|
| 271 |
+ } |
|
| 272 |
+ c->flag = flag; |
|
| 273 |
+ } |
|
| 274 |
+ { // tid, pos, qual
|
|
| 275 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); |
|
| 276 |
+ if (c->tid < 0 && strcmp(str->s, "*")) {
|
|
| 277 |
+ if (header->n_targets == 0) {
|
|
| 278 |
+ fprintf(stderr, "[sam_read1] missing header? Abort!\n"); |
|
| 279 |
+ exit(1); |
|
| 280 |
+ } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); |
|
| 281 |
+ } |
|
| 282 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; |
|
| 283 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; |
|
| 284 |
+ if (ret < 0) return -2; |
|
| 285 |
+ } |
|
| 286 |
+ { // cigar
|
|
| 287 |
+ char *s, *t; |
|
| 288 |
+ int i, op; |
|
| 289 |
+ long x; |
|
| 290 |
+ c->n_cigar = 0; |
|
| 291 |
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; |
|
| 292 |
+ z += str->l + 1; |
|
| 293 |
+ if (str->s[0] != '*') {
|
|
| 294 |
+ uint32_t *cigar; |
|
| 295 |
+ for (s = str->s; *s; ++s) {
|
|
| 296 |
+ if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; |
|
| 297 |
+ else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); |
|
| 298 |
+ } |
|
| 299 |
+ b->data = alloc_data(b, doff + c->n_cigar * 4); |
|
| 300 |
+ cigar = bam1_cigar(b); |
|
| 301 |
+ for (i = 0, s = str->s; i != c->n_cigar; ++i) {
|
|
| 302 |
+ x = strtol(s, &t, 10); |
|
| 303 |
+ op = toupper(*t); |
|
| 304 |
+ if (op == 'M') op = BAM_CMATCH; |
|
| 305 |
+ else if (op == 'I') op = BAM_CINS; |
|
| 306 |
+ else if (op == 'D') op = BAM_CDEL; |
|
| 307 |
+ else if (op == 'N') op = BAM_CREF_SKIP; |
|
| 308 |
+ else if (op == 'S') op = BAM_CSOFT_CLIP; |
|
| 309 |
+ else if (op == 'H') op = BAM_CHARD_CLIP; |
|
| 310 |
+ else if (op == 'P') op = BAM_CPAD; |
|
| 311 |
+ else if (op == '=') op = BAM_CEQUAL; |
|
| 312 |
+ else if (op == 'X') op = BAM_CDIFF; |
|
| 313 |
+ else if (op == 'B') op = BAM_CBACK; |
|
| 314 |
+ else parse_error(fp->n_lines, "invalid CIGAR operation"); |
|
| 315 |
+ s = t + 1; |
|
| 316 |
+ cigar[i] = bam_cigar_gen(x, op); |
|
| 317 |
+ } |
|
| 318 |
+ if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); |
|
| 319 |
+ c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar)); |
|
| 320 |
+ doff += c->n_cigar * 4; |
|
| 321 |
+ } else {
|
|
| 322 |
+ if (!(c->flag&BAM_FUNMAP)) {
|
|
| 323 |
+ fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines); |
|
| 324 |
+ c->flag |= BAM_FUNMAP; |
|
| 325 |
+ } |
|
| 326 |
+ c->bin = bam_reg2bin(c->pos, c->pos + 1); |
|
| 327 |
+ } |
|
| 328 |
+ } |
|
| 329 |
+ { // mtid, mpos, isize
|
|
| 330 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; |
|
| 331 |
+ c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; |
|
| 332 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; |
|
| 333 |
+ c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; |
|
| 334 |
+ ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; |
|
| 335 |
+ c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; |
|
| 336 |
+ if (ret < 0) return -4; |
|
| 337 |
+ } |
|
| 338 |
+ { // seq and qual
|
|
| 339 |
+ int i; |
|
| 340 |
+ uint8_t *p = 0; |
|
| 341 |
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq |
|
| 342 |
+ z += str->l + 1; |
|
| 343 |
+ if (strcmp(str->s, "*")) {
|
|
| 344 |
+ c->l_qseq = strlen(str->s); |
|
| 345 |
+ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
|
|
| 346 |
+ fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", |
|
| 347 |
+ (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); |
|
| 348 |
+ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); |
|
| 349 |
+ } |
|
| 350 |
+ p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; |
|
| 351 |
+ memset(p, 0, (c->l_qseq+1)/2); |
|
| 352 |
+ for (i = 0; i < c->l_qseq; ++i) |
|
| 353 |
+ p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); |
|
| 354 |
+ } else c->l_qseq = 0; |
|
| 355 |
+ if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual |
|
| 356 |
+ z += str->l + 1; |
|
| 357 |
+ if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) |
|
| 358 |
+ parse_error(fp->n_lines, "sequence and quality are inconsistent"); |
|
| 359 |
+ p += (c->l_qseq+1)/2; |
|
| 360 |
+ if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; |
|
| 361 |
+ else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; |
|
| 362 |
+ doff += c->l_qseq + (c->l_qseq+1)/2; |
|
| 363 |
+ } |
|
| 364 |
+ doff0 = doff; |
|
| 365 |
+ if (dret != '\n' && dret != '\r') { // aux
|
|
| 366 |
+ while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
|
|
| 367 |
+ uint8_t *s, type, key[2]; |
|
| 368 |
+ z += str->l + 1; |
|
| 369 |
+ if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') |
|
| 370 |
+ parse_error(fp->n_lines, "missing colon in auxiliary data"); |
|
| 371 |
+ key[0] = str->s[0]; key[1] = str->s[1]; |
|
| 372 |
+ type = str->s[3]; |
|
| 373 |
+ s = alloc_data(b, doff + 3) + doff; |
|
| 374 |
+ s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; |
|
| 375 |
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
|
|
| 376 |
+ s = alloc_data(b, doff + 2) + doff; |
|
| 377 |
+ *s++ = 'A'; *s = str->s[5]; |
|
| 378 |
+ doff += 2; |
|
| 379 |
+ } else if (type == 'I' || type == 'i') {
|
|
| 380 |
+ long long x; |
|
| 381 |
+ s = alloc_data(b, doff + 5) + doff; |
|
| 382 |
+ x = (long long)atoll(str->s + 5); |
|
| 383 |
+ if (x < 0) {
|
|
| 384 |
+ if (x >= -127) {
|
|
| 385 |
+ *s++ = 'c'; *(int8_t*)s = (int8_t)x; |
|
| 386 |
+ s += 1; doff += 2; |
|
| 387 |
+ } else if (x >= -32767) {
|
|
| 388 |
+ *s++ = 's'; *(int16_t*)s = (int16_t)x; |
|
| 389 |
+ s += 2; doff += 3; |
|
| 390 |
+ } else {
|
|
| 391 |
+ *s++ = 'i'; *(int32_t*)s = (int32_t)x; |
|
| 392 |
+ s += 4; doff += 5; |
|
| 393 |
+ if (x < -2147483648ll) |
|
| 394 |
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", |
|
| 395 |
+ (long long)fp->n_lines, x); |
|
| 396 |
+ } |
|
| 397 |
+ } else {
|
|
| 398 |
+ if (x <= 255) {
|
|
| 399 |
+ *s++ = 'C'; *s++ = (uint8_t)x; |
|
| 400 |
+ doff += 2; |
|
| 401 |
+ } else if (x <= 65535) {
|
|
| 402 |
+ *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; |
|
| 403 |
+ s += 2; doff += 3; |
|
| 404 |
+ } else {
|
|
| 405 |
+ *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; |
|
| 406 |
+ s += 4; doff += 5; |
|
| 407 |
+ if (x > 4294967295ll) |
|
| 408 |
+ fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", |
|
| 409 |
+ (long long)fp->n_lines, x); |
|
| 410 |
+ } |
|
| 411 |
+ } |
|
| 412 |
+ } else if (type == 'f') {
|
|
| 413 |
+ s = alloc_data(b, doff + 5) + doff; |
|
| 414 |
+ *s++ = 'f'; |
|
| 415 |
+ *(float*)s = (float)atof(str->s + 5); |
|
| 416 |
+ s += 4; doff += 5; |
|
| 417 |
+ } else if (type == 'd') {
|
|
| 418 |
+ s = alloc_data(b, doff + 9) + doff; |
|
| 419 |
+ *s++ = 'd'; |
|
| 420 |
+ *(float*)s = (float)atof(str->s + 9); |
|
| 421 |
+ s += 8; doff += 9; |
|
| 422 |
+ } else if (type == 'Z' || type == 'H') {
|
|
| 423 |
+ int size = 1 + (str->l - 5) + 1; |
|
| 424 |
+ if (type == 'H') { // check whether the hex string is valid
|
|
| 425 |
+ int i; |
|
| 426 |
+ if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); |
|
| 427 |
+ for (i = 0; i < str->l - 5; ++i) {
|
|
| 428 |
+ int c = toupper(str->s[5 + i]); |
|
| 429 |
+ if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
|
| 430 |
+ parse_error(fp->n_lines, "invalid hex character"); |
|
| 431 |
+ } |
|
| 432 |
+ } |
|
| 433 |
+ s = alloc_data(b, doff + size) + doff; |
|
| 434 |
+ *s++ = type; |
|
| 435 |
+ memcpy(s, str->s + 5, str->l - 5); |
|
| 436 |
+ s[str->l - 5] = 0; |
|
| 437 |
+ doff += size; |
|
| 438 |
+ } else if (type == 'B') {
|
|
| 439 |
+ int32_t n = 0, Bsize, k = 0, size; |
|
| 440 |
+ char *p; |
|
| 441 |
+ if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); |
|
| 442 |
+ Bsize = bam_aux_type2size(str->s[5]); // the size of each element |
|
| 443 |
+ for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array |
|
| 444 |
+ if (*p == ',') ++n; |
|
| 445 |
+ p = str->s + 7; // now p points to the first number in the array |
|
| 446 |
+ size = 6 + Bsize * n; // total number of bytes allocated to this tag |
|
| 447 |
+ s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory |
|
| 448 |
+ *s++ = 'B'; *s++ = str->s[5]; |
|
| 449 |
+ memcpy(s, &n, 4); s += 4; // write the number of elements |
|
| 450 |
+ if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; |
|
| 451 |
+ else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; |
|
| 452 |
+ else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory |
|
| 453 |
+ else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; |
|
| 454 |
+ else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; |
|
| 455 |
+ else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; |
|
| 456 |
+ else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p; |
|
| 457 |
+ else parse_error(fp->n_lines, "unrecognized array type"); |
|
| 458 |
+ s += Bsize * n; doff += size; |
|
| 459 |
+ } else parse_error(fp->n_lines, "unrecognized type"); |
|
| 460 |
+ if (dret == '\n' || dret == '\r') break; |
|
| 461 |
+ } |
|
| 462 |
+ } |
|
| 463 |
+ b->l_aux = doff - doff0; |
|
| 464 |
+ b->data_len = doff; |
|
| 465 |
+ if (bam_no_B) bam_remove_B(b); |
|
| 466 |
+ return z; |
|
| 467 |
+} |
|
| 468 |
+ |
|
| 469 |
+tamFile sam_open(const char *fn) |
|
| 470 |
+{
|
|
| 471 |
+ tamFile fp; |
|
| 472 |
+ gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb"); |
|
| 473 |
+ if (gzfp == 0) return 0; |
|
| 474 |
+ fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); |
|
| 475 |
+ fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); |
|
| 476 |
+ fp->fp = gzfp; |
|
| 477 |
+ fp->ks = ks_init(fp->fp); |
|
| 478 |
+ return fp; |
|
| 479 |
+} |
|
| 480 |
+ |
|
| 481 |
+void sam_close(tamFile fp) |
|
| 482 |
+{
|
|
| 483 |
+ if (fp) {
|
|
| 484 |
+ ks_destroy(fp->ks); |
|
| 485 |
+ gzclose(fp->fp); |
|
| 486 |
+ free(fp->str->s); free(fp->str); |
|
| 487 |
+ free(fp); |
|
| 488 |
+ } |
|
| 489 |
+} |