git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@109238 bc3139a8-67e5-0310-9ffc-ced21a209358
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,197 @@ |
| 1 |
+/* The MIT License |
|
| 2 |
+ |
|
| 3 |
+ Copyright (c) 2010 Broad Institute |
|
| 4 |
+ |
|
| 5 |
+ Permission is hereby granted, free of charge, to any person obtaining |
|
| 6 |
+ a copy of this software and associated documentation files (the |
|
| 7 |
+ "Software"), to deal in the Software without restriction, including |
|
| 8 |
+ without limitation the rights to use, copy, modify, merge, publish, |
|
| 9 |
+ distribute, sublicense, and/or sell copies of the Software, and to |
|
| 10 |
+ permit persons to whom the Software is furnished to do so, subject to |
|
| 11 |
+ the following conditions: |
|
| 12 |
+ |
|
| 13 |
+ The above copyright notice and this permission notice shall be |
|
| 14 |
+ included in all copies or substantial portions of the Software. |
|
| 15 |
+ |
|
| 16 |
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
| 17 |
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
| 18 |
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
| 19 |
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
|
| 20 |
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
|
| 21 |
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
|
| 22 |
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
| 23 |
+ SOFTWARE. |
|
| 24 |
+*/ |
|
| 25 |
+ |
|
| 26 |
+/* Contact: Heng Li <[email protected]> */ |
|
| 27 |
+ |
|
| 28 |
+#ifndef BCF_H |
|
| 29 |
+#define BCF_H |
|
| 30 |
+ |
|
| 31 |
+#define BCF_VERSION "0.1.19-44428cd" |
|
| 32 |
+ |
|
| 33 |
+#include <stdint.h> |
|
| 34 |
+#include <zlib.h> |
|
| 35 |
+ |
|
| 36 |
+#ifndef BCF_LITE |
|
| 37 |
+#include "bgzf.h" |
|
| 38 |
+typedef BGZF *bcfFile; |
|
| 39 |
+#else |
|
| 40 |
+typedef gzFile bcfFile; |
|
| 41 |
+#define bgzf_open(fn, mode) gzopen(fn, mode) |
|
| 42 |
+#define bgzf_fdopen(fd, mode) gzdopen(fd, mode) |
|
| 43 |
+#define bgzf_close(fp) gzclose(fp) |
|
| 44 |
+#define bgzf_read(fp, buf, len) gzread(fp, buf, len) |
|
| 45 |
+#define bgzf_write(fp, buf, len) |
|
| 46 |
+#define bgzf_flush(fp) |
|
| 47 |
+#endif |
|
| 48 |
+ |
|
| 49 |
+/* |
|
| 50 |
+ A member in the structs below is said to "primary" if its content |
|
| 51 |
+ cannot be inferred from other members in any of structs below; a |
|
| 52 |
+ member is said to be "derived" if its content can be derived from |
|
| 53 |
+ other members. For example, bcf1_t::str is primary as this comes from |
|
| 54 |
+ the input data, while bcf1_t::info is derived as it can always be |
|
| 55 |
+ correctly set if we know bcf1_t::str. Derived members are for quick |
|
| 56 |
+ access to the content and must be synchronized with the primary data. |
|
| 57 |
+ */ |
|
| 58 |
+ |
|
| 59 |
+typedef struct {
|
|
| 60 |
+ uint32_t fmt; // format of the block, set by bcf_str2int(). |
|
| 61 |
+ int len; // length of data for each individual |
|
| 62 |
+ void *data; // concatenated data |
|
| 63 |
+ // derived info: fmt, len (<-bcf1_t::fmt) |
|
| 64 |
+} bcf_ginfo_t; |
|
| 65 |
+ |
|
| 66 |
+typedef struct {
|
|
| 67 |
+ int32_t tid, pos; // refID and 0-based position |
|
| 68 |
+ int32_t l_str, m_str; // length and the allocated size of ->str |
|
| 69 |
+ float qual; // SNP quality |
|
| 70 |
+ char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7) |
|
| 71 |
+ char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation |
|
| 72 |
+ int n_gi, m_gi; // number and the allocated size of geno fields |
|
| 73 |
+ bcf_ginfo_t *gi; // array of geno fields |
|
| 74 |
+ int n_alleles, n_smpl; // number of alleles and samples |
|
| 75 |
+ // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) |
|
| 76 |
+ uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed. |
|
| 77 |
+} bcf1_t; |
|
| 78 |
+ |
|
| 79 |
+typedef struct {
|
|
| 80 |
+ int32_t n_ref, n_smpl; // number of reference sequences and samples |
|
| 81 |
+ int32_t l_nm; // length of concatenated sequence names; 0 padded |
|
| 82 |
+ int32_t l_smpl; // length of concatenated sample names; 0 padded |
|
| 83 |
+ int32_t l_txt; // length of header text (lines started with ##) |
|
| 84 |
+ char *name, *sname, *txt; // concatenated sequence names, sample names and header text |
|
| 85 |
+ char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively |
|
| 86 |
+ // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname) |
|
| 87 |
+} bcf_hdr_t; |
|
| 88 |
+ |
|
| 89 |
+typedef struct {
|
|
| 90 |
+ int is_vcf; // if the file in operation is a VCF |
|
| 91 |
+ void *v; // auxillary data structure for VCF |
|
| 92 |
+ bcfFile fp; // file handler for BCF |
|
| 93 |
+} bcf_t; |
|
| 94 |
+ |
|
| 95 |
+struct __bcf_idx_t; |
|
| 96 |
+typedef struct __bcf_idx_t bcf_idx_t; |
|
| 97 |
+ |
|
| 98 |
+#ifdef __cplusplus |
|
| 99 |
+extern "C" {
|
|
| 100 |
+#endif |
|
| 101 |
+ |
|
| 102 |
+ // open a BCF file; for BCF file only |
|
| 103 |
+ bcf_t *bcf_open(const char *fn, const char *mode); |
|
| 104 |
+ // close file |
|
| 105 |
+ int bcf_close(bcf_t *b); |
|
| 106 |
+ // read one record from BCF; return -1 on end-of-file, and <-1 for errors |
|
| 107 |
+ int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b); |
|
| 108 |
+ // call this function if b->str is changed |
|
| 109 |
+ int bcf_sync(bcf1_t *b); |
|
| 110 |
+ // write a BCF record |
|
| 111 |
+ int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b); |
|
| 112 |
+ // read the BCF header; BCF only |
|
| 113 |
+ bcf_hdr_t *bcf_hdr_read(bcf_t *b); |
|
| 114 |
+ // write the BCF header |
|
| 115 |
+ int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h); |
|
| 116 |
+ // set bcf_hdr_t::ns and bcf_hdr_t::sns |
|
| 117 |
+ int bcf_hdr_sync(bcf_hdr_t *b); |
|
| 118 |
+ // destroy the header |
|
| 119 |
+ void bcf_hdr_destroy(bcf_hdr_t *h); |
|
| 120 |
+ // destroy a record |
|
| 121 |
+ int bcf_destroy(bcf1_t *b); |
|
| 122 |
+ // BCF->VCF conversion |
|
| 123 |
+ char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); |
|
| 124 |
+ // append more info |
|
| 125 |
+ int bcf_append_info(bcf1_t *b, const char *info, int l); |
|
| 126 |
+ // remove tag |
|
| 127 |
+ int remove_tag(char *string, const char *tag, char delim); |
|
| 128 |
+ // remove info tag, string is the kstring holder of bcf1_t.str |
|
| 129 |
+ void rm_info(kstring_t *string, const char *key); |
|
| 130 |
+ // copy |
|
| 131 |
+ int bcf_cpy(bcf1_t *r, const bcf1_t *b); |
|
| 132 |
+ |
|
| 133 |
+ // open a VCF or BCF file if "b" is set in "mode" |
|
| 134 |
+ bcf_t *vcf_open(const char *fn, const char *mode); |
|
| 135 |
+ // close a VCF/BCF file |
|
| 136 |
+ int vcf_close(bcf_t *bp); |
|
| 137 |
+ // read the VCF/BCF header |
|
| 138 |
+ bcf_hdr_t *vcf_hdr_read(bcf_t *bp); |
|
| 139 |
+ // read the sequence dictionary from a separate file; required for VCF->BCF conversion |
|
| 140 |
+ int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); |
|
| 141 |
+ // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors |
|
| 142 |
+ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); |
|
| 143 |
+ // write the VCF header |
|
| 144 |
+ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h); |
|
| 145 |
+ // write a VCF record |
|
| 146 |
+ int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); |
|
| 147 |
+ |
|
| 148 |
+ // keep the first n alleles and discard the rest |
|
| 149 |
+ int bcf_shrink_alt(bcf1_t *b, int n); |
|
| 150 |
+ // keep the masked alleles and discard the rest |
|
| 151 |
+ void bcf_fit_alt(bcf1_t *b, int mask); |
|
| 152 |
+ // convert GL to PL |
|
| 153 |
+ int bcf_gl2pl(bcf1_t *b); |
|
| 154 |
+ // if the site is an indel |
|
| 155 |
+ int bcf_is_indel(const bcf1_t *b); |
|
| 156 |
+ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); |
|
| 157 |
+ int bcf_subsam(int n_smpl, int *list, bcf1_t *b); |
|
| 158 |
+ // move GT to the first FORMAT field |
|
| 159 |
+ int bcf_fix_gt(bcf1_t *b); |
|
| 160 |
+ // update PL generated by old samtools |
|
| 161 |
+ int bcf_fix_pl(bcf1_t *b); |
|
| 162 |
+ // convert PL to GLF-like 10-likelihood GL |
|
| 163 |
+ int bcf_gl10(const bcf1_t *b, uint8_t *gl); |
|
| 164 |
+ // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL |
|
| 165 |
+ int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); |
|
| 166 |
+ |
|
| 167 |
+ // string hash table |
|
| 168 |
+ void *bcf_build_refhash(bcf_hdr_t *h); |
|
| 169 |
+ void bcf_str2id_destroy(void *_hash); |
|
| 170 |
+ void bcf_str2id_thorough_destroy(void *_hash); |
|
| 171 |
+ int bcf_str2id_add(void *_hash, const char *str); |
|
| 172 |
+ int bcf_str2id(void *_hash, const char *str); |
|
| 173 |
+ void *bcf_str2id_init(); |
|
| 174 |
+ |
|
| 175 |
+ // indexing related functions |
|
| 176 |
+ int bcf_idx_build(const char *fn); |
|
| 177 |
+ uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg); |
|
| 178 |
+ int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); |
|
| 179 |
+ bcf_idx_t *bcf_idx_load(const char *fn); |
|
| 180 |
+ void bcf_idx_destroy(bcf_idx_t *idx); |
|
| 181 |
+ |
|
| 182 |
+#ifdef __cplusplus |
|
| 183 |
+} |
|
| 184 |
+#endif |
|
| 185 |
+ |
|
| 186 |
+static inline uint32_t bcf_str2int(const char *str, int l) |
|
| 187 |
+{
|
|
| 188 |
+ int i; |
|
| 189 |
+ uint32_t x = 0; |
|
| 190 |
+ for (i = 0; i < l && i < 4; ++i) {
|
|
| 191 |
+ if (str[i] == 0) return x; |
|
| 192 |
+ x = x<<8 | str[i]; |
|
| 193 |
+ } |
|
| 194 |
+ return x; |
|
| 195 |
+} |
|
| 196 |
+ |
|
| 197 |
+#endif |