git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/SICtools@109238 bc3139a8-67e5-0310-9ffc-ced21a209358
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,191 @@ |
| 1 |
+#include <string.h> |
|
| 2 |
+#include <stdlib.h> |
|
| 3 |
+#include <sys/stat.h> |
|
| 4 |
+#include <unistd.h> |
|
| 5 |
+#include "knetfile.h" |
|
| 6 |
+#include "bcf.h" |
|
| 7 |
+ |
|
| 8 |
+#include "kseq.h" |
|
| 9 |
+KSTREAM_INIT(gzFile, gzread, 0x10000) |
|
| 10 |
+ |
|
| 11 |
+int bcfview(int argc, char *argv[]); |
|
| 12 |
+int bcf_main_index(int argc, char *argv[]); |
|
| 13 |
+ |
|
| 14 |
+#define BUF_SIZE 0x10000 |
|
| 15 |
+ |
|
| 16 |
+int bcf_cat(int n, char * const *fn) |
|
| 17 |
+{
|
|
| 18 |
+ int i; |
|
| 19 |
+ bcf_t *out; |
|
| 20 |
+ uint8_t *buf; |
|
| 21 |
+ buf = malloc(BUF_SIZE); |
|
| 22 |
+ out = bcf_open("-", "w");
|
|
| 23 |
+ for (i = 0; i < n; ++i) {
|
|
| 24 |
+ bcf_t *in; |
|
| 25 |
+ bcf_hdr_t *h; |
|
| 26 |
+ off_t end; |
|
| 27 |
+ struct stat s; |
|
| 28 |
+ in = bcf_open(fn[i], "r"); |
|
| 29 |
+ h = bcf_hdr_read(in); |
|
| 30 |
+ if (i == 0) bcf_hdr_write(out, h); |
|
| 31 |
+ bcf_hdr_destroy(h); |
|
| 32 |
+#ifdef _USE_KNETFILE |
|
| 33 |
+ fstat(knet_fileno((knetFile*)in->fp->fp), &s); |
|
| 34 |
+ end = s.st_size - 28; |
|
| 35 |
+ while (knet_tell((knetFile*)in->fp->fp) < end) {
|
|
| 36 |
+ int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp); |
|
| 37 |
+ knet_read(in->fp->fp, buf, size); |
|
| 38 |
+ fwrite(buf, 1, size, out->fp->fp); |
|
| 39 |
+ } |
|
| 40 |
+#else |
|
| 41 |
+ abort(); // FIXME: not implemented |
|
| 42 |
+#endif |
|
| 43 |
+ bcf_close(in); |
|
| 44 |
+ } |
|
| 45 |
+ bcf_close(out); |
|
| 46 |
+ free(buf); |
|
| 47 |
+ return 0; |
|
| 48 |
+} |
|
| 49 |
+ |
|
| 50 |
+extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); |
|
| 51 |
+ |
|
| 52 |
+int bcf_main_ldpair(int argc, char *argv[]) |
|
| 53 |
+{
|
|
| 54 |
+ bcf_t *fp; |
|
| 55 |
+ bcf_hdr_t *h; |
|
| 56 |
+ bcf1_t *b0, *b1; |
|
| 57 |
+ bcf_idx_t *idx; |
|
| 58 |
+ kstring_t str; |
|
| 59 |
+ void *str2id; |
|
| 60 |
+ gzFile fplist; |
|
| 61 |
+ kstream_t *ks; |
|
| 62 |
+ int dret, lineno = 0; |
|
| 63 |
+ if (argc < 3) {
|
|
| 64 |
+ fprintf(stderr, "Usage: bcftools ldpair <in.bcf> <in.list>\n"); |
|
| 65 |
+ return 1; |
|
| 66 |
+ } |
|
| 67 |
+ fplist = gzopen(argv[2], "rb"); |
|
| 68 |
+ ks = ks_init(fplist); |
|
| 69 |
+ memset(&str, 0, sizeof(kstring_t)); |
|
| 70 |
+ fp = bcf_open(argv[1], "rb"); |
|
| 71 |
+ h = bcf_hdr_read(fp); |
|
| 72 |
+ str2id = bcf_build_refhash(h); |
|
| 73 |
+ idx = bcf_idx_load(argv[1]); |
|
| 74 |
+ if (idx == 0) {
|
|
| 75 |
+ fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); |
|
| 76 |
+ return 1; |
|
| 77 |
+ } |
|
| 78 |
+ b0 = calloc(1, sizeof(bcf1_t)); |
|
| 79 |
+ b1 = calloc(1, sizeof(bcf1_t)); |
|
| 80 |
+ while (ks_getuntil(ks, '\n', &str, &dret) >= 0) {
|
|
| 81 |
+ char *p, *q; |
|
| 82 |
+ int k; |
|
| 83 |
+ int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; |
|
| 84 |
+ ++lineno; |
|
| 85 |
+ for (p = q = str.s, k = 0; *p; ++p) {
|
|
| 86 |
+ if (*p == ' ' || *p == '\t') {
|
|
| 87 |
+ *p = '\0'; |
|
| 88 |
+ if (k == 0) tid0 = bcf_str2id(str2id, q); |
|
| 89 |
+ else if (k == 1) pos0 = atoi(q) - 1; |
|
| 90 |
+ else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; |
|
| 91 |
+ else if (k == 3) pos1 = atoi(q) - 1; |
|
| 92 |
+ q = p + 1; |
|
| 93 |
+ ++k; |
|
| 94 |
+ } |
|
| 95 |
+ } |
|
| 96 |
+ if (k == 3) pos1 = atoi(q) - 1; |
|
| 97 |
+ if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) {
|
|
| 98 |
+ uint64_t off; |
|
| 99 |
+ double r, f[4]; |
|
| 100 |
+ off = bcf_idx_query(idx, tid0, pos0); |
|
| 101 |
+ bgzf_seek(fp->fp, off, SEEK_SET); |
|
| 102 |
+ while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); |
|
| 103 |
+ off = bcf_idx_query(idx, tid1, pos1); |
|
| 104 |
+ bgzf_seek(fp->fp, off, SEEK_SET); |
|
| 105 |
+ while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); |
|
| 106 |
+ r = bcf_pair_freq(b0, b1, f); |
|
| 107 |
+ r *= r; |
|
| 108 |
+ printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1,
|
|
| 109 |
+ r, f[0], f[1], f[2], f[3]); |
|
| 110 |
+ } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); |
|
| 111 |
+ } |
|
| 112 |
+ bcf_destroy(b0); bcf_destroy(b1); |
|
| 113 |
+ bcf_idx_destroy(idx); |
|
| 114 |
+ bcf_str2id_destroy(str2id); |
|
| 115 |
+ bcf_hdr_destroy(h); |
|
| 116 |
+ bcf_close(fp); |
|
| 117 |
+ free(str.s); |
|
| 118 |
+ ks_destroy(ks); |
|
| 119 |
+ gzclose(fplist); |
|
| 120 |
+ return 0; |
|
| 121 |
+} |
|
| 122 |
+ |
|
| 123 |
+int bcf_main_ld(int argc, char *argv[]) |
|
| 124 |
+{
|
|
| 125 |
+ bcf_t *fp; |
|
| 126 |
+ bcf_hdr_t *h; |
|
| 127 |
+ bcf1_t **b, *b0; |
|
| 128 |
+ int i, j, m, n; |
|
| 129 |
+ double f[4]; |
|
| 130 |
+ if (argc == 1) {
|
|
| 131 |
+ fprintf(stderr, "Usage: bcftools ld <in.bcf>\n"); |
|
| 132 |
+ return 1; |
|
| 133 |
+ } |
|
| 134 |
+ fp = bcf_open(argv[1], "rb"); |
|
| 135 |
+ h = bcf_hdr_read(fp); |
|
| 136 |
+ // read the entire BCF |
|
| 137 |
+ m = n = 0; b = 0; |
|
| 138 |
+ b0 = calloc(1, sizeof(bcf1_t)); |
|
| 139 |
+ while (bcf_read(fp, h, b0) >= 0) {
|
|
| 140 |
+ if (m == n) {
|
|
| 141 |
+ m = m? m<<1 : 16; |
|
| 142 |
+ b = realloc(b, sizeof(void*) * m); |
|
| 143 |
+ } |
|
| 144 |
+ b[n] = calloc(1, sizeof(bcf1_t)); |
|
| 145 |
+ bcf_cpy(b[n++], b0); |
|
| 146 |
+ } |
|
| 147 |
+ bcf_destroy(b0); |
|
| 148 |
+ // compute pair-wise r^2 |
|
| 149 |
+ printf("%d\n", n); // the number of loci
|
|
| 150 |
+ for (i = 0; i < n; ++i) {
|
|
| 151 |
+ printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
|
|
| 152 |
+ for (j = 0; j < i; ++j) {
|
|
| 153 |
+ double r = bcf_pair_freq(b[i], b[j], f); |
|
| 154 |
+ printf("\t%.3f", r*r);
|
|
| 155 |
+ } |
|
| 156 |
+ printf("\t1.000\n");
|
|
| 157 |
+ } |
|
| 158 |
+ // free |
|
| 159 |
+ for (i = 0; i < n; ++i) bcf_destroy(b[i]); |
|
| 160 |
+ free(b); |
|
| 161 |
+ bcf_hdr_destroy(h); |
|
| 162 |
+ bcf_close(fp); |
|
| 163 |
+ return 0; |
|
| 164 |
+} |
|
| 165 |
+ |
|
| 166 |
+int main(int argc, char *argv[]) |
|
| 167 |
+{
|
|
| 168 |
+ if (argc == 1) {
|
|
| 169 |
+ fprintf(stderr, "\n"); |
|
| 170 |
+ fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); |
|
| 171 |
+ fprintf(stderr, "Version: %s\n\n", BCF_VERSION); |
|
| 172 |
+ fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n"); |
|
| 173 |
+ fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); |
|
| 174 |
+ fprintf(stderr, " index index BCF\n"); |
|
| 175 |
+ fprintf(stderr, " cat concatenate BCFs\n"); |
|
| 176 |
+ fprintf(stderr, " ld compute all-pair r^2\n"); |
|
| 177 |
+ fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); |
|
| 178 |
+ fprintf(stderr, "\n"); |
|
| 179 |
+ return 1; |
|
| 180 |
+ } |
|
| 181 |
+ if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); |
|
| 182 |
+ else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); |
|
| 183 |
+ else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); |
|
| 184 |
+ else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); |
|
| 185 |
+ else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... |
|
| 186 |
+ else {
|
|
| 187 |
+ fprintf(stderr, "[main] Unrecognized command.\n"); |
|
| 188 |
+ return 1; |
|
| 189 |
+ } |
|
| 190 |
+ return 0; |
|
| 191 |
+} |