Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ce1fc8b

Browse files
committed
r120: added telo for identifying telomeres
1 parent f6ea81c commit ce1fc8b

File tree

1 file changed

+96
-3
lines changed

1 file changed

+96
-3
lines changed

seqtk.c

Lines changed: 96 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* The MIT License
22
3-
Copyright (c) 2008-2016 Broad Institute
3+
Copyright (c) 2018- Dana-Farber Cancer Institute
4+
2008-2018 Broad Institute
45
56
Permission is hereby granted, free of charge, to any person obtaining
67
a copy of this software and associated documentation files (the
@@ -1920,13 +1921,104 @@ int stk_size(int argc, char *argv[])
19201921
return 0;
19211922
}
19221923

1924+
int stk_telo(int argc, char *argv[])
1925+
{
1926+
gzFile fp;
1927+
kseq_t *seq;
1928+
char *telo_seq = "CCCTAA";
1929+
int c, i, j, len, absent, penalty = 1, max_drop = 2000, min_score = 300;
1930+
uint64_t x, mask, sum_input = 0, sum_telo = 0;
1931+
khash_t(64) *h;
1932+
1933+
while ((c = getopt(argc, argv, "m:p:d:s:")) >= 0) {
1934+
if (c == 'm') telo_seq = optarg;
1935+
else if (c == 'p') penalty = atoi(optarg);
1936+
else if (c == 'd') max_drop = atoi(optarg);
1937+
else if (c == 's') min_score = atoi(optarg);
1938+
}
1939+
if (penalty < 0) penalty = -penalty;
1940+
if (argc == optind && isatty(fileno(stdin))) {
1941+
fprintf(stderr, "Usage: seqtk telo [options] <in.fq>\n");
1942+
fprintf(stderr, "Options:\n");
1943+
fprintf(stderr, " -m STR motif [%s]\n", telo_seq);
1944+
fprintf(stderr, " -p INT penalty [%d]\n", penalty);
1945+
fprintf(stderr, " -d INT max drop [%d]\n", max_drop);
1946+
fprintf(stderr, " -s INT min score [%d]\n", min_score);
1947+
return 1;
1948+
}
1949+
1950+
len = strlen(telo_seq);
1951+
mask = (1ULL<<2*len) - 1;
1952+
1953+
h = kh_init(64);
1954+
kh_resize(64, h, len * 2);
1955+
for (i = 0; i < len; ++i) {
1956+
for (j = 0, x = 0; j < len; ++j) {
1957+
int c = seq_nt6_table[(uint8_t)telo_seq[(i + j) % len]];
1958+
assert(c >= 1 && c <= 4);
1959+
x = x<<2 | (c-1);
1960+
}
1961+
kh_put(64, h, x, &absent);
1962+
}
1963+
1964+
fp = argc > 1 && strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
1965+
if (fp == 0) {
1966+
fprintf(stderr, "[E::%s] failed to open the input file/stream.\n", __func__);
1967+
return 1;
1968+
}
1969+
seq = kseq_init(fp);
1970+
while (kseq_read(seq) >= 0) {
1971+
ssize_t i, l, max_i = -1;
1972+
int64_t score, max;
1973+
sum_input += seq->seq.l;
1974+
score = max = 0, max_i = -1;
1975+
for (i = 0, l = 0, x = 0; i < seq->seq.l; ++i) {
1976+
int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
1977+
if (c >= 1 && c <= 4) {
1978+
x = (x<<2 | (c-1)) & mask;
1979+
if (++l >= len && kh_get(64, h, x) != kh_end(h))
1980+
hit = 1;
1981+
} else l = 0, x = 0;
1982+
if (i >= len) score += hit? 1 : -penalty;
1983+
if (score > max) max = score, max_i = i;
1984+
else if (max - score > max_drop) break;
1985+
}
1986+
if (score >= min_score) {
1987+
printf("%s\t0\t%ld\t%ld\n", seq->name.s, max_i + 1, seq->seq.l);
1988+
sum_telo += max_i + 1;
1989+
}
1990+
score = max = 0, max_i = -1;
1991+
for (i = seq->seq.l - 1, l = 0, x = 0; i >= 0; --i) {
1992+
int hit = 0, c = seq_nt6_table[(uint8_t)seq->seq.s[i]];
1993+
if (c >= 1 && c <= 4) {
1994+
x = (x<<2 | (4-c)) & mask;
1995+
if (++l >= len && kh_get(64, h, x) != kh_end(h))
1996+
hit = 1;
1997+
} else l = 0, x = 0;
1998+
if (seq->seq.l - i >= len) score += hit? 1 : -penalty;
1999+
if (score > max) max = score, max_i = i;
2000+
else if (max - score > max_drop) break;
2001+
}
2002+
if (score >= min_score) {
2003+
printf("%s\t%ld\t%ld\t%ld\n", seq->name.s, max_i, seq->seq.l, seq->seq.l);
2004+
sum_telo += seq->seq.l - max_i;
2005+
}
2006+
}
2007+
kh_destroy(64, h);
2008+
kseq_destroy(seq);
2009+
gzclose(fp);
2010+
fprintf(stderr, "%ld\t%ld\n", (long)sum_telo, (long)sum_input);
2011+
return 0;
2012+
}
2013+
19232014
/* main function */
19242015
static int usage()
19252016
{
19262017
fprintf(stderr, "\n");
19272018
fprintf(stderr, "Usage: seqtk <command> <arguments>\n");
1928-
fprintf(stderr, "Version: 1.3-r119-dirty\n\n");
2019+
fprintf(stderr, "Version: 1.3-r120-dirty\n\n");
19292020
fprintf(stderr, "Command: seq common transformation of FASTA/Q\n");
2021+
fprintf(stderr, " size report the number sequences and bases\n");
19302022
fprintf(stderr, " comp get the nucleotide composition of FASTA/Q\n");
19312023
fprintf(stderr, " sample subsample sequences\n");
19322024
fprintf(stderr, " subseq extract subsequences from FASTA/Q\n");
@@ -1946,7 +2038,7 @@ static int usage()
19462038
fprintf(stderr, " gap get the gap locations\n");
19472039
fprintf(stderr, " listhet extract the position of each het\n");
19482040
fprintf(stderr, " hpc homopolyer-compressed sequence\n");
1949-
fprintf(stderr, " size report the number sequences and bases\n");
2041+
fprintf(stderr, " telo identify telomere repeats in asm or long reads\n");
19502042
fprintf(stderr, "\n");
19512043
return 1;
19522044
}
@@ -1977,6 +2069,7 @@ int main(int argc, char *argv[])
19772069
else if (strcmp(argv[1], "split") == 0) return stk_split(argc-1, argv+1);
19782070
else if (strcmp(argv[1], "hpc") == 0) return stk_hpc(argc-1, argv+1);
19792071
else if (strcmp(argv[1], "size") == 0) return stk_size(argc-1, argv+1);
2072+
else if (strcmp(argv[1], "telo") == 0) return stk_telo(argc-1, argv+1);
19802073
else {
19812074
fprintf(stderr, "[main] unrecognized command '%s'. Abort!\n", argv[1]);
19822075
return 1;

0 commit comments

Comments
 (0)