1
1
/* The MIT License
2
2
3
- Copyright (c) 2008-2016 Broad Institute
3
+ Copyright (c) 2018- Dana-Farber Cancer Institute
4
+ 2008-2018 Broad Institute
4
5
5
6
Permission is hereby granted, free of charge, to any person obtaining
6
7
a copy of this software and associated documentation files (the
@@ -1920,13 +1921,104 @@ int stk_size(int argc, char *argv[])
1920
1921
return 0 ;
1921
1922
}
1922
1923
1924
+ int stk_telo (int argc , char * argv [])
1925
+ {
1926
+ gzFile fp ;
1927
+ kseq_t * seq ;
1928
+ char * telo_seq = "CCCTAA" ;
1929
+ int c , i , j , len , absent , penalty = 1 , max_drop = 2000 , min_score = 300 ;
1930
+ uint64_t x , mask , sum_input = 0 , sum_telo = 0 ;
1931
+ khash_t (64 ) * h ;
1932
+
1933
+ while ((c = getopt (argc , argv , "m:p:d:s:" )) >= 0 ) {
1934
+ if (c == 'm' ) telo_seq = optarg ;
1935
+ else if (c == 'p' ) penalty = atoi (optarg );
1936
+ else if (c == 'd' ) max_drop = atoi (optarg );
1937
+ else if (c == 's' ) min_score = atoi (optarg );
1938
+ }
1939
+ if (penalty < 0 ) penalty = - penalty ;
1940
+ if (argc == optind && isatty (fileno (stdin ))) {
1941
+ fprintf (stderr , "Usage: seqtk telo [options] <in.fq>\n" );
1942
+ fprintf (stderr , "Options:\n" );
1943
+ fprintf (stderr , " -m STR motif [%s]\n" , telo_seq );
1944
+ fprintf (stderr , " -p INT penalty [%d]\n" , penalty );
1945
+ fprintf (stderr , " -d INT max drop [%d]\n" , max_drop );
1946
+ fprintf (stderr , " -s INT min score [%d]\n" , min_score );
1947
+ return 1 ;
1948
+ }
1949
+
1950
+ len = strlen (telo_seq );
1951
+ mask = (1ULL <<2 * len ) - 1 ;
1952
+
1953
+ h = kh_init (64 );
1954
+ kh_resize (64 , h , len * 2 );
1955
+ for (i = 0 ; i < len ; ++ i ) {
1956
+ for (j = 0 , x = 0 ; j < len ; ++ j ) {
1957
+ int c = seq_nt6_table [(uint8_t )telo_seq [(i + j ) % len ]];
1958
+ assert (c >= 1 && c <= 4 );
1959
+ x = x <<2 | (c - 1 );
1960
+ }
1961
+ kh_put (64 , h , x , & absent );
1962
+ }
1963
+
1964
+ fp = argc > 1 && strcmp (argv [optind ], "-" )? gzopen (argv [optind ], "r" ) : gzdopen (fileno (stdin ), "r" );
1965
+ if (fp == 0 ) {
1966
+ fprintf (stderr , "[E::%s] failed to open the input file/stream.\n" , __func__ );
1967
+ return 1 ;
1968
+ }
1969
+ seq = kseq_init (fp );
1970
+ while (kseq_read (seq ) >= 0 ) {
1971
+ ssize_t i , l , max_i = -1 ;
1972
+ int64_t score , max ;
1973
+ sum_input += seq -> seq .l ;
1974
+ score = max = 0 , max_i = -1 ;
1975
+ for (i = 0 , l = 0 , x = 0 ; i < seq -> seq .l ; ++ i ) {
1976
+ int hit = 0 , c = seq_nt6_table [(uint8_t )seq -> seq .s [i ]];
1977
+ if (c >= 1 && c <= 4 ) {
1978
+ x = (x <<2 | (c - 1 )) & mask ;
1979
+ if (++ l >= len && kh_get (64 , h , x ) != kh_end (h ))
1980
+ hit = 1 ;
1981
+ } else l = 0 , x = 0 ;
1982
+ if (i >= len ) score += hit ? 1 : - penalty ;
1983
+ if (score > max ) max = score , max_i = i ;
1984
+ else if (max - score > max_drop ) break ;
1985
+ }
1986
+ if (score >= min_score ) {
1987
+ printf ("%s\t0\t%ld\t%ld\n" , seq -> name .s , max_i + 1 , seq -> seq .l );
1988
+ sum_telo += max_i + 1 ;
1989
+ }
1990
+ score = max = 0 , max_i = -1 ;
1991
+ for (i = seq -> seq .l - 1 , l = 0 , x = 0 ; i >= 0 ; -- i ) {
1992
+ int hit = 0 , c = seq_nt6_table [(uint8_t )seq -> seq .s [i ]];
1993
+ if (c >= 1 && c <= 4 ) {
1994
+ x = (x <<2 | (4 - c )) & mask ;
1995
+ if (++ l >= len && kh_get (64 , h , x ) != kh_end (h ))
1996
+ hit = 1 ;
1997
+ } else l = 0 , x = 0 ;
1998
+ if (seq -> seq .l - i >= len ) score += hit ? 1 : - penalty ;
1999
+ if (score > max ) max = score , max_i = i ;
2000
+ else if (max - score > max_drop ) break ;
2001
+ }
2002
+ if (score >= min_score ) {
2003
+ printf ("%s\t%ld\t%ld\t%ld\n" , seq -> name .s , max_i , seq -> seq .l , seq -> seq .l );
2004
+ sum_telo += seq -> seq .l - max_i ;
2005
+ }
2006
+ }
2007
+ kh_destroy (64 , h );
2008
+ kseq_destroy (seq );
2009
+ gzclose (fp );
2010
+ fprintf (stderr , "%ld\t%ld\n" , (long )sum_telo , (long )sum_input );
2011
+ return 0 ;
2012
+ }
2013
+
1923
2014
/* main function */
1924
2015
static int usage ()
1925
2016
{
1926
2017
fprintf (stderr , "\n" );
1927
2018
fprintf (stderr , "Usage: seqtk <command> <arguments>\n" );
1928
- fprintf (stderr , "Version: 1.3-r119 -dirty\n\n" );
2019
+ fprintf (stderr , "Version: 1.3-r120 -dirty\n\n" );
1929
2020
fprintf (stderr , "Command: seq common transformation of FASTA/Q\n" );
2021
+ fprintf (stderr , " size report the number sequences and bases\n" );
1930
2022
fprintf (stderr , " comp get the nucleotide composition of FASTA/Q\n" );
1931
2023
fprintf (stderr , " sample subsample sequences\n" );
1932
2024
fprintf (stderr , " subseq extract subsequences from FASTA/Q\n" );
@@ -1946,7 +2038,7 @@ static int usage()
1946
2038
fprintf (stderr , " gap get the gap locations\n" );
1947
2039
fprintf (stderr , " listhet extract the position of each het\n" );
1948
2040
fprintf (stderr , " hpc homopolyer-compressed sequence\n" );
1949
- fprintf (stderr , " size report the number sequences and bases \n" );
2041
+ fprintf (stderr , " telo identify telomere repeats in asm or long reads \n" );
1950
2042
fprintf (stderr , "\n" );
1951
2043
return 1 ;
1952
2044
}
@@ -1977,6 +2069,7 @@ int main(int argc, char *argv[])
1977
2069
else if (strcmp (argv [1 ], "split" ) == 0 ) return stk_split (argc - 1 , argv + 1 );
1978
2070
else if (strcmp (argv [1 ], "hpc" ) == 0 ) return stk_hpc (argc - 1 , argv + 1 );
1979
2071
else if (strcmp (argv [1 ], "size" ) == 0 ) return stk_size (argc - 1 , argv + 1 );
2072
+ else if (strcmp (argv [1 ], "telo" ) == 0 ) return stk_telo (argc - 1 , argv + 1 );
1980
2073
else {
1981
2074
fprintf (stderr , "[main] unrecognized command '%s'. Abort!\n" , argv [1 ]);
1982
2075
return 1 ;
0 commit comments