Thanks to visit codestin.com
Credit goes to code.bioconductor.org

src/sam_header.c
c1100b9b
 #include "sam_header.h"
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdarg.h>
 
 #include "khash.h"
 KHASH_MAP_INIT_STR(str, const char *)
 
 struct _HeaderList
 {
     struct _HeaderList *last;   // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
     struct _HeaderList *next;
     void *data;
 };
 typedef struct _HeaderList list_t;
 typedef list_t HeaderDict;
 
 typedef struct
 {
     char key[2];
     char *value;
 }
 HeaderTag;
 
 typedef struct
 {
     char type[2];
     list_t *tags;
 }
 HeaderLine;
 
 const char *o_hd_tags[] = {"SO","GO",NULL};
 const char *r_hd_tags[] = {"VN",NULL};
 
 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
 const char *r_sq_tags[] = {"SN","LN",NULL};
 const char *u_sq_tags[] = {"SN",NULL};
 
 const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
 const char *r_rg_tags[] = {"ID",NULL};
 const char *u_rg_tags[] = {"ID",NULL};
 
 const char *o_pg_tags[] = {"VN","CL",NULL};
 const char *r_pg_tags[] = {"ID",NULL};
 
 const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
 const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
 
 
 static void debug(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
     vfprintf(stderr, format, ap);
     va_end(ap);
 }
 
 #if 0
 // Replaced by list_append_to_end
 static list_t *list_prepend(list_t *root, void *data)
 {
     list_t *l = malloc(sizeof(list_t));
     l->next = root;
     l->data = data;
     return l;
 }
 #endif
 
 // Relies on the root->last being correct. Do not use with the other list_*
 //  routines unless they are fixed to modify root->last as well.
 static list_t *list_append_to_end(list_t *root, void *data)
 {
     list_t *l = malloc(sizeof(list_t));
     l->last = l;
     l->next = NULL;
     l->data = data;
 
     if ( !root )
         return l;
 
     root->last->next = l;
     root->last = l;
     return root;
 }
 
 static list_t *list_append(list_t *root, void *data)
 {
     list_t *l = root;
     while (l && l->next)
         l = l->next;
     if ( l ) 
     {
         l->next = malloc(sizeof(list_t));
         l = l->next;
     }
     else
     {
         l = malloc(sizeof(list_t));
         root = l;
     }
     l->data = data;
     l->next = NULL;
     return root;
 }
 
 static void list_free(list_t *root)
 {
     list_t *l = root;
     while (root)
     {
         l = root;
         root = root->next;
         free(l);
     }
 }
 
 
 
 // Look for a tag "XY" in a predefined const char *[] array.
 static int tag_exists(const char *tag, const char **tags)
 {
     int itag=0;
     if ( !tags ) return -1;
     while ( tags[itag] )
     {
         if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; 
         itag++;
     }
     return -1;
 }
 
 
 
 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
 //  or NULL if everything has been read. The lineptr should be freed by the caller. The
 //  newline character is stripped.
 static const char *nextline(char **lineptr, size_t *n, const char *text)
 {
     int len;
     const char *to = text;
 
     if ( !*to ) return NULL;
 
     while ( *to && *to!='\n' && *to!='\r' ) to++;
     len = to - text + 1;
 
     if ( *to )
     {
         // Advance the pointer for the next call
         if ( *to=='\n' ) to++;
         else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
     }
     if ( !len )
         return to;
 
     if ( !*lineptr ) 
     {
         *lineptr = malloc(len);
         *n = len;
     }
     else if ( *n<len ) 
     {
         *lineptr = realloc(*lineptr, len);
         *n = len;
     }
     if ( !*lineptr ) {
 		debug("[nextline] Insufficient memory!\n");
 		return 0;
 	}
 
     memcpy(*lineptr,text,len);
     (*lineptr)[len-1] = 0;
 
     return to;
 }
 
 // name points to "XY", value_from points to the first character of the value string and
 //  value_to points to the last character of the value string.
 static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
 {
     HeaderTag *tag = malloc(sizeof(HeaderTag));
     int len = value_to-value_from+1;
 
     tag->key[0] = name[0];
     tag->key[1] = name[1];
     tag->value = malloc(len+1);
     memcpy(tag->value,value_from,len+1);
     tag->value[len] = 0;
     return tag;
 }
 
 static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
 {
     list_t *tags = hline->tags;
     while (tags)
     {
         HeaderTag *tag = tags->data;
         if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
         tags = tags->next;
     }
     return NULL;
 }
 
 
 // Return codes:
 //   0 .. different types or unique tags differ or conflicting tags, cannot be merged
 //   1 .. all tags identical -> no need to merge, drop one
 //   2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
 //   3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
 static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
 {
     HeaderTag *t1, *t2;
 
     if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
         return 0;
 
     int itype = tag_exists(hline1->type,types);
     if ( itype==-1 ) {
 		debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
 		return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
 	}
 
     if ( unique_tags[itype] )
     {
         t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
         t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
         if ( !t1 || !t2 ) // this should never happen, the unique tags are required
             return 2;
 
         if ( strcmp(t1->value,t2->value) )
             return 0;   // the unique tags differ, cannot be merged
     }
     if ( !required_tags[itype] && !optional_tags[itype] )
     {
         t1 = hline1->tags->data;
         t2 = hline2->tags->data;
         if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
         return 0;
     }
 
     int missing=0, itag=0;
     while ( required_tags[itype] && required_tags[itype][itag] )
     {
         t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
         t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
         if ( !t1 && !t2 )
             return 2;       // this should never happen
         else if ( !t1 || !t2 )
             missing = 1;    // there is some tag missing in one of the hlines
         else if ( strcmp(t1->value,t2->value) )
         {
             if ( unique_tags[itype] )
                 return 2;   // the lines have a matching unique tag but have a conflicting tag
                     
             return 0;    // the lines contain conflicting tags, cannot be merged
         }
         itag++;
     }
     itag = 0;
     while ( optional_tags[itype] && optional_tags[itype][itag] )
     {
         t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
         t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
         if ( !t1 && !t2 )
         {
             itag++;
             continue;
         }
         if ( !t1 || !t2 )
             missing = 1;    // there is some tag missing in one of the hlines
         else if ( strcmp(t1->value,t2->value) )
         {
             if ( unique_tags[itype] )
                 return 2;   // the lines have a matching unique tag but have a conflicting tag
 
             return 0;   // the lines contain conflicting tags, cannot be merged
         }
         itag++;
     }
     if ( missing ) return 3;    // there are some missing complementary tags with no conflicts, can be merged
     return 1;
 }
 
 
 static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
 {
     list_t *tags;
     HeaderLine *out = malloc(sizeof(HeaderLine));
     out->type[0] = hline->type[0];
     out->type[1] = hline->type[1];
     out->tags = NULL;
 
     tags = hline->tags;
     while (tags)
     {
         HeaderTag *old = tags->data;
 
         HeaderTag *new = malloc(sizeof(HeaderTag));
         new->key[0] = old->key[0];
         new->key[1] = old->key[1];
         new->value  = strdup(old->value);
         out->tags = list_append(out->tags, new);
 
         tags = tags->next;
     }
     return out;
 }
 
 static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
 {
     list_t *tmpl_tags;
 
     if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
         return 0;
     
     tmpl_tags = tmpl_hline->tags;
     while (tmpl_tags)
     {
         HeaderTag *tmpl_tag = tmpl_tags->data;
         HeaderTag *out_tag  = header_line_has_tag(out_hline, tmpl_tag->key);
         if ( !out_tag )
         {
             HeaderTag *tag = malloc(sizeof(HeaderTag));
             tag->key[0] = tmpl_tag->key[0];
             tag->key[1] = tmpl_tag->key[1];
             tag->value  = strdup(tmpl_tag->value);
             out_hline->tags = list_append(out_hline->tags,tag);
         }
         tmpl_tags = tmpl_tags->next;
     }
     return 1;
 }
 
 
 static HeaderLine *sam_header_line_parse(const char *headerLine)
 {
     HeaderLine *hline;
     HeaderTag *tag;
     const char *from, *to;
     from = headerLine;
 
     if ( *from != '@' ) {
 		debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
 		return 0;
 	}
     to = ++from;
 
     while (*to && *to!='\t') to++;
     if ( to-from != 2 ) {
 		debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
 		return 0;
 	}
     
     hline = malloc(sizeof(HeaderLine));
     hline->type[0] = from[0];
     hline->type[1] = from[1];
     hline->tags = NULL;
 
     int itype = tag_exists(hline->type, types);
     
     from = to;
     while (*to && *to=='\t') to++;
     if ( to-from != 1 ) {
         debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
         free(hline);
 		return 0;
 	}
     from = to;
     while (*from)
     {
         while (*to && *to!='\t') to++;
 
         if ( !required_tags[itype] && !optional_tags[itype] )
         {
             // CO is a special case, it can contain anything, including tabs
             if ( *to ) { to++; continue; }
             tag = new_tag("  ",from,to-1);
         }
         else
             tag = new_tag(from,from+3,to-1);
 
         if ( header_line_has_tag(hline,tag->key) ) 
                 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
         hline->tags = list_append(hline->tags, tag);
 
         from = to;
         while (*to && *to=='\t') to++;
         if ( *to && to-from != 1 ) {
 			debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
 			return 0;
 		}
 
         from = to;
     }
     return hline;
 }
 
 
 // Must be of an existing type, all tags must be recognised and all required tags must be present
 static int sam_header_line_validate(HeaderLine *hline)
 {
     list_t *tags;
     HeaderTag *tag;
     int itype, itag;
     
     // Is the type correct?
     itype = tag_exists(hline->type, types);
     if ( itype==-1 ) 
     {
         debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
         return 0;
     }
 
     // Has all required tags?
     itag = 0;
     while ( required_tags[itype] && required_tags[itype][itag] )
     {
         if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
         {
             debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
                 hline->type[0],hline->type[1]);
             return 0;
         }
         itag++;
     }
 
     // Are all tags recognised?
     tags = hline->tags;
     while ( tags )
     {
         tag = tags->data;
         if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
         {
             // Lower case tags are user-defined values.
             if( !(islower(tag->key[0]) || islower(tag->key[1])) )
             {
                 // Neither is lower case, but tag was not recognized.
                 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
                 // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
             }
             // else - allow user defined tag
         }
         tags = tags->next;
     }
 
     return 1;
 }
 
 
 static void print_header_line(FILE *fp, HeaderLine *hline)
 {
     list_t *tags = hline->tags;
     HeaderTag *tag;
 
     fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
     while (tags)
     {
         tag = tags->data;
 
         fprintf(fp, "\t");
         if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
             fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
         fprintf(fp, "%s", tag->value);
 
         tags = tags->next;
     }
     fprintf(fp,"\n");
 }
 
 
 static void sam_header_line_free(HeaderLine *hline)
 {
     list_t *tags = hline->tags;
     while (tags)
     {
         HeaderTag *tag = tags->data;
         free(tag->value);
         free(tag);
         tags = tags->next;
     }
     list_free(hline->tags);
     free(hline);
 }
 
 void sam_header_free(void *_header)
 {
 	HeaderDict *header = (HeaderDict*)_header;
     list_t *hlines = header;
     while (hlines)
     {
         sam_header_line_free(hlines->data);
         hlines = hlines->next;
     }
     list_free(header);
 }
 
 HeaderDict *sam_header_clone(const HeaderDict *dict)
 {
     HeaderDict *out = NULL;
     while (dict)
     {
         HeaderLine *hline = dict->data;
         out = list_append(out, sam_header_line_clone(hline));
         dict = dict->next;
     }
     return out;
 }
 
 // Returns a newly allocated string
 char *sam_header_write(const void *_header)
 {
 	const HeaderDict *header = (const HeaderDict*)_header;
     char *out = NULL;
     int len=0, nout=0;
     const list_t *hlines;
 
     // Calculate the length of the string to allocate
     hlines = header;
     while (hlines)
     {
         len += 4;   // @XY and \n
 
         HeaderLine *hline = hlines->data;
         list_t *tags = hline->tags;
         while (tags)
         {
             HeaderTag *tag = tags->data;
             len += strlen(tag->value) + 1;                  // \t
             if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
                 len += strlen(tag->value) + 3;              // XY:
             tags = tags->next;
         }
         hlines = hlines->next;
     }
 
     nout = 0;
     out  = malloc(len+1);
     hlines = header;
     while (hlines)
     {
         HeaderLine *hline = hlines->data;
 
         nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
 
         list_t *tags = hline->tags;
         while (tags)
         {
             HeaderTag *tag = tags->data;
             nout += sprintf(out+nout,"\t");
             if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
                 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
             nout += sprintf(out+nout,"%s", tag->value);
             tags = tags->next;
         }
         hlines = hlines->next;
         nout += sprintf(out+nout,"\n");
     }
     out[len] = 0;
     return out;
 }
 
 void *sam_header_parse2(const char *headerText)
 {
     list_t *hlines = NULL;
     HeaderLine *hline;
     const char *text;
     char *buf=NULL;
     size_t nbuf = 0;
 	int tovalidate = 0;
 
     if ( !headerText )
 		return 0;
 
     text = headerText;
     while ( (text=nextline(&buf, &nbuf, text)) )
     {
         hline = sam_header_line_parse(buf);
         if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
             // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
             hlines = list_append_to_end(hlines, hline);
         else
         {
 			if (hline) sam_header_line_free(hline);
 			sam_header_free(hlines);
             if ( buf ) free(buf);
             return NULL;
         }
     }
     if ( buf ) free(buf);
 
     return hlines;
 }
 
 void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
 {
 	const HeaderDict *dict = (const HeaderDict*)_dict;
     const list_t *l   = dict;
     khash_t(str) *tbl = kh_init(str);
     khiter_t k;
     int ret;
 
 	if (_dict == 0) return tbl; // return an empty (not null) hash table
     while (l)
     {
         HeaderLine *hline = l->data;
         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
         {
             l = l->next;
             continue;
         }
         
         HeaderTag *key, *value;
         key   = header_line_has_tag(hline,key_tag);
         value = header_line_has_tag(hline,value_tag); 
         if ( !key || !value )
         {
             l = l->next;
             continue;
         }
         
         k = kh_get(str, tbl, key->value);
         if ( k != kh_end(tbl) )
             debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
         k = kh_put(str, tbl, key->value, &ret);
         kh_value(tbl, k) = value->value;
 
         l = l->next;
     }
     return tbl;
 }
 
 char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
 {
 	const HeaderDict *dict = (const HeaderDict*)_dict;
     const list_t *l   = dict;
     int max, n;
 	char **ret;
 
 	ret = 0; *_n = max = n = 0;
     while (l)
     {
         HeaderLine *hline = l->data;
         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
         {
             l = l->next;
             continue;
         }
         
         HeaderTag *key;
         key   = header_line_has_tag(hline,key_tag);
         if ( !key )
         {
             l = l->next;
             continue;
         }
 
 		if (n == max) {
 			max = max? max<<1 : 4;
 			ret = realloc(ret, max * sizeof(void*));
 		}
 		ret[n++] = key->value;
 
         l = l->next;
     }
 	*_n = n;
     return ret;
 }
 
 void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
 {
     list_t *l = iter;
     if ( !l ) return NULL;
 
     while (l)
     {
         HeaderLine *hline = l->data;
         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
         {
             l = l->next;
             continue;
         }
 
         HeaderTag *key, *value;
         key   = header_line_has_tag(hline,key_tag);
         value = header_line_has_tag(hline,value_tag);
         if ( !key && !value ) 
         {
             l = l->next;
             continue;
         }
 
         *_key = key->value;
         *_value = value->value;
         return l->next;
     }
     return l;
 }
 
 const char *sam_tbl_get(void *h, const char *key)
 {
 	khash_t(str) *tbl = (khash_t(str)*)h;
 	khint_t k;
 	k = kh_get(str, tbl, key);
 	return k == kh_end(tbl)? 0 : kh_val(tbl, k);
 }
 
 int sam_tbl_size(void *h)
 {
 	khash_t(str) *tbl = (khash_t(str)*)h;
 	return h? kh_size(tbl) : 0;
 }
 
 void sam_tbl_destroy(void *h)
 {
 	khash_t(str) *tbl = (khash_t(str)*)h;
 	kh_destroy(str, tbl);
 }
 
 void *sam_header_merge(int n, const void **_dicts)
 {
 	const HeaderDict **dicts = (const HeaderDict**)_dicts;
     HeaderDict *out_dict;
     int idict, status;
 
     if ( n<2 ) return NULL;
 
     out_dict = sam_header_clone(dicts[0]);
 
     for (idict=1; idict<n; idict++)
     {
         const list_t *tmpl_hlines = dicts[idict];
 
         while ( tmpl_hlines )
         {
             list_t *out_hlines = out_dict;
             int inserted = 0;
             while ( out_hlines )
             {
                 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
                 if ( status==0 )
                 {
                     out_hlines = out_hlines->next;
                     continue;
                 }
                 
                 if ( status==2 ) 
                 {
                     print_header_line(stderr,tmpl_hlines->data);
                     print_header_line(stderr,out_hlines->data);
                     debug("Conflicting lines, cannot merge the headers.\n");
 					return 0;
                 }
                 if ( status==3 )
                     sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
 
                 inserted = 1;
                 break;
             }
             if ( !inserted )
                 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
 
             tmpl_hlines = tmpl_hlines->next;
         }
     }
 
     return out_dict;
 }
 
 char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n)
 {
     int nout = 0;
     char **out = NULL;
 
     *n = 0;
     list_t *l = (list_t *)dict;
     if ( !l ) return NULL;
 
     int i, ntags = 0;
     while ( tags[ntags] ) ntags++;
 
     while (l)
     {
         HeaderLine *hline = l->data;
         if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
         {
             l = l->next;
             continue;
         }
         out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags);
         for (i=0; i<ntags; i++)
         {
             HeaderTag *key = header_line_has_tag(hline, tags[i]);
             if ( !key ) 
             {
                 out[nout*ntags+i] = NULL;
                 continue;
             }
             out[nout*ntags+i] = key->value;
         }
         nout++;
         l = l->next;
     }
     *n = nout;
     return out;
 }