// [[Rcpp::depends(BH)]]
#include <Rcpp.h>
using namespace Rcpp;
#include <cstdlib>
#include <sstream>
#include <iostream>
#include <fstream>
#include <map>
#include <vector>
#include <string>
#include <cmath>
#include <cstdlib>
#include <iomanip>
// #include <zlib.h>
#include <algorithm>
#include <boost/algorithm/string.hpp>
#include <boost/iostreams/device/file.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
// #include <boost/iostreams/filter/gzip.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/assign.hpp>
#include <boost/algorithm/string/trim.hpp>
// #include <RcppGSL.h>
// #include <gsl/gsl_cdf.h>
// #include <gsl/gsl_sf_gamma.h>
// #include <gsl/gsl_sf_exp.h>
#include "data_types.h"
#include "matrix.h"
#include "utils.h"
using namespace std;
using namespace boost;
void multi_vec_by_number(vector<double>& vec, double v) {
if (!vec.empty()) {
for (int i=0; i<vec.size(); i++) vec[i] *= v;
}
}
bool assign_vector_zeros(vector<double>& vec) {
if (vec.empty()) return false;
else {
for (int i=0; i<vec.size(); i++) vec[i] = 0;
return true;
}
}
// https://stackoverflow.com/questions/874134/find-out-if-string-ends-with-another-string-in-c/42844629
bool str_ends_with(string& str, string& suffix) {
// bool found = false;
if (str.length() >= suffix.length()) {
return (0 == str.compare (str.length() - suffix.length(), suffix.length(), suffix));
} else {
return false;
}
}
void make_complete_chromosomes(vector<string> & all_chrs){
all_chrs.clear();
for (int i=1; i<=22; i++) {
ostringstream convert;
convert << i;
all_chrs.push_back("chr" + convert.str());
}
all_chrs.push_back("chrX");
all_chrs.push_back("chrY");
}
// File format (each line is a string)
void read_list_strings_from_file(string input_file, vector<string>& strs) {
ifstream fin;
fin.open(input_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << input_file << " in read_list_strings_from_file()" << endl;
Rcpp::Rcerr << "Exit." << endl;
// exit(EXIT_FAILURE);
}
string line;
while (!fin.eof()) {
getline(fin, line);
if (line.empty()) {
// this is the last line of the file
break;
}
strs.push_back(line);
}
fin.close();
}
// File format (each line has two strings, TAB delimited):
// Column 1: string1
// Column 2: string2
//
// We obtain the unique values of Column 1, and build a map between unique value of Column 1 and their corresponding values in Column 2
void read_two_columns_of_list_strings_from_file(string input_file, vector<string>& strs1, vector<string>& strs2,
map<string, vector<string> > & map_str1tostrs2, string delimit="\t") {
Rcpp::Rcerr << "Load file '" << input_file << "'" << endl;
ifstream fin;
fin.open(input_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << input_file << " in read_two_columns_of_list_strings_from_file()" << endl;
Rcpp::Rcerr << "Exit." << endl;
// exit(EXIT_FAILURE);
}
string line;
unsigned long i=0;
while (!fin.eof()) {
getline(fin, line);
if (line.empty()) {
// this is the last line of the file
break;
}
i++;
vector<string> items;
split(items, line, is_any_of(delimit));
if (items.size()<2) {
Rcpp::Rcerr << "Error (read_two_columns_of_list_strings_from_file):"<< endl;
Rcpp::Rcerr << " File: " << input_file << endl;
Rcpp::Rcerr << " Line " << i << ": There are less than 2 columns!" << endl;
Rcpp::Rcerr << "Exit." << endl;
// exit(EXIT_FAILURE);
}
strs1.push_back(items[0]);
strs2.push_back(items[1]);
// build a map between strs1 and strs2
// string in strs1 is a key, and strings of strs2 in corresponding lines are values of this key
if ( map_str1tostrs2.find(items[0]) == map_str1tostrs2.end() ) {
// this is a new key
vector<string> values;
values.push_back(items[1]);
map_str1tostrs2.insert(make_pair(items[0], values));
} else {
// this key already exists in the map
vector<string> & values = map_str1tostrs2[items[0]];
values.push_back(items[1]);
}
}
fin.close();
// print a summary of this map
map<string, vector<string> >::iterator it;
int n=0;
for (it=map_str1tostrs2.begin(); it!=map_str1tostrs2.end(); it++) {
Rcpp::Rcout << it->first << "\t" << it->second.size() << endl;
n += it->second.size();
}
Rcpp::Rcerr << "Total: " << n << " elements" << endl;
}
// wig File format
// http://www.ensembl.org/info/website/upload/wig.html
// http://genome.ucsc.edu/goldenpath/help/wiggle.html
// wig file format: Wiggle element data values can be integer or real, positive or negative. Chromosome positions are 1-relative, i.e. the first base is 1. Only positions specified have data; unspecified positions will be empty.
// Wiggle format is line-oriented. For wiggle custom tracks, the first line must be a track definition line (i.e., track type=wiggle_0), which designates the track as a wiggle track and adds a number of options for controlling the default display.
//
// track type=wiggle_0 name="UCSD.Adipose_Tissue.Bisulfite-Seq.STL003:methRatio" visibility=full color=20,150,20 altColor=150,20,20 windowingFunction=mean
// variableStep chrom=chrN [span=windowSize]
// chromStartA dataValueA
// chromStartB dataValueB
// ... etc ... ... etc ...
//
// For example
// track type=wiggle_0 name="UCSD.Adipose_Tissue.Bisulfite-Seq.STL003:methRatio" visibility=full color=20,150,20 altColor=150,20,20 windowingFunction=mean
// variableStep chrom=chr1
// 10469 0.75
// 10470 0.75
// 10471 0.833333333333333
// 10472 0.833333333333333
// 10484 0.928571428571429
// 10485 0.928571428571429
// We obtain the unique values of Column 1, and build a map between unique value of Column 1 and their corresponding values in Column 2
//
// output variable: map<string,map<int,double> > data, it is "chr -> map(position -> value)"
// The input line is not the first header line
void process_one_line_of_wig_file(string & line, string & chr, map<string,map<int,double> > & data) {
if (!std::isdigit(line[0])) {
// this is the line "variableStep chrom=chr2" or "fixedStep chrom=chr3 start=400601 step=100"
// this indicates this is a new chr
string delimit=" "; // a space
vector<string> items;
split(items, line, is_any_of(delimit));
for (int i=0; i<items.size(); i++) {
//cerr << "Item: " << items[i] << endl << flush;
if (items[i].substr(0,6)=="chrom=") {
chr = items[i].substr(6,items[i].size()-6);
//cerr << chr << endl << flush;
break;
}
}
map<int,double> position2value;
data[chr] = position2value; // create an empty map
} else {
// this is the line "chromStart dataValue", which is delimited by a TAB
string delimit="\t"; // a tab
vector<string> items;
split(items, line, is_any_of(delimit));
//cerr << "Item 1: " << items[0] << endl << flush;
//cerr << "Item 2: " << items[1] << endl << flush;
int position = (int)atoi(items[0].c_str());
double value = atof(items[1].c_str());
//cerr << position << "\t" << value << endl << flush;
map<int,double> & position2value = data[chr];
position2value[position] = value;
}
}
void read_wig_file(string wig_file, map<string,map<int,double> > & data) {
//cerr << "reading '" << wig_file << "'" << endl;
unsigned long i_line=0;
string chr="";
// input is a plain text file
ifstream fin;
fin.open(wig_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << wig_file << " in read_wig_file()" << endl;
// exit(EXIT_FAILURE);
}
string line;
while (!fin.eof()) {
getline(fin, line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line of wig file
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
process_one_line_of_wig_file(line, chr, data);
i_line++;
}
fin.close();
}
//void print_one_line_of_data(string chr, map<int,double> & position2value, stringstream & out) {
//map<int,double>::iterator it2;
//for (it2=position2value.begin(); it2!=position2value.end(); ++it2) {
//out << chr << "\t" << it2->first << "\t" << it2->second << endl;
//}
//}
// "data" is wig data
// print wig data to another format:
// Column 1: chr
// Column 2: position
// Column 3: value
void print_wig_data(map<string,map<int,double> > & data) {
// cout.precision(15);
map<string,map<int,double> >::iterator it1;
for (it1=data.begin(); it1!=data.end(); ++it1) {
string chr = it1->first;
map<int,double> & position2value = it1->second;
map<int,double>::iterator it2;
for (it2=position2value.begin(); it2!=position2value.end(); ++it2) {
Rcpp::Rcout << chr << "\t" << it2->first << "\t" << it2->second << endl;
}
}
}
// add a position,value to wig bins data
void add_a_position_value_to_wig_bins_data(Wig_bins_data & wig_bins_data,
int bin_internal_index, string chr, int position, double value)
{
if ( wig_bins_data.find(chr) == wig_bins_data.end() ) {
// this is a new chr
map<int,map<int,double> > info_of_a_chr_;
wig_bins_data[chr] = info_of_a_chr_;
}
map<int,map<int,double> > & info_of_a_chr = wig_bins_data[chr];
if ( info_of_a_chr.find(bin_internal_index) == info_of_a_chr.end() ) {
// this is a new bin
map<int,double> info_of_a_bin_;
info_of_a_chr[bin_internal_index] = info_of_a_bin_;
}
map<int,double> & info_of_a_bin = info_of_a_chr[bin_internal_index];
info_of_a_bin[position] = value;
}
// "data" is wig data
// binning all wig data to "wig_bins_data"
void binning_wig_data(map<string,map<int,double> > & data,
Bins_end_coord & bins_end_coord, Bins_index & bins_index,
Wig_bins_data & wig_bins_data)
{
map<string,map<int,double> >::iterator it1;
for (it1=data.begin(); it1!=data.end(); ++it1) {
string chr = it1->first;
map<int,double> & position2value = it1->second;
map<int,double>::iterator it2;
for (it2=position2value.begin(); it2!=position2value.end(); ++it2) {
int position = it2->first;
double value = it2->second;
int bin_internal_index = find_bin_of_position(bins_end_coord, chr, position);
if (bin_internal_index==-1) {
// Error: it is wierd this position doesn't belong to any bin. This is impossible. Must debug data or codes.
// This can hapen, if chr of this wig data doesn't exist in genome bins annotation "bins_end_coord".
//cerr << "Warn: " << chr << ":" << position << ":" << value << " doesn't exist in bin!" << endl;
continue;
}
int marker_index = bins_index[chr][bin_internal_index];
if (marker_index!=0) {
// this bin is a marker, not a complementary bin
// record this position and its value
add_a_position_value_to_wig_bins_data(wig_bins_data, bin_internal_index, chr, position, value);
}
}
}
}
// "wig_of_a_bin" is a map of "position -> value".
// We print this map to one line: a list of position/value pairs, separated by a space; each pair is separated by ":"
void print_wig_of_a_bin(map<int,double> & wig_of_a_bin, ostream & out) {
map<int,double>::iterator it;
size_t size_ = wig_of_a_bin.size();
out << size_ << "\t";
size_t i = 1;
for (it=wig_of_a_bin.begin(); it!=wig_of_a_bin.end(); ++it) {
if (i<size_)
out << it->first << ":" << it->second << " ";
else
out << it->first << ":" << it->second << endl;
++i;
}
}
// "data" is wig data
// print wig data to marker bins:
//
//Each line is a marker bin (or feature). All columns are delimited by TAB. There is one header line.
//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0.
//Column 2: chr
//Column 3: start coordinate of bin (1-base)
//Column 4: end coordinate of bin (1-base). The range of the bin is [start, end)
//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
//Column 6: a list of position/value pairs, separated by a space; each pair is separated by ":"
//
// The following is an example file
void print_wig_bins_data(Wig_bins_data & wig_bins_data, Bins_index & bins_index,
Bins_info & bins_info)
{
vector<string> all_chrs;
make_complete_chromosomes( all_chrs );
// cout.precision(15);
vector<string>::iterator chr_it;
for (chr_it=all_chrs.begin(); chr_it!=all_chrs.end(); ++chr_it) {
// print wig data in the right order of chromosomes, which is as the order of chromosomes in "all_chrs".
string chr = *chr_it;
vector<int> & markers_index_of_a_chr = bins_index[chr];
vector<string> & markers_info_of_a_chr = bins_info[chr];
if (wig_bins_data.find(chr) == wig_bins_data.end()) {
// this chr doesn't exist in wig_bins_data
// We still output marker bin's info, but no wig data
for (int bin_internal_index=0; bin_internal_index<markers_index_of_a_chr.size(); ++bin_internal_index) {
if (markers_index_of_a_chr[bin_internal_index]!=0) {
// this bin is a marker; complementary bin's index is zero
Rcpp::Rcout << markers_info_of_a_chr[bin_internal_index] << "\t"; // print the first five columns
Rcpp::Rcout << "0" << "\t" << "-" << endl;
}
}
} else {
map<int,map<int,double> > & wig_data_of_a_chr = wig_bins_data[chr];
for (int bin_internal_index=0; bin_internal_index<markers_index_of_a_chr.size(); ++bin_internal_index) {
if (markers_index_of_a_chr[bin_internal_index]!=0) {
// this bin is a marker; complementary bin's index is zero
Rcpp::Rcout << markers_info_of_a_chr[bin_internal_index] << "\t"; // print the first five columns
if (wig_data_of_a_chr.find(bin_internal_index) == wig_data_of_a_chr.end()) {
// this bin doesn't exist in wig_bins_data
Rcpp::Rcout << "0" << "\t" << "-" << endl;
} else {
map<int,double> & wig_of_a_bin = wig_data_of_a_chr[bin_internal_index];
print_wig_of_a_bin(wig_of_a_bin, Rcpp::Rcout); // print number of position/value pairs and the list of these pairs.
}
}
}
}
}
}
//file of a single value (of each tissue) for each marker (generated by program "build_features_bins.py")
//This file should have the same number lines (bins) as the number of markers (excluding complementary bins) in bins annotation file.
//Each line is a marker bin (or feature). All columns are delimited by TAB. There is one header line.
//The first 5 columns are the same as File 1.
//Column 1: marker_index, 1-based index. Only marker bins are included, those complementary bins do not appear in this file.
//Column 2: chr
//Column 3: start coordinate of bin (1-base)
//Column 4: end coordinate of bin (1-base). The range of the bin is [start, end)
//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
//Column 6+: values for this marker (each column is a tissue)
//For example:
//marker.index b-cell colon liver lung monocyte neutrophils stomach t-cell median.of.tissues
//12 0.652 2.51:0.377 10.4:1.48 21.1:2.36 473:19.8 0.862 88.1:9.11 80.7:10.6 0.652,0.912,0.905,0.911,0.959,0.862,0.905,0.883
//38 0.0952 9.84:7.56 122:74.3 13.5:11.6 1.8:7.4 0.125 10:8.83 2.3:9.72 0.0952,0.5521,0.6341,0.5359,0.1653,0.1250,0.4667,0.1591
//391 0.743 28.5:2.31 13.1:0.986 40.9:2.39 0.682 0.722 6.72:0.627 0.955 0.743,0.944,0.950,0.947,0.682,0.722,0.940,0.955
//600 0.604 39.1:3.1 19.3:2.01 8.43:0.814 0.626 0.575 2.75:0.397 8.57:0.429 0.604,0.929,0.935,0.930,0.626,0.575,0.940,1.000
//
// input file is TAB-delimited plain text. The file has a header line
// value_column_start_index: the column index of the first value (index is 1-base). In this format, it should be 6.
// marker2beta: a map for bin_index -> a vector of value. bin_index is always 1-base. In detail, it is "map<unsigned int, vector<double> >"
void read_tissue_markers_txt_file(string tissue_markers_file, int value_column_start_index, int num_tissue_types, Bins2Values & marker2beta, vector<string> & value_names)
{
// char buf[102400]; // suppose a line has max 102400 char.
//cerr << "reading '" << tissue_markers_file << "'" << endl;
unsigned long i_line=0;
// input is a plain text file
ifstream fin;
fin.open(tissue_markers_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << tissue_markers_file << " in read_tissue_markers_txt_file()" << endl;
// exit(EXIT_FAILURE);
}
string line;
while (!fin.eof()) {
getline(fin, line);
trim(line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line of tissue_markers_file
vector<string> items;
split(items, line, is_any_of("\t"));
for (int i=value_column_start_index-1; i<num_tissue_types+value_column_start_index-1; i++) {
value_names.push_back( items[i] );
}
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
vector<string> items;
split(items, line, is_any_of("\t"));
if (value_column_start_index>items.size()) {
Rcpp::Rcerr << "Error(read_tissue_markers_txt_file): the value column starts from Column " << value_column_start_index << " that is > total number of columns (" << items.size() << ") in Line " << i_line+1 << "!" << endl;
// exit(EXIT_FAILURE);
}
int marker_index = (int)atoi(items[0].c_str());
vector<double> values;
for (int i=value_column_start_index-1; i<num_tissue_types+value_column_start_index-1; i++) {
if (items[i].find(':') != std::string::npos) {
// found ':', two values
vector<string> pair;
split(pair, items[i], is_any_of(":"));
if (pair.size()!=2) {
Rcpp::Rcerr << "Error(read_tissue_markers_txt_file): the value column " << (i+1) << " has " << pair.size() << " values (only " << pair.size() << " values) in Line " << i_line+1 << "!" << endl;
// exit(EXIT_FAILURE);
}
double a = (double)atof(pair[0].c_str());
double b = (double)atof(pair[1].c_str());
values.push_back( a/(a+b) );
} else {
// not found ':', just a single value
values.push_back( (double)atof(items[i].c_str()) );
}
}
marker2beta[marker_index] = values;
i_line++;
}
fin.close();
}
// void read_tissue_markers_gz_file(string tissue_markers_file, int value_column_start_index, int num_tissue_types, Bins2Values & marker2beta, vector<string> & value_names)
// {
// // char buf[102400]; // suppose a line has max 102400 char.
// //cerr << "reading '" << tissue_markers_file << "'" << endl;
// // input is a gzip file
// // gzFile fin = gzopen(tissue_markers_file.c_str(), "rb");
// // if (fin == NULL){
// // cerr << "Error: Unable to open " << tissue_markers_file << " in read_tissue_markers_gz_file()" << endl;
// // exit(EXIT_FAILURE);
// // }
//
// // See webpage: https://techoverflow.net/2013/11/03/c-iterating-lines-in-a-gz-file-using-boostiostreams/
// std::ifstream file(tissue_markers_file.c_str(), std::ios_base::in | std::ios_base::binary);
// iostreams::filtering_streambuf<iostreams::input> inbuf;
// inbuf.push(iostreams::gzip_decompressor());
// inbuf.push(file);
// //Convert streambuf to istream
// std::istream instream(&inbuf);
//
// std::string line;
// unsigned long i_line=0;
// while(std::getline(instream, line)) {
// trim(line);
// //cerr << "Line: " << line << endl;
// if (i_line==0) {
// // skip the header line of tissue_markers_file
// vector<string> items;
// split(items, line, is_any_of("\t"));
// for (int i=value_column_start_index-1; i<num_tissue_types+value_column_start_index-1; i++) {
// value_names.push_back( items[i] );
// }
// i_line++;
// continue;
// }
// if (line.empty()) {
// // this is the last line of the file
// break;
// }
// vector<string> items;
// split(items, line, is_any_of("\t"));
// if (value_column_start_index>items.size()) {
// Rcpp::Rcerr << "Error(read_tissue_markers_gz_file): the value column starts from Column " << value_column_start_index << " that is > total number of columns (" << items.size() << ") in Line " << i_line+1 << "!" << endl;
// // exit(EXIT_FAILURE);
// }
// int marker_index = (int)atoi(items[0].c_str());
// vector<double> values;
// for (int i=value_column_start_index-1; i<num_tissue_types+value_column_start_index-1; i++) {
// if (items[i].find(':') != std::string::npos) {
// // found ':', two values
// vector<string> pair;
// split(pair, items[i], is_any_of(":"));
// if (pair.size()!=2) {
// Rcpp::Rcerr << "Error(read_tissue_markers_gz_file): the value column " << (i+1) << " has " << pair.size() << " values (only " << pair.size() << " values) in Line " << i_line+1 << "!" << endl;
// // exit(EXIT_FAILURE);
// }
// double a = (double)atof(pair[0].c_str());
// double b = (double)atof(pair[1].c_str());
// values.push_back( a/(a+b) );
// } else {
// // not found ':', just a single value
// values.push_back( (double)atof(items[i].c_str()) );
// }
// }
// marker2beta[marker_index] = values;
// i_line++;
// }
// file.close();
// }
//file of a single value (of each tissue) for each marker (generated by program "build_features_bins.py")
//This file should have the same number lines (bins) as the number of markers (excluding complementary bins) in bins annotation file.
//Each line is a marker bin (or feature). All columns are delimited by TAB. There is one header line.
//The first 5 columns are the same as File 1.
//Column 1: marker_index, 1-based index. Only marker bins are included, those complementary bins do not appear in this file.
//Column 2: chr
//Column 3: start coordinate of bin (1-base)
//Column 4: end coordinate of bin (1-base). The range of the bin is [start, end)
//Column 5: marker_type. "-" is the complementary bin, only facilitating the searching.
//Column 6+: paired values for this marker (each column is a tissue). Each pair contains two values, delimited by ":"
//For example:
//marker_index chr start end marker_type normal_plsa tumor_brca
//1 chr1 855266 855766 brca 47.2,28.9 8.48,10.7
//2 chr1 969796 970296 brca 6.59,10.9 5.58,8.9
//3 chr1 1099044 1099544 brca 7.22,20.8 8.79,3.2
//4 chr1 1109315 1109815 brca 30.8,20.7 25.8,78.9
//...
//5816 chr22 50962109 50962609 I 91.2,10.7 20.7,60.7
//5817 chr22 50987071 50987571 II 1.04,10.2 15.7,1.8
//5818 chr22 51016754 51017254 II 32.1,10.7 10.8,2.9
//5819 chr22 51041995 51042495 II 1.29,1.9 5.7,8.9
//5820 chr22 51136171 51136671 II 53.1,10.7 3.0,7.1
//
// input file is TAB-delimited plain text. The file has a header line
// value_column_start_index: the column index of the first value (index is 1-base). In this format, it should be 6.
// bins2pairedvalues: a map for bin_index -> a pair of two vectors of values. bin_index is always 1-base. In detail, it is "map<int, pair<vector<double>,vector<double> > >"
void read_paired_values_file_of_bins(string value_file_of_bins,
unsigned int value_column_start_index, Bins2PairedValues & bins2pairedvalues,
vector<string> & value_names)
{
//cerr << "reading '" << value_file_of_bins << "'" << endl;
unsigned long i_line=0;
// input is a plain text file
ifstream fin;
fin.open(value_file_of_bins.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << value_file_of_bins << " in read_paired_values_file_of_bins()" << endl;
// exit(EXIT_FAILURE);
}
string line;
while (!fin.eof()) {
getline(fin, line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line of value_file_of_bins
vector<string> items;
split(items, line, is_any_of("\t"));
for (int i=value_column_start_index-1; i<items.size(); i++) {
value_names.push_back( items[i] );
}
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
vector<string> items;
split(items, line, is_any_of("\t"));
if (value_column_start_index>items.size()) {
Rcpp::Rcerr << "Error(read_paired_values_file_of_bins): the value column starts from Column " << value_column_start_index << " that is > total number of columns (" << items.size() << ") in Line " << i_line+1 << "!" << endl;
// exit(EXIT_FAILURE);
}
int marker_index = (int)atoi(items[0].c_str());
vector<double> values1, values2;
for (int i=value_column_start_index-1; i<items.size(); i++) {
vector<string> subitems;
string delimit = ",:";
//cout << i << ": " << items[i] << endl;
split(subitems, items[i], is_any_of(delimit));
if (subitems.size()!=2) {
Rcpp::Rcerr << "Error(read_paired_values_file_of_bins): the value column " << (i+1) << " has no paired values (only " << subitems.size() << " values) in Line " << i_line+1 << "!" << endl;
// exit(EXIT_FAILURE);
}
// added by Wenyuan Li, 2019/01/16, to make sure all beta distribution parameters are positives
double a = (double)atof(subitems[0].c_str());
double b = (double)atof(subitems[1].c_str());
if ( (a<0) || (b<0) ) {
Rcpp::Rcerr << "Error: alpha=" << a << " or beta=" << b << " values in Line " << i_line+1 << " < 0, wrong!\nExit." << endl;
// exit(EXIT_FAILURE);
}
values1.push_back( a );
values2.push_back( b );
//cout << subitems[0] << "\t" << subitems[1] << endl;
}
bins2pairedvalues[marker_index] = make_pair(values1, values2);
i_line++;
}
fin.close();
}
//file of position-specific value of a tissue for each marker (generated by program "wig_binning")
//This file should have the same number lines (bins) as the number of markers (excluding complementary bins) in bins annotation file.
//Each line is a marker bin (or feature). All columns are delimited by TAB. There is one header line.
//The first 5 columns are the same as File 1.
//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0.
//Column 2: chr
//Column 3: start coordinate of bin (1-base)
//Column 4: end coordinate of bin (1-base). The range of the bin is [start, end)
//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
//Column 6: number of CpG sites in this marker region
//Column 7: a list of position/value pairs, separated by a space; each pair is separated by ":"
//For example:
//1 chr1 855266 855766 II 44 855298:0.8125 855299:0.8125 855404:0.933333333333333 855405:0.933333333333333 855409:0.8 855410:0.8 855425:0.75 855426:0.75 855428:0.818181818181818 855429:0.818181818181818 855437:1 855438:1 855445:1 855446:1 855472:0.833333333333333 855473:0.833333333333333 855478:0.857142857142857 855479:0.857142857142857 855486:0.909090909090909 855487:0.909090909090909 855544:0.266666666666667 855545:0.266666666666667 855549:0.133333333333333 855550:0.133333333333333 855567:0.571428571428571 855568:0.571428571428571 855609:0.1 855610:0.1 855643:0.5 855644:0.5 855649:0.777777777777778 855650:0.777777777777778 855654:0.454545454545455 855655:0.454545454545455 855657:0.333333333333333 855658:0.333333333333333 855664:0.545454545454545 855665:0.545454545454545 855692:0.0666666666666667 855693:0.0666666666666667 855708:0.357142857142857 855709:0.357142857142857 855741:0.363636363636364 855742:0.363636363636364
//2 chr1 969796 970296 II 64 969825:0.2 969826:0.2 969828:0.333333333333333 969829:0.333333333333333 969852:0.2 969853:0.2 969949:0.5 969950:0.5 969955:0.5 969956:0.5 969966:0.6 969967:0.6 969973:0.6 969974:0.6 969980:0.6 969981:0.6 969996:0.333333333333333 969997:0.333333333333333 970019:0.5 970020:0.5 970043:0.833333333333333 970044:0.833333333333333 970048:0.5 970049:0.5 970052:0.6 970053:0.6 970058:0.8 970059:0.8 970060:0.8 970061:0.8 970093:0.4 970094:0.4 970095:0.2 970096:0.2 970099:0.6 970100:0.6 970104:0.666666666666667 970105:0.666666666666667 970107:0.833333333333333 970108:0.833333333333333 970129:0.2 970130:0.2 970158:0.4 970159:0.4 970160:0.2 970161:0.2 970163:0.4 970164:0.4 970169:0.8 970170:0.8 970171:0.5 970172:0.5 970180:0.75 970181:0.75 970191:0.8 970192:0.8 970214:0 970215:0 970223:0.285714285714286 970224:0.285714285714286 970237:0.625 970238:0.625 970294:0.6 970295:0.6
//
// input file is plain text
// Bins2PositionValuePairs: a map for bin_index -> a pair of (position vector, value vector). bin_index is always 1-base. In detail, it is "map<unsigned int, pair<vector<GENOME_POSITION>,vector<double> > >".
void read_position_value_pairs_file_of_bins(string position_value_pairs_file_of_bins,
Bins2PositionValuePairs bins2positionvaluepairs)
{
//cerr << "reading '" << position_value_pairs_file_of_bins << "'" << endl;
}
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker1_index, 1-based index. "0" indicates the bin "marker1_index" is not a marker.
//Column 2: marker2_index, 1-based index. "-" indicates the read completely falls into "marker1"; "0" indicates the bin "marker2_index" is not a marker.
//Column 3: chr
//Column 4: start coordinate of read (1-base)
//Column 5: end coordinate of read (1-base). The range of the read is [start, end)
//Column 6: strand (+ or -)
//Column 7: number of CpG sites
//Column 8: list of CpG coordinates observed in this read (delimited by a comma). For example, 10469,10471,10484,10489,10493,10497,10525,10542
//Column 9: a binary vector of methylation status for all CpG sites in this read (no delimit). This vector should have the same size as list size in Column 6. For example, 00111100
//
//For example:
//1 - chr1 855441 855515 + 4 855445,855472,855478,855486 1111
//1 - chr1 855442 855516 + 4 855445,855472,855478,855486 1111
//1 - chr1 855518 855577 - 3 855544,855549,855567 111
//1 - chr1 855536 855608 - 3 855544,855549,855567 110
//0 2 chr1 969782 969856 + 3 969825,969828,969852 101
//2 - chr1 969824 969897 - 4 969825,969828,969852,969883 1110
//2 - chr1 969844 969918 - 4 969852,969883,969898,969914 1000
//...
//
// cpg_sites_column_index is 1-base
// methy_status_column_index is 1-base
void process_one_line_of_reads_binning_file(string & line, int cpg_sites_column_index, int methy_status_column_index, int & marker_index, vector<GENOME_POSITION> & cpg_sites, vector<int> & methy_status)
{
vector<string> items;
split(items, line, is_any_of("\t"));
marker_index = (unsigned int)atoi(items[0].c_str());
if (marker_index==0) {
// indicates this read covers two bins: a complementary bin (marker_index=0) and a marker bin (marker_index is non-zero).
// so we should take the second item as the marker_index
// for example:
// 0 2 chr1 969782 969856 + 3 969825,969828,969852 101
marker_index = (unsigned int)atoi(items[1].c_str());
}
vector<string> subitems;
split(subitems, items[cpg_sites_column_index-1], is_any_of(","));
for (int i=0; i<subitems.size(); i++)
cpg_sites.push_back((GENOME_POSITION)atoi(subitems[i].c_str()));
string & str_methy_status = items[methy_status_column_index-1];
for (int i=0; i<str_methy_status.size(); i++)
methy_status.push_back(str_methy_status.at(i)-'0'); // convert each char '0' or '1' to 0 or 1, respectively.
}
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker1_index, 1-based index.
//Column 2: list of CpG coordinates observed in this read (delimited by a comma). For example, 10469,10471,10484,10489,10493,10497,10525,10542
//Column 3: a binary vector of methylation status for all CpG sites in this read (no delimit). This vector should have the same size as list size in Column 6. For example, 00111100
//Column 4: number of methylated cytosines in CpG sites
//Column 5: number of unmethylated cytosines in CpG sites
//Column 6: strand (+ or -)
//
//For example:
//marker_index cpg_locs meth_string meth_count unmeth_count strand
//2 10497,10525,10542,10563,10571,10577,10579 1011111 6 1 +
//2 10497,10525,10542,10563,10571,10577,10579 1111111 7 0 +
//27 14976,15005,15029,15046,15086,15090 111111 6 0 -
//27 14976,15005,15029,15046,15086,15090 110011 4 2 -
//29 15720,15749,15769,15789,15834 11111 5 0 +
//65 88705,88762,88767,88811 0111 3 1 +
//72 133165,133180 10 1 1 +
//...
//
void process_one_line_of_reads_binning_file_with_mary_format(string & line, int & marker_index, vector<int> & methy_status)
{
vector<string> items;
split(items, line, is_any_of("\t"));
marker_index = (int)atoi(items[0].c_str());
string & str_methy_status = items[2];
for (int i=0; i<str_methy_status.size(); i++)
methy_status.push_back(str_methy_status.at(i)-'0'); // convert each char '0' or '1' to 0 or 1, respectively.
}
// File with simple format:
//marker_index meth_count unmeth_count
//2 5 2
//2 3 4
//27 8 1
//27 3 5
void process_one_line_of_reads_binning_file_with_simple_format(string & line, int & marker_index, int & mC, int & uC)
{
vector<string> items;
split(items, line, is_any_of("\t"));
marker_index = (int)atoi(items[0].c_str());
mC = (int)atoi(items[1].c_str());
uC = (int)atoi(items[2].c_str());
}
double calc_one_read_prob_by_single_value(double value, vector<int> & methy_status)
{
if (!methy_status.empty()) {
double p=1.0;
for (int i=0; i<methy_status.size(); i++) {
// for each CpG site
if (methy_status[i]==1) {
// methylation status is 1 (methylated)
p *= value;
} else {
// methylation status is 0 (unmethylated)
p *= 1-value;
}
}
return p;
} else {
return -1;
}
}
// Count the number of methylated cytosine (1: methylated, 0: unmethylated)
int get_mC_from_methy_states(vector<int> & methy_status)
{
int mC=-1;
if (!methy_status.empty()) {
mC=0;
for (int i=0; i<methy_status.size(); i++) {
// for each CpG site
if (methy_status[i]==1) {
// methylation status is 1 (methylated)
mC += 1;
}
}
}
return mC;
}
//
// mC: number of methylatd cytosines
// uC: number of unmethylatd cytosines
//
double calc_one_read_prob_by_mC_and_uC_and_by_single_value(double value, int mC, int uC)
{
double p=1.0;
// methylation status is 1 (methylated)
for (int i=0; i<mC; i++) p*=value;
// methylation status is 0 (unmethylated)
for (int i=0; i<uC; i++) p*=1-value;
return p;
}
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//
// marker2beta: a map for bin_index -> a vector of value. bin_index is always 1-base. In detail, it is "map<unsigned int, vector<double> >"
//
// Function: calculate the probability of each read that belongs to a tissue
//
// Output: file of probability value for each read
//
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker index (1-base)
//Columns 2+: each column is a probability value of a specific tissue. For example, given 15 tissues, there will be 15 additional columns.
//
// output:
// reads_likelihoods is a Matrix_Double.
// marker2rowindexes is a Bins2UnsignedIntegers.
// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
//
// Before calling this function, declare reads_likelihoods (Matrix_Double) by Matrix_Double(ncol).
// Before calling this function, declare marker2rowindexes (Matrix_Double) by data type Bins2UnsignedIntegers.
// Before calling this function, declare Rm and Rl (vector<int>).
//
unsigned long calc_read_probability_by_marker2beta_from_reads_binning_text_file(string reads_binning_file, Bins2Values & marker2beta, Matrix_Double& reads_likelihoods, Bins2UnsignedIntegers& marker2rowindexes, Bins2Value& marker2ambiguousreadcounts, vector<int>&Rm, vector<int>& Rl, double likelihood_ratio_cutoff)
{
unsigned long totalReadCount=0;
int print_control=1000000;
// cout.precision(15);
// process each read (i.e., line) of the input file, then print out the calculated probabilities for each read.
unsigned long i_line=0; // line count of the input file
// input is a plain text file
istream * in=&cin; // default is stdin
ifstream fin;
if ( !(reads_binning_file.compare("stdin")==0)) {
fin.open(reads_binning_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << reads_binning_file << " in calc_read_probability_by_marker2beta()" << endl;
// exit(EXIT_FAILURE);
}
in = &fin;
}
string line;
while (!(*in).eof()) {
getline((*in), line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
// if ( totalReadCount%print_control==0) {
// Rcpp::Rcerr << totalReadCount << ",";
// }
totalReadCount++;
int marker_index;
//vector<int> methy_status;
//process_one_line_of_reads_binning_file_with_mary_format(line, marker_index, methy_status);
//int mC = get_mC_from_methy_states(methy_status);
//int totalC = methy_status.size();
int mC, uC;
process_one_line_of_reads_binning_file_with_simple_format(line, marker_index, mC, uC);
int totalC = mC + uC;
if (!(marker2beta.find(marker_index) == marker2beta.end())) {
vector<double> & values = marker2beta[marker_index];
vector<double> read_likelihood_of_tissues;
for (int t=0; t<values.size(); t++) {
double p = calc_one_read_prob_by_mC_and_uC_and_by_single_value(values[t], mC, totalC-mC);
//double p = calc_one_read_prob_by_single_value(values[t], methy_status);
read_likelihood_of_tissues.push_back(p);
//cout << "\t" << p;
}
long row_index_in_likelihood_matrix;
if (likelihood_ratio_cutoff==-1.0) {
//cerr << "\nNo filteration of reads by likelihood_ratio_cutoff\n";
row_index_in_likelihood_matrix = reads_likelihoods.append_row_vector(read_likelihood_of_tissues, marker_index);
} else {
//cerr << "\nfilter reads with likelihood_ratio_cutoff: " << likelihood_ratio_cutoff << endl;
// if row_index_in_likelihood_matrix!=-1, this read likelihood vector was already appended into the matrix of read likelihoods.
row_index_in_likelihood_matrix = reads_likelihoods.append_row_vector_with_filter(read_likelihood_of_tissues, marker_index, likelihood_ratio_cutoff);
}
if (row_index_in_likelihood_matrix!=-1) {
// this read is not ambiguous read, but a tissue-type-specific read
/////////// begin debug /////////////
//cerr << "line: " << line << endl;
//cerr << "row_index (with prob): " << row_index_in_likelihood_matrix << ", ";
//for (int jj=0; jj<read_likelihood_of_tissues.size(); jj++) {
//cerr << "\t" << read_likelihood_of_tissues[jj];
//}
//cerr << endl;
/////////// end debug /////////////
// successfully append a vector of this read's tissue-specific likelihoods to the matrix.
// add 'row_index_in_likelihood_matrix' to 'marker2rowindexes'
if (marker2rowindexes.find(marker_index)==marker2rowindexes.end()) {
// not found 'marker_index' in 'marker2rowindexes', create a new key
marker2rowindexes.insert(make_pair(marker_index,vector<unsigned int>()));
}
marker2rowindexes[marker_index].push_back(row_index_in_likelihood_matrix);
Rm.push_back(mC);
Rl.push_back(totalC);
//cerr << "line " << i_line << ", markerId: " << marker_index << ", mC: " << mC << ", totalC: " << totalC << ", row_index_in_likelihood_matrix: " << row_index_in_likelihood_matrix << endl;
} else {
// this read is an ambiguous read, which is used for read counting in unknown class
if (marker2ambiguousreadcounts.find(marker_index)==marker2ambiguousreadcounts.end()) {
// not found 'marker_index' in 'marker2ambiguousreadcounts'
marker2ambiguousreadcounts.insert(make_pair(marker_index, 0));
}
marker2ambiguousreadcounts[marker_index]++;
}
}
i_line++;
}
if ( !(reads_binning_file.compare("stdin")==0) ) {
fin.close();
}
Rcpp::Rcerr << endl;
return totalReadCount;
}
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//
// marker2beta: a map for bin_index -> a vector of value. bin_index is always 1-base. In detail, it is "map<unsigned int, vector<double> >"
//
// Function: calculate the probability of each read that belongs to a tissue
//
// Output: file of probability value for each read
//
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker index (1-base)
//Columns 2+: each column is a probability value of a specific tissue. For example, given 15 tissues, there will be 15 additional columns.
//
// output:
// reads_likelihoods is a Matrix_Double.
// marker2rowindexes is a Bins2UnsignedIntegers.
// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
//
// Before calling this function, declare reads_likelihoods (Matrix_Double) by Matrix_Double(ncol).
// Before calling this function, declare marker2rowindexes (Matrix_Double) by data type Bins2UnsignedIntegers.
// Before calling this function, declare Rm and Rl (vector<int>).
//
// unsigned long calc_read_probability_by_marker2beta_from_reads_binning_gzip_file(string reads_binning_file, Bins2Values & marker2beta, Matrix_Double& reads_likelihoods, Bins2UnsignedIntegers& marker2rowindexes, Bins2Value& marker2ambiguousreadcounts, vector<int>&Rm, vector<int>& Rl, double likelihood_ratio_cutoff)
// {
// unsigned long totalReadCount=0;
// int print_control=1000000;
// cout.precision(15);
// // process each read (i.e., line) of the input file, then print out the calculated probabilities for each read.
// unsigned long i_line=0; // line count of the input file
// // input is a gzipped text file
// // See webpage: https://techoverflow.net/2013/11/03/c-iterating-lines-in-a-gz-file-using-boostiostreams/
// std::ifstream file(reads_binning_file.c_str(), std::ios_base::in | std::ios_base::binary);
// iostreams::filtering_streambuf<iostreams::input> inbuf;
// inbuf.push(iostreams::gzip_decompressor());
// inbuf.push(file);
// //Convert streambuf to istream
// std::istream instream(&inbuf);
//
// std::string line;
// while(std::getline(instream, line)) {
// trim(line);
// //cerr << "Line: " << line << endl;
// if (i_line==0) {
// // skip the header line
// i_line++;
// continue;
// }
// if (line.empty()) {
// // this is the last line of the file
// break;
// }
// if ( totalReadCount%print_control==0) {
// Rcpp::Rcerr << totalReadCount << ",";
// }
// totalReadCount++;
// int marker_index;
//
// //vector<int> methy_status;
// //process_one_line_of_reads_binning_file_with_mary_format(line, marker_index, methy_status);
// //int mC = get_mC_from_methy_states(methy_status);
// //int totalC = methy_status.size();
//
// int mC, uC;
// process_one_line_of_reads_binning_file_with_simple_format(line, marker_index, mC, uC);
// int totalC = mC + uC;
//
// if (!(marker2beta.find(marker_index) == marker2beta.end())) {
// vector<double> & values = marker2beta[marker_index];
// vector<double> read_likelihood_of_tissues;
// for (int t=0; t<values.size(); t++) {
// double p = calc_one_read_prob_by_mC_and_uC_and_by_single_value(values[t], mC, totalC-mC);
// //double p = calc_one_read_prob_by_single_value(values[t], methy_status);
// read_likelihood_of_tissues.push_back(p);
// //cout << "\t" << p;
// }
// long row_index_in_likelihood_matrix;
// if (likelihood_ratio_cutoff==-1.0) {
// //cerr << "\nNo filteration of reads by likelihood_ratio_cutoff\n";
// row_index_in_likelihood_matrix = reads_likelihoods.append_row_vector(read_likelihood_of_tissues, marker_index);
// } else {
// //cerr << "\nfilter reads with likelihood_ratio_cutoff: " << likelihood_ratio_cutoff << endl;
// // if row_index_in_likelihood_matrix!=-1, this read likelihood vector was already appended into the matrix of read likelihoods.
// row_index_in_likelihood_matrix = reads_likelihoods.append_row_vector_with_filter(read_likelihood_of_tissues, marker_index, likelihood_ratio_cutoff);
// }
// if (row_index_in_likelihood_matrix!=-1) {
// // this read is not ambiguous read, but a tissue-type-specific read
// /////////// begin debug /////////////
// //cerr << "line: " << line << endl;
// //cerr << "row_index (with prob): " << row_index_in_likelihood_matrix << ", ";
// //for (int jj=0; jj<read_likelihood_of_tissues.size(); jj++) {
// //cerr << "\t" << read_likelihood_of_tissues[jj];
// //}
// //cerr << endl;
// /////////// end debug /////////////
// // successfully append a vector of this read's tissue-specific likelihoods to the matrix.
// // add 'row_index_in_likelihood_matrix' to 'marker2rowindexes'
// if (marker2rowindexes.find(marker_index)==marker2rowindexes.end()) {
// // not found 'marker_index' in 'marker2rowindexes', create a new key
// marker2rowindexes.insert(make_pair(marker_index,vector<unsigned int>()));
// }
// marker2rowindexes[marker_index].push_back(row_index_in_likelihood_matrix);
// Rm.push_back(mC);
// Rl.push_back(totalC);
// //cerr << "line " << i_line << ", markerId: " << marker_index << ", mC: " << mC << ", totalC: " << totalC << ", row_index_in_likelihood_matrix: " << row_index_in_likelihood_matrix << endl;
// } else {
// // this read is an ambiguous read, which is used for read counting in unknown class
// if (marker2ambiguousreadcounts.find(marker_index)==marker2ambiguousreadcounts.end()) {
// // not found 'marker_index' in 'marker2ambiguousreadcounts'
// marker2ambiguousreadcounts.insert(make_pair(marker_index, 0));
// }
// marker2ambiguousreadcounts[marker_index]++;
// }
// }
// i_line++;
// }
// file.close();
// Rcpp::Rcerr << endl;
// return totalReadCount;
// }
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//
// bins2values: a map for bin_index -> a vector of value. bin_index is always 1-base. In detail, it is "map<unsigned int, vector<double> >"
//
// Function: calculate the probability of each read that belongs to a tissue
//
// Output: file of probability value for each read
//
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker index (1-base)
//Columns 2+: each column is a probability value of a specific tissue. For example, given 15 tissues, there will be 15 additional columns.
// void calc_read_probability_by_methy_level_bins(string reads_binning_file, int num_bins_of_methylation_level)
// {
// vector<double> values;
// vector<string> value_names;
// double mvalue_step = 1.0/(double)num_bins_of_methylation_level;
// char str[64];
// for (int i=0; i<num_bins_of_methylation_level; i++) {
// double v = i*mvalue_step + mvalue_step/2;
// values.push_back( v );
// sprintf(str,"%.6g", i*mvalue_step);
// string start = string( str );
// sprintf(str,"%.6g", (i+1)*mvalue_step);
// string end = string( str );
// sprintf(str,"%d", i+1 );
// string idx = string( str );
// sprintf(str,"%.6g", v );
// string v_str = string( str );
// value_names.push_back( idx + ":(" + start + "," + end + "):" + v_str );
// }
//
// int column_index_of_cpg_sites = 8; // 1-base index
// int column_index_of_methy_status = 1 + column_index_of_cpg_sites; // 0-base index
// // cout.precision(15);
// //cerr << "reading '" << reads_binning_file << "'" << endl;
// // print out the header line
// Rcpp::Rcout << "marker_index";
// for (int i=0; i<value_names.size(); i++)
// Rcpp::Rcout << "\t" << value_names[i];
// Rcpp::Rcout << "\n";
//
// // process each read (i.e., line) of the input file, then print out the calculated probabilities for each read.
// unsigned long i_line=0; // line count of the input file
//
// // input is a plain text file
// ifstream fin;
// fin.open(reads_binning_file.c_str());
// if (fin.fail()){
// Rcpp::Rcerr << "Error: Unable to open " << reads_binning_file << " in calc_read_probability_by_methy_level_bins()" << endl;
// // exit(EXIT_FAILURE);
// }
// string line;
// while (!fin.eof()) {
// getline(fin, line);
// //cerr << "Line: " << line << endl;
// if (i_line==0) {
// // skip the header line of wig file
// i_line++;
// continue;
// }
// if (line.empty()) {
// // this is the last line of the file
// break;
// }
// int marker_index;
// vector<GENOME_POSITION> cpg_sites;
// vector<int> methy_status;
// process_one_line_of_reads_binning_file(line, column_index_of_cpg_sites, column_index_of_methy_status, marker_index, cpg_sites, methy_status);
// Rcpp::Rcout << marker_index;
// for (int t=0; t<values.size(); t++) {
// double p = calc_one_read_prob_by_single_value(values[t], methy_status);
// Rcpp::Rcout << "\t" << p;
// }
// Rcpp::Rcout << "\n";
// i_line++;
// }
// fin.close();
// }
// Function: calculate the probability of each read that belongs to a tissue
// the formula is B(x+alpha, 1-x+beta)/B(alpha,beta)
// where B() is beta function, x is methylation status of a CpG site (0 or 1), alpha and beta are a pair of values for describing a Beta-distribution provided by the input parameter "bins2pairedvalues".
//
// background_B: a precomputed B(alpha,beta)
//
double calc_one_read_prob_by_a_paired_value(double alpha, double beta, double background_B, vector<int> & methy_status)
{
if (!methy_status.empty()) {
double p=1.0;
for (int i=0; i<methy_status.size(); i++) {
// for each CpG site
//cerr << "gammaln_of_all: " << ( gsl_sf_lngamma(methy_status[i]+alpha) + gsl_sf_lngamma(1-methy_status[i]+beta) - gsl_sf_lngamma(alpha+beta+1) - background_B ) << endl << flush;
// p *= gsl_sf_exp( gsl_sf_lngamma(methy_status[i]+alpha) + gsl_sf_lngamma(1-methy_status[i]+beta) - gsl_sf_lngamma(alpha+beta+1) - background_B );
p *= exp( lgamma(methy_status[i]+alpha) + lgamma(1-methy_status[i]+beta) - lgamma(alpha+beta+1) - background_B );
}
return p;
} else {
return -1;
}
}
//
//input file of indexing reads by bins (or features). (generated by program "reads_binning")
//Each line is a read. All columns are delimited by TAB. There is one header line.
//
// bins2pairedvalues: a map for bin_index -> a pair of two vectors of value. bin_index is always 1-base. In detail, it is "map<unsigned int, pair<vector<double>,vector<double> > >"
//
// Function: calculate the probability of each read that belongs to a tissue
// the formula is B(x+alpha, 1-x+beta)/B(alpha,beta)
// where B() is beta function, x is methylation status of a CpG site (0 or 1), alpha and beta are a pair of values for describing a Beta-distribution provided by the input parameter "bins2pairedvalues".
//
// Output: file of probability value for each read
//
//Each line is a read. All columns are delimited by TAB. There is one header line.
//Column 1: marker index (1-base)
//Columns 2+: each column is a probability value of a specific tissue. For example, given 15 tissues, there will be 15 additional columns.
//
// Ongoing, unfinished yet.
//
void calc_read_probability_by_bins2pairedvalues(string reads_binning_file,
Bins2PairedValues & bins2pairedvalues, vector<string> & value_names)
{
int column_index_of_cpg_sites = 8; // 1-base index
int column_index_of_methy_status = 1 + column_index_of_cpg_sites; // 0-base index
// precompute the B(alpha,beta) for each bin
map<int,vector<double> > B;
Bins2PairedValues::iterator it;
for (it=bins2pairedvalues.begin(); it!=bins2pairedvalues.end(); ++it) {
int bin_index = it->first;
vector<double> & alphas = (it->second).first;
vector<double> & betas = (it->second).second;
vector<double> values;
int n = alphas.size();
for (int t=0; t<n; t++) {
//values.push_back( gsl_sf_beta(alphas[t], betas[t]) );
// values.push_back( gsl_sf_lngamma(alphas[t]) + gsl_sf_lngamma(betas[t]) - gsl_sf_lngamma(alphas[t]+betas[t]) );
values.push_back( lgamma(alphas[t]) + lgamma(betas[t]) - lgamma(alphas[t]+betas[t]) );
//cerr << "a: " << alphas[t] << ", b: " << betas[t] << "gammaln(z)+gammaln(w)-gammaln(z+w): " << gsl_sf_lngamma(alphas[t]) + gsl_sf_lngamma(betas[t]) - gsl_sf_lngamma(alphas[t]+betas[t]) << endl;
}
B[bin_index] = values;
}
// cout.precision(15);
//cerr << "reading '" << reads_binning_file << "'" << endl;
// print out the header line
Rcpp::Rcout << "marker_index";
for (int i=0; i<value_names.size(); i++)
Rcpp::Rcout << "\t" << value_names[i];
Rcpp::Rcout << "\n";
// process each read (i.e., line) of the input file, then print out the calculated probabilities for each read.
unsigned long i_line=0; // line count of the input file
// input is a plain text file
istream * in=&cin; // default is stdin
ifstream fin;
if ( !(reads_binning_file.compare("stdin")==0)) {
fin.open(reads_binning_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << reads_binning_file << " in calc_read_probability_by_bins2pairedvalues()" << endl;
// exit(EXIT_FAILURE);
}
in = &fin;
}
string line;
while (!(*in).eof()) {
getline((*in), line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line of wig file
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
int marker_index;
vector<GENOME_POSITION> cpg_sites;
vector<int> methy_status;
process_one_line_of_reads_binning_file(line, column_index_of_cpg_sites, column_index_of_methy_status, marker_index, cpg_sites, methy_status);
if (!(bins2pairedvalues.find(marker_index) == bins2pairedvalues.end())) {
//cerr << "marker " << marker_index << " does not exist in markers list" << endl;
//} else {
Rcpp::Rcout << marker_index;
vector<double> & alphas = bins2pairedvalues[marker_index].first;
vector<double> & betas = bins2pairedvalues[marker_index].second;
for (int t=0; t<alphas.size(); t++) {
double p = calc_one_read_prob_by_a_paired_value(alphas[t], betas[t], B[marker_index][t], methy_status);
Rcpp::Rcout << "\t" << p;
}
Rcpp::Rcout << "\n";
}
i_line++;
}
if ( !(reads_binning_file.compare("stdin")==0) ) {
fin.close();
}
}
// Function: calculate the probability of each read that belongs to a tissue
//
// Output: two vectors
// (1) number of methylated sites of each read
// (2) number of all sites of each read
//
void get_reads_methy_data_from_reads_binning_file(string reads_binning_file,
vector<int> & num_cpg_sites, vector<int> & num_methy_cpg_sites)
{
int column_index_of_cpg_sites = 8; // 1-base index
int column_index_of_methy_status = 1 + column_index_of_cpg_sites; // 0-base index
// cout.precision(15);
//cerr << "reading '" << reads_binning_file << "'" << endl;
// print out the header line
Rcpp::Rcout << "marker_index" << "\t" << "num_CpG_sites" << "\t" << "num_methy_CpG_sites" << endl;
// process each read (i.e., line) of the input file, then print out the numbers of CpG sites and methylated CpG sites for each read.
unsigned long i_line=0; // line count of the input file
// input is a plain text file
ifstream fin;
fin.open(reads_binning_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << reads_binning_file << " in get_reads_methy_data_from_reads_binning_file()" << endl;
// exit(EXIT_FAILURE);
}
string line;
while (!fin.eof()) {
getline(fin, line);
//cerr << "Line: " << line << endl;
if (i_line==0) {
// skip the header line of wig file
i_line++;
continue;
}
if (line.empty()) {
// this is the last line of the file
break;
}
int marker_index;
vector<GENOME_POSITION> cpg_sites;
vector<int> methy_status;
process_one_line_of_reads_binning_file(line, column_index_of_cpg_sites, column_index_of_methy_status, marker_index, cpg_sites, methy_status);
int num_methy_sites=0;
for (int t=0; t<methy_status.size(); t++)
if (methy_status[t]==1)
num_methy_sites++;
num_methy_cpg_sites.push_back(num_methy_sites);
num_cpg_sites.push_back(methy_status.size());
//cout << marker_index << "\t" << methy_status.size() << "\t" << num_methy_sites << endl;
i_line++;
}
fin.close();
}
// print vector of uint to the format, e.g., 10469,10471,10484,10489,10493,10497,10525,10542
void print_vec_of_uint(ostream& of, vector<unsigned int> & v) {
switch (v.size()) {
case 0:
return;
case 1:
of << v[0];
break;
default: // v.size()>=2
int i;
for (i=0; i<v.size()-1; i++) of << v[i] << ",";
of << v[i];
}
}
// print vector of ulong to the format, e.g., 10469,10471,10484,10489,10493,10497,10525,10542
void print_vec_of_ulong(ostream& of, vector<unsigned long> & v) {
switch (v.size()) {
case 0:
return;
case 1:
of << v[0];
break;
default: // v.size()>=2
int i;
for (i=0; i<v.size()-1; i++) of << v[i] << ",";
of << v[i];
}
}
void print_map_of_strings(ostream& of, map<string, vector<string> > & map_str1tostrs2) {
map<string, vector<string> >::iterator it;
for (it=map_str1tostrs2.begin(); it!=map_str1tostrs2.end(); it++) {
string str1 = it->first;
vector<string> & strs2 = it->second;
for (int i=0; i<strs2.size(); i++)
of << str1 << "\t" << strs2[i] << endl;
}
}
void print_str_vectors(ofstream& o, vector<string> & s)
{
vector<string>::iterator it;
for (it=s.begin(); it!=s.end(); ++it)
o << *it << endl;
}
void strings2floats(string str, vector<float> & vec, string delimit) {
vector<string> strs;
split(strs, str, is_any_of(delimit));
if (!strs.empty())
for (int i=0; i<strs.size(); i++)
vec.push_back((float)atof(strs[i].c_str()));
}
float min(float a, float b) {
return a < b ? a : b;
}
float max(float a, float b) {
return a > b ? a : b;
}
// float meta_p(vector<float> & probs) {
// if (!probs.empty()) {
// double x = 0;
// int n_effective = 0;
// for (int i=0; i<probs.size(); i++)
// if (probs[i]!=-1) {
// x += -2*log(probs[i]);
// n_effective++;
// }
// int n_degree = 2*n_effective;
// if (n_effective==0) return -1;
// else return gsl_cdf_chisq_Q(x, n_degree);
// } else
// return -1;
// }