Codestin Search App

src/matrix.cpp

53b02cb7	// [[Rcpp::depends(BH)]] #include <Rcpp.h> using namespace Rcpp; #include <cstdlib> #include <cmath> #include <sstream> #include <iostream> #include <fstream> #include <map> #include <vector> #include <string> #include <cassert> #include <algorithm> #include <boost/algorithm/string.hpp> #include <boost/assign.hpp> #include <boost/iostreams/filtering_stream.hpp>
433af17b	#include <random>
53b02cb7	#include "matrix.h" #include "utils.h" using namespace std; using namespace boost; #include "template_utils.cpp" Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) { nrow = nrow_; ncol = ncol_; empty = false; mat.reserve(nrow_); for (int i=0; i<nrow; i++) { vector<double> temp; temp.reserve(ncol); for (int j=0; j<ncol; j++) temp.push_back(init_value); mat.push_back( temp ); } empty = mat.empty(); } // a line represents a row of the matrix. Each element in this row is delimited by a TAB. void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) { vector<string> items; split(items, line, is_any_of("\t")); vector<double> v; for (int i=0; i<items.size(); i++) { if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. if (i<num_header_column) continue; // skip the first num_header_column header columns v.push_back( atof(items[i].c_str()) ); } mat.push_back( v ); } Matrix_Double::Matrix_Double(const string & mat_file, bool header_line=false, int num_header_column=1) { unsigned long i_line=0; // input is a plain text file istream * in=&cin; // default is stdin ifstream fin; if ( !(mat_file.compare("stdin")==0)) { fin.open(mat_file.c_str()); if (fin.fail()){ Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; // exit(EXIT_FAILURE); } in = &fin; } string line; while (!(in).eof()) { getline((in), line); //cerr << "Line: " << line << endl; if (header_line) { if (i_line==0) { // skip the header line i_line++; continue; } } if (line.empty()) { // this is the last line of the file break; } process_one_line_of_mat_file(line, num_header_column); i_line++; } if ( !(mat_file.compare("stdin")==0)) { fin.close(); } empty = mat.empty(); nrow = mat.size(); if (nrow>=1) ncol = mat[0].size(); else ncol = 0; } // return the row index of this vector long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) { if (v_.size()!=ncol) { Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl; // exit(EXIT_FAILURE); } vector<double> v; for (int i=0; i<v_.size(); i++) { v.push_back( v_[i] ); } mat.push_back( v ); nrow = mat.size(); row_labels.push_back(row_label_); empty=false; return(nrow-1); } // // if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. // // return row index of this appended vector, if successfully appending; return -1 otherwise. // long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) { if (v_.size()!=ncol) { Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl; // exit(EXIT_FAILURE); } double min_ = min_element(v_.begin(), v_.end()); double max_ = max_element(v_.begin(), v_.end()); //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; if (min_==0) { if (max_==0) return(-1); } else { double ratio=max_/min_; if (ratio<min_threshold_maxv_minv) return(-1); } long row_index = append_row_vector(v_, row_label_); return(row_index); } bool Matrix_Double::get_element(const int i, const int j, double & v) { if (!empty) { if (i<nrow && j<ncol) { v = mat[i][j]; return true; } else { Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; return false; } } else { v = 0; return false; } } bool Matrix_Double::set_element(const int i, const int j, double v) { if (!empty) { if (i<nrow && j<ncol) { mat[i][j] = v; return true; } else { Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; return false; } } else { return false; } } bool Matrix_Double::get_column_sum(const int j, double & v) { if (!empty) { if (j<ncol) { v = 0; for (int k=0; k<nrow; k++) { v += mat[k][j]; } return true; } else { Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl; return false; } } else { v = 0; return false; } } bool Matrix_Double::get_row_sum(const int i, double & v) { if (!empty) { if (i<nrow) { v = 0; for (int k=0; k<ncol; k++) { v += mat[i][k]; } return true; } else { Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl; return false; } } else { v = 0; return false; } } void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) { map<int, int> counts; for (int i=0; i<row_labels.size(); i++) if (counts.find(row_labels[i]) != counts.end()) counts[row_labels[i]] += 1; // found this label, then increment count else { counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once //counts[row_labels[i]] = 1; } map<int,int>::iterator it; for (it=counts.begin(); it!=counts.end(); ++it) uniq_labels.push_back(it->first); } void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) { // obtain unique row labels vector<int> unique_row_labels; int row_label_; for (int i=0; i<nrow; i++) { row_label_ = row_labels[i]; unique_row_labels.push_back( row_label_ ); } Bins2Value::iterator iter; for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) { row_label_ = iter->first; if (!exist_row_label(row_label_)) { unique_row_labels.push_back( row_label_ ); } } sort(unique_row_labels.begin(), unique_row_labels.end()); // print both 'mat' and 'additional_column_data' as the last column for (int i=0; i<unique_row_labels.size(); i++) { row_label_ = unique_row_labels[i]; os << row_label_; int row_index = get_row_index(row_label_); if (row_index!=-1) { for (int j=0; j<ncol; j++) { os << "\t" << mat[row_index][j]; } } else { for (int j=0; j<ncol; j++) { os << "\t0"; } } if (additional_column_data.find(row_label_)!=additional_column_data.end()) { os << "\t" << additional_column_data[row_label_]; } else { os << "\t0"; } os << endl; } } ostream & operator<<(ostream & os, Matrix_Double & mat) { double v; int row_label; if (!mat.isempty()) { for (int i=0; i<mat.get_row_num(); i++) { int j; mat.get_row_label(i, row_label); os << row_label << "\t"; for (j=0; j<mat.get_column_num()-1; j++) { mat.get_element(i,j,v); os << v << "\t"; } mat.get_element(i,j,v); os << v << endl; } } return os; } double objective_em_supervise(Matrix_Double & p, vector<double> & theta) { unsigned int ncol = p.get_column_num(); unsigned int nrow = p.get_row_num(); double v; double obj = 0; for (int i=0; i<nrow; i++) { double sum = 0; for (int j=0; j<ncol; j++) { p.get_element(i,j,v); sum += theta[j]*v; } obj += log(sum); } return obj; } // // There are T known tissues and 1 unknown tissue (described by a double vector "m") // // input: // p is a matrix of N X T, where N is number of reads and T is number of known tissue // // output: // theta (model parameters), a vector with "T" elements. // q is a matrix of N X T, the tissue-specific posterior probabilty of each read
3a37cb6e	// obj is the objective function value
53b02cb7	//
3a37cb6e	double em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q, int random_seed)
53b02cb7	{ // cout.precision(15); // cerr.precision(15); unsigned int ncol = p.get_column_num(); unsigned int nrow = p.get_row_num(); // nrow is number of tissues. theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. // initialize model parameters as uniform distribution // alternatively, model parameters can be random numbers by satisfying the crition // (1) \sum_{i=1}^{ncol}{theta_i}=1
433af17b	if (random_seed == 0) { for (int j=0; j<ncol; j++) { theta[j] = 1/(double)ncol; } } else{ double sum = 0.0; std::default_random_engine generator(random_seed); std::uniform_real_distribution<double> distribution(0.0, 1.0); for (int j = 0; j < ncol; j++) { theta[j] = distribution(generator); sum += theta[j]; } // Normalize the values so that they sum up to 1 for (int j = 0; j < ncol; j++) { theta[j] /= sum; }
53b02cb7	}
433af17b
53b02cb7	// create and initialize q with the same size of p and with all elements initialized as 0 //Matrix_Double q(nrow, ncol, 0); //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; double v1, v2; for (int iter=0; iter<max_iter; iter++) { // Rcpp::Rcerr << iter+1 << "," ; // E-step: estimate q for (int i=0; i<nrow; i++) { double sum = 0; for (int j=0; j<ncol; j++) { p.get_element(i,j,v1); v2 = theta[j]v1; q.set_element( i, j, v2 ); sum += v2; } for (int j=0; j<ncol; j++) { q.get_element(i,j,v2); v2 /= sum; q.set_element( i, j, v2 ); } } // M-step: estimate theta for (int j=0; j<ncol; j++) { double sum=0; for (int i=0; i<nrow; i++) { q.get_element(i,j,v2); sum += v2; } theta[j] = sum / nrow; } // for debug //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; } // last E-step: estimate q for (int i=0; i<nrow; i++) { double sum = 0; for (int j=0; j<ncol; j++) { p.get_element(i,j,v1); v2 = theta[j]v1; q.set_element( i, j, v2 ); sum += v2; } for (int j=0; j<ncol; j++) { q.get_element(i,j,v2); v2 /= sum; q.set_element( i, j, v2 ); } } Rcpp::Rcerr << endl;
3a37cb6e	return objective_em_supervise(p, theta);
53b02cb7	} // // There are T known tissues and 1 unknown tissue (described by a double vector "m") // // input: // p is a matrix of N X T, where N is number of reads and T is number of known tissue // p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. // Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read // Rl is a vector of N X 1, each element is number of all valid CpG sites in a read // // output: // theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. // m is a vector of M X 1, where M is number of markers. // q is a matrix of N X T, the tissue-specific posterior probabilty of each read // q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. // void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
433af17b	vector<double> & m, int random_seed)
53b02cb7	{ // cout.precision(15); // cerr.precision(15); unsigned int ncol = p.get_column_num(); // T, number of known tissues unsigned int nrow = p.get_row_num(); // N, number of reads theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. vector<int> uniq_marker_Ids; p.get_unique_row_labels(uniq_marker_Ids); int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. map<int,int> markerId2Index; // marker index is 0-based. for (int index=0; index<nMarker; index++) { markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); } // initialize model parameters theta as uniform distribution, and m as 0.5 // alternatively, theta and m can be random numbers, by satisfying: // (1) \sum_{i=1}^{ncol+1}{theta_i}=1 // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
433af17b	if (random_seed == 0) { for (int j=0; j<ncol+1; j++) { theta[j] = 1/(double)(ncol+1); } } else{ double sum = 0.0; std::default_random_engine generator(random_seed); std::uniform_real_distribution<double> distribution(0.0, 1.0); for (int j = 0; j < ncol+1; j++) { theta[j] = distribution(generator); sum += theta[j]; } // Normalize the values so that they sum up to 1 for (int j = 0; j < ncol+1; j++) { theta[j] /= sum; }
53b02cb7	}
433af17b
53b02cb7	for (int k=0; k<nMarker; k++) { m[k] = 0.5; } for (int i=0; i<nrow; i++) q_unknown.push_back(0); // EM algorithm for (int iter=0; iter<max_iter; iter++) { Rcpp::Rcerr << iter+1 << "," ; // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) double v1, v2, likelihood_unknown_class; int markerId, markerIndex; for (int i=0; i<nrow; i++) { // process the first T known tissues double sum = 0; for (int j=0; j<ncol; j++) { p.get_element(i,j,v1); v2 = theta[j]v1; q.set_element( i, j, v2 ); sum += v2; } // process the last unknown tissue p.get_row_label(i, markerId); // get markerId (1-base) of read i markerIndex = markerId2Index[markerId]; likelihood_unknown_class = pow(m[markerIndex],Rm[i]) pow(1-m[markerIndex],Rl[i]-Rm[i]); q_unknown[i] = theta[ncol]likelihood_unknown_class; sum += q_unknown[i]; // update q and q_unknown for (int j=0; j<ncol; j++) { q.get_element(i,j,v2); v2 /= sum; q.set_element( i, j, v2 ); } q_unknown[i] /= sum; } // M-step 1: estimate unknown class's methylation level (m) of each marker double sum1=0, sum2=0; int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base for (int i=0; i<nrow; i++) { // for each row (or read) p.get_row_label(i, markerId); // get the marker_index (0-base) of read i curr_marker_index = markerId2Index[markerId]; if (curr_marker_index != prev_marker_index) { // this is the 1st read of a new marker // we need to summarize m value of previous marker if (prev_marker_index!=-1) { // current marker is not the 1st marker m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); sum1 = 0; sum2 = 0; } prev_marker_index = curr_marker_index; } sum1 += q_unknown[i]Rm[i]; sum2 += q_unknown[i]*Rl[i]; } m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker //cout << "iter=" << iter << ", " << "m=" << endl; // for debug //print_vec(cout, m, "\n"); // for debug //cout << endl; // for debug // M-step 2: estimate theta, which has ncol+1 values double sum; for (int j=0; j<ncol; j++) { sum=0; for (int i=0; i<nrow; i++) { q.get_element(i,j,v2); sum += v2; } theta[j] = sum / nrow; } sum=0; for (int i=0; i<nrow; i++) sum += q_unknown[i]; theta[ncol] = sum / nrow; //cout << "round " << iter+1 << ", theta="; // for debug //print_vec(cout, theta, ", "); // for debug //cout << endl << endl; // for debug } Rcpp::Rcerr << endl; //cout << "final:" << endl << std::flush; //print_vec(cout, theta, ", "); // for debug //cout << endl << endl << std::flush; // for debug } void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) { unsigned int ncol = q.get_column_num(); // T, number of known tissues unsigned int nrow = q.get_row_num(); // N, number of reads vector<int> uniq_marker_Ids; q.get_unique_row_labels(uniq_marker_Ids); // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. double v; int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 for (int i=0; i<nrow; i++) { // for each row (or read) q.get_row_label(i, curr_markerId); // get the marker ID of read i if (curr_markerId != prev_markerId) { // this is the 1st read of a new marker // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" // then clear readCountsPerMarker for a new read counting of the next new marker if (prev_markerId!=-1) { // current marker is not the 1st marker multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. readCounts.append_row_vector(readCountsPerMarker, prev_markerId); assign_vector_zeros(readCountsPerMarker); } prev_markerId = curr_markerId; } for (int j=0; j<ncol; j++) { q.get_element(i,j,v); readCountsPerMarker[j] += v; } } multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. readCounts.append_row_vector(readCountsPerMarker, curr_markerId); } void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) { unsigned int ncol = q.get_column_num(); // T, number of known tissues unsigned int nrow = q.get_row_num(); // N, number of reads if (nrow!=(unsigned int)q_unknown.size()) { Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; // exit(EXIT_FAILURE); } vector<int> uniq_marker_Ids; q.get_unique_row_labels(uniq_marker_Ids); // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. double v; int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. for (int i=0; i<nrow; i++) { // for each row (or read) q.get_row_label(i, curr_markerId); // get the marker ID of read i if (curr_markerId != prev_markerId) { // this is the 1st read of a new marker // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" // then clear readCountsPerMarker for a new read counting of the next new marker if (prev_markerId!=-1) { // current marker is not the 1st marker multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. readCounts.append_row_vector(readCountsPerMarker, prev_markerId); assign_vector_zeros(readCountsPerMarker); } prev_markerId = curr_markerId; } for (int j=0; j<ncol; j++) { q.get_element(i,j,v); readCountsPerMarker[j] += v; } readCountsPerMarker[ncol] += q_unknown[i]; } multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. readCounts.append_row_vector(readCountsPerMarker, curr_markerId); }