// [[Rcpp::depends(BH)]]
#include <Rcpp.h>
using namespace Rcpp;
#include <cstdlib>
#include <cmath>
#include <sstream>
#include <iostream>
#include <fstream>
#include <map>
#include <vector>
#include <string>
#include <cassert>
#include <algorithm>
#include <boost/algorithm/string.hpp>
#include <boost/assign.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include <random>
#include "matrix.h"
#include "utils.h"
using namespace std;
using namespace boost;
#include "template_utils.cpp"
Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
{
nrow = nrow_;
ncol = ncol_;
empty = false;
mat.reserve(nrow_);
for (int i=0; i<nrow; i++) {
vector<double> temp;
temp.reserve(ncol);
for (int j=0; j<ncol; j++)
temp.push_back(init_value);
mat.push_back( temp );
}
empty = mat.empty();
}
// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
{
vector<string> items;
split(items, line, is_any_of("\t"));
vector<double> v;
for (int i=0; i<items.size(); i++) {
if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
if (i<num_header_column) continue; // skip the first num_header_column header columns
v.push_back( atof(items[i].c_str()) );
}
mat.push_back( v );
}
Matrix_Double::Matrix_Double(const string & mat_file,
bool header_line=false, int num_header_column=1)
{
unsigned long i_line=0;
// input is a plain text file
istream * in=&cin; // default is stdin
ifstream fin;
if ( !(mat_file.compare("stdin")==0)) {
fin.open(mat_file.c_str());
if (fin.fail()){
Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
// exit(EXIT_FAILURE);
}
in = &fin;
}
string line;
while (!(*in).eof()) {
getline((*in), line);
//cerr << "Line: " << line << endl;
if (header_line) {
if (i_line==0) {
// skip the header line
i_line++;
continue;
}
}
if (line.empty()) {
// this is the last line of the file
break;
}
process_one_line_of_mat_file(line, num_header_column);
i_line++;
}
if ( !(mat_file.compare("stdin")==0)) {
fin.close();
}
empty = mat.empty();
nrow = mat.size();
if (nrow>=1) ncol = mat[0].size();
else ncol = 0;
}
// return the row index of this vector
long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
if (v_.size()!=ncol) {
Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
// exit(EXIT_FAILURE);
}
vector<double> v;
for (int i=0; i<v_.size(); i++) {
v.push_back( v_[i] );
}
mat.push_back( v );
nrow = mat.size();
row_labels.push_back(row_label_);
empty=false;
return(nrow-1);
}
//
// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
//
// return row index of this appended vector, if successfully appending; return -1 otherwise.
//
long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
if (v_.size()!=ncol) {
Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
// exit(EXIT_FAILURE);
}
double min_ = *min_element(v_.begin(), v_.end());
double max_ = *max_element(v_.begin(), v_.end());
//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
if (min_==0) {
if (max_==0) return(-1);
} else {
double ratio=max_/min_;
if (ratio<min_threshold_maxv_minv) return(-1);
}
long row_index = append_row_vector(v_, row_label_);
return(row_index);
}
bool Matrix_Double::get_element(const int i, const int j, double & v)
{
if (!empty) {
if (i<nrow && j<ncol) {
v = mat[i][j];
return true;
} else {
Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
return false;
}
} else {
v = 0;
return false;
}
}
bool Matrix_Double::set_element(const int i, const int j, double v)
{
if (!empty) {
if (i<nrow && j<ncol) {
mat[i][j] = v;
return true;
} else {
Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
return false;
}
} else {
return false;
}
}
bool Matrix_Double::get_column_sum(const int j, double & v)
{
if (!empty) {
if (j<ncol) {
v = 0;
for (int k=0; k<nrow; k++) {
v += mat[k][j];
}
return true;
} else {
Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
return false;
}
} else {
v = 0;
return false;
}
}
bool Matrix_Double::get_row_sum(const int i, double & v)
{
if (!empty) {
if (i<nrow) {
v = 0;
for (int k=0; k<ncol; k++) {
v += mat[i][k];
}
return true;
} else {
Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
return false;
}
} else {
v = 0;
return false;
}
}
void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
{
map<int, int> counts;
for (int i=0; i<row_labels.size(); i++)
if (counts.find(row_labels[i]) != counts.end())
counts[row_labels[i]] += 1; // found this label, then increment count
else {
counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
//counts[row_labels[i]] = 1;
}
map<int,int>::iterator it;
for (it=counts.begin(); it!=counts.end(); ++it)
uniq_labels.push_back(it->first);
}
void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
{
// obtain unique row labels
vector<int> unique_row_labels;
int row_label_;
for (int i=0; i<nrow; i++) {
row_label_ = row_labels[i];
unique_row_labels.push_back( row_label_ );
}
Bins2Value::iterator iter;
for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
row_label_ = iter->first;
if (!exist_row_label(row_label_)) {
unique_row_labels.push_back( row_label_ );
}
}
sort(unique_row_labels.begin(), unique_row_labels.end());
// print both 'mat' and 'additional_column_data' as the last column
for (int i=0; i<unique_row_labels.size(); i++) {
row_label_ = unique_row_labels[i];
os << row_label_;
int row_index = get_row_index(row_label_);
if (row_index!=-1) {
for (int j=0; j<ncol; j++) {
os << "\t" << mat[row_index][j];
}
} else {
for (int j=0; j<ncol; j++) {
os << "\t0";
}
}
if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
os << "\t" << additional_column_data[row_label_];
} else {
os << "\t0";
}
os << endl;
}
}
ostream & operator<<(ostream & os, Matrix_Double & mat)
{
double v;
int row_label;
if (!mat.isempty()) {
for (int i=0; i<mat.get_row_num(); i++) {
int j;
mat.get_row_label(i, row_label);
os << row_label << "\t";
for (j=0; j<mat.get_column_num()-1; j++) {
mat.get_element(i,j,v);
os << v << "\t";
}
mat.get_element(i,j,v);
os << v << endl;
}
}
return os;
}
double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
{
unsigned int ncol = p.get_column_num();
unsigned int nrow = p.get_row_num();
double v;
double obj = 0;
for (int i=0; i<nrow; i++) {
double sum = 0;
for (int j=0; j<ncol; j++) {
p.get_element(i,j,v);
sum += theta[j]*v;
}
obj += log(sum);
}
return obj;
}
//
// There are T known tissues and 1 unknown tissue (described by a double vector "m")
//
// input:
// p is a matrix of N X T, where N is number of reads and T is number of known tissue
//
// output:
// theta (model parameters), a vector with "T" elements.
// q is a matrix of N X T, the tissue-specific posterior probabilty of each read
// obj is the objective function value
//
double em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q, int random_seed)
{
// cout.precision(15);
// cerr.precision(15);
unsigned int ncol = p.get_column_num();
unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
// initialize model parameters as uniform distribution
// alternatively, model parameters can be random numbers by satisfying the crition
// (1) \sum_{i=1}^{ncol}{theta_i}=1
if (random_seed == 0) {
for (int j=0; j<ncol; j++) {
theta[j] = 1/(double)ncol;
}
} else{
double sum = 0.0;
std::default_random_engine generator(random_seed);
std::uniform_real_distribution<double> distribution(0.0, 1.0);
for (int j = 0; j < ncol; j++) {
theta[j] = distribution(generator);
sum += theta[j];
}
// Normalize the values so that they sum up to 1
for (int j = 0; j < ncol; j++) {
theta[j] /= sum;
}
}
// create and initialize q with the same size of p and with all elements initialized as 0
//Matrix_Double q(nrow, ncol, 0);
//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
double v1, v2;
for (int iter=0; iter<max_iter; iter++) {
// Rcpp::Rcerr << iter+1 << "," ;
// E-step: estimate q
for (int i=0; i<nrow; i++) {
double sum = 0;
for (int j=0; j<ncol; j++) {
p.get_element(i,j,v1);
v2 = theta[j]*v1;
q.set_element( i, j, v2 );
sum += v2;
}
for (int j=0; j<ncol; j++) {
q.get_element(i,j,v2);
v2 /= sum;
q.set_element( i, j, v2 );
}
}
// M-step: estimate theta
for (int j=0; j<ncol; j++) {
double sum=0;
for (int i=0; i<nrow; i++) {
q.get_element(i,j,v2);
sum += v2;
}
theta[j] = sum / nrow;
}
// for debug
//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
}
// last E-step: estimate q
for (int i=0; i<nrow; i++) {
double sum = 0;
for (int j=0; j<ncol; j++) {
p.get_element(i,j,v1);
v2 = theta[j]*v1;
q.set_element( i, j, v2 );
sum += v2;
}
for (int j=0; j<ncol; j++) {
q.get_element(i,j,v2);
v2 /= sum;
q.set_element( i, j, v2 );
}
}
Rcpp::Rcerr << endl;
return objective_em_supervise(p, theta);
}
//
// There are T known tissues and 1 unknown tissue (described by a double vector "m")
//
// input:
// p is a matrix of N X T, where N is number of reads and T is number of known tissue
// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
//
// output:
// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
// m is a vector of M X 1, where M is number of markers.
// q is a matrix of N X T, the tissue-specific posterior probabilty of each read
// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
//
void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
vector<double> & m, int random_seed)
{
// cout.precision(15);
// cerr.precision(15);
unsigned int ncol = p.get_column_num(); // T, number of known tissues
unsigned int nrow = p.get_row_num(); // N, number of reads
theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
vector<int> uniq_marker_Ids;
p.get_unique_row_labels(uniq_marker_Ids);
int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
map<int,int> markerId2Index; // marker index is 0-based.
for (int index=0; index<nMarker; index++) {
markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
}
// initialize model parameters theta as uniform distribution, and m as 0.5
// alternatively, theta and m can be random numbers, by satisfying:
// (1) \sum_{i=1}^{ncol+1}{theta_i}=1
// (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
if (random_seed == 0) {
for (int j=0; j<ncol+1; j++) {
theta[j] = 1/(double)(ncol+1);
}
} else{
double sum = 0.0;
std::default_random_engine generator(random_seed);
std::uniform_real_distribution<double> distribution(0.0, 1.0);
for (int j = 0; j < ncol+1; j++) {
theta[j] = distribution(generator);
sum += theta[j];
}
// Normalize the values so that they sum up to 1
for (int j = 0; j < ncol+1; j++) {
theta[j] /= sum;
}
}
for (int k=0; k<nMarker; k++) {
m[k] = 0.5;
}
for (int i=0; i<nrow; i++)
q_unknown.push_back(0);
// EM algorithm
for (int iter=0; iter<max_iter; iter++) {
Rcpp::Rcerr << iter+1 << "," ;
// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
double v1, v2, likelihood_unknown_class;
int markerId, markerIndex;
for (int i=0; i<nrow; i++) {
// process the first T known tissues
double sum = 0;
for (int j=0; j<ncol; j++) {
p.get_element(i,j,v1);
v2 = theta[j]*v1;
q.set_element( i, j, v2 );
sum += v2;
}
// process the last unknown tissue
p.get_row_label(i, markerId); // get markerId (1-base) of read i
markerIndex = markerId2Index[markerId];
likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
q_unknown[i] = theta[ncol]*likelihood_unknown_class;
sum += q_unknown[i];
// update q and q_unknown
for (int j=0; j<ncol; j++) {
q.get_element(i,j,v2);
v2 /= sum;
q.set_element( i, j, v2 );
}
q_unknown[i] /= sum;
}
// M-step 1: estimate unknown class's methylation level (m) of each marker
double sum1=0, sum2=0;
int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
for (int i=0; i<nrow; i++) {
// for each row (or read)
p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
curr_marker_index = markerId2Index[markerId];
if (curr_marker_index != prev_marker_index) {
// this is the 1st read of a new marker
// we need to summarize m value of previous marker
if (prev_marker_index!=-1) { // current marker is not the 1st marker
m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
sum1 = 0;
sum2 = 0;
}
prev_marker_index = curr_marker_index;
}
sum1 += q_unknown[i]*Rm[i];
sum2 += q_unknown[i]*Rl[i];
}
m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
//print_vec(cout, m, "\n"); // for debug
//cout << endl; // for debug
// M-step 2: estimate theta, which has ncol+1 values
double sum;
for (int j=0; j<ncol; j++) {
sum=0;
for (int i=0; i<nrow; i++) {
q.get_element(i,j,v2);
sum += v2;
}
theta[j] = sum / nrow;
}
sum=0;
for (int i=0; i<nrow; i++)
sum += q_unknown[i];
theta[ncol] = sum / nrow;
//cout << "round " << iter+1 << ", theta="; // for debug
//print_vec(cout, theta, ", "); // for debug
//cout << endl << endl; // for debug
}
Rcpp::Rcerr << endl;
//cout << "final:" << endl << std::flush;
//print_vec(cout, theta, ", "); // for debug
//cout << endl << endl << std::flush; // for debug
}
void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
{
unsigned int ncol = q.get_column_num(); // T, number of known tissues
unsigned int nrow = q.get_row_num(); // N, number of reads
vector<int> uniq_marker_Ids;
q.get_unique_row_labels(uniq_marker_Ids);
// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
double v;
int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
for (int i=0; i<nrow; i++) {
// for each row (or read)
q.get_row_label(i, curr_markerId); // get the marker ID of read i
if (curr_markerId != prev_markerId) {
// this is the 1st read of a new marker
// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
// then clear readCountsPerMarker for a new read counting of the next new marker
if (prev_markerId!=-1) { // current marker is not the 1st marker
multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
assign_vector_zeros(readCountsPerMarker);
}
prev_markerId = curr_markerId;
}
for (int j=0; j<ncol; j++) {
q.get_element(i,j,v);
readCountsPerMarker[j] += v;
}
}
multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
}
void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
{
unsigned int ncol = q.get_column_num(); // T, number of known tissues
unsigned int nrow = q.get_row_num(); // N, number of reads
if (nrow!=(unsigned int)q_unknown.size()) {
Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
// exit(EXIT_FAILURE);
}
vector<int> uniq_marker_Ids;
q.get_unique_row_labels(uniq_marker_Ids);
// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
double v;
int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
for (int i=0; i<nrow; i++) {
// for each row (or read)
q.get_row_label(i, curr_markerId); // get the marker ID of read i
if (curr_markerId != prev_markerId) {
// this is the 1st read of a new marker
// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
// then clear readCountsPerMarker for a new read counting of the next new marker
if (prev_markerId!=-1) { // current marker is not the 1st marker
multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
assign_vector_zeros(readCountsPerMarker);
}
prev_markerId = curr_markerId;
}
for (int j=0; j<ncol; j++) {
q.get_element(i,j,v);
readCountsPerMarker[j] += v;
}
readCountsPerMarker[ncol] += q_unknown[i];
}
multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
}