| ... | ... |
@@ -15,6 +15,7 @@ using namespace Rcpp; |
| 15 | 15 |
#include <boost/algorithm/string.hpp> |
| 16 | 16 |
#include <boost/assign.hpp> |
| 17 | 17 |
#include <boost/iostreams/filtering_stream.hpp> |
| 18 |
+#include <random> |
|
| 18 | 19 |
#include "matrix.h" |
| 19 | 20 |
#include "utils.h" |
| 20 | 21 |
|
| ... | ... |
@@ -305,7 +306,7 @@ double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
| 305 | 306 |
// theta (model parameters), a vector with "T" elements. |
| 306 | 307 |
// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
| 307 | 308 |
// |
| 308 |
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 309 |
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q, int random_seed) |
|
| 309 | 310 |
{
|
| 310 | 311 |
// cout.precision(15); |
| 311 | 312 |
// cerr.precision(15); |
| ... | ... |
@@ -315,9 +316,24 @@ void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matri |
| 315 | 316 |
// initialize model parameters as uniform distribution |
| 316 | 317 |
// alternatively, model parameters can be random numbers by satisfying the crition |
| 317 | 318 |
// (1) \sum_{i=1}^{ncol}{theta_i}=1
|
| 318 |
- for (int j=0; j<ncol; j++) {
|
|
| 319 |
- theta[j] = 1/(double)ncol; |
|
| 319 |
+ if (random_seed == 0) {
|
|
| 320 |
+ for (int j=0; j<ncol; j++) {
|
|
| 321 |
+ theta[j] = 1/(double)ncol; |
|
| 322 |
+ } |
|
| 323 |
+ } else{
|
|
| 324 |
+ double sum = 0.0; |
|
| 325 |
+ std::default_random_engine generator(random_seed); |
|
| 326 |
+ std::uniform_real_distribution<double> distribution(0.0, 1.0); |
|
| 327 |
+ for (int j = 0; j < ncol; j++) {
|
|
| 328 |
+ theta[j] = distribution(generator); |
|
| 329 |
+ sum += theta[j]; |
|
| 330 |
+ } |
|
| 331 |
+ // Normalize the values so that they sum up to 1 |
|
| 332 |
+ for (int j = 0; j < ncol; j++) {
|
|
| 333 |
+ theta[j] /= sum; |
|
| 334 |
+ } |
|
| 320 | 335 |
} |
| 336 |
+ |
|
| 321 | 337 |
// create and initialize q with the same size of p and with all elements initialized as 0 |
| 322 | 338 |
//Matrix_Double q(nrow, ncol, 0); |
| 323 | 339 |
//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
| ... | ... |
@@ -386,7 +402,7 @@ void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matri |
| 386 | 402 |
// |
| 387 | 403 |
void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
| 388 | 404 |
int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
| 389 |
- vector<double> & m) |
|
| 405 |
+ vector<double> & m, int random_seed) |
|
| 390 | 406 |
{
|
| 391 | 407 |
// cout.precision(15); |
| 392 | 408 |
// cerr.precision(15); |
| ... | ... |
@@ -408,9 +424,27 @@ void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
| 408 | 424 |
// alternatively, theta and m can be random numbers, by satisfying: |
| 409 | 425 |
// (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
| 410 | 426 |
// (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
| 411 |
- for (int j=0; j<ncol+1; j++) {
|
|
| 412 |
- theta[j] = 1/(double)(ncol+1); |
|
| 427 |
+ if (random_seed == 0) {
|
|
| 428 |
+ for (int j=0; j<ncol+1; j++) {
|
|
| 429 |
+ theta[j] = 1/(double)(ncol+1); |
|
| 430 |
+ } |
|
| 431 |
+ } else{
|
|
| 432 |
+ double sum = 0.0; |
|
| 433 |
+ std::default_random_engine generator(random_seed); |
|
| 434 |
+ std::uniform_real_distribution<double> distribution(0.0, 1.0); |
|
| 435 |
+ for (int j = 0; j < ncol+1; j++) {
|
|
| 436 |
+ theta[j] = distribution(generator); |
|
| 437 |
+ sum += theta[j]; |
|
| 438 |
+ } |
|
| 439 |
+ // Normalize the values so that they sum up to 1 |
|
| 440 |
+ for (int j = 0; j < ncol+1; j++) {
|
|
| 441 |
+ theta[j] /= sum; |
|
| 442 |
+ } |
|
| 413 | 443 |
} |
| 444 |
+ |
|
| 445 |
+ |
|
| 446 |
+ |
|
| 447 |
+ |
|
| 414 | 448 |
for (int k=0; k<nMarker; k++) {
|
| 415 | 449 |
m[k] = 0.5; |
| 416 | 450 |
} |
| ... | ... |
@@ -1,576 +1,576 @@ |
| 1 |
-// [[Rcpp::depends(BH)]] |
|
| 2 |
-#include <Rcpp.h> |
|
| 3 |
-using namespace Rcpp; |
|
| 4 |
- |
|
| 5 |
-#include <cstdlib> |
|
| 6 |
-#include <cmath> |
|
| 7 |
-#include <sstream> |
|
| 8 |
-#include <iostream> |
|
| 9 |
-#include <fstream> |
|
| 10 |
-#include <map> |
|
| 11 |
-#include <vector> |
|
| 12 |
-#include <string> |
|
| 13 |
-#include <cassert> |
|
| 14 |
-#include <algorithm> |
|
| 15 |
-#include <boost/algorithm/string.hpp> |
|
| 16 |
-#include <boost/assign.hpp> |
|
| 17 |
-#include <boost/iostreams/filtering_stream.hpp> |
|
| 18 |
-#include "matrix.h" |
|
| 19 |
-#include "utils.h" |
|
| 20 |
- |
|
| 21 |
-using namespace std; |
|
| 22 |
-using namespace boost; |
|
| 23 |
- |
|
| 24 |
-#include "template_utils.cpp" |
|
| 25 |
- |
|
| 26 |
-Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) |
|
| 27 |
-{
|
|
| 28 |
- nrow = nrow_; |
|
| 29 |
- ncol = ncol_; |
|
| 30 |
- empty = false; |
|
| 31 |
- mat.reserve(nrow_); |
|
| 32 |
- for (int i=0; i<nrow; i++) {
|
|
| 33 |
- vector<double> temp; |
|
| 34 |
- temp.reserve(ncol); |
|
| 35 |
- for (int j=0; j<ncol; j++) |
|
| 36 |
- temp.push_back(init_value); |
|
| 37 |
- mat.push_back( temp ); |
|
| 38 |
- } |
|
| 39 |
- empty = mat.empty(); |
|
| 40 |
-} |
|
| 41 |
- |
|
| 42 |
-// a line represents a row of the matrix. Each element in this row is delimited by a TAB. |
|
| 43 |
-void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) |
|
| 44 |
-{
|
|
| 45 |
- vector<string> items; |
|
| 46 |
- split(items, line, is_any_of("\t"));
|
|
| 47 |
- vector<double> v; |
|
| 48 |
- for (int i=0; i<items.size(); i++) {
|
|
| 49 |
- if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. |
|
| 50 |
- if (i<num_header_column) continue; // skip the first num_header_column header columns |
|
| 51 |
- v.push_back( atof(items[i].c_str()) ); |
|
| 52 |
- } |
|
| 53 |
- mat.push_back( v ); |
|
| 54 |
-} |
|
| 55 |
- |
|
| 56 |
-Matrix_Double::Matrix_Double(const string & mat_file, |
|
| 57 |
- bool header_line=false, int num_header_column=1) |
|
| 58 |
-{
|
|
| 59 |
- unsigned long i_line=0; |
|
| 60 |
- // input is a plain text file |
|
| 61 |
- istream * in=&cin; // default is stdin |
|
| 62 |
- ifstream fin; |
|
| 63 |
- if ( !(mat_file.compare("stdin")==0)) {
|
|
| 64 |
- fin.open(mat_file.c_str()); |
|
| 65 |
- if (fin.fail()){
|
|
| 66 |
- Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; |
|
| 67 |
- // exit(EXIT_FAILURE); |
|
| 68 |
- } |
|
| 69 |
- in = &fin; |
|
| 70 |
- } |
|
| 71 |
- string line; |
|
| 72 |
- while (!(*in).eof()) {
|
|
| 73 |
- getline((*in), line); |
|
| 74 |
- //cerr << "Line: " << line << endl; |
|
| 75 |
- if (header_line) {
|
|
| 76 |
- if (i_line==0) {
|
|
| 77 |
- // skip the header line |
|
| 78 |
- i_line++; |
|
| 79 |
- continue; |
|
| 80 |
- } |
|
| 81 |
- } |
|
| 82 |
- if (line.empty()) {
|
|
| 83 |
- // this is the last line of the file |
|
| 84 |
- break; |
|
| 85 |
- } |
|
| 86 |
- process_one_line_of_mat_file(line, num_header_column); |
|
| 87 |
- i_line++; |
|
| 88 |
- } |
|
| 89 |
- if ( !(mat_file.compare("stdin")==0)) {
|
|
| 90 |
- fin.close(); |
|
| 91 |
- } |
|
| 92 |
- empty = mat.empty(); |
|
| 93 |
- nrow = mat.size(); |
|
| 94 |
- if (nrow>=1) ncol = mat[0].size(); |
|
| 95 |
- else ncol = 0; |
|
| 96 |
-} |
|
| 97 |
- |
|
| 98 |
-// return the row index of this vector |
|
| 99 |
-long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
|
|
| 100 |
- if (v_.size()!=ncol) {
|
|
| 101 |
- Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 102 |
- // exit(EXIT_FAILURE); |
|
| 103 |
- } |
|
| 104 |
- vector<double> v; |
|
| 105 |
- for (int i=0; i<v_.size(); i++) {
|
|
| 106 |
- v.push_back( v_[i] ); |
|
| 107 |
- } |
|
| 108 |
- mat.push_back( v ); |
|
| 109 |
- nrow = mat.size(); |
|
| 110 |
- row_labels.push_back(row_label_); |
|
| 111 |
- empty=false; |
|
| 112 |
- return(nrow-1); |
|
| 113 |
-} |
|
| 114 |
- |
|
| 115 |
-// |
|
| 116 |
-// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. |
|
| 117 |
-// |
|
| 118 |
-// return row index of this appended vector, if successfully appending; return -1 otherwise. |
|
| 119 |
-// |
|
| 120 |
-long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
|
|
| 121 |
- if (v_.size()!=ncol) {
|
|
| 122 |
- Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 123 |
- // exit(EXIT_FAILURE); |
|
| 124 |
- } |
|
| 125 |
- double min_ = *min_element(v_.begin(), v_.end()); |
|
| 126 |
- double max_ = *max_element(v_.begin(), v_.end()); |
|
| 127 |
- //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; |
|
| 128 |
- if (min_==0) {
|
|
| 129 |
- if (max_==0) return(-1); |
|
| 130 |
- } else {
|
|
| 131 |
- double ratio=max_/min_; |
|
| 132 |
- if (ratio<min_threshold_maxv_minv) return(-1); |
|
| 133 |
- } |
|
| 134 |
- long row_index = append_row_vector(v_, row_label_); |
|
| 135 |
- return(row_index); |
|
| 136 |
-} |
|
| 137 |
- |
|
| 138 |
-bool Matrix_Double::get_element(const int i, const int j, double & v) |
|
| 139 |
-{
|
|
| 140 |
- if (!empty) {
|
|
| 141 |
- if (i<nrow && j<ncol) {
|
|
| 142 |
- v = mat[i][j]; |
|
| 143 |
- return true; |
|
| 144 |
- } else {
|
|
| 145 |
- Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 146 |
- return false; |
|
| 147 |
- } |
|
| 148 |
- } else {
|
|
| 149 |
- v = 0; |
|
| 150 |
- return false; |
|
| 151 |
- } |
|
| 152 |
-} |
|
| 153 |
- |
|
| 154 |
-bool Matrix_Double::set_element(const int i, const int j, double v) |
|
| 155 |
-{
|
|
| 156 |
- if (!empty) {
|
|
| 157 |
- if (i<nrow && j<ncol) {
|
|
| 158 |
- mat[i][j] = v; |
|
| 159 |
- return true; |
|
| 160 |
- } else {
|
|
| 161 |
- Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 162 |
- return false; |
|
| 163 |
- } |
|
| 164 |
- } else {
|
|
| 165 |
- return false; |
|
| 166 |
- } |
|
| 167 |
-} |
|
| 168 |
- |
|
| 169 |
-bool Matrix_Double::get_column_sum(const int j, double & v) |
|
| 170 |
-{
|
|
| 171 |
- if (!empty) {
|
|
| 172 |
- if (j<ncol) {
|
|
| 173 |
- v = 0; |
|
| 174 |
- for (int k=0; k<nrow; k++) {
|
|
| 175 |
- v += mat[k][j]; |
|
| 176 |
- } |
|
| 177 |
- return true; |
|
| 178 |
- } else {
|
|
| 179 |
- Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
|
|
| 180 |
- return false; |
|
| 181 |
- } |
|
| 182 |
- } else {
|
|
| 183 |
- v = 0; |
|
| 184 |
- return false; |
|
| 185 |
- } |
|
| 186 |
-} |
|
| 187 |
- |
|
| 188 |
-bool Matrix_Double::get_row_sum(const int i, double & v) |
|
| 189 |
-{
|
|
| 190 |
- if (!empty) {
|
|
| 191 |
- if (i<nrow) {
|
|
| 192 |
- v = 0; |
|
| 193 |
- for (int k=0; k<ncol; k++) {
|
|
| 194 |
- v += mat[i][k]; |
|
| 195 |
- } |
|
| 196 |
- return true; |
|
| 197 |
- } else {
|
|
| 198 |
- Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
|
|
| 199 |
- return false; |
|
| 200 |
- } |
|
| 201 |
- } else {
|
|
| 202 |
- v = 0; |
|
| 203 |
- return false; |
|
| 204 |
- } |
|
| 205 |
-} |
|
| 206 |
- |
|
| 207 |
-void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) |
|
| 208 |
-{
|
|
| 209 |
- map<int, int> counts; |
|
| 210 |
- for (int i=0; i<row_labels.size(); i++) |
|
| 211 |
- if (counts.find(row_labels[i]) != counts.end()) |
|
| 212 |
- counts[row_labels[i]] += 1; // found this label, then increment count |
|
| 213 |
- else {
|
|
| 214 |
- counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once |
|
| 215 |
- //counts[row_labels[i]] = 1; |
|
| 216 |
- } |
|
| 217 |
- map<int,int>::iterator it; |
|
| 218 |
- for (it=counts.begin(); it!=counts.end(); ++it) |
|
| 219 |
- uniq_labels.push_back(it->first); |
|
| 220 |
-} |
|
| 221 |
- |
|
| 222 |
-void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) |
|
| 223 |
-{
|
|
| 224 |
- // obtain unique row labels |
|
| 225 |
- vector<int> unique_row_labels; |
|
| 226 |
- int row_label_; |
|
| 227 |
- for (int i=0; i<nrow; i++) {
|
|
| 228 |
- row_label_ = row_labels[i]; |
|
| 229 |
- unique_row_labels.push_back( row_label_ ); |
|
| 230 |
- } |
|
| 231 |
- Bins2Value::iterator iter; |
|
| 232 |
- for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
|
|
| 233 |
- row_label_ = iter->first; |
|
| 234 |
- if (!exist_row_label(row_label_)) {
|
|
| 235 |
- unique_row_labels.push_back( row_label_ ); |
|
| 236 |
- } |
|
| 237 |
- } |
|
| 238 |
- sort(unique_row_labels.begin(), unique_row_labels.end()); |
|
| 239 |
- // print both 'mat' and 'additional_column_data' as the last column |
|
| 240 |
- for (int i=0; i<unique_row_labels.size(); i++) {
|
|
| 241 |
- row_label_ = unique_row_labels[i]; |
|
| 242 |
- os << row_label_; |
|
| 243 |
- int row_index = get_row_index(row_label_); |
|
| 244 |
- if (row_index!=-1) {
|
|
| 245 |
- for (int j=0; j<ncol; j++) {
|
|
| 246 |
- os << "\t" << mat[row_index][j]; |
|
| 247 |
- } |
|
| 248 |
- } else {
|
|
| 249 |
- for (int j=0; j<ncol; j++) {
|
|
| 250 |
- os << "\t0"; |
|
| 251 |
- } |
|
| 252 |
- } |
|
| 253 |
- if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
|
|
| 254 |
- os << "\t" << additional_column_data[row_label_]; |
|
| 255 |
- } else {
|
|
| 256 |
- os << "\t0"; |
|
| 257 |
- } |
|
| 258 |
- os << endl; |
|
| 259 |
- } |
|
| 260 |
-} |
|
| 261 |
- |
|
| 262 |
-ostream & operator<<(ostream & os, Matrix_Double & mat) |
|
| 263 |
-{
|
|
| 264 |
- double v; |
|
| 265 |
- int row_label; |
|
| 266 |
- if (!mat.isempty()) {
|
|
| 267 |
- for (int i=0; i<mat.get_row_num(); i++) {
|
|
| 268 |
- int j; |
|
| 269 |
- mat.get_row_label(i, row_label); |
|
| 270 |
- os << row_label << "\t"; |
|
| 271 |
- for (j=0; j<mat.get_column_num()-1; j++) {
|
|
| 272 |
- mat.get_element(i,j,v); |
|
| 273 |
- os << v << "\t"; |
|
| 274 |
- } |
|
| 275 |
- mat.get_element(i,j,v); |
|
| 276 |
- os << v << endl; |
|
| 277 |
- } |
|
| 278 |
- } |
|
| 279 |
- return os; |
|
| 280 |
-} |
|
| 281 |
- |
|
| 282 |
-double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
|
| 283 |
-{
|
|
| 284 |
- unsigned int ncol = p.get_column_num(); |
|
| 285 |
- unsigned int nrow = p.get_row_num(); |
|
| 286 |
- double v; |
|
| 287 |
- double obj = 0; |
|
| 288 |
- for (int i=0; i<nrow; i++) {
|
|
| 289 |
- double sum = 0; |
|
| 290 |
- for (int j=0; j<ncol; j++) {
|
|
| 291 |
- p.get_element(i,j,v); |
|
| 292 |
- sum += theta[j]*v; |
|
| 293 |
- } |
|
| 294 |
- obj += log(sum); |
|
| 295 |
- } |
|
| 296 |
- return obj; |
|
| 297 |
-} |
|
| 298 |
-// |
|
| 299 |
-// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 300 |
-// |
|
| 301 |
-// input: |
|
| 302 |
-// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 303 |
-// |
|
| 304 |
-// output: |
|
| 305 |
-// theta (model parameters), a vector with "T" elements. |
|
| 306 |
-// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 307 |
-// |
|
| 308 |
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 309 |
-{
|
|
| 310 |
- // cout.precision(15); |
|
| 311 |
- // cerr.precision(15); |
|
| 312 |
- unsigned int ncol = p.get_column_num(); |
|
| 313 |
- unsigned int nrow = p.get_row_num(); // nrow is number of tissues. |
|
| 314 |
- theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. |
|
| 315 |
- // initialize model parameters as uniform distribution |
|
| 316 |
- // alternatively, model parameters can be random numbers by satisfying the crition |
|
| 317 |
- // (1) \sum_{i=1}^{ncol}{theta_i}=1
|
|
| 318 |
- for (int j=0; j<ncol; j++) {
|
|
| 319 |
- theta[j] = 1/(double)ncol; |
|
| 320 |
- } |
|
| 321 |
- // create and initialize q with the same size of p and with all elements initialized as 0 |
|
| 322 |
- //Matrix_Double q(nrow, ncol, 0); |
|
| 323 |
- //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
|
| 324 |
- double v1, v2; |
|
| 325 |
- for (int iter=0; iter<max_iter; iter++) {
|
|
| 326 |
- Rcpp::Rcerr << iter+1 << "," ; |
|
| 327 |
- // E-step: estimate q |
|
| 328 |
- for (int i=0; i<nrow; i++) {
|
|
| 329 |
- double sum = 0; |
|
| 330 |
- for (int j=0; j<ncol; j++) {
|
|
| 331 |
- p.get_element(i,j,v1); |
|
| 332 |
- v2 = theta[j]*v1; |
|
| 333 |
- q.set_element( i, j, v2 ); |
|
| 334 |
- sum += v2; |
|
| 335 |
- } |
|
| 336 |
- for (int j=0; j<ncol; j++) {
|
|
| 337 |
- q.get_element(i,j,v2); |
|
| 338 |
- v2 /= sum; |
|
| 339 |
- q.set_element( i, j, v2 ); |
|
| 340 |
- } |
|
| 341 |
- } |
|
| 342 |
- // M-step: estimate theta |
|
| 343 |
- for (int j=0; j<ncol; j++) {
|
|
| 344 |
- double sum=0; |
|
| 345 |
- for (int i=0; i<nrow; i++) {
|
|
| 346 |
- q.get_element(i,j,v2); |
|
| 347 |
- sum += v2; |
|
| 348 |
- } |
|
| 349 |
- theta[j] = sum / nrow; |
|
| 350 |
- } |
|
| 351 |
- // for debug |
|
| 352 |
- //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; |
|
| 353 |
- } |
|
| 354 |
- // last E-step: estimate q |
|
| 355 |
- for (int i=0; i<nrow; i++) {
|
|
| 356 |
- double sum = 0; |
|
| 357 |
- for (int j=0; j<ncol; j++) {
|
|
| 358 |
- p.get_element(i,j,v1); |
|
| 359 |
- v2 = theta[j]*v1; |
|
| 360 |
- q.set_element( i, j, v2 ); |
|
| 361 |
- sum += v2; |
|
| 362 |
- } |
|
| 363 |
- for (int j=0; j<ncol; j++) {
|
|
| 364 |
- q.get_element(i,j,v2); |
|
| 365 |
- v2 /= sum; |
|
| 366 |
- q.set_element( i, j, v2 ); |
|
| 367 |
- } |
|
| 368 |
- } |
|
| 369 |
- Rcpp::Rcerr << endl; |
|
| 370 |
-} |
|
| 371 |
- |
|
| 372 |
-// |
|
| 373 |
-// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 374 |
-// |
|
| 375 |
-// input: |
|
| 376 |
-// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 377 |
-// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. |
|
| 378 |
-// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read |
|
| 379 |
-// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read |
|
| 380 |
-// |
|
| 381 |
-// output: |
|
| 382 |
-// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. |
|
| 383 |
-// m is a vector of M X 1, where M is number of markers. |
|
| 384 |
-// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 385 |
-// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. |
|
| 386 |
-// |
|
| 387 |
-void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
|
| 388 |
- int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
|
| 389 |
- vector<double> & m) |
|
| 390 |
-{
|
|
| 391 |
- // cout.precision(15); |
|
| 392 |
- // cerr.precision(15); |
|
| 393 |
- unsigned int ncol = p.get_column_num(); // T, number of known tissues |
|
| 394 |
- unsigned int nrow = p.get_row_num(); // N, number of reads |
|
| 395 |
- |
|
| 396 |
- theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. |
|
| 397 |
- |
|
| 398 |
- vector<int> uniq_marker_Ids; |
|
| 399 |
- p.get_unique_row_labels(uniq_marker_Ids); |
|
| 400 |
- int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 401 |
- m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. |
|
| 402 |
- map<int,int> markerId2Index; // marker index is 0-based. |
|
| 403 |
- for (int index=0; index<nMarker; index++) {
|
|
| 404 |
- markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); |
|
| 405 |
- } |
|
| 406 |
- |
|
| 407 |
- // initialize model parameters theta as uniform distribution, and m as 0.5 |
|
| 408 |
- // alternatively, theta and m can be random numbers, by satisfying: |
|
| 409 |
- // (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
|
| 410 |
- // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
|
| 411 |
- for (int j=0; j<ncol+1; j++) {
|
|
| 412 |
- theta[j] = 1/(double)(ncol+1); |
|
| 413 |
- } |
|
| 414 |
- for (int k=0; k<nMarker; k++) {
|
|
| 415 |
- m[k] = 0.5; |
|
| 416 |
- } |
|
| 417 |
- for (int i=0; i<nrow; i++) |
|
| 418 |
- q_unknown.push_back(0); |
|
| 419 |
- |
|
| 420 |
- // EM algorithm |
|
| 421 |
- for (int iter=0; iter<max_iter; iter++) {
|
|
| 422 |
- Rcpp::Rcerr << iter+1 << "," ; |
|
| 423 |
- // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) |
|
| 424 |
- double v1, v2, likelihood_unknown_class; |
|
| 425 |
- int markerId, markerIndex; |
|
| 426 |
- for (int i=0; i<nrow; i++) {
|
|
| 427 |
- // process the first T known tissues |
|
| 428 |
- double sum = 0; |
|
| 429 |
- for (int j=0; j<ncol; j++) {
|
|
| 430 |
- p.get_element(i,j,v1); |
|
| 431 |
- v2 = theta[j]*v1; |
|
| 432 |
- q.set_element( i, j, v2 ); |
|
| 433 |
- sum += v2; |
|
| 434 |
- } |
|
| 435 |
- // process the last unknown tissue |
|
| 436 |
- p.get_row_label(i, markerId); // get markerId (1-base) of read i |
|
| 437 |
- markerIndex = markerId2Index[markerId]; |
|
| 438 |
- likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]); |
|
| 439 |
- q_unknown[i] = theta[ncol]*likelihood_unknown_class; |
|
| 440 |
- sum += q_unknown[i]; |
|
| 441 |
- |
|
| 442 |
- // update q and q_unknown |
|
| 443 |
- for (int j=0; j<ncol; j++) {
|
|
| 444 |
- q.get_element(i,j,v2); |
|
| 445 |
- v2 /= sum; |
|
| 446 |
- q.set_element( i, j, v2 ); |
|
| 447 |
- } |
|
| 448 |
- q_unknown[i] /= sum; |
|
| 449 |
- } |
|
| 450 |
- |
|
| 451 |
- // M-step 1: estimate unknown class's methylation level (m) of each marker |
|
| 452 |
- double sum1=0, sum2=0; |
|
| 453 |
- int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base |
|
| 454 |
- for (int i=0; i<nrow; i++) {
|
|
| 455 |
- // for each row (or read) |
|
| 456 |
- p.get_row_label(i, markerId); // get the marker_index (0-base) of read i |
|
| 457 |
- curr_marker_index = markerId2Index[markerId]; |
|
| 458 |
- if (curr_marker_index != prev_marker_index) {
|
|
| 459 |
- // this is the 1st read of a new marker |
|
| 460 |
- // we need to summarize m value of previous marker |
|
| 461 |
- if (prev_marker_index!=-1) { // current marker is not the 1st marker
|
|
| 462 |
- m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); |
|
| 463 |
- sum1 = 0; |
|
| 464 |
- sum2 = 0; |
|
| 465 |
- } |
|
| 466 |
- prev_marker_index = curr_marker_index; |
|
| 467 |
- } |
|
| 468 |
- sum1 += q_unknown[i]*Rm[i]; |
|
| 469 |
- sum2 += q_unknown[i]*Rl[i]; |
|
| 470 |
- } |
|
| 471 |
- m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker |
|
| 472 |
- |
|
| 473 |
- //cout << "iter=" << iter << ", " << "m=" << endl; // for debug |
|
| 474 |
- //print_vec(cout, m, "\n"); // for debug |
|
| 475 |
- //cout << endl; // for debug |
|
| 476 |
- |
|
| 477 |
- // M-step 2: estimate theta, which has ncol+1 values |
|
| 478 |
- double sum; |
|
| 479 |
- for (int j=0; j<ncol; j++) {
|
|
| 480 |
- sum=0; |
|
| 481 |
- for (int i=0; i<nrow; i++) {
|
|
| 482 |
- q.get_element(i,j,v2); |
|
| 483 |
- sum += v2; |
|
| 484 |
- } |
|
| 485 |
- theta[j] = sum / nrow; |
|
| 486 |
- } |
|
| 487 |
- sum=0; |
|
| 488 |
- for (int i=0; i<nrow; i++) |
|
| 489 |
- sum += q_unknown[i]; |
|
| 490 |
- theta[ncol] = sum / nrow; |
|
| 491 |
- |
|
| 492 |
- //cout << "round " << iter+1 << ", theta="; // for debug |
|
| 493 |
- //print_vec(cout, theta, ", "); // for debug |
|
| 494 |
- //cout << endl << endl; // for debug |
|
| 495 |
- } |
|
| 496 |
- Rcpp::Rcerr << endl; |
|
| 497 |
- //cout << "final:" << endl << std::flush; |
|
| 498 |
- //print_vec(cout, theta, ", "); // for debug |
|
| 499 |
- //cout << endl << endl << std::flush; // for debug |
|
| 500 |
-} |
|
| 501 |
- |
|
| 502 |
-void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) |
|
| 503 |
-{
|
|
| 504 |
- unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 505 |
- unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 506 |
- |
|
| 507 |
- vector<int> uniq_marker_Ids; |
|
| 508 |
- q.get_unique_row_labels(uniq_marker_Ids); |
|
| 509 |
- // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 510 |
- |
|
| 511 |
- double v; |
|
| 512 |
- int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 513 |
- vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 |
|
| 514 |
- for (int i=0; i<nrow; i++) {
|
|
| 515 |
- // for each row (or read) |
|
| 516 |
- q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 517 |
- if (curr_markerId != prev_markerId) {
|
|
| 518 |
- // this is the 1st read of a new marker |
|
| 519 |
- // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 520 |
- // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 521 |
- if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 522 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 523 |
- readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 524 |
- assign_vector_zeros(readCountsPerMarker); |
|
| 525 |
- } |
|
| 526 |
- prev_markerId = curr_markerId; |
|
| 527 |
- } |
|
| 528 |
- for (int j=0; j<ncol; j++) {
|
|
| 529 |
- q.get_element(i,j,v); |
|
| 530 |
- readCountsPerMarker[j] += v; |
|
| 531 |
- } |
|
| 532 |
- } |
|
| 533 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 534 |
- readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 535 |
-} |
|
| 536 |
- |
|
| 537 |
-void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) |
|
| 538 |
-{
|
|
| 539 |
- unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 540 |
- unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 541 |
- if (nrow!=(unsigned int)q_unknown.size()) {
|
|
| 542 |
- Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; |
|
| 543 |
- // exit(EXIT_FAILURE); |
|
| 544 |
- } |
|
| 545 |
- |
|
| 546 |
- vector<int> uniq_marker_Ids; |
|
| 547 |
- q.get_unique_row_labels(uniq_marker_Ids); |
|
| 548 |
- // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 549 |
- |
|
| 550 |
- double v; |
|
| 551 |
- int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 552 |
- vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. |
|
| 553 |
- for (int i=0; i<nrow; i++) {
|
|
| 554 |
- // for each row (or read) |
|
| 555 |
- q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 556 |
- if (curr_markerId != prev_markerId) {
|
|
| 557 |
- // this is the 1st read of a new marker |
|
| 558 |
- // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 559 |
- // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 560 |
- if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 561 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 562 |
- readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 563 |
- assign_vector_zeros(readCountsPerMarker); |
|
| 564 |
- } |
|
| 565 |
- prev_markerId = curr_markerId; |
|
| 566 |
- } |
|
| 567 |
- for (int j=0; j<ncol; j++) {
|
|
| 568 |
- q.get_element(i,j,v); |
|
| 569 |
- readCountsPerMarker[j] += v; |
|
| 570 |
- } |
|
| 571 |
- readCountsPerMarker[ncol] += q_unknown[i]; |
|
| 572 |
- } |
|
| 573 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 574 |
- readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 575 |
-} |
|
| 576 |
- |
|
| 1 |
+// [[Rcpp::depends(BH)]] |
|
| 2 |
+#include <Rcpp.h> |
|
| 3 |
+using namespace Rcpp; |
|
| 4 |
+ |
|
| 5 |
+#include <cstdlib> |
|
| 6 |
+#include <cmath> |
|
| 7 |
+#include <sstream> |
|
| 8 |
+#include <iostream> |
|
| 9 |
+#include <fstream> |
|
| 10 |
+#include <map> |
|
| 11 |
+#include <vector> |
|
| 12 |
+#include <string> |
|
| 13 |
+#include <cassert> |
|
| 14 |
+#include <algorithm> |
|
| 15 |
+#include <boost/algorithm/string.hpp> |
|
| 16 |
+#include <boost/assign.hpp> |
|
| 17 |
+#include <boost/iostreams/filtering_stream.hpp> |
|
| 18 |
+#include "matrix.h" |
|
| 19 |
+#include "utils.h" |
|
| 20 |
+ |
|
| 21 |
+using namespace std; |
|
| 22 |
+using namespace boost; |
|
| 23 |
+ |
|
| 24 |
+#include "template_utils.cpp" |
|
| 25 |
+ |
|
| 26 |
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) |
|
| 27 |
+{
|
|
| 28 |
+ nrow = nrow_; |
|
| 29 |
+ ncol = ncol_; |
|
| 30 |
+ empty = false; |
|
| 31 |
+ mat.reserve(nrow_); |
|
| 32 |
+ for (int i=0; i<nrow; i++) {
|
|
| 33 |
+ vector<double> temp; |
|
| 34 |
+ temp.reserve(ncol); |
|
| 35 |
+ for (int j=0; j<ncol; j++) |
|
| 36 |
+ temp.push_back(init_value); |
|
| 37 |
+ mat.push_back( temp ); |
|
| 38 |
+ } |
|
| 39 |
+ empty = mat.empty(); |
|
| 40 |
+} |
|
| 41 |
+ |
|
| 42 |
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB. |
|
| 43 |
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) |
|
| 44 |
+{
|
|
| 45 |
+ vector<string> items; |
|
| 46 |
+ split(items, line, is_any_of("\t"));
|
|
| 47 |
+ vector<double> v; |
|
| 48 |
+ for (int i=0; i<items.size(); i++) {
|
|
| 49 |
+ if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. |
|
| 50 |
+ if (i<num_header_column) continue; // skip the first num_header_column header columns |
|
| 51 |
+ v.push_back( atof(items[i].c_str()) ); |
|
| 52 |
+ } |
|
| 53 |
+ mat.push_back( v ); |
|
| 54 |
+} |
|
| 55 |
+ |
|
| 56 |
+Matrix_Double::Matrix_Double(const string & mat_file, |
|
| 57 |
+ bool header_line=false, int num_header_column=1) |
|
| 58 |
+{
|
|
| 59 |
+ unsigned long i_line=0; |
|
| 60 |
+ // input is a plain text file |
|
| 61 |
+ istream * in=&cin; // default is stdin |
|
| 62 |
+ ifstream fin; |
|
| 63 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 64 |
+ fin.open(mat_file.c_str()); |
|
| 65 |
+ if (fin.fail()){
|
|
| 66 |
+ Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; |
|
| 67 |
+ // exit(EXIT_FAILURE); |
|
| 68 |
+ } |
|
| 69 |
+ in = &fin; |
|
| 70 |
+ } |
|
| 71 |
+ string line; |
|
| 72 |
+ while (!(*in).eof()) {
|
|
| 73 |
+ getline((*in), line); |
|
| 74 |
+ //cerr << "Line: " << line << endl; |
|
| 75 |
+ if (header_line) {
|
|
| 76 |
+ if (i_line==0) {
|
|
| 77 |
+ // skip the header line |
|
| 78 |
+ i_line++; |
|
| 79 |
+ continue; |
|
| 80 |
+ } |
|
| 81 |
+ } |
|
| 82 |
+ if (line.empty()) {
|
|
| 83 |
+ // this is the last line of the file |
|
| 84 |
+ break; |
|
| 85 |
+ } |
|
| 86 |
+ process_one_line_of_mat_file(line, num_header_column); |
|
| 87 |
+ i_line++; |
|
| 88 |
+ } |
|
| 89 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 90 |
+ fin.close(); |
|
| 91 |
+ } |
|
| 92 |
+ empty = mat.empty(); |
|
| 93 |
+ nrow = mat.size(); |
|
| 94 |
+ if (nrow>=1) ncol = mat[0].size(); |
|
| 95 |
+ else ncol = 0; |
|
| 96 |
+} |
|
| 97 |
+ |
|
| 98 |
+// return the row index of this vector |
|
| 99 |
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
|
|
| 100 |
+ if (v_.size()!=ncol) {
|
|
| 101 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 102 |
+ // exit(EXIT_FAILURE); |
|
| 103 |
+ } |
|
| 104 |
+ vector<double> v; |
|
| 105 |
+ for (int i=0; i<v_.size(); i++) {
|
|
| 106 |
+ v.push_back( v_[i] ); |
|
| 107 |
+ } |
|
| 108 |
+ mat.push_back( v ); |
|
| 109 |
+ nrow = mat.size(); |
|
| 110 |
+ row_labels.push_back(row_label_); |
|
| 111 |
+ empty=false; |
|
| 112 |
+ return(nrow-1); |
|
| 113 |
+} |
|
| 114 |
+ |
|
| 115 |
+// |
|
| 116 |
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. |
|
| 117 |
+// |
|
| 118 |
+// return row index of this appended vector, if successfully appending; return -1 otherwise. |
|
| 119 |
+// |
|
| 120 |
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
|
|
| 121 |
+ if (v_.size()!=ncol) {
|
|
| 122 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 123 |
+ // exit(EXIT_FAILURE); |
|
| 124 |
+ } |
|
| 125 |
+ double min_ = *min_element(v_.begin(), v_.end()); |
|
| 126 |
+ double max_ = *max_element(v_.begin(), v_.end()); |
|
| 127 |
+ //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; |
|
| 128 |
+ if (min_==0) {
|
|
| 129 |
+ if (max_==0) return(-1); |
|
| 130 |
+ } else {
|
|
| 131 |
+ double ratio=max_/min_; |
|
| 132 |
+ if (ratio<min_threshold_maxv_minv) return(-1); |
|
| 133 |
+ } |
|
| 134 |
+ long row_index = append_row_vector(v_, row_label_); |
|
| 135 |
+ return(row_index); |
|
| 136 |
+} |
|
| 137 |
+ |
|
| 138 |
+bool Matrix_Double::get_element(const int i, const int j, double & v) |
|
| 139 |
+{
|
|
| 140 |
+ if (!empty) {
|
|
| 141 |
+ if (i<nrow && j<ncol) {
|
|
| 142 |
+ v = mat[i][j]; |
|
| 143 |
+ return true; |
|
| 144 |
+ } else {
|
|
| 145 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 146 |
+ return false; |
|
| 147 |
+ } |
|
| 148 |
+ } else {
|
|
| 149 |
+ v = 0; |
|
| 150 |
+ return false; |
|
| 151 |
+ } |
|
| 152 |
+} |
|
| 153 |
+ |
|
| 154 |
+bool Matrix_Double::set_element(const int i, const int j, double v) |
|
| 155 |
+{
|
|
| 156 |
+ if (!empty) {
|
|
| 157 |
+ if (i<nrow && j<ncol) {
|
|
| 158 |
+ mat[i][j] = v; |
|
| 159 |
+ return true; |
|
| 160 |
+ } else {
|
|
| 161 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 162 |
+ return false; |
|
| 163 |
+ } |
|
| 164 |
+ } else {
|
|
| 165 |
+ return false; |
|
| 166 |
+ } |
|
| 167 |
+} |
|
| 168 |
+ |
|
| 169 |
+bool Matrix_Double::get_column_sum(const int j, double & v) |
|
| 170 |
+{
|
|
| 171 |
+ if (!empty) {
|
|
| 172 |
+ if (j<ncol) {
|
|
| 173 |
+ v = 0; |
|
| 174 |
+ for (int k=0; k<nrow; k++) {
|
|
| 175 |
+ v += mat[k][j]; |
|
| 176 |
+ } |
|
| 177 |
+ return true; |
|
| 178 |
+ } else {
|
|
| 179 |
+ Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
|
|
| 180 |
+ return false; |
|
| 181 |
+ } |
|
| 182 |
+ } else {
|
|
| 183 |
+ v = 0; |
|
| 184 |
+ return false; |
|
| 185 |
+ } |
|
| 186 |
+} |
|
| 187 |
+ |
|
| 188 |
+bool Matrix_Double::get_row_sum(const int i, double & v) |
|
| 189 |
+{
|
|
| 190 |
+ if (!empty) {
|
|
| 191 |
+ if (i<nrow) {
|
|
| 192 |
+ v = 0; |
|
| 193 |
+ for (int k=0; k<ncol; k++) {
|
|
| 194 |
+ v += mat[i][k]; |
|
| 195 |
+ } |
|
| 196 |
+ return true; |
|
| 197 |
+ } else {
|
|
| 198 |
+ Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
|
|
| 199 |
+ return false; |
|
| 200 |
+ } |
|
| 201 |
+ } else {
|
|
| 202 |
+ v = 0; |
|
| 203 |
+ return false; |
|
| 204 |
+ } |
|
| 205 |
+} |
|
| 206 |
+ |
|
| 207 |
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) |
|
| 208 |
+{
|
|
| 209 |
+ map<int, int> counts; |
|
| 210 |
+ for (int i=0; i<row_labels.size(); i++) |
|
| 211 |
+ if (counts.find(row_labels[i]) != counts.end()) |
|
| 212 |
+ counts[row_labels[i]] += 1; // found this label, then increment count |
|
| 213 |
+ else {
|
|
| 214 |
+ counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once |
|
| 215 |
+ //counts[row_labels[i]] = 1; |
|
| 216 |
+ } |
|
| 217 |
+ map<int,int>::iterator it; |
|
| 218 |
+ for (it=counts.begin(); it!=counts.end(); ++it) |
|
| 219 |
+ uniq_labels.push_back(it->first); |
|
| 220 |
+} |
|
| 221 |
+ |
|
| 222 |
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) |
|
| 223 |
+{
|
|
| 224 |
+ // obtain unique row labels |
|
| 225 |
+ vector<int> unique_row_labels; |
|
| 226 |
+ int row_label_; |
|
| 227 |
+ for (int i=0; i<nrow; i++) {
|
|
| 228 |
+ row_label_ = row_labels[i]; |
|
| 229 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 230 |
+ } |
|
| 231 |
+ Bins2Value::iterator iter; |
|
| 232 |
+ for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
|
|
| 233 |
+ row_label_ = iter->first; |
|
| 234 |
+ if (!exist_row_label(row_label_)) {
|
|
| 235 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 236 |
+ } |
|
| 237 |
+ } |
|
| 238 |
+ sort(unique_row_labels.begin(), unique_row_labels.end()); |
|
| 239 |
+ // print both 'mat' and 'additional_column_data' as the last column |
|
| 240 |
+ for (int i=0; i<unique_row_labels.size(); i++) {
|
|
| 241 |
+ row_label_ = unique_row_labels[i]; |
|
| 242 |
+ os << row_label_; |
|
| 243 |
+ int row_index = get_row_index(row_label_); |
|
| 244 |
+ if (row_index!=-1) {
|
|
| 245 |
+ for (int j=0; j<ncol; j++) {
|
|
| 246 |
+ os << "\t" << mat[row_index][j]; |
|
| 247 |
+ } |
|
| 248 |
+ } else {
|
|
| 249 |
+ for (int j=0; j<ncol; j++) {
|
|
| 250 |
+ os << "\t0"; |
|
| 251 |
+ } |
|
| 252 |
+ } |
|
| 253 |
+ if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
|
|
| 254 |
+ os << "\t" << additional_column_data[row_label_]; |
|
| 255 |
+ } else {
|
|
| 256 |
+ os << "\t0"; |
|
| 257 |
+ } |
|
| 258 |
+ os << endl; |
|
| 259 |
+ } |
|
| 260 |
+} |
|
| 261 |
+ |
|
| 262 |
+ostream & operator<<(ostream & os, Matrix_Double & mat) |
|
| 263 |
+{
|
|
| 264 |
+ double v; |
|
| 265 |
+ int row_label; |
|
| 266 |
+ if (!mat.isempty()) {
|
|
| 267 |
+ for (int i=0; i<mat.get_row_num(); i++) {
|
|
| 268 |
+ int j; |
|
| 269 |
+ mat.get_row_label(i, row_label); |
|
| 270 |
+ os << row_label << "\t"; |
|
| 271 |
+ for (j=0; j<mat.get_column_num()-1; j++) {
|
|
| 272 |
+ mat.get_element(i,j,v); |
|
| 273 |
+ os << v << "\t"; |
|
| 274 |
+ } |
|
| 275 |
+ mat.get_element(i,j,v); |
|
| 276 |
+ os << v << endl; |
|
| 277 |
+ } |
|
| 278 |
+ } |
|
| 279 |
+ return os; |
|
| 280 |
+} |
|
| 281 |
+ |
|
| 282 |
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
|
| 283 |
+{
|
|
| 284 |
+ unsigned int ncol = p.get_column_num(); |
|
| 285 |
+ unsigned int nrow = p.get_row_num(); |
|
| 286 |
+ double v; |
|
| 287 |
+ double obj = 0; |
|
| 288 |
+ for (int i=0; i<nrow; i++) {
|
|
| 289 |
+ double sum = 0; |
|
| 290 |
+ for (int j=0; j<ncol; j++) {
|
|
| 291 |
+ p.get_element(i,j,v); |
|
| 292 |
+ sum += theta[j]*v; |
|
| 293 |
+ } |
|
| 294 |
+ obj += log(sum); |
|
| 295 |
+ } |
|
| 296 |
+ return obj; |
|
| 297 |
+} |
|
| 298 |
+// |
|
| 299 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 300 |
+// |
|
| 301 |
+// input: |
|
| 302 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 303 |
+// |
|
| 304 |
+// output: |
|
| 305 |
+// theta (model parameters), a vector with "T" elements. |
|
| 306 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 307 |
+// |
|
| 308 |
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 309 |
+{
|
|
| 310 |
+ // cout.precision(15); |
|
| 311 |
+ // cerr.precision(15); |
|
| 312 |
+ unsigned int ncol = p.get_column_num(); |
|
| 313 |
+ unsigned int nrow = p.get_row_num(); // nrow is number of tissues. |
|
| 314 |
+ theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. |
|
| 315 |
+ // initialize model parameters as uniform distribution |
|
| 316 |
+ // alternatively, model parameters can be random numbers by satisfying the crition |
|
| 317 |
+ // (1) \sum_{i=1}^{ncol}{theta_i}=1
|
|
| 318 |
+ for (int j=0; j<ncol; j++) {
|
|
| 319 |
+ theta[j] = 1/(double)ncol; |
|
| 320 |
+ } |
|
| 321 |
+ // create and initialize q with the same size of p and with all elements initialized as 0 |
|
| 322 |
+ //Matrix_Double q(nrow, ncol, 0); |
|
| 323 |
+ //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
|
| 324 |
+ double v1, v2; |
|
| 325 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 326 |
+ // Rcpp::Rcerr << iter+1 << "," ; |
|
| 327 |
+ // E-step: estimate q |
|
| 328 |
+ for (int i=0; i<nrow; i++) {
|
|
| 329 |
+ double sum = 0; |
|
| 330 |
+ for (int j=0; j<ncol; j++) {
|
|
| 331 |
+ p.get_element(i,j,v1); |
|
| 332 |
+ v2 = theta[j]*v1; |
|
| 333 |
+ q.set_element( i, j, v2 ); |
|
| 334 |
+ sum += v2; |
|
| 335 |
+ } |
|
| 336 |
+ for (int j=0; j<ncol; j++) {
|
|
| 337 |
+ q.get_element(i,j,v2); |
|
| 338 |
+ v2 /= sum; |
|
| 339 |
+ q.set_element( i, j, v2 ); |
|
| 340 |
+ } |
|
| 341 |
+ } |
|
| 342 |
+ // M-step: estimate theta |
|
| 343 |
+ for (int j=0; j<ncol; j++) {
|
|
| 344 |
+ double sum=0; |
|
| 345 |
+ for (int i=0; i<nrow; i++) {
|
|
| 346 |
+ q.get_element(i,j,v2); |
|
| 347 |
+ sum += v2; |
|
| 348 |
+ } |
|
| 349 |
+ theta[j] = sum / nrow; |
|
| 350 |
+ } |
|
| 351 |
+ // for debug |
|
| 352 |
+ //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; |
|
| 353 |
+ } |
|
| 354 |
+ // last E-step: estimate q |
|
| 355 |
+ for (int i=0; i<nrow; i++) {
|
|
| 356 |
+ double sum = 0; |
|
| 357 |
+ for (int j=0; j<ncol; j++) {
|
|
| 358 |
+ p.get_element(i,j,v1); |
|
| 359 |
+ v2 = theta[j]*v1; |
|
| 360 |
+ q.set_element( i, j, v2 ); |
|
| 361 |
+ sum += v2; |
|
| 362 |
+ } |
|
| 363 |
+ for (int j=0; j<ncol; j++) {
|
|
| 364 |
+ q.get_element(i,j,v2); |
|
| 365 |
+ v2 /= sum; |
|
| 366 |
+ q.set_element( i, j, v2 ); |
|
| 367 |
+ } |
|
| 368 |
+ } |
|
| 369 |
+ Rcpp::Rcerr << endl; |
|
| 370 |
+} |
|
| 371 |
+ |
|
| 372 |
+// |
|
| 373 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 374 |
+// |
|
| 375 |
+// input: |
|
| 376 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 377 |
+// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. |
|
| 378 |
+// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read |
|
| 379 |
+// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read |
|
| 380 |
+// |
|
| 381 |
+// output: |
|
| 382 |
+// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. |
|
| 383 |
+// m is a vector of M X 1, where M is number of markers. |
|
| 384 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 385 |
+// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. |
|
| 386 |
+// |
|
| 387 |
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
|
| 388 |
+ int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
|
| 389 |
+ vector<double> & m) |
|
| 390 |
+{
|
|
| 391 |
+ // cout.precision(15); |
|
| 392 |
+ // cerr.precision(15); |
|
| 393 |
+ unsigned int ncol = p.get_column_num(); // T, number of known tissues |
|
| 394 |
+ unsigned int nrow = p.get_row_num(); // N, number of reads |
|
| 395 |
+ |
|
| 396 |
+ theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. |
|
| 397 |
+ |
|
| 398 |
+ vector<int> uniq_marker_Ids; |
|
| 399 |
+ p.get_unique_row_labels(uniq_marker_Ids); |
|
| 400 |
+ int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 401 |
+ m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. |
|
| 402 |
+ map<int,int> markerId2Index; // marker index is 0-based. |
|
| 403 |
+ for (int index=0; index<nMarker; index++) {
|
|
| 404 |
+ markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); |
|
| 405 |
+ } |
|
| 406 |
+ |
|
| 407 |
+ // initialize model parameters theta as uniform distribution, and m as 0.5 |
|
| 408 |
+ // alternatively, theta and m can be random numbers, by satisfying: |
|
| 409 |
+ // (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
|
| 410 |
+ // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
|
| 411 |
+ for (int j=0; j<ncol+1; j++) {
|
|
| 412 |
+ theta[j] = 1/(double)(ncol+1); |
|
| 413 |
+ } |
|
| 414 |
+ for (int k=0; k<nMarker; k++) {
|
|
| 415 |
+ m[k] = 0.5; |
|
| 416 |
+ } |
|
| 417 |
+ for (int i=0; i<nrow; i++) |
|
| 418 |
+ q_unknown.push_back(0); |
|
| 419 |
+ |
|
| 420 |
+ // EM algorithm |
|
| 421 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 422 |
+ Rcpp::Rcerr << iter+1 << "," ; |
|
| 423 |
+ // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) |
|
| 424 |
+ double v1, v2, likelihood_unknown_class; |
|
| 425 |
+ int markerId, markerIndex; |
|
| 426 |
+ for (int i=0; i<nrow; i++) {
|
|
| 427 |
+ // process the first T known tissues |
|
| 428 |
+ double sum = 0; |
|
| 429 |
+ for (int j=0; j<ncol; j++) {
|
|
| 430 |
+ p.get_element(i,j,v1); |
|
| 431 |
+ v2 = theta[j]*v1; |
|
| 432 |
+ q.set_element( i, j, v2 ); |
|
| 433 |
+ sum += v2; |
|
| 434 |
+ } |
|
| 435 |
+ // process the last unknown tissue |
|
| 436 |
+ p.get_row_label(i, markerId); // get markerId (1-base) of read i |
|
| 437 |
+ markerIndex = markerId2Index[markerId]; |
|
| 438 |
+ likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]); |
|
| 439 |
+ q_unknown[i] = theta[ncol]*likelihood_unknown_class; |
|
| 440 |
+ sum += q_unknown[i]; |
|
| 441 |
+ |
|
| 442 |
+ // update q and q_unknown |
|
| 443 |
+ for (int j=0; j<ncol; j++) {
|
|
| 444 |
+ q.get_element(i,j,v2); |
|
| 445 |
+ v2 /= sum; |
|
| 446 |
+ q.set_element( i, j, v2 ); |
|
| 447 |
+ } |
|
| 448 |
+ q_unknown[i] /= sum; |
|
| 449 |
+ } |
|
| 450 |
+ |
|
| 451 |
+ // M-step 1: estimate unknown class's methylation level (m) of each marker |
|
| 452 |
+ double sum1=0, sum2=0; |
|
| 453 |
+ int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base |
|
| 454 |
+ for (int i=0; i<nrow; i++) {
|
|
| 455 |
+ // for each row (or read) |
|
| 456 |
+ p.get_row_label(i, markerId); // get the marker_index (0-base) of read i |
|
| 457 |
+ curr_marker_index = markerId2Index[markerId]; |
|
| 458 |
+ if (curr_marker_index != prev_marker_index) {
|
|
| 459 |
+ // this is the 1st read of a new marker |
|
| 460 |
+ // we need to summarize m value of previous marker |
|
| 461 |
+ if (prev_marker_index!=-1) { // current marker is not the 1st marker
|
|
| 462 |
+ m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); |
|
| 463 |
+ sum1 = 0; |
|
| 464 |
+ sum2 = 0; |
|
| 465 |
+ } |
|
| 466 |
+ prev_marker_index = curr_marker_index; |
|
| 467 |
+ } |
|
| 468 |
+ sum1 += q_unknown[i]*Rm[i]; |
|
| 469 |
+ sum2 += q_unknown[i]*Rl[i]; |
|
| 470 |
+ } |
|
| 471 |
+ m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker |
|
| 472 |
+ |
|
| 473 |
+ //cout << "iter=" << iter << ", " << "m=" << endl; // for debug |
|
| 474 |
+ //print_vec(cout, m, "\n"); // for debug |
|
| 475 |
+ //cout << endl; // for debug |
|
| 476 |
+ |
|
| 477 |
+ // M-step 2: estimate theta, which has ncol+1 values |
|
| 478 |
+ double sum; |
|
| 479 |
+ for (int j=0; j<ncol; j++) {
|
|
| 480 |
+ sum=0; |
|
| 481 |
+ for (int i=0; i<nrow; i++) {
|
|
| 482 |
+ q.get_element(i,j,v2); |
|
| 483 |
+ sum += v2; |
|
| 484 |
+ } |
|
| 485 |
+ theta[j] = sum / nrow; |
|
| 486 |
+ } |
|
| 487 |
+ sum=0; |
|
| 488 |
+ for (int i=0; i<nrow; i++) |
|
| 489 |
+ sum += q_unknown[i]; |
|
| 490 |
+ theta[ncol] = sum / nrow; |
|
| 491 |
+ |
|
| 492 |
+ //cout << "round " << iter+1 << ", theta="; // for debug |
|
| 493 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 494 |
+ //cout << endl << endl; // for debug |
|
| 495 |
+ } |
|
| 496 |
+ Rcpp::Rcerr << endl; |
|
| 497 |
+ //cout << "final:" << endl << std::flush; |
|
| 498 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 499 |
+ //cout << endl << endl << std::flush; // for debug |
|
| 500 |
+} |
|
| 501 |
+ |
|
| 502 |
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) |
|
| 503 |
+{
|
|
| 504 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 505 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 506 |
+ |
|
| 507 |
+ vector<int> uniq_marker_Ids; |
|
| 508 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 509 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 510 |
+ |
|
| 511 |
+ double v; |
|
| 512 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 513 |
+ vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 |
|
| 514 |
+ for (int i=0; i<nrow; i++) {
|
|
| 515 |
+ // for each row (or read) |
|
| 516 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 517 |
+ if (curr_markerId != prev_markerId) {
|
|
| 518 |
+ // this is the 1st read of a new marker |
|
| 519 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 520 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 521 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 522 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 523 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 524 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 525 |
+ } |
|
| 526 |
+ prev_markerId = curr_markerId; |
|
| 527 |
+ } |
|
| 528 |
+ for (int j=0; j<ncol; j++) {
|
|
| 529 |
+ q.get_element(i,j,v); |
|
| 530 |
+ readCountsPerMarker[j] += v; |
|
| 531 |
+ } |
|
| 532 |
+ } |
|
| 533 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 534 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 535 |
+} |
|
| 536 |
+ |
|
| 537 |
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) |
|
| 538 |
+{
|
|
| 539 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 540 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 541 |
+ if (nrow!=(unsigned int)q_unknown.size()) {
|
|
| 542 |
+ Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; |
|
| 543 |
+ // exit(EXIT_FAILURE); |
|
| 544 |
+ } |
|
| 545 |
+ |
|
| 546 |
+ vector<int> uniq_marker_Ids; |
|
| 547 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 548 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 549 |
+ |
|
| 550 |
+ double v; |
|
| 551 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 552 |
+ vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. |
|
| 553 |
+ for (int i=0; i<nrow; i++) {
|
|
| 554 |
+ // for each row (or read) |
|
| 555 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 556 |
+ if (curr_markerId != prev_markerId) {
|
|
| 557 |
+ // this is the 1st read of a new marker |
|
| 558 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 559 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 560 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 561 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 562 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 563 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 564 |
+ } |
|
| 565 |
+ prev_markerId = curr_markerId; |
|
| 566 |
+ } |
|
| 567 |
+ for (int j=0; j<ncol; j++) {
|
|
| 568 |
+ q.get_element(i,j,v); |
|
| 569 |
+ readCountsPerMarker[j] += v; |
|
| 570 |
+ } |
|
| 571 |
+ readCountsPerMarker[ncol] += q_unknown[i]; |
|
| 572 |
+ } |
|
| 573 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 574 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 575 |
+} |
|
| 576 |
+ |
| ... | ... |
@@ -1,575 +1,576 @@ |
| 1 |
-#include <Rcpp.h> |
|
| 2 |
-using namespace Rcpp; |
|
| 3 |
- |
|
| 4 |
-#include <cstdlib> |
|
| 5 |
-#include <cmath> |
|
| 6 |
-#include <sstream> |
|
| 7 |
-#include <iostream> |
|
| 8 |
-#include <fstream> |
|
| 9 |
-#include <map> |
|
| 10 |
-#include <vector> |
|
| 11 |
-#include <string> |
|
| 12 |
-#include <cassert> |
|
| 13 |
-#include <algorithm> |
|
| 14 |
-#include <boost/algorithm/string.hpp> |
|
| 15 |
-#include <boost/assign.hpp> |
|
| 16 |
-#include <boost/iostreams/filtering_stream.hpp> |
|
| 17 |
-#include "matrix.h" |
|
| 18 |
-#include "utils.h" |
|
| 19 |
- |
|
| 20 |
-using namespace std; |
|
| 21 |
-using namespace boost; |
|
| 22 |
- |
|
| 23 |
-#include "template_utils.cpp" |
|
| 24 |
- |
|
| 25 |
-Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) |
|
| 26 |
-{
|
|
| 27 |
- nrow = nrow_; |
|
| 28 |
- ncol = ncol_; |
|
| 29 |
- empty = false; |
|
| 30 |
- mat.reserve(nrow_); |
|
| 31 |
- for (int i=0; i<nrow; i++) {
|
|
| 32 |
- vector<double> temp; |
|
| 33 |
- temp.reserve(ncol); |
|
| 34 |
- for (int j=0; j<ncol; j++) |
|
| 35 |
- temp.push_back(init_value); |
|
| 36 |
- mat.push_back( temp ); |
|
| 37 |
- } |
|
| 38 |
- empty = mat.empty(); |
|
| 39 |
-} |
|
| 40 |
- |
|
| 41 |
-// a line represents a row of the matrix. Each element in this row is delimited by a TAB. |
|
| 42 |
-void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) |
|
| 43 |
-{
|
|
| 44 |
- vector<string> items; |
|
| 45 |
- split(items, line, is_any_of("\t"));
|
|
| 46 |
- vector<double> v; |
|
| 47 |
- for (int i=0; i<items.size(); i++) {
|
|
| 48 |
- if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. |
|
| 49 |
- if (i<num_header_column) continue; // skip the first num_header_column header columns |
|
| 50 |
- v.push_back( atof(items[i].c_str()) ); |
|
| 51 |
- } |
|
| 52 |
- mat.push_back( v ); |
|
| 53 |
-} |
|
| 54 |
- |
|
| 55 |
-Matrix_Double::Matrix_Double(const string & mat_file, |
|
| 56 |
- bool header_line=false, int num_header_column=1) |
|
| 57 |
-{
|
|
| 58 |
- unsigned long i_line=0; |
|
| 59 |
- // input is a plain text file |
|
| 60 |
- istream * in=&cin; // default is stdin |
|
| 61 |
- ifstream fin; |
|
| 62 |
- if ( !(mat_file.compare("stdin")==0)) {
|
|
| 63 |
- fin.open(mat_file.c_str()); |
|
| 64 |
- if (fin.fail()){
|
|
| 65 |
- Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; |
|
| 66 |
- // exit(EXIT_FAILURE); |
|
| 67 |
- } |
|
| 68 |
- in = &fin; |
|
| 69 |
- } |
|
| 70 |
- string line; |
|
| 71 |
- while (!(*in).eof()) {
|
|
| 72 |
- getline((*in), line); |
|
| 73 |
- //cerr << "Line: " << line << endl; |
|
| 74 |
- if (header_line) {
|
|
| 75 |
- if (i_line==0) {
|
|
| 76 |
- // skip the header line |
|
| 77 |
- i_line++; |
|
| 78 |
- continue; |
|
| 79 |
- } |
|
| 80 |
- } |
|
| 81 |
- if (line.empty()) {
|
|
| 82 |
- // this is the last line of the file |
|
| 83 |
- break; |
|
| 84 |
- } |
|
| 85 |
- process_one_line_of_mat_file(line, num_header_column); |
|
| 86 |
- i_line++; |
|
| 87 |
- } |
|
| 88 |
- if ( !(mat_file.compare("stdin")==0)) {
|
|
| 89 |
- fin.close(); |
|
| 90 |
- } |
|
| 91 |
- empty = mat.empty(); |
|
| 92 |
- nrow = mat.size(); |
|
| 93 |
- if (nrow>=1) ncol = mat[0].size(); |
|
| 94 |
- else ncol = 0; |
|
| 95 |
-} |
|
| 96 |
- |
|
| 97 |
-// return the row index of this vector |
|
| 98 |
-long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
|
|
| 99 |
- if (v_.size()!=ncol) {
|
|
| 100 |
- Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 101 |
- // exit(EXIT_FAILURE); |
|
| 102 |
- } |
|
| 103 |
- vector<double> v; |
|
| 104 |
- for (int i=0; i<v_.size(); i++) {
|
|
| 105 |
- v.push_back( v_[i] ); |
|
| 106 |
- } |
|
| 107 |
- mat.push_back( v ); |
|
| 108 |
- nrow = mat.size(); |
|
| 109 |
- row_labels.push_back(row_label_); |
|
| 110 |
- empty=false; |
|
| 111 |
- return(nrow-1); |
|
| 112 |
-} |
|
| 113 |
- |
|
| 114 |
-// |
|
| 115 |
-// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. |
|
| 116 |
-// |
|
| 117 |
-// return row index of this appended vector, if successfully appending; return -1 otherwise. |
|
| 118 |
-// |
|
| 119 |
-long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
|
|
| 120 |
- if (v_.size()!=ncol) {
|
|
| 121 |
- Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 122 |
- // exit(EXIT_FAILURE); |
|
| 123 |
- } |
|
| 124 |
- double min_ = *min_element(v_.begin(), v_.end()); |
|
| 125 |
- double max_ = *max_element(v_.begin(), v_.end()); |
|
| 126 |
- //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; |
|
| 127 |
- if (min_==0) {
|
|
| 128 |
- if (max_==0) return(-1); |
|
| 129 |
- } else {
|
|
| 130 |
- double ratio=max_/min_; |
|
| 131 |
- if (ratio<min_threshold_maxv_minv) return(-1); |
|
| 132 |
- } |
|
| 133 |
- long row_index = append_row_vector(v_, row_label_); |
|
| 134 |
- return(row_index); |
|
| 135 |
-} |
|
| 136 |
- |
|
| 137 |
-bool Matrix_Double::get_element(const int i, const int j, double & v) |
|
| 138 |
-{
|
|
| 139 |
- if (!empty) {
|
|
| 140 |
- if (i<nrow && j<ncol) {
|
|
| 141 |
- v = mat[i][j]; |
|
| 142 |
- return true; |
|
| 143 |
- } else {
|
|
| 144 |
- Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 145 |
- return false; |
|
| 146 |
- } |
|
| 147 |
- } else {
|
|
| 148 |
- v = 0; |
|
| 149 |
- return false; |
|
| 150 |
- } |
|
| 151 |
-} |
|
| 152 |
- |
|
| 153 |
-bool Matrix_Double::set_element(const int i, const int j, double v) |
|
| 154 |
-{
|
|
| 155 |
- if (!empty) {
|
|
| 156 |
- if (i<nrow && j<ncol) {
|
|
| 157 |
- mat[i][j] = v; |
|
| 158 |
- return true; |
|
| 159 |
- } else {
|
|
| 160 |
- Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 161 |
- return false; |
|
| 162 |
- } |
|
| 163 |
- } else {
|
|
| 164 |
- return false; |
|
| 165 |
- } |
|
| 166 |
-} |
|
| 167 |
- |
|
| 168 |
-bool Matrix_Double::get_column_sum(const int j, double & v) |
|
| 169 |
-{
|
|
| 170 |
- if (!empty) {
|
|
| 171 |
- if (j<ncol) {
|
|
| 172 |
- v = 0; |
|
| 173 |
- for (int k=0; k<nrow; k++) {
|
|
| 174 |
- v += mat[k][j]; |
|
| 175 |
- } |
|
| 176 |
- return true; |
|
| 177 |
- } else {
|
|
| 178 |
- Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
|
|
| 179 |
- return false; |
|
| 180 |
- } |
|
| 181 |
- } else {
|
|
| 182 |
- v = 0; |
|
| 183 |
- return false; |
|
| 184 |
- } |
|
| 185 |
-} |
|
| 186 |
- |
|
| 187 |
-bool Matrix_Double::get_row_sum(const int i, double & v) |
|
| 188 |
-{
|
|
| 189 |
- if (!empty) {
|
|
| 190 |
- if (i<nrow) {
|
|
| 191 |
- v = 0; |
|
| 192 |
- for (int k=0; k<ncol; k++) {
|
|
| 193 |
- v += mat[i][k]; |
|
| 194 |
- } |
|
| 195 |
- return true; |
|
| 196 |
- } else {
|
|
| 197 |
- Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
|
|
| 198 |
- return false; |
|
| 199 |
- } |
|
| 200 |
- } else {
|
|
| 201 |
- v = 0; |
|
| 202 |
- return false; |
|
| 203 |
- } |
|
| 204 |
-} |
|
| 205 |
- |
|
| 206 |
-void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) |
|
| 207 |
-{
|
|
| 208 |
- map<int, int> counts; |
|
| 209 |
- for (int i=0; i<row_labels.size(); i++) |
|
| 210 |
- if (counts.find(row_labels[i]) != counts.end()) |
|
| 211 |
- counts[row_labels[i]] += 1; // found this label, then increment count |
|
| 212 |
- else {
|
|
| 213 |
- counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once |
|
| 214 |
- //counts[row_labels[i]] = 1; |
|
| 215 |
- } |
|
| 216 |
- map<int,int>::iterator it; |
|
| 217 |
- for (it=counts.begin(); it!=counts.end(); ++it) |
|
| 218 |
- uniq_labels.push_back(it->first); |
|
| 219 |
-} |
|
| 220 |
- |
|
| 221 |
-void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) |
|
| 222 |
-{
|
|
| 223 |
- // obtain unique row labels |
|
| 224 |
- vector<int> unique_row_labels; |
|
| 225 |
- int row_label_; |
|
| 226 |
- for (int i=0; i<nrow; i++) {
|
|
| 227 |
- row_label_ = row_labels[i]; |
|
| 228 |
- unique_row_labels.push_back( row_label_ ); |
|
| 229 |
- } |
|
| 230 |
- Bins2Value::iterator iter; |
|
| 231 |
- for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
|
|
| 232 |
- row_label_ = iter->first; |
|
| 233 |
- if (!exist_row_label(row_label_)) {
|
|
| 234 |
- unique_row_labels.push_back( row_label_ ); |
|
| 235 |
- } |
|
| 236 |
- } |
|
| 237 |
- sort(unique_row_labels.begin(), unique_row_labels.end()); |
|
| 238 |
- // print both 'mat' and 'additional_column_data' as the last column |
|
| 239 |
- for (int i=0; i<unique_row_labels.size(); i++) {
|
|
| 240 |
- row_label_ = unique_row_labels[i]; |
|
| 241 |
- os << row_label_; |
|
| 242 |
- int row_index = get_row_index(row_label_); |
|
| 243 |
- if (row_index!=-1) {
|
|
| 244 |
- for (int j=0; j<ncol; j++) {
|
|
| 245 |
- os << "\t" << mat[row_index][j]; |
|
| 246 |
- } |
|
| 247 |
- } else {
|
|
| 248 |
- for (int j=0; j<ncol; j++) {
|
|
| 249 |
- os << "\t0"; |
|
| 250 |
- } |
|
| 251 |
- } |
|
| 252 |
- if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
|
|
| 253 |
- os << "\t" << additional_column_data[row_label_]; |
|
| 254 |
- } else {
|
|
| 255 |
- os << "\t0"; |
|
| 256 |
- } |
|
| 257 |
- os << endl; |
|
| 258 |
- } |
|
| 259 |
-} |
|
| 260 |
- |
|
| 261 |
-ostream & operator<<(ostream & os, Matrix_Double & mat) |
|
| 262 |
-{
|
|
| 263 |
- double v; |
|
| 264 |
- int row_label; |
|
| 265 |
- if (!mat.isempty()) {
|
|
| 266 |
- for (int i=0; i<mat.get_row_num(); i++) {
|
|
| 267 |
- int j; |
|
| 268 |
- mat.get_row_label(i, row_label); |
|
| 269 |
- os << row_label << "\t"; |
|
| 270 |
- for (j=0; j<mat.get_column_num()-1; j++) {
|
|
| 271 |
- mat.get_element(i,j,v); |
|
| 272 |
- os << v << "\t"; |
|
| 273 |
- } |
|
| 274 |
- mat.get_element(i,j,v); |
|
| 275 |
- os << v << endl; |
|
| 276 |
- } |
|
| 277 |
- } |
|
| 278 |
- return os; |
|
| 279 |
-} |
|
| 280 |
- |
|
| 281 |
-double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
|
| 282 |
-{
|
|
| 283 |
- unsigned int ncol = p.get_column_num(); |
|
| 284 |
- unsigned int nrow = p.get_row_num(); |
|
| 285 |
- double v; |
|
| 286 |
- double obj = 0; |
|
| 287 |
- for (int i=0; i<nrow; i++) {
|
|
| 288 |
- double sum = 0; |
|
| 289 |
- for (int j=0; j<ncol; j++) {
|
|
| 290 |
- p.get_element(i,j,v); |
|
| 291 |
- sum += theta[j]*v; |
|
| 292 |
- } |
|
| 293 |
- obj += log(sum); |
|
| 294 |
- } |
|
| 295 |
- return obj; |
|
| 296 |
-} |
|
| 297 |
-// |
|
| 298 |
-// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 299 |
-// |
|
| 300 |
-// input: |
|
| 301 |
-// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 302 |
-// |
|
| 303 |
-// output: |
|
| 304 |
-// theta (model parameters), a vector with "T" elements. |
|
| 305 |
-// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 306 |
-// |
|
| 307 |
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 308 |
-{
|
|
| 309 |
- // cout.precision(15); |
|
| 310 |
- // cerr.precision(15); |
|
| 311 |
- unsigned int ncol = p.get_column_num(); |
|
| 312 |
- unsigned int nrow = p.get_row_num(); // nrow is number of tissues. |
|
| 313 |
- theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. |
|
| 314 |
- // initialize model parameters as uniform distribution |
|
| 315 |
- // alternatively, model parameters can be random numbers by satisfying the crition |
|
| 316 |
- // (1) \sum_{i=1}^{ncol}{theta_i}=1
|
|
| 317 |
- for (int j=0; j<ncol; j++) {
|
|
| 318 |
- theta[j] = 1/(double)ncol; |
|
| 319 |
- } |
|
| 320 |
- // create and initialize q with the same size of p and with all elements initialized as 0 |
|
| 321 |
- //Matrix_Double q(nrow, ncol, 0); |
|
| 322 |
- //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
|
| 323 |
- double v1, v2; |
|
| 324 |
- for (int iter=0; iter<max_iter; iter++) {
|
|
| 325 |
- Rcpp::Rcerr << iter+1 << "," ; |
|
| 326 |
- // E-step: estimate q |
|
| 327 |
- for (int i=0; i<nrow; i++) {
|
|
| 328 |
- double sum = 0; |
|
| 329 |
- for (int j=0; j<ncol; j++) {
|
|
| 330 |
- p.get_element(i,j,v1); |
|
| 331 |
- v2 = theta[j]*v1; |
|
| 332 |
- q.set_element( i, j, v2 ); |
|
| 333 |
- sum += v2; |
|
| 334 |
- } |
|
| 335 |
- for (int j=0; j<ncol; j++) {
|
|
| 336 |
- q.get_element(i,j,v2); |
|
| 337 |
- v2 /= sum; |
|
| 338 |
- q.set_element( i, j, v2 ); |
|
| 339 |
- } |
|
| 340 |
- } |
|
| 341 |
- // M-step: estimate theta |
|
| 342 |
- for (int j=0; j<ncol; j++) {
|
|
| 343 |
- double sum=0; |
|
| 344 |
- for (int i=0; i<nrow; i++) {
|
|
| 345 |
- q.get_element(i,j,v2); |
|
| 346 |
- sum += v2; |
|
| 347 |
- } |
|
| 348 |
- theta[j] = sum / nrow; |
|
| 349 |
- } |
|
| 350 |
- // for debug |
|
| 351 |
- //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; |
|
| 352 |
- } |
|
| 353 |
- // last E-step: estimate q |
|
| 354 |
- for (int i=0; i<nrow; i++) {
|
|
| 355 |
- double sum = 0; |
|
| 356 |
- for (int j=0; j<ncol; j++) {
|
|
| 357 |
- p.get_element(i,j,v1); |
|
| 358 |
- v2 = theta[j]*v1; |
|
| 359 |
- q.set_element( i, j, v2 ); |
|
| 360 |
- sum += v2; |
|
| 361 |
- } |
|
| 362 |
- for (int j=0; j<ncol; j++) {
|
|
| 363 |
- q.get_element(i,j,v2); |
|
| 364 |
- v2 /= sum; |
|
| 365 |
- q.set_element( i, j, v2 ); |
|
| 366 |
- } |
|
| 367 |
- } |
|
| 368 |
- Rcpp::Rcerr << endl; |
|
| 369 |
-} |
|
| 370 |
- |
|
| 371 |
-// |
|
| 372 |
-// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 373 |
-// |
|
| 374 |
-// input: |
|
| 375 |
-// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 376 |
-// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. |
|
| 377 |
-// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read |
|
| 378 |
-// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read |
|
| 379 |
-// |
|
| 380 |
-// output: |
|
| 381 |
-// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. |
|
| 382 |
-// m is a vector of M X 1, where M is number of markers. |
|
| 383 |
-// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 384 |
-// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. |
|
| 385 |
-// |
|
| 386 |
-void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
|
| 387 |
- int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
|
| 388 |
- vector<double> & m) |
|
| 389 |
-{
|
|
| 390 |
- // cout.precision(15); |
|
| 391 |
- // cerr.precision(15); |
|
| 392 |
- unsigned int ncol = p.get_column_num(); // T, number of known tissues |
|
| 393 |
- unsigned int nrow = p.get_row_num(); // N, number of reads |
|
| 394 |
- |
|
| 395 |
- theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. |
|
| 396 |
- |
|
| 397 |
- vector<int> uniq_marker_Ids; |
|
| 398 |
- p.get_unique_row_labels(uniq_marker_Ids); |
|
| 399 |
- int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 400 |
- m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. |
|
| 401 |
- map<int,int> markerId2Index; // marker index is 0-based. |
|
| 402 |
- for (int index=0; index<nMarker; index++) {
|
|
| 403 |
- markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); |
|
| 404 |
- } |
|
| 405 |
- |
|
| 406 |
- // initialize model parameters theta as uniform distribution, and m as 0.5 |
|
| 407 |
- // alternatively, theta and m can be random numbers, by satisfying: |
|
| 408 |
- // (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
|
| 409 |
- // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
|
| 410 |
- for (int j=0; j<ncol+1; j++) {
|
|
| 411 |
- theta[j] = 1/(double)(ncol+1); |
|
| 412 |
- } |
|
| 413 |
- for (int k=0; k<nMarker; k++) {
|
|
| 414 |
- m[k] = 0.5; |
|
| 415 |
- } |
|
| 416 |
- for (int i=0; i<nrow; i++) |
|
| 417 |
- q_unknown.push_back(0); |
|
| 418 |
- |
|
| 419 |
- // EM algorithm |
|
| 420 |
- for (int iter=0; iter<max_iter; iter++) {
|
|
| 421 |
- Rcpp::Rcerr << iter+1 << "," ; |
|
| 422 |
- // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) |
|
| 423 |
- double v1, v2, likelihood_unknown_class; |
|
| 424 |
- int markerId, markerIndex; |
|
| 425 |
- for (int i=0; i<nrow; i++) {
|
|
| 426 |
- // process the first T known tissues |
|
| 427 |
- double sum = 0; |
|
| 428 |
- for (int j=0; j<ncol; j++) {
|
|
| 429 |
- p.get_element(i,j,v1); |
|
| 430 |
- v2 = theta[j]*v1; |
|
| 431 |
- q.set_element( i, j, v2 ); |
|
| 432 |
- sum += v2; |
|
| 433 |
- } |
|
| 434 |
- // process the last unknown tissue |
|
| 435 |
- p.get_row_label(i, markerId); // get markerId (1-base) of read i |
|
| 436 |
- markerIndex = markerId2Index[markerId]; |
|
| 437 |
- likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]); |
|
| 438 |
- q_unknown[i] = theta[ncol]*likelihood_unknown_class; |
|
| 439 |
- sum += q_unknown[i]; |
|
| 440 |
- |
|
| 441 |
- // update q and q_unknown |
|
| 442 |
- for (int j=0; j<ncol; j++) {
|
|
| 443 |
- q.get_element(i,j,v2); |
|
| 444 |
- v2 /= sum; |
|
| 445 |
- q.set_element( i, j, v2 ); |
|
| 446 |
- } |
|
| 447 |
- q_unknown[i] /= sum; |
|
| 448 |
- } |
|
| 449 |
- |
|
| 450 |
- // M-step 1: estimate unknown class's methylation level (m) of each marker |
|
| 451 |
- double sum1=0, sum2=0; |
|
| 452 |
- int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base |
|
| 453 |
- for (int i=0; i<nrow; i++) {
|
|
| 454 |
- // for each row (or read) |
|
| 455 |
- p.get_row_label(i, markerId); // get the marker_index (0-base) of read i |
|
| 456 |
- curr_marker_index = markerId2Index[markerId]; |
|
| 457 |
- if (curr_marker_index != prev_marker_index) {
|
|
| 458 |
- // this is the 1st read of a new marker |
|
| 459 |
- // we need to summarize m value of previous marker |
|
| 460 |
- if (prev_marker_index!=-1) { // current marker is not the 1st marker
|
|
| 461 |
- m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); |
|
| 462 |
- sum1 = 0; |
|
| 463 |
- sum2 = 0; |
|
| 464 |
- } |
|
| 465 |
- prev_marker_index = curr_marker_index; |
|
| 466 |
- } |
|
| 467 |
- sum1 += q_unknown[i]*Rm[i]; |
|
| 468 |
- sum2 += q_unknown[i]*Rl[i]; |
|
| 469 |
- } |
|
| 470 |
- m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker |
|
| 471 |
- |
|
| 472 |
- //cout << "iter=" << iter << ", " << "m=" << endl; // for debug |
|
| 473 |
- //print_vec(cout, m, "\n"); // for debug |
|
| 474 |
- //cout << endl; // for debug |
|
| 475 |
- |
|
| 476 |
- // M-step 2: estimate theta, which has ncol+1 values |
|
| 477 |
- double sum; |
|
| 478 |
- for (int j=0; j<ncol; j++) {
|
|
| 479 |
- sum=0; |
|
| 480 |
- for (int i=0; i<nrow; i++) {
|
|
| 481 |
- q.get_element(i,j,v2); |
|
| 482 |
- sum += v2; |
|
| 483 |
- } |
|
| 484 |
- theta[j] = sum / nrow; |
|
| 485 |
- } |
|
| 486 |
- sum=0; |
|
| 487 |
- for (int i=0; i<nrow; i++) |
|
| 488 |
- sum += q_unknown[i]; |
|
| 489 |
- theta[ncol] = sum / nrow; |
|
| 490 |
- |
|
| 491 |
- //cout << "round " << iter+1 << ", theta="; // for debug |
|
| 492 |
- //print_vec(cout, theta, ", "); // for debug |
|
| 493 |
- //cout << endl << endl; // for debug |
|
| 494 |
- } |
|
| 495 |
- Rcpp::Rcerr << endl; |
|
| 496 |
- //cout << "final:" << endl << std::flush; |
|
| 497 |
- //print_vec(cout, theta, ", "); // for debug |
|
| 498 |
- //cout << endl << endl << std::flush; // for debug |
|
| 499 |
-} |
|
| 500 |
- |
|
| 501 |
-void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) |
|
| 502 |
-{
|
|
| 503 |
- unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 504 |
- unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 505 |
- |
|
| 506 |
- vector<int> uniq_marker_Ids; |
|
| 507 |
- q.get_unique_row_labels(uniq_marker_Ids); |
|
| 508 |
- // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 509 |
- |
|
| 510 |
- double v; |
|
| 511 |
- int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 512 |
- vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 |
|
| 513 |
- for (int i=0; i<nrow; i++) {
|
|
| 514 |
- // for each row (or read) |
|
| 515 |
- q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 516 |
- if (curr_markerId != prev_markerId) {
|
|
| 517 |
- // this is the 1st read of a new marker |
|
| 518 |
- // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 519 |
- // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 520 |
- if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 521 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 522 |
- readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 523 |
- assign_vector_zeros(readCountsPerMarker); |
|
| 524 |
- } |
|
| 525 |
- prev_markerId = curr_markerId; |
|
| 526 |
- } |
|
| 527 |
- for (int j=0; j<ncol; j++) {
|
|
| 528 |
- q.get_element(i,j,v); |
|
| 529 |
- readCountsPerMarker[j] += v; |
|
| 530 |
- } |
|
| 531 |
- } |
|
| 532 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 533 |
- readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 534 |
-} |
|
| 535 |
- |
|
| 536 |
-void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) |
|
| 537 |
-{
|
|
| 538 |
- unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 539 |
- unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 540 |
- if (nrow!=(unsigned int)q_unknown.size()) {
|
|
| 541 |
- Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; |
|
| 542 |
- // exit(EXIT_FAILURE); |
|
| 543 |
- } |
|
| 544 |
- |
|
| 545 |
- vector<int> uniq_marker_Ids; |
|
| 546 |
- q.get_unique_row_labels(uniq_marker_Ids); |
|
| 547 |
- // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 548 |
- |
|
| 549 |
- double v; |
|
| 550 |
- int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 551 |
- vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. |
|
| 552 |
- for (int i=0; i<nrow; i++) {
|
|
| 553 |
- // for each row (or read) |
|
| 554 |
- q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 555 |
- if (curr_markerId != prev_markerId) {
|
|
| 556 |
- // this is the 1st read of a new marker |
|
| 557 |
- // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 558 |
- // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 559 |
- if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 560 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 561 |
- readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 562 |
- assign_vector_zeros(readCountsPerMarker); |
|
| 563 |
- } |
|
| 564 |
- prev_markerId = curr_markerId; |
|
| 565 |
- } |
|
| 566 |
- for (int j=0; j<ncol; j++) {
|
|
| 567 |
- q.get_element(i,j,v); |
|
| 568 |
- readCountsPerMarker[j] += v; |
|
| 569 |
- } |
|
| 570 |
- readCountsPerMarker[ncol] += q_unknown[i]; |
|
| 571 |
- } |
|
| 572 |
- multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 573 |
- readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 574 |
-} |
|
| 575 |
- |
|
| 1 |
+// [[Rcpp::depends(BH)]] |
|
| 2 |
+#include <Rcpp.h> |
|
| 3 |
+using namespace Rcpp; |
|
| 4 |
+ |
|
| 5 |
+#include <cstdlib> |
|
| 6 |
+#include <cmath> |
|
| 7 |
+#include <sstream> |
|
| 8 |
+#include <iostream> |
|
| 9 |
+#include <fstream> |
|
| 10 |
+#include <map> |
|
| 11 |
+#include <vector> |
|
| 12 |
+#include <string> |
|
| 13 |
+#include <cassert> |
|
| 14 |
+#include <algorithm> |
|
| 15 |
+#include <boost/algorithm/string.hpp> |
|
| 16 |
+#include <boost/assign.hpp> |
|
| 17 |
+#include <boost/iostreams/filtering_stream.hpp> |
|
| 18 |
+#include "matrix.h" |
|
| 19 |
+#include "utils.h" |
|
| 20 |
+ |
|
| 21 |
+using namespace std; |
|
| 22 |
+using namespace boost; |
|
| 23 |
+ |
|
| 24 |
+#include "template_utils.cpp" |
|
| 25 |
+ |
|
| 26 |
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) |
|
| 27 |
+{
|
|
| 28 |
+ nrow = nrow_; |
|
| 29 |
+ ncol = ncol_; |
|
| 30 |
+ empty = false; |
|
| 31 |
+ mat.reserve(nrow_); |
|
| 32 |
+ for (int i=0; i<nrow; i++) {
|
|
| 33 |
+ vector<double> temp; |
|
| 34 |
+ temp.reserve(ncol); |
|
| 35 |
+ for (int j=0; j<ncol; j++) |
|
| 36 |
+ temp.push_back(init_value); |
|
| 37 |
+ mat.push_back( temp ); |
|
| 38 |
+ } |
|
| 39 |
+ empty = mat.empty(); |
|
| 40 |
+} |
|
| 41 |
+ |
|
| 42 |
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB. |
|
| 43 |
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) |
|
| 44 |
+{
|
|
| 45 |
+ vector<string> items; |
|
| 46 |
+ split(items, line, is_any_of("\t"));
|
|
| 47 |
+ vector<double> v; |
|
| 48 |
+ for (int i=0; i<items.size(); i++) {
|
|
| 49 |
+ if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. |
|
| 50 |
+ if (i<num_header_column) continue; // skip the first num_header_column header columns |
|
| 51 |
+ v.push_back( atof(items[i].c_str()) ); |
|
| 52 |
+ } |
|
| 53 |
+ mat.push_back( v ); |
|
| 54 |
+} |
|
| 55 |
+ |
|
| 56 |
+Matrix_Double::Matrix_Double(const string & mat_file, |
|
| 57 |
+ bool header_line=false, int num_header_column=1) |
|
| 58 |
+{
|
|
| 59 |
+ unsigned long i_line=0; |
|
| 60 |
+ // input is a plain text file |
|
| 61 |
+ istream * in=&cin; // default is stdin |
|
| 62 |
+ ifstream fin; |
|
| 63 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 64 |
+ fin.open(mat_file.c_str()); |
|
| 65 |
+ if (fin.fail()){
|
|
| 66 |
+ Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; |
|
| 67 |
+ // exit(EXIT_FAILURE); |
|
| 68 |
+ } |
|
| 69 |
+ in = &fin; |
|
| 70 |
+ } |
|
| 71 |
+ string line; |
|
| 72 |
+ while (!(*in).eof()) {
|
|
| 73 |
+ getline((*in), line); |
|
| 74 |
+ //cerr << "Line: " << line << endl; |
|
| 75 |
+ if (header_line) {
|
|
| 76 |
+ if (i_line==0) {
|
|
| 77 |
+ // skip the header line |
|
| 78 |
+ i_line++; |
|
| 79 |
+ continue; |
|
| 80 |
+ } |
|
| 81 |
+ } |
|
| 82 |
+ if (line.empty()) {
|
|
| 83 |
+ // this is the last line of the file |
|
| 84 |
+ break; |
|
| 85 |
+ } |
|
| 86 |
+ process_one_line_of_mat_file(line, num_header_column); |
|
| 87 |
+ i_line++; |
|
| 88 |
+ } |
|
| 89 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 90 |
+ fin.close(); |
|
| 91 |
+ } |
|
| 92 |
+ empty = mat.empty(); |
|
| 93 |
+ nrow = mat.size(); |
|
| 94 |
+ if (nrow>=1) ncol = mat[0].size(); |
|
| 95 |
+ else ncol = 0; |
|
| 96 |
+} |
|
| 97 |
+ |
|
| 98 |
+// return the row index of this vector |
|
| 99 |
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
|
|
| 100 |
+ if (v_.size()!=ncol) {
|
|
| 101 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 102 |
+ // exit(EXIT_FAILURE); |
|
| 103 |
+ } |
|
| 104 |
+ vector<double> v; |
|
| 105 |
+ for (int i=0; i<v_.size(); i++) {
|
|
| 106 |
+ v.push_back( v_[i] ); |
|
| 107 |
+ } |
|
| 108 |
+ mat.push_back( v ); |
|
| 109 |
+ nrow = mat.size(); |
|
| 110 |
+ row_labels.push_back(row_label_); |
|
| 111 |
+ empty=false; |
|
| 112 |
+ return(nrow-1); |
|
| 113 |
+} |
|
| 114 |
+ |
|
| 115 |
+// |
|
| 116 |
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. |
|
| 117 |
+// |
|
| 118 |
+// return row index of this appended vector, if successfully appending; return -1 otherwise. |
|
| 119 |
+// |
|
| 120 |
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
|
|
| 121 |
+ if (v_.size()!=ncol) {
|
|
| 122 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 123 |
+ // exit(EXIT_FAILURE); |
|
| 124 |
+ } |
|
| 125 |
+ double min_ = *min_element(v_.begin(), v_.end()); |
|
| 126 |
+ double max_ = *max_element(v_.begin(), v_.end()); |
|
| 127 |
+ //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; |
|
| 128 |
+ if (min_==0) {
|
|
| 129 |
+ if (max_==0) return(-1); |
|
| 130 |
+ } else {
|
|
| 131 |
+ double ratio=max_/min_; |
|
| 132 |
+ if (ratio<min_threshold_maxv_minv) return(-1); |
|
| 133 |
+ } |
|
| 134 |
+ long row_index = append_row_vector(v_, row_label_); |
|
| 135 |
+ return(row_index); |
|
| 136 |
+} |
|
| 137 |
+ |
|
| 138 |
+bool Matrix_Double::get_element(const int i, const int j, double & v) |
|
| 139 |
+{
|
|
| 140 |
+ if (!empty) {
|
|
| 141 |
+ if (i<nrow && j<ncol) {
|
|
| 142 |
+ v = mat[i][j]; |
|
| 143 |
+ return true; |
|
| 144 |
+ } else {
|
|
| 145 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 146 |
+ return false; |
|
| 147 |
+ } |
|
| 148 |
+ } else {
|
|
| 149 |
+ v = 0; |
|
| 150 |
+ return false; |
|
| 151 |
+ } |
|
| 152 |
+} |
|
| 153 |
+ |
|
| 154 |
+bool Matrix_Double::set_element(const int i, const int j, double v) |
|
| 155 |
+{
|
|
| 156 |
+ if (!empty) {
|
|
| 157 |
+ if (i<nrow && j<ncol) {
|
|
| 158 |
+ mat[i][j] = v; |
|
| 159 |
+ return true; |
|
| 160 |
+ } else {
|
|
| 161 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 162 |
+ return false; |
|
| 163 |
+ } |
|
| 164 |
+ } else {
|
|
| 165 |
+ return false; |
|
| 166 |
+ } |
|
| 167 |
+} |
|
| 168 |
+ |
|
| 169 |
+bool Matrix_Double::get_column_sum(const int j, double & v) |
|
| 170 |
+{
|
|
| 171 |
+ if (!empty) {
|
|
| 172 |
+ if (j<ncol) {
|
|
| 173 |
+ v = 0; |
|
| 174 |
+ for (int k=0; k<nrow; k++) {
|
|
| 175 |
+ v += mat[k][j]; |
|
| 176 |
+ } |
|
| 177 |
+ return true; |
|
| 178 |
+ } else {
|
|
| 179 |
+ Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
|
|
| 180 |
+ return false; |
|
| 181 |
+ } |
|
| 182 |
+ } else {
|
|
| 183 |
+ v = 0; |
|
| 184 |
+ return false; |
|
| 185 |
+ } |
|
| 186 |
+} |
|
| 187 |
+ |
|
| 188 |
+bool Matrix_Double::get_row_sum(const int i, double & v) |
|
| 189 |
+{
|
|
| 190 |
+ if (!empty) {
|
|
| 191 |
+ if (i<nrow) {
|
|
| 192 |
+ v = 0; |
|
| 193 |
+ for (int k=0; k<ncol; k++) {
|
|
| 194 |
+ v += mat[i][k]; |
|
| 195 |
+ } |
|
| 196 |
+ return true; |
|
| 197 |
+ } else {
|
|
| 198 |
+ Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
|
|
| 199 |
+ return false; |
|
| 200 |
+ } |
|
| 201 |
+ } else {
|
|
| 202 |
+ v = 0; |
|
| 203 |
+ return false; |
|
| 204 |
+ } |
|
| 205 |
+} |
|
| 206 |
+ |
|
| 207 |
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) |
|
| 208 |
+{
|
|
| 209 |
+ map<int, int> counts; |
|
| 210 |
+ for (int i=0; i<row_labels.size(); i++) |
|
| 211 |
+ if (counts.find(row_labels[i]) != counts.end()) |
|
| 212 |
+ counts[row_labels[i]] += 1; // found this label, then increment count |
|
| 213 |
+ else {
|
|
| 214 |
+ counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once |
|
| 215 |
+ //counts[row_labels[i]] = 1; |
|
| 216 |
+ } |
|
| 217 |
+ map<int,int>::iterator it; |
|
| 218 |
+ for (it=counts.begin(); it!=counts.end(); ++it) |
|
| 219 |
+ uniq_labels.push_back(it->first); |
|
| 220 |
+} |
|
| 221 |
+ |
|
| 222 |
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) |
|
| 223 |
+{
|
|
| 224 |
+ // obtain unique row labels |
|
| 225 |
+ vector<int> unique_row_labels; |
|
| 226 |
+ int row_label_; |
|
| 227 |
+ for (int i=0; i<nrow; i++) {
|
|
| 228 |
+ row_label_ = row_labels[i]; |
|
| 229 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 230 |
+ } |
|
| 231 |
+ Bins2Value::iterator iter; |
|
| 232 |
+ for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
|
|
| 233 |
+ row_label_ = iter->first; |
|
| 234 |
+ if (!exist_row_label(row_label_)) {
|
|
| 235 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 236 |
+ } |
|
| 237 |
+ } |
|
| 238 |
+ sort(unique_row_labels.begin(), unique_row_labels.end()); |
|
| 239 |
+ // print both 'mat' and 'additional_column_data' as the last column |
|
| 240 |
+ for (int i=0; i<unique_row_labels.size(); i++) {
|
|
| 241 |
+ row_label_ = unique_row_labels[i]; |
|
| 242 |
+ os << row_label_; |
|
| 243 |
+ int row_index = get_row_index(row_label_); |
|
| 244 |
+ if (row_index!=-1) {
|
|
| 245 |
+ for (int j=0; j<ncol; j++) {
|
|
| 246 |
+ os << "\t" << mat[row_index][j]; |
|
| 247 |
+ } |
|
| 248 |
+ } else {
|
|
| 249 |
+ for (int j=0; j<ncol; j++) {
|
|
| 250 |
+ os << "\t0"; |
|
| 251 |
+ } |
|
| 252 |
+ } |
|
| 253 |
+ if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
|
|
| 254 |
+ os << "\t" << additional_column_data[row_label_]; |
|
| 255 |
+ } else {
|
|
| 256 |
+ os << "\t0"; |
|
| 257 |
+ } |
|
| 258 |
+ os << endl; |
|
| 259 |
+ } |
|
| 260 |
+} |
|
| 261 |
+ |
|
| 262 |
+ostream & operator<<(ostream & os, Matrix_Double & mat) |
|
| 263 |
+{
|
|
| 264 |
+ double v; |
|
| 265 |
+ int row_label; |
|
| 266 |
+ if (!mat.isempty()) {
|
|
| 267 |
+ for (int i=0; i<mat.get_row_num(); i++) {
|
|
| 268 |
+ int j; |
|
| 269 |
+ mat.get_row_label(i, row_label); |
|
| 270 |
+ os << row_label << "\t"; |
|
| 271 |
+ for (j=0; j<mat.get_column_num()-1; j++) {
|
|
| 272 |
+ mat.get_element(i,j,v); |
|
| 273 |
+ os << v << "\t"; |
|
| 274 |
+ } |
|
| 275 |
+ mat.get_element(i,j,v); |
|
| 276 |
+ os << v << endl; |
|
| 277 |
+ } |
|
| 278 |
+ } |
|
| 279 |
+ return os; |
|
| 280 |
+} |
|
| 281 |
+ |
|
| 282 |
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
|
| 283 |
+{
|
|
| 284 |
+ unsigned int ncol = p.get_column_num(); |
|
| 285 |
+ unsigned int nrow = p.get_row_num(); |
|
| 286 |
+ double v; |
|
| 287 |
+ double obj = 0; |
|
| 288 |
+ for (int i=0; i<nrow; i++) {
|
|
| 289 |
+ double sum = 0; |
|
| 290 |
+ for (int j=0; j<ncol; j++) {
|
|
| 291 |
+ p.get_element(i,j,v); |
|
| 292 |
+ sum += theta[j]*v; |
|
| 293 |
+ } |
|
| 294 |
+ obj += log(sum); |
|
| 295 |
+ } |
|
| 296 |
+ return obj; |
|
| 297 |
+} |
|
| 298 |
+// |
|
| 299 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 300 |
+// |
|
| 301 |
+// input: |
|
| 302 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 303 |
+// |
|
| 304 |
+// output: |
|
| 305 |
+// theta (model parameters), a vector with "T" elements. |
|
| 306 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 307 |
+// |
|
| 308 |
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 309 |
+{
|
|
| 310 |
+ // cout.precision(15); |
|
| 311 |
+ // cerr.precision(15); |
|
| 312 |
+ unsigned int ncol = p.get_column_num(); |
|
| 313 |
+ unsigned int nrow = p.get_row_num(); // nrow is number of tissues. |
|
| 314 |
+ theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. |
|
| 315 |
+ // initialize model parameters as uniform distribution |
|
| 316 |
+ // alternatively, model parameters can be random numbers by satisfying the crition |
|
| 317 |
+ // (1) \sum_{i=1}^{ncol}{theta_i}=1
|
|
| 318 |
+ for (int j=0; j<ncol; j++) {
|
|
| 319 |
+ theta[j] = 1/(double)ncol; |
|
| 320 |
+ } |
|
| 321 |
+ // create and initialize q with the same size of p and with all elements initialized as 0 |
|
| 322 |
+ //Matrix_Double q(nrow, ncol, 0); |
|
| 323 |
+ //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
|
| 324 |
+ double v1, v2; |
|
| 325 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 326 |
+ Rcpp::Rcerr << iter+1 << "," ; |
|
| 327 |
+ // E-step: estimate q |
|
| 328 |
+ for (int i=0; i<nrow; i++) {
|
|
| 329 |
+ double sum = 0; |
|
| 330 |
+ for (int j=0; j<ncol; j++) {
|
|
| 331 |
+ p.get_element(i,j,v1); |
|
| 332 |
+ v2 = theta[j]*v1; |
|
| 333 |
+ q.set_element( i, j, v2 ); |
|
| 334 |
+ sum += v2; |
|
| 335 |
+ } |
|
| 336 |
+ for (int j=0; j<ncol; j++) {
|
|
| 337 |
+ q.get_element(i,j,v2); |
|
| 338 |
+ v2 /= sum; |
|
| 339 |
+ q.set_element( i, j, v2 ); |
|
| 340 |
+ } |
|
| 341 |
+ } |
|
| 342 |
+ // M-step: estimate theta |
|
| 343 |
+ for (int j=0; j<ncol; j++) {
|
|
| 344 |
+ double sum=0; |
|
| 345 |
+ for (int i=0; i<nrow; i++) {
|
|
| 346 |
+ q.get_element(i,j,v2); |
|
| 347 |
+ sum += v2; |
|
| 348 |
+ } |
|
| 349 |
+ theta[j] = sum / nrow; |
|
| 350 |
+ } |
|
| 351 |
+ // for debug |
|
| 352 |
+ //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; |
|
| 353 |
+ } |
|
| 354 |
+ // last E-step: estimate q |
|
| 355 |
+ for (int i=0; i<nrow; i++) {
|
|
| 356 |
+ double sum = 0; |
|
| 357 |
+ for (int j=0; j<ncol; j++) {
|
|
| 358 |
+ p.get_element(i,j,v1); |
|
| 359 |
+ v2 = theta[j]*v1; |
|
| 360 |
+ q.set_element( i, j, v2 ); |
|
| 361 |
+ sum += v2; |
|
| 362 |
+ } |
|
| 363 |
+ for (int j=0; j<ncol; j++) {
|
|
| 364 |
+ q.get_element(i,j,v2); |
|
| 365 |
+ v2 /= sum; |
|
| 366 |
+ q.set_element( i, j, v2 ); |
|
| 367 |
+ } |
|
| 368 |
+ } |
|
| 369 |
+ Rcpp::Rcerr << endl; |
|
| 370 |
+} |
|
| 371 |
+ |
|
| 372 |
+// |
|
| 373 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 374 |
+// |
|
| 375 |
+// input: |
|
| 376 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 377 |
+// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. |
|
| 378 |
+// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read |
|
| 379 |
+// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read |
|
| 380 |
+// |
|
| 381 |
+// output: |
|
| 382 |
+// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. |
|
| 383 |
+// m is a vector of M X 1, where M is number of markers. |
|
| 384 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 385 |
+// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. |
|
| 386 |
+// |
|
| 387 |
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
|
| 388 |
+ int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
|
| 389 |
+ vector<double> & m) |
|
| 390 |
+{
|
|
| 391 |
+ // cout.precision(15); |
|
| 392 |
+ // cerr.precision(15); |
|
| 393 |
+ unsigned int ncol = p.get_column_num(); // T, number of known tissues |
|
| 394 |
+ unsigned int nrow = p.get_row_num(); // N, number of reads |
|
| 395 |
+ |
|
| 396 |
+ theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. |
|
| 397 |
+ |
|
| 398 |
+ vector<int> uniq_marker_Ids; |
|
| 399 |
+ p.get_unique_row_labels(uniq_marker_Ids); |
|
| 400 |
+ int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 401 |
+ m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. |
|
| 402 |
+ map<int,int> markerId2Index; // marker index is 0-based. |
|
| 403 |
+ for (int index=0; index<nMarker; index++) {
|
|
| 404 |
+ markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); |
|
| 405 |
+ } |
|
| 406 |
+ |
|
| 407 |
+ // initialize model parameters theta as uniform distribution, and m as 0.5 |
|
| 408 |
+ // alternatively, theta and m can be random numbers, by satisfying: |
|
| 409 |
+ // (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
|
| 410 |
+ // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
|
| 411 |
+ for (int j=0; j<ncol+1; j++) {
|
|
| 412 |
+ theta[j] = 1/(double)(ncol+1); |
|
| 413 |
+ } |
|
| 414 |
+ for (int k=0; k<nMarker; k++) {
|
|
| 415 |
+ m[k] = 0.5; |
|
| 416 |
+ } |
|
| 417 |
+ for (int i=0; i<nrow; i++) |
|
| 418 |
+ q_unknown.push_back(0); |
|
| 419 |
+ |
|
| 420 |
+ // EM algorithm |
|
| 421 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 422 |
+ Rcpp::Rcerr << iter+1 << "," ; |
|
| 423 |
+ // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) |
|
| 424 |
+ double v1, v2, likelihood_unknown_class; |
|
| 425 |
+ int markerId, markerIndex; |
|
| 426 |
+ for (int i=0; i<nrow; i++) {
|
|
| 427 |
+ // process the first T known tissues |
|
| 428 |
+ double sum = 0; |
|
| 429 |
+ for (int j=0; j<ncol; j++) {
|
|
| 430 |
+ p.get_element(i,j,v1); |
|
| 431 |
+ v2 = theta[j]*v1; |
|
| 432 |
+ q.set_element( i, j, v2 ); |
|
| 433 |
+ sum += v2; |
|
| 434 |
+ } |
|
| 435 |
+ // process the last unknown tissue |
|
| 436 |
+ p.get_row_label(i, markerId); // get markerId (1-base) of read i |
|
| 437 |
+ markerIndex = markerId2Index[markerId]; |
|
| 438 |
+ likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]); |
|
| 439 |
+ q_unknown[i] = theta[ncol]*likelihood_unknown_class; |
|
| 440 |
+ sum += q_unknown[i]; |
|
| 441 |
+ |
|
| 442 |
+ // update q and q_unknown |
|
| 443 |
+ for (int j=0; j<ncol; j++) {
|
|
| 444 |
+ q.get_element(i,j,v2); |
|
| 445 |
+ v2 /= sum; |
|
| 446 |
+ q.set_element( i, j, v2 ); |
|
| 447 |
+ } |
|
| 448 |
+ q_unknown[i] /= sum; |
|
| 449 |
+ } |
|
| 450 |
+ |
|
| 451 |
+ // M-step 1: estimate unknown class's methylation level (m) of each marker |
|
| 452 |
+ double sum1=0, sum2=0; |
|
| 453 |
+ int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base |
|
| 454 |
+ for (int i=0; i<nrow; i++) {
|
|
| 455 |
+ // for each row (or read) |
|
| 456 |
+ p.get_row_label(i, markerId); // get the marker_index (0-base) of read i |
|
| 457 |
+ curr_marker_index = markerId2Index[markerId]; |
|
| 458 |
+ if (curr_marker_index != prev_marker_index) {
|
|
| 459 |
+ // this is the 1st read of a new marker |
|
| 460 |
+ // we need to summarize m value of previous marker |
|
| 461 |
+ if (prev_marker_index!=-1) { // current marker is not the 1st marker
|
|
| 462 |
+ m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); |
|
| 463 |
+ sum1 = 0; |
|
| 464 |
+ sum2 = 0; |
|
| 465 |
+ } |
|
| 466 |
+ prev_marker_index = curr_marker_index; |
|
| 467 |
+ } |
|
| 468 |
+ sum1 += q_unknown[i]*Rm[i]; |
|
| 469 |
+ sum2 += q_unknown[i]*Rl[i]; |
|
| 470 |
+ } |
|
| 471 |
+ m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker |
|
| 472 |
+ |
|
| 473 |
+ //cout << "iter=" << iter << ", " << "m=" << endl; // for debug |
|
| 474 |
+ //print_vec(cout, m, "\n"); // for debug |
|
| 475 |
+ //cout << endl; // for debug |
|
| 476 |
+ |
|
| 477 |
+ // M-step 2: estimate theta, which has ncol+1 values |
|
| 478 |
+ double sum; |
|
| 479 |
+ for (int j=0; j<ncol; j++) {
|
|
| 480 |
+ sum=0; |
|
| 481 |
+ for (int i=0; i<nrow; i++) {
|
|
| 482 |
+ q.get_element(i,j,v2); |
|
| 483 |
+ sum += v2; |
|
| 484 |
+ } |
|
| 485 |
+ theta[j] = sum / nrow; |
|
| 486 |
+ } |
|
| 487 |
+ sum=0; |
|
| 488 |
+ for (int i=0; i<nrow; i++) |
|
| 489 |
+ sum += q_unknown[i]; |
|
| 490 |
+ theta[ncol] = sum / nrow; |
|
| 491 |
+ |
|
| 492 |
+ //cout << "round " << iter+1 << ", theta="; // for debug |
|
| 493 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 494 |
+ //cout << endl << endl; // for debug |
|
| 495 |
+ } |
|
| 496 |
+ Rcpp::Rcerr << endl; |
|
| 497 |
+ //cout << "final:" << endl << std::flush; |
|
| 498 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 499 |
+ //cout << endl << endl << std::flush; // for debug |
|
| 500 |
+} |
|
| 501 |
+ |
|
| 502 |
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) |
|
| 503 |
+{
|
|
| 504 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 505 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 506 |
+ |
|
| 507 |
+ vector<int> uniq_marker_Ids; |
|
| 508 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 509 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 510 |
+ |
|
| 511 |
+ double v; |
|
| 512 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 513 |
+ vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 |
|
| 514 |
+ for (int i=0; i<nrow; i++) {
|
|
| 515 |
+ // for each row (or read) |
|
| 516 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 517 |
+ if (curr_markerId != prev_markerId) {
|
|
| 518 |
+ // this is the 1st read of a new marker |
|
| 519 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 520 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 521 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 522 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 523 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 524 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 525 |
+ } |
|
| 526 |
+ prev_markerId = curr_markerId; |
|
| 527 |
+ } |
|
| 528 |
+ for (int j=0; j<ncol; j++) {
|
|
| 529 |
+ q.get_element(i,j,v); |
|
| 530 |
+ readCountsPerMarker[j] += v; |
|
| 531 |
+ } |
|
| 532 |
+ } |
|
| 533 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 534 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 535 |
+} |
|
| 536 |
+ |
|
| 537 |
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) |
|
| 538 |
+{
|
|
| 539 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 540 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 541 |
+ if (nrow!=(unsigned int)q_unknown.size()) {
|
|
| 542 |
+ Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; |
|
| 543 |
+ // exit(EXIT_FAILURE); |
|
| 544 |
+ } |
|
| 545 |
+ |
|
| 546 |
+ vector<int> uniq_marker_Ids; |
|
| 547 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 548 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 549 |
+ |
|
| 550 |
+ double v; |
|
| 551 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 552 |
+ vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. |
|
| 553 |
+ for (int i=0; i<nrow; i++) {
|
|
| 554 |
+ // for each row (or read) |
|
| 555 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 556 |
+ if (curr_markerId != prev_markerId) {
|
|
| 557 |
+ // this is the 1st read of a new marker |
|
| 558 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 559 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 560 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 561 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 562 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 563 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 564 |
+ } |
|
| 565 |
+ prev_markerId = curr_markerId; |
|
| 566 |
+ } |
|
| 567 |
+ for (int j=0; j<ncol; j++) {
|
|
| 568 |
+ q.get_element(i,j,v); |
|
| 569 |
+ readCountsPerMarker[j] += v; |
|
| 570 |
+ } |
|
| 571 |
+ readCountsPerMarker[ncol] += q_unknown[i]; |
|
| 572 |
+ } |
|
| 573 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 574 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 575 |
+} |
|
| 576 |
+ |
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,575 @@ |
| 1 |
+#include <Rcpp.h> |
|
| 2 |
+using namespace Rcpp; |
|
| 3 |
+ |
|
| 4 |
+#include <cstdlib> |
|
| 5 |
+#include <cmath> |
|
| 6 |
+#include <sstream> |
|
| 7 |
+#include <iostream> |
|
| 8 |
+#include <fstream> |
|
| 9 |
+#include <map> |
|
| 10 |
+#include <vector> |
|
| 11 |
+#include <string> |
|
| 12 |
+#include <cassert> |
|
| 13 |
+#include <algorithm> |
|
| 14 |
+#include <boost/algorithm/string.hpp> |
|
| 15 |
+#include <boost/assign.hpp> |
|
| 16 |
+#include <boost/iostreams/filtering_stream.hpp> |
|
| 17 |
+#include "matrix.h" |
|
| 18 |
+#include "utils.h" |
|
| 19 |
+ |
|
| 20 |
+using namespace std; |
|
| 21 |
+using namespace boost; |
|
| 22 |
+ |
|
| 23 |
+#include "template_utils.cpp" |
|
| 24 |
+ |
|
| 25 |
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value) |
|
| 26 |
+{
|
|
| 27 |
+ nrow = nrow_; |
|
| 28 |
+ ncol = ncol_; |
|
| 29 |
+ empty = false; |
|
| 30 |
+ mat.reserve(nrow_); |
|
| 31 |
+ for (int i=0; i<nrow; i++) {
|
|
| 32 |
+ vector<double> temp; |
|
| 33 |
+ temp.reserve(ncol); |
|
| 34 |
+ for (int j=0; j<ncol; j++) |
|
| 35 |
+ temp.push_back(init_value); |
|
| 36 |
+ mat.push_back( temp ); |
|
| 37 |
+ } |
|
| 38 |
+ empty = mat.empty(); |
|
| 39 |
+} |
|
| 40 |
+ |
|
| 41 |
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB. |
|
| 42 |
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column) |
|
| 43 |
+{
|
|
| 44 |
+ vector<string> items; |
|
| 45 |
+ split(items, line, is_any_of("\t"));
|
|
| 46 |
+ vector<double> v; |
|
| 47 |
+ for (int i=0; i<items.size(); i++) {
|
|
| 48 |
+ if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row. |
|
| 49 |
+ if (i<num_header_column) continue; // skip the first num_header_column header columns |
|
| 50 |
+ v.push_back( atof(items[i].c_str()) ); |
|
| 51 |
+ } |
|
| 52 |
+ mat.push_back( v ); |
|
| 53 |
+} |
|
| 54 |
+ |
|
| 55 |
+Matrix_Double::Matrix_Double(const string & mat_file, |
|
| 56 |
+ bool header_line=false, int num_header_column=1) |
|
| 57 |
+{
|
|
| 58 |
+ unsigned long i_line=0; |
|
| 59 |
+ // input is a plain text file |
|
| 60 |
+ istream * in=&cin; // default is stdin |
|
| 61 |
+ ifstream fin; |
|
| 62 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 63 |
+ fin.open(mat_file.c_str()); |
|
| 64 |
+ if (fin.fail()){
|
|
| 65 |
+ Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl; |
|
| 66 |
+ // exit(EXIT_FAILURE); |
|
| 67 |
+ } |
|
| 68 |
+ in = &fin; |
|
| 69 |
+ } |
|
| 70 |
+ string line; |
|
| 71 |
+ while (!(*in).eof()) {
|
|
| 72 |
+ getline((*in), line); |
|
| 73 |
+ //cerr << "Line: " << line << endl; |
|
| 74 |
+ if (header_line) {
|
|
| 75 |
+ if (i_line==0) {
|
|
| 76 |
+ // skip the header line |
|
| 77 |
+ i_line++; |
|
| 78 |
+ continue; |
|
| 79 |
+ } |
|
| 80 |
+ } |
|
| 81 |
+ if (line.empty()) {
|
|
| 82 |
+ // this is the last line of the file |
|
| 83 |
+ break; |
|
| 84 |
+ } |
|
| 85 |
+ process_one_line_of_mat_file(line, num_header_column); |
|
| 86 |
+ i_line++; |
|
| 87 |
+ } |
|
| 88 |
+ if ( !(mat_file.compare("stdin")==0)) {
|
|
| 89 |
+ fin.close(); |
|
| 90 |
+ } |
|
| 91 |
+ empty = mat.empty(); |
|
| 92 |
+ nrow = mat.size(); |
|
| 93 |
+ if (nrow>=1) ncol = mat[0].size(); |
|
| 94 |
+ else ncol = 0; |
|
| 95 |
+} |
|
| 96 |
+ |
|
| 97 |
+// return the row index of this vector |
|
| 98 |
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
|
|
| 99 |
+ if (v_.size()!=ncol) {
|
|
| 100 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 101 |
+ // exit(EXIT_FAILURE); |
|
| 102 |
+ } |
|
| 103 |
+ vector<double> v; |
|
| 104 |
+ for (int i=0; i<v_.size(); i++) {
|
|
| 105 |
+ v.push_back( v_[i] ); |
|
| 106 |
+ } |
|
| 107 |
+ mat.push_back( v ); |
|
| 108 |
+ nrow = mat.size(); |
|
| 109 |
+ row_labels.push_back(row_label_); |
|
| 110 |
+ empty=false; |
|
| 111 |
+ return(nrow-1); |
|
| 112 |
+} |
|
| 113 |
+ |
|
| 114 |
+// |
|
| 115 |
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it. |
|
| 116 |
+// |
|
| 117 |
+// return row index of this appended vector, if successfully appending; return -1 otherwise. |
|
| 118 |
+// |
|
| 119 |
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
|
|
| 120 |
+ if (v_.size()!=ncol) {
|
|
| 121 |
+ Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
|
|
| 122 |
+ // exit(EXIT_FAILURE); |
|
| 123 |
+ } |
|
| 124 |
+ double min_ = *min_element(v_.begin(), v_.end()); |
|
| 125 |
+ double max_ = *max_element(v_.begin(), v_.end()); |
|
| 126 |
+ //cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl; |
|
| 127 |
+ if (min_==0) {
|
|
| 128 |
+ if (max_==0) return(-1); |
|
| 129 |
+ } else {
|
|
| 130 |
+ double ratio=max_/min_; |
|
| 131 |
+ if (ratio<min_threshold_maxv_minv) return(-1); |
|
| 132 |
+ } |
|
| 133 |
+ long row_index = append_row_vector(v_, row_label_); |
|
| 134 |
+ return(row_index); |
|
| 135 |
+} |
|
| 136 |
+ |
|
| 137 |
+bool Matrix_Double::get_element(const int i, const int j, double & v) |
|
| 138 |
+{
|
|
| 139 |
+ if (!empty) {
|
|
| 140 |
+ if (i<nrow && j<ncol) {
|
|
| 141 |
+ v = mat[i][j]; |
|
| 142 |
+ return true; |
|
| 143 |
+ } else {
|
|
| 144 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 145 |
+ return false; |
|
| 146 |
+ } |
|
| 147 |
+ } else {
|
|
| 148 |
+ v = 0; |
|
| 149 |
+ return false; |
|
| 150 |
+ } |
|
| 151 |
+} |
|
| 152 |
+ |
|
| 153 |
+bool Matrix_Double::set_element(const int i, const int j, double v) |
|
| 154 |
+{
|
|
| 155 |
+ if (!empty) {
|
|
| 156 |
+ if (i<nrow && j<ncol) {
|
|
| 157 |
+ mat[i][j] = v; |
|
| 158 |
+ return true; |
|
| 159 |
+ } else {
|
|
| 160 |
+ Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl; |
|
| 161 |
+ return false; |
|
| 162 |
+ } |
|
| 163 |
+ } else {
|
|
| 164 |
+ return false; |
|
| 165 |
+ } |
|
| 166 |
+} |
|
| 167 |
+ |
|
| 168 |
+bool Matrix_Double::get_column_sum(const int j, double & v) |
|
| 169 |
+{
|
|
| 170 |
+ if (!empty) {
|
|
| 171 |
+ if (j<ncol) {
|
|
| 172 |
+ v = 0; |
|
| 173 |
+ for (int k=0; k<nrow; k++) {
|
|
| 174 |
+ v += mat[k][j]; |
|
| 175 |
+ } |
|
| 176 |
+ return true; |
|
| 177 |
+ } else {
|
|
| 178 |
+ Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
|
|
| 179 |
+ return false; |
|
| 180 |
+ } |
|
| 181 |
+ } else {
|
|
| 182 |
+ v = 0; |
|
| 183 |
+ return false; |
|
| 184 |
+ } |
|
| 185 |
+} |
|
| 186 |
+ |
|
| 187 |
+bool Matrix_Double::get_row_sum(const int i, double & v) |
|
| 188 |
+{
|
|
| 189 |
+ if (!empty) {
|
|
| 190 |
+ if (i<nrow) {
|
|
| 191 |
+ v = 0; |
|
| 192 |
+ for (int k=0; k<ncol; k++) {
|
|
| 193 |
+ v += mat[i][k]; |
|
| 194 |
+ } |
|
| 195 |
+ return true; |
|
| 196 |
+ } else {
|
|
| 197 |
+ Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
|
|
| 198 |
+ return false; |
|
| 199 |
+ } |
|
| 200 |
+ } else {
|
|
| 201 |
+ v = 0; |
|
| 202 |
+ return false; |
|
| 203 |
+ } |
|
| 204 |
+} |
|
| 205 |
+ |
|
| 206 |
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels) |
|
| 207 |
+{
|
|
| 208 |
+ map<int, int> counts; |
|
| 209 |
+ for (int i=0; i<row_labels.size(); i++) |
|
| 210 |
+ if (counts.find(row_labels[i]) != counts.end()) |
|
| 211 |
+ counts[row_labels[i]] += 1; // found this label, then increment count |
|
| 212 |
+ else {
|
|
| 213 |
+ counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once |
|
| 214 |
+ //counts[row_labels[i]] = 1; |
|
| 215 |
+ } |
|
| 216 |
+ map<int,int>::iterator it; |
|
| 217 |
+ for (it=counts.begin(); it!=counts.end(); ++it) |
|
| 218 |
+ uniq_labels.push_back(it->first); |
|
| 219 |
+} |
|
| 220 |
+ |
|
| 221 |
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data) |
|
| 222 |
+{
|
|
| 223 |
+ // obtain unique row labels |
|
| 224 |
+ vector<int> unique_row_labels; |
|
| 225 |
+ int row_label_; |
|
| 226 |
+ for (int i=0; i<nrow; i++) {
|
|
| 227 |
+ row_label_ = row_labels[i]; |
|
| 228 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 229 |
+ } |
|
| 230 |
+ Bins2Value::iterator iter; |
|
| 231 |
+ for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
|
|
| 232 |
+ row_label_ = iter->first; |
|
| 233 |
+ if (!exist_row_label(row_label_)) {
|
|
| 234 |
+ unique_row_labels.push_back( row_label_ ); |
|
| 235 |
+ } |
|
| 236 |
+ } |
|
| 237 |
+ sort(unique_row_labels.begin(), unique_row_labels.end()); |
|
| 238 |
+ // print both 'mat' and 'additional_column_data' as the last column |
|
| 239 |
+ for (int i=0; i<unique_row_labels.size(); i++) {
|
|
| 240 |
+ row_label_ = unique_row_labels[i]; |
|
| 241 |
+ os << row_label_; |
|
| 242 |
+ int row_index = get_row_index(row_label_); |
|
| 243 |
+ if (row_index!=-1) {
|
|
| 244 |
+ for (int j=0; j<ncol; j++) {
|
|
| 245 |
+ os << "\t" << mat[row_index][j]; |
|
| 246 |
+ } |
|
| 247 |
+ } else {
|
|
| 248 |
+ for (int j=0; j<ncol; j++) {
|
|
| 249 |
+ os << "\t0"; |
|
| 250 |
+ } |
|
| 251 |
+ } |
|
| 252 |
+ if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
|
|
| 253 |
+ os << "\t" << additional_column_data[row_label_]; |
|
| 254 |
+ } else {
|
|
| 255 |
+ os << "\t0"; |
|
| 256 |
+ } |
|
| 257 |
+ os << endl; |
|
| 258 |
+ } |
|
| 259 |
+} |
|
| 260 |
+ |
|
| 261 |
+ostream & operator<<(ostream & os, Matrix_Double & mat) |
|
| 262 |
+{
|
|
| 263 |
+ double v; |
|
| 264 |
+ int row_label; |
|
| 265 |
+ if (!mat.isempty()) {
|
|
| 266 |
+ for (int i=0; i<mat.get_row_num(); i++) {
|
|
| 267 |
+ int j; |
|
| 268 |
+ mat.get_row_label(i, row_label); |
|
| 269 |
+ os << row_label << "\t"; |
|
| 270 |
+ for (j=0; j<mat.get_column_num()-1; j++) {
|
|
| 271 |
+ mat.get_element(i,j,v); |
|
| 272 |
+ os << v << "\t"; |
|
| 273 |
+ } |
|
| 274 |
+ mat.get_element(i,j,v); |
|
| 275 |
+ os << v << endl; |
|
| 276 |
+ } |
|
| 277 |
+ } |
|
| 278 |
+ return os; |
|
| 279 |
+} |
|
| 280 |
+ |
|
| 281 |
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta) |
|
| 282 |
+{
|
|
| 283 |
+ unsigned int ncol = p.get_column_num(); |
|
| 284 |
+ unsigned int nrow = p.get_row_num(); |
|
| 285 |
+ double v; |
|
| 286 |
+ double obj = 0; |
|
| 287 |
+ for (int i=0; i<nrow; i++) {
|
|
| 288 |
+ double sum = 0; |
|
| 289 |
+ for (int j=0; j<ncol; j++) {
|
|
| 290 |
+ p.get_element(i,j,v); |
|
| 291 |
+ sum += theta[j]*v; |
|
| 292 |
+ } |
|
| 293 |
+ obj += log(sum); |
|
| 294 |
+ } |
|
| 295 |
+ return obj; |
|
| 296 |
+} |
|
| 297 |
+// |
|
| 298 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 299 |
+// |
|
| 300 |
+// input: |
|
| 301 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 302 |
+// |
|
| 303 |
+// output: |
|
| 304 |
+// theta (model parameters), a vector with "T" elements. |
|
| 305 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 306 |
+// |
|
| 307 |
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q) |
|
| 308 |
+{
|
|
| 309 |
+ // cout.precision(15); |
|
| 310 |
+ // cerr.precision(15); |
|
| 311 |
+ unsigned int ncol = p.get_column_num(); |
|
| 312 |
+ unsigned int nrow = p.get_row_num(); // nrow is number of tissues. |
|
| 313 |
+ theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s. |
|
| 314 |
+ // initialize model parameters as uniform distribution |
|
| 315 |
+ // alternatively, model parameters can be random numbers by satisfying the crition |
|
| 316 |
+ // (1) \sum_{i=1}^{ncol}{theta_i}=1
|
|
| 317 |
+ for (int j=0; j<ncol; j++) {
|
|
| 318 |
+ theta[j] = 1/(double)ncol; |
|
| 319 |
+ } |
|
| 320 |
+ // create and initialize q with the same size of p and with all elements initialized as 0 |
|
| 321 |
+ //Matrix_Double q(nrow, ncol, 0); |
|
| 322 |
+ //cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl; |
|
| 323 |
+ double v1, v2; |
|
| 324 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 325 |
+ Rcpp::Rcerr << iter+1 << "," ; |
|
| 326 |
+ // E-step: estimate q |
|
| 327 |
+ for (int i=0; i<nrow; i++) {
|
|
| 328 |
+ double sum = 0; |
|
| 329 |
+ for (int j=0; j<ncol; j++) {
|
|
| 330 |
+ p.get_element(i,j,v1); |
|
| 331 |
+ v2 = theta[j]*v1; |
|
| 332 |
+ q.set_element( i, j, v2 ); |
|
| 333 |
+ sum += v2; |
|
| 334 |
+ } |
|
| 335 |
+ for (int j=0; j<ncol; j++) {
|
|
| 336 |
+ q.get_element(i,j,v2); |
|
| 337 |
+ v2 /= sum; |
|
| 338 |
+ q.set_element( i, j, v2 ); |
|
| 339 |
+ } |
|
| 340 |
+ } |
|
| 341 |
+ // M-step: estimate theta |
|
| 342 |
+ for (int j=0; j<ncol; j++) {
|
|
| 343 |
+ double sum=0; |
|
| 344 |
+ for (int i=0; i<nrow; i++) {
|
|
| 345 |
+ q.get_element(i,j,v2); |
|
| 346 |
+ sum += v2; |
|
| 347 |
+ } |
|
| 348 |
+ theta[j] = sum / nrow; |
|
| 349 |
+ } |
|
| 350 |
+ // for debug |
|
| 351 |
+ //cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl; |
|
| 352 |
+ } |
|
| 353 |
+ // last E-step: estimate q |
|
| 354 |
+ for (int i=0; i<nrow; i++) {
|
|
| 355 |
+ double sum = 0; |
|
| 356 |
+ for (int j=0; j<ncol; j++) {
|
|
| 357 |
+ p.get_element(i,j,v1); |
|
| 358 |
+ v2 = theta[j]*v1; |
|
| 359 |
+ q.set_element( i, j, v2 ); |
|
| 360 |
+ sum += v2; |
|
| 361 |
+ } |
|
| 362 |
+ for (int j=0; j<ncol; j++) {
|
|
| 363 |
+ q.get_element(i,j,v2); |
|
| 364 |
+ v2 /= sum; |
|
| 365 |
+ q.set_element( i, j, v2 ); |
|
| 366 |
+ } |
|
| 367 |
+ } |
|
| 368 |
+ Rcpp::Rcerr << endl; |
|
| 369 |
+} |
|
| 370 |
+ |
|
| 371 |
+// |
|
| 372 |
+// There are T known tissues and 1 unknown tissue (described by a double vector "m") |
|
| 373 |
+// |
|
| 374 |
+// input: |
|
| 375 |
+// p is a matrix of N X T, where N is number of reads and T is number of known tissue |
|
| 376 |
+// p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read. |
|
| 377 |
+// Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read |
|
| 378 |
+// Rl is a vector of N X 1, each element is number of all valid CpG sites in a read |
|
| 379 |
+// |
|
| 380 |
+// output: |
|
| 381 |
+// theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units. |
|
| 382 |
+// m is a vector of M X 1, where M is number of markers. |
|
| 383 |
+// q is a matrix of N X T, the tissue-specific posterior probabilty of each read |
|
| 384 |
+// q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling. |
|
| 385 |
+// |
|
| 386 |
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl, |
|
| 387 |
+ int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown, |
|
| 388 |
+ vector<double> & m) |
|
| 389 |
+{
|
|
| 390 |
+ // cout.precision(15); |
|
| 391 |
+ // cerr.precision(15); |
|
| 392 |
+ unsigned int ncol = p.get_column_num(); // T, number of known tissues |
|
| 393 |
+ unsigned int nrow = p.get_row_num(); // N, number of reads |
|
| 394 |
+ |
|
| 395 |
+ theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s. |
|
| 396 |
+ |
|
| 397 |
+ vector<int> uniq_marker_Ids; |
|
| 398 |
+ p.get_unique_row_labels(uniq_marker_Ids); |
|
| 399 |
+ int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 400 |
+ m.resize(nMarker, 0); // assign nMarker space and initialize to 0s. |
|
| 401 |
+ map<int,int> markerId2Index; // marker index is 0-based. |
|
| 402 |
+ for (int index=0; index<nMarker; index++) {
|
|
| 403 |
+ markerId2Index.insert(make_pair(uniq_marker_Ids[index],index)); |
|
| 404 |
+ } |
|
| 405 |
+ |
|
| 406 |
+ // initialize model parameters theta as uniform distribution, and m as 0.5 |
|
| 407 |
+ // alternatively, theta and m can be random numbers, by satisfying: |
|
| 408 |
+ // (1) \sum_{i=1}^{ncol+1}{theta_i}=1
|
|
| 409 |
+ // (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads |
|
| 410 |
+ for (int j=0; j<ncol+1; j++) {
|
|
| 411 |
+ theta[j] = 1/(double)(ncol+1); |
|
| 412 |
+ } |
|
| 413 |
+ for (int k=0; k<nMarker; k++) {
|
|
| 414 |
+ m[k] = 0.5; |
|
| 415 |
+ } |
|
| 416 |
+ for (int i=0; i<nrow; i++) |
|
| 417 |
+ q_unknown.push_back(0); |
|
| 418 |
+ |
|
| 419 |
+ // EM algorithm |
|
| 420 |
+ for (int iter=0; iter<max_iter; iter++) {
|
|
| 421 |
+ Rcpp::Rcerr << iter+1 << "," ; |
|
| 422 |
+ // E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue) |
|
| 423 |
+ double v1, v2, likelihood_unknown_class; |
|
| 424 |
+ int markerId, markerIndex; |
|
| 425 |
+ for (int i=0; i<nrow; i++) {
|
|
| 426 |
+ // process the first T known tissues |
|
| 427 |
+ double sum = 0; |
|
| 428 |
+ for (int j=0; j<ncol; j++) {
|
|
| 429 |
+ p.get_element(i,j,v1); |
|
| 430 |
+ v2 = theta[j]*v1; |
|
| 431 |
+ q.set_element( i, j, v2 ); |
|
| 432 |
+ sum += v2; |
|
| 433 |
+ } |
|
| 434 |
+ // process the last unknown tissue |
|
| 435 |
+ p.get_row_label(i, markerId); // get markerId (1-base) of read i |
|
| 436 |
+ markerIndex = markerId2Index[markerId]; |
|
| 437 |
+ likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]); |
|
| 438 |
+ q_unknown[i] = theta[ncol]*likelihood_unknown_class; |
|
| 439 |
+ sum += q_unknown[i]; |
|
| 440 |
+ |
|
| 441 |
+ // update q and q_unknown |
|
| 442 |
+ for (int j=0; j<ncol; j++) {
|
|
| 443 |
+ q.get_element(i,j,v2); |
|
| 444 |
+ v2 /= sum; |
|
| 445 |
+ q.set_element( i, j, v2 ); |
|
| 446 |
+ } |
|
| 447 |
+ q_unknown[i] /= sum; |
|
| 448 |
+ } |
|
| 449 |
+ |
|
| 450 |
+ // M-step 1: estimate unknown class's methylation level (m) of each marker |
|
| 451 |
+ double sum1=0, sum2=0; |
|
| 452 |
+ int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base |
|
| 453 |
+ for (int i=0; i<nrow; i++) {
|
|
| 454 |
+ // for each row (or read) |
|
| 455 |
+ p.get_row_label(i, markerId); // get the marker_index (0-base) of read i |
|
| 456 |
+ curr_marker_index = markerId2Index[markerId]; |
|
| 457 |
+ if (curr_marker_index != prev_marker_index) {
|
|
| 458 |
+ // this is the 1st read of a new marker |
|
| 459 |
+ // we need to summarize m value of previous marker |
|
| 460 |
+ if (prev_marker_index!=-1) { // current marker is not the 1st marker
|
|
| 461 |
+ m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0); |
|
| 462 |
+ sum1 = 0; |
|
| 463 |
+ sum2 = 0; |
|
| 464 |
+ } |
|
| 465 |
+ prev_marker_index = curr_marker_index; |
|
| 466 |
+ } |
|
| 467 |
+ sum1 += q_unknown[i]*Rm[i]; |
|
| 468 |
+ sum2 += q_unknown[i]*Rl[i]; |
|
| 469 |
+ } |
|
| 470 |
+ m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker |
|
| 471 |
+ |
|
| 472 |
+ //cout << "iter=" << iter << ", " << "m=" << endl; // for debug |
|
| 473 |
+ //print_vec(cout, m, "\n"); // for debug |
|
| 474 |
+ //cout << endl; // for debug |
|
| 475 |
+ |
|
| 476 |
+ // M-step 2: estimate theta, which has ncol+1 values |
|
| 477 |
+ double sum; |
|
| 478 |
+ for (int j=0; j<ncol; j++) {
|
|
| 479 |
+ sum=0; |
|
| 480 |
+ for (int i=0; i<nrow; i++) {
|
|
| 481 |
+ q.get_element(i,j,v2); |
|
| 482 |
+ sum += v2; |
|
| 483 |
+ } |
|
| 484 |
+ theta[j] = sum / nrow; |
|
| 485 |
+ } |
|
| 486 |
+ sum=0; |
|
| 487 |
+ for (int i=0; i<nrow; i++) |
|
| 488 |
+ sum += q_unknown[i]; |
|
| 489 |
+ theta[ncol] = sum / nrow; |
|
| 490 |
+ |
|
| 491 |
+ //cout << "round " << iter+1 << ", theta="; // for debug |
|
| 492 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 493 |
+ //cout << endl << endl; // for debug |
|
| 494 |
+ } |
|
| 495 |
+ Rcpp::Rcerr << endl; |
|
| 496 |
+ //cout << "final:" << endl << std::flush; |
|
| 497 |
+ //print_vec(cout, theta, ", "); // for debug |
|
| 498 |
+ //cout << endl << endl << std::flush; // for debug |
|
| 499 |
+} |
|
| 500 |
+ |
|
| 501 |
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts) |
|
| 502 |
+{
|
|
| 503 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 504 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 505 |
+ |
|
| 506 |
+ vector<int> uniq_marker_Ids; |
|
| 507 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 508 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 509 |
+ |
|
| 510 |
+ double v; |
|
| 511 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 512 |
+ vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0 |
|
| 513 |
+ for (int i=0; i<nrow; i++) {
|
|
| 514 |
+ // for each row (or read) |
|
| 515 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 516 |
+ if (curr_markerId != prev_markerId) {
|
|
| 517 |
+ // this is the 1st read of a new marker |
|
| 518 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 519 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 520 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 521 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 522 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 523 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 524 |
+ } |
|
| 525 |
+ prev_markerId = curr_markerId; |
|
| 526 |
+ } |
|
| 527 |
+ for (int j=0; j<ncol; j++) {
|
|
| 528 |
+ q.get_element(i,j,v); |
|
| 529 |
+ readCountsPerMarker[j] += v; |
|
| 530 |
+ } |
|
| 531 |
+ } |
|
| 532 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 533 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 534 |
+} |
|
| 535 |
+ |
|
| 536 |
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts) |
|
| 537 |
+{
|
|
| 538 |
+ unsigned int ncol = q.get_column_num(); // T, number of known tissues |
|
| 539 |
+ unsigned int nrow = q.get_row_num(); // N, number of reads |
|
| 540 |
+ if (nrow!=(unsigned int)q_unknown.size()) {
|
|
| 541 |
+ Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl; |
|
| 542 |
+ // exit(EXIT_FAILURE); |
|
| 543 |
+ } |
|
| 544 |
+ |
|
| 545 |
+ vector<int> uniq_marker_Ids; |
|
| 546 |
+ q.get_unique_row_labels(uniq_marker_Ids); |
|
| 547 |
+ // int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads. |
|
| 548 |
+ |
|
| 549 |
+ double v; |
|
| 550 |
+ int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base |
|
| 551 |
+ vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class. |
|
| 552 |
+ for (int i=0; i<nrow; i++) {
|
|
| 553 |
+ // for each row (or read) |
|
| 554 |
+ q.get_row_label(i, curr_markerId); // get the marker ID of read i |
|
| 555 |
+ if (curr_markerId != prev_markerId) {
|
|
| 556 |
+ // this is the 1st read of a new marker |
|
| 557 |
+ // we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts" |
|
| 558 |
+ // then clear readCountsPerMarker for a new read counting of the next new marker |
|
| 559 |
+ if (prev_markerId!=-1) { // current marker is not the 1st marker
|
|
| 560 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 561 |
+ readCounts.append_row_vector(readCountsPerMarker, prev_markerId); |
|
| 562 |
+ assign_vector_zeros(readCountsPerMarker); |
|
| 563 |
+ } |
|
| 564 |
+ prev_markerId = curr_markerId; |
|
| 565 |
+ } |
|
| 566 |
+ for (int j=0; j<ncol; j++) {
|
|
| 567 |
+ q.get_element(i,j,v); |
|
| 568 |
+ readCountsPerMarker[j] += v; |
|
| 569 |
+ } |
|
| 570 |
+ readCountsPerMarker[ncol] += q_unknown[i]; |
|
| 571 |
+ } |
|
| 572 |
+ multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit. |
|
| 573 |
+ readCounts.append_row_vector(readCountsPerMarker, curr_markerId); |
|
| 574 |
+} |
|
| 575 |
+ |