Thanks to visit codestin.com
Credit goes to code.bioconductor.org

Browse code

update

RoseHuRan authored on 10/08/2024 19:18:12
Showing 0 changed files
Browse code

add random seed

RoseHuRan authored on 25/11/2023 20:01:05
Showing 1 changed files
... ...
@@ -15,6 +15,7 @@ using namespace Rcpp;
15 15
 #include <boost/algorithm/string.hpp>
16 16
 #include <boost/assign.hpp>
17 17
 #include <boost/iostreams/filtering_stream.hpp>
18
+#include <random>
18 19
 #include "matrix.h"
19 20
 #include "utils.h"
20 21
 
... ...
@@ -305,7 +306,7 @@ double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
305 306
 //   theta (model parameters), a vector with "T" elements.
306 307
 //   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
307 308
 //
308
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
309
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q, int random_seed)
309 310
 {
310 311
 	// cout.precision(15);
311 312
 	// cerr.precision(15);
... ...
@@ -315,9 +316,24 @@ void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matri
315 316
 	// initialize model parameters as uniform distribution
316 317
 	// alternatively, model parameters can be random numbers by satisfying the crition
317 318
 	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
318
-	for (int j=0; j<ncol; j++) {
319
-		theta[j] = 1/(double)ncol;
319
+	if (random_seed == 0) {
320
+	    for (int j=0; j<ncol; j++) {
321
+	        theta[j] = 1/(double)ncol;
322
+	    }
323
+	} else{
324
+	    double sum = 0.0;
325
+	    std::default_random_engine generator(random_seed);
326
+	    std::uniform_real_distribution<double> distribution(0.0, 1.0);
327
+	    for (int j = 0; j < ncol; j++) {
328
+	        theta[j] = distribution(generator);
329
+	        sum += theta[j];
330
+	    }
331
+	    // Normalize the values so that they sum up to 1
332
+	    for (int j = 0; j < ncol; j++) {
333
+	        theta[j] /= sum;
334
+	    }
320 335
 	}
336
+
321 337
 	// create and initialize q with the same size of p and with all elements initialized as 0
322 338
 	//Matrix_Double q(nrow, ncol, 0);
323 339
 	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
... ...
@@ -386,7 +402,7 @@ void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matri
386 402
 //
387 403
 void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
388 404
 	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
389
-	vector<double> & m)
405
+	vector<double> & m, int random_seed)
390 406
 {
391 407
 	// cout.precision(15);
392 408
   // cerr.precision(15);
... ...
@@ -408,9 +424,27 @@ void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
408 424
 	// alternatively, theta and m can be random numbers, by satisfying:
409 425
 	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
410 426
 	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
411
-	for (int j=0; j<ncol+1; j++) {
412
-		theta[j] = 1/(double)(ncol+1);
427
+	if (random_seed == 0) {
428
+	    for (int j=0; j<ncol+1; j++) {
429
+	        theta[j] = 1/(double)(ncol+1);
430
+	    }
431
+	} else{
432
+	    double sum = 0.0;
433
+	    std::default_random_engine generator(random_seed);
434
+	    std::uniform_real_distribution<double> distribution(0.0, 1.0);
435
+	    for (int j = 0; j < ncol+1; j++) {
436
+	        theta[j] = distribution(generator);
437
+	        sum += theta[j];
438
+	    }
439
+	    // Normalize the values so that they sum up to 1
440
+	    for (int j = 0; j < ncol+1; j++) {
441
+	        theta[j] /= sum;
442
+	    }
413 443
 	}
444
+	
445
+	
446
+	
447
+	
414 448
 	for (int k=0; k<nMarker; k++) {
415 449
 		m[k] = 0.5;
416 450
 	}
Browse code

update vignette

RoseHuRan authored on 16/03/2023 06:49:08
Showing 1 changed files
... ...
@@ -1,576 +1,576 @@
1
-// [[Rcpp::depends(BH)]]
2
-#include <Rcpp.h>
3
-using namespace Rcpp;
4
-
5
-#include <cstdlib>
6
-#include <cmath>
7
-#include <sstream>
8
-#include <iostream>
9
-#include <fstream>
10
-#include <map>
11
-#include <vector>
12
-#include <string>
13
-#include <cassert>
14
-#include <algorithm>
15
-#include <boost/algorithm/string.hpp>
16
-#include <boost/assign.hpp>
17
-#include <boost/iostreams/filtering_stream.hpp>
18
-#include "matrix.h"
19
-#include "utils.h"
20
-
21
-using namespace std;
22
-using namespace boost;
23
-
24
-#include "template_utils.cpp"
25
-
26
-Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
27
-{
28
-	nrow = nrow_;
29
-	ncol = ncol_;
30
-	empty = false;
31
-	mat.reserve(nrow_);
32
-	for (int i=0; i<nrow; i++) {
33
-		vector<double> temp;
34
-		temp.reserve(ncol);
35
-		for (int j=0; j<ncol; j++)
36
-			temp.push_back(init_value);
37
-		mat.push_back( temp );
38
-	}
39
-	empty = mat.empty();
40
-}
41
-
42
-// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
43
-void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
44
-{
45
-	vector<string> items;
46
-	split(items, line, is_any_of("\t"));
47
-	vector<double> v;
48
-	for (int i=0; i<items.size(); i++) {
49
-		if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
50
-		if (i<num_header_column) continue; // skip the first num_header_column header columns
51
-		v.push_back( atof(items[i].c_str()) );
52
-	}
53
-	mat.push_back( v );
54
-}
55
-
56
-Matrix_Double::Matrix_Double(const string & mat_file,
57
-	bool header_line=false, int num_header_column=1)
58
-{
59
-	unsigned long i_line=0;
60
-	// input is a plain text file
61
-	istream * in=&cin; // default is stdin
62
-	ifstream fin;
63
-	if ( !(mat_file.compare("stdin")==0)) {
64
-		fin.open(mat_file.c_str());
65
-		if (fin.fail()){
66
-		  Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
67
-			// exit(EXIT_FAILURE);
68
-		}
69
-		in = &fin;
70
-	}
71
-	string line;
72
-	while (!(*in).eof()) {
73
-		getline((*in), line);
74
-		//cerr << "Line: " << line << endl;
75
-		if (header_line) {
76
-			if (i_line==0) {
77
-				// skip the header line
78
-				i_line++;
79
-				continue;
80
-			}
81
-		}
82
-		if (line.empty()) {
83
-			// this is the last line of the file
84
-			break;
85
-		}
86
-		process_one_line_of_mat_file(line, num_header_column);
87
-		i_line++;
88
-	}
89
-	if ( !(mat_file.compare("stdin")==0)) {
90
-		fin.close();
91
-	}
92
-	empty = mat.empty();
93
-	nrow = mat.size();
94
-	if (nrow>=1) ncol = mat[0].size();
95
-	else ncol = 0;
96
-}
97
-
98
-// return the row index of this vector
99
-long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
100
-	if (v_.size()!=ncol) {
101
-	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
102
-		// exit(EXIT_FAILURE);
103
-	}
104
-	vector<double> v;
105
-	for (int i=0; i<v_.size(); i++) {
106
-		v.push_back( v_[i] );
107
-	}
108
-	mat.push_back( v );
109
-	nrow = mat.size();
110
-	row_labels.push_back(row_label_);
111
-	empty=false;
112
-	return(nrow-1);
113
-}
114
-
115
-//
116
-// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
117
-//
118
-// return row index of this appended vector, if successfully appending; return -1 otherwise.
119
-//
120
-long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
121
-	if (v_.size()!=ncol) {
122
-	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
123
-		// exit(EXIT_FAILURE);
124
-	}
125
-	double min_ = *min_element(v_.begin(), v_.end());
126
-	double max_ = *max_element(v_.begin(), v_.end());
127
-	//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
128
-	if (min_==0) {
129
-		if (max_==0) return(-1);
130
-	} else {
131
-		double ratio=max_/min_;
132
-		if (ratio<min_threshold_maxv_minv) return(-1);
133
-	}
134
-	long row_index = append_row_vector(v_, row_label_);
135
-	return(row_index);
136
-}
137
-
138
-bool Matrix_Double::get_element(const int i, const int j, double & v)
139
-{
140
-	if (!empty) {
141
-		if (i<nrow && j<ncol) {
142
-			v = mat[i][j];
143
-			return true;
144
-		} else {
145
-		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
146
-			return false;
147
-		}
148
-	} else {
149
-		v = 0;
150
-		return false;
151
-	}
152
-}
153
-
154
-bool Matrix_Double::set_element(const int i, const int j, double v)
155
-{
156
-	if (!empty) {
157
-		if (i<nrow && j<ncol) {
158
-			mat[i][j] = v;
159
-			return true;
160
-		} else {
161
-		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
162
-			return false;
163
-		}
164
-	} else {
165
-		return false;
166
-	}
167
-}
168
-
169
-bool Matrix_Double::get_column_sum(const int j, double & v)
170
-{
171
-	if (!empty) {
172
-		if (j<ncol) {
173
-			v = 0;
174
-			for (int k=0; k<nrow; k++) {
175
-				v += mat[k][j];
176
-			}
177
-			return true;
178
-		} else {
179
-		  Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
180
-			return false;
181
-		}
182
-	} else {
183
-		v = 0;
184
-		return false;
185
-	}
186
-}
187
-
188
-bool Matrix_Double::get_row_sum(const int i, double & v)
189
-{
190
-	if (!empty) {
191
-		if (i<nrow) {
192
-			v = 0;
193
-			for (int k=0; k<ncol; k++) {
194
-				v += mat[i][k];
195
-			}
196
-			return true;
197
-		} else {
198
-		  Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
199
-			return false;
200
-		}
201
-	} else {
202
-		v = 0;
203
-		return false;
204
-	}
205
-}
206
-
207
-void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
208
-{
209
-	map<int, int> counts;
210
-	for (int i=0; i<row_labels.size(); i++)
211
-		if (counts.find(row_labels[i]) != counts.end())
212
-			counts[row_labels[i]] += 1; // found this label, then increment count
213
-		else {
214
-			counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
215
-			//counts[row_labels[i]] = 1;
216
-		}
217
-	map<int,int>::iterator it;
218
-	for (it=counts.begin(); it!=counts.end(); ++it)
219
-		uniq_labels.push_back(it->first);
220
-}
221
-
222
-void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
223
-{
224
-	// obtain unique row labels
225
-	vector<int> unique_row_labels;
226
-	int row_label_;
227
-	for (int i=0; i<nrow; i++) {
228
-		row_label_ = row_labels[i];
229
-		unique_row_labels.push_back( row_label_ );
230
-	}
231
-	Bins2Value::iterator iter;
232
-	for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
233
-		row_label_ = iter->first;
234
-		if (!exist_row_label(row_label_)) {
235
-			unique_row_labels.push_back( row_label_ );
236
-		}
237
-	}
238
-	sort(unique_row_labels.begin(), unique_row_labels.end());
239
-	// print both 'mat' and 'additional_column_data' as the last column
240
-	for (int i=0; i<unique_row_labels.size(); i++) {
241
-		row_label_ = unique_row_labels[i];
242
-		os << row_label_;
243
-		int row_index = get_row_index(row_label_);
244
-		if (row_index!=-1) {
245
-			for (int j=0; j<ncol; j++) {
246
-				os << "\t" << mat[row_index][j];
247
-			}
248
-		} else {
249
-			for (int j=0; j<ncol; j++) {
250
-				os << "\t0";
251
-			}
252
-		}
253
-		if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
254
-			os << "\t" << additional_column_data[row_label_];
255
-		} else {
256
-			os << "\t0";
257
-		}
258
-		os << endl;
259
-	}
260
-}
261
-
262
-ostream & operator<<(ostream & os, Matrix_Double & mat)
263
-{
264
-	double v;
265
-	int row_label;
266
-	if (!mat.isempty()) {
267
-		for (int i=0; i<mat.get_row_num(); i++) {
268
-			int j;
269
-			mat.get_row_label(i, row_label);
270
-			os << row_label << "\t";
271
-			for (j=0; j<mat.get_column_num()-1; j++) {
272
-				mat.get_element(i,j,v);
273
-				os << v << "\t";
274
-			}
275
-			mat.get_element(i,j,v);
276
-			os << v << endl;
277
-		}
278
-	}
279
-	return os;
280
-}
281
-
282
-double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
283
-{
284
-	unsigned int ncol = p.get_column_num();
285
-	unsigned int nrow = p.get_row_num();
286
-	double v;
287
-	double obj = 0;
288
-	for (int i=0; i<nrow; i++) {
289
-		double sum = 0;
290
-		for (int j=0; j<ncol; j++) {
291
-			p.get_element(i,j,v);
292
-			sum += theta[j]*v;
293
-		}
294
-		obj += log(sum);
295
-	}
296
-	return obj;
297
-}
298
-//
299
-// There are T known tissues and 1 unknown tissue (described by a double vector "m")
300
-//
301
-// input:
302
-//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
303
-//
304
-// output:
305
-//   theta (model parameters), a vector with "T" elements.
306
-//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
307
-//
308
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
309
-{
310
-	// cout.precision(15);
311
-	// cerr.precision(15);
312
-	unsigned int ncol = p.get_column_num();
313
-	unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
314
-	theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
315
-	// initialize model parameters as uniform distribution
316
-	// alternatively, model parameters can be random numbers by satisfying the crition
317
-	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
318
-	for (int j=0; j<ncol; j++) {
319
-		theta[j] = 1/(double)ncol;
320
-	}
321
-	// create and initialize q with the same size of p and with all elements initialized as 0
322
-	//Matrix_Double q(nrow, ncol, 0);
323
-	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
324
-	double v1, v2;
325
-	for (int iter=0; iter<max_iter; iter++) {
326
-	  Rcpp::Rcerr << iter+1 << "," ;
327
-		// E-step: estimate q
328
-		for (int i=0; i<nrow; i++) {
329
-			double sum = 0;
330
-			for (int j=0; j<ncol; j++) {
331
-				p.get_element(i,j,v1);
332
-				v2 = theta[j]*v1;
333
-				q.set_element( i, j, v2 );
334
-				sum += v2;
335
-			}
336
-			for (int j=0; j<ncol; j++) {
337
-				q.get_element(i,j,v2);
338
-				v2 /= sum;
339
-				q.set_element( i, j, v2 );
340
-			}
341
-		}
342
-		// M-step: estimate theta
343
-		for (int j=0; j<ncol; j++) {
344
-			double sum=0;
345
-			for (int i=0; i<nrow; i++) {
346
-				q.get_element(i,j,v2);
347
-				sum += v2;
348
-			}
349
-			theta[j] = sum / nrow;
350
-		}
351
-		// for debug
352
-		//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
353
-	}
354
-	// last E-step: estimate q
355
-	for (int i=0; i<nrow; i++) {
356
-		double sum = 0;
357
-		for (int j=0; j<ncol; j++) {
358
-			p.get_element(i,j,v1);
359
-			v2 = theta[j]*v1;
360
-			q.set_element( i, j, v2 );
361
-			sum += v2;
362
-		}
363
-		for (int j=0; j<ncol; j++) {
364
-			q.get_element(i,j,v2);
365
-			v2 /= sum;
366
-			q.set_element( i, j, v2 );
367
-		}
368
-	}
369
-	Rcpp::Rcerr << endl;
370
-}
371
-
372
-//
373
-// There are T known tissues and 1 unknown tissue (described by a double vector "m")
374
-//
375
-// input:
376
-//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
377
-//     p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
378
-//   Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
379
-//   Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
380
-//
381
-// output:
382
-//   theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
383
-//   m is a vector of M X 1, where M is number of markers.
384
-//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
385
-//   q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
386
-//
387
-void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
388
-	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
389
-	vector<double> & m)
390
-{
391
-	// cout.precision(15);
392
-  // cerr.precision(15);
393
-	unsigned int ncol = p.get_column_num(); // T, number of known tissues
394
-	unsigned int nrow = p.get_row_num(); // N, number of reads
395
-
396
-	theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
397
-
398
-	vector<int> uniq_marker_Ids;
399
-	p.get_unique_row_labels(uniq_marker_Ids);
400
-	int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
401
-	m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
402
-	map<int,int> markerId2Index; // marker index is 0-based.
403
-	for (int index=0; index<nMarker; index++) {
404
-		markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
405
-	}
406
-
407
-	// initialize model parameters theta as uniform distribution, and m as 0.5
408
-	// alternatively, theta and m can be random numbers, by satisfying:
409
-	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
410
-	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
411
-	for (int j=0; j<ncol+1; j++) {
412
-		theta[j] = 1/(double)(ncol+1);
413
-	}
414
-	for (int k=0; k<nMarker; k++) {
415
-		m[k] = 0.5;
416
-	}
417
-	for (int i=0; i<nrow; i++)
418
-		q_unknown.push_back(0);
419
-
420
-	// EM algorithm
421
-	for (int iter=0; iter<max_iter; iter++) {
422
-	  Rcpp::Rcerr << iter+1 << "," ;
423
-		// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
424
-		double v1, v2, likelihood_unknown_class;
425
-		int markerId, markerIndex;
426
-		for (int i=0; i<nrow; i++) {
427
-			// process the first T known tissues
428
-			double sum = 0;
429
-			for (int j=0; j<ncol; j++) {
430
-				p.get_element(i,j,v1);
431
-				v2 = theta[j]*v1;
432
-				q.set_element( i, j, v2 );
433
-				sum += v2;
434
-			}
435
-			// process the last unknown tissue
436
-			p.get_row_label(i, markerId); // get markerId (1-base) of read i
437
-			markerIndex = markerId2Index[markerId];
438
-			likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
439
-			q_unknown[i] = theta[ncol]*likelihood_unknown_class;
440
-			sum += q_unknown[i];
441
-
442
-			// update q and q_unknown
443
-			for (int j=0; j<ncol; j++) {
444
-				q.get_element(i,j,v2);
445
-				v2 /= sum;
446
-				q.set_element( i, j, v2 );
447
-			}
448
-			q_unknown[i] /= sum;
449
-		}
450
-
451
-		// M-step 1: estimate unknown class's methylation level (m) of each marker
452
-		double sum1=0, sum2=0;
453
-		int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
454
-		for (int i=0; i<nrow; i++) {
455
-			// for each row (or read)
456
-			p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
457
-			curr_marker_index = markerId2Index[markerId];
458
-			if (curr_marker_index != prev_marker_index) {
459
-				// this is the 1st read of a new marker
460
-				// we need to summarize m value of previous marker
461
-				if (prev_marker_index!=-1) { // current marker is not the 1st marker
462
-					m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
463
-					sum1 = 0;
464
-					sum2 = 0;
465
-				}
466
-				prev_marker_index = curr_marker_index;
467
-			}
468
-			sum1 += q_unknown[i]*Rm[i];
469
-			sum2 += q_unknown[i]*Rl[i];
470
-		}
471
-		m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
472
-
473
-		//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
474
-		//print_vec(cout, m, "\n"); // for debug
475
-		//cout << endl; // for debug
476
-
477
-		// M-step 2: estimate theta, which has ncol+1 values
478
-		double sum;
479
-		for (int j=0; j<ncol; j++) {
480
-			sum=0;
481
-			for (int i=0; i<nrow; i++) {
482
-				q.get_element(i,j,v2);
483
-				sum += v2;
484
-			}
485
-			theta[j] = sum / nrow;
486
-		}
487
-		sum=0;
488
-		for (int i=0; i<nrow; i++)
489
-			sum += q_unknown[i];
490
-		theta[ncol] = sum / nrow;
491
-
492
-		//cout << "round " << iter+1 << ", theta="; // for debug
493
-		//print_vec(cout, theta, ", "); // for debug
494
-		//cout << endl << endl; // for debug
495
-	}
496
-	Rcpp::Rcerr << endl;
497
-	//cout << "final:" << endl << std::flush;
498
-	//print_vec(cout, theta, ", "); // for debug
499
-	//cout << endl << endl << std::flush; // for debug
500
-}
501
-
502
-void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
503
-{
504
-	unsigned int ncol = q.get_column_num(); // T, number of known tissues
505
-	unsigned int nrow = q.get_row_num(); // N, number of reads
506
-
507
-	vector<int> uniq_marker_Ids;
508
-	q.get_unique_row_labels(uniq_marker_Ids);
509
-	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
510
-
511
-	double v;
512
-	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
513
-	vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
514
-	for (int i=0; i<nrow; i++) {
515
-		// for each row (or read)
516
-		q.get_row_label(i, curr_markerId); // get the marker ID of read i
517
-		if (curr_markerId != prev_markerId) {
518
-			// this is the 1st read of a new marker
519
-			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
520
-			// then clear readCountsPerMarker for a new read counting of the next new marker
521
-			if (prev_markerId!=-1) { // current marker is not the 1st marker
522
-				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
523
-				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
524
-				assign_vector_zeros(readCountsPerMarker);
525
-			}
526
-			prev_markerId = curr_markerId;
527
-		}
528
-		for (int j=0; j<ncol; j++) {
529
-			q.get_element(i,j,v);
530
-			readCountsPerMarker[j] += v;
531
-		}
532
-	}
533
-	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
534
-	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
535
-}
536
-
537
-void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
538
-{
539
-	unsigned int ncol = q.get_column_num(); // T, number of known tissues
540
-	unsigned int nrow = q.get_row_num(); // N, number of reads
541
-	if (nrow!=(unsigned int)q_unknown.size()) {
542
-	  Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
543
-		// exit(EXIT_FAILURE);
544
-	}
545
-
546
-	vector<int> uniq_marker_Ids;
547
-	q.get_unique_row_labels(uniq_marker_Ids);
548
-	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
549
-
550
-	double v;
551
-	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
552
-	vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
553
-	for (int i=0; i<nrow; i++) {
554
-		// for each row (or read)
555
-		q.get_row_label(i, curr_markerId); // get the marker ID of read i
556
-		if (curr_markerId != prev_markerId) {
557
-			// this is the 1st read of a new marker
558
-			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
559
-			// then clear readCountsPerMarker for a new read counting of the next new marker
560
-			if (prev_markerId!=-1) { // current marker is not the 1st marker
561
-				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
562
-				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
563
-				assign_vector_zeros(readCountsPerMarker);
564
-			}
565
-			prev_markerId = curr_markerId;
566
-		}
567
-		for (int j=0; j<ncol; j++) {
568
-			q.get_element(i,j,v);
569
-			readCountsPerMarker[j] += v;
570
-		}
571
-		readCountsPerMarker[ncol] += q_unknown[i];
572
-	}
573
-	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
574
-	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
575
-}
576
-
1
+// [[Rcpp::depends(BH)]]
2
+#include <Rcpp.h>
3
+using namespace Rcpp;
4
+
5
+#include <cstdlib>
6
+#include <cmath>
7
+#include <sstream>
8
+#include <iostream>
9
+#include <fstream>
10
+#include <map>
11
+#include <vector>
12
+#include <string>
13
+#include <cassert>
14
+#include <algorithm>
15
+#include <boost/algorithm/string.hpp>
16
+#include <boost/assign.hpp>
17
+#include <boost/iostreams/filtering_stream.hpp>
18
+#include "matrix.h"
19
+#include "utils.h"
20
+
21
+using namespace std;
22
+using namespace boost;
23
+
24
+#include "template_utils.cpp"
25
+
26
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
27
+{
28
+	nrow = nrow_;
29
+	ncol = ncol_;
30
+	empty = false;
31
+	mat.reserve(nrow_);
32
+	for (int i=0; i<nrow; i++) {
33
+		vector<double> temp;
34
+		temp.reserve(ncol);
35
+		for (int j=0; j<ncol; j++)
36
+			temp.push_back(init_value);
37
+		mat.push_back( temp );
38
+	}
39
+	empty = mat.empty();
40
+}
41
+
42
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
43
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
44
+{
45
+	vector<string> items;
46
+	split(items, line, is_any_of("\t"));
47
+	vector<double> v;
48
+	for (int i=0; i<items.size(); i++) {
49
+		if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
50
+		if (i<num_header_column) continue; // skip the first num_header_column header columns
51
+		v.push_back( atof(items[i].c_str()) );
52
+	}
53
+	mat.push_back( v );
54
+}
55
+
56
+Matrix_Double::Matrix_Double(const string & mat_file,
57
+	bool header_line=false, int num_header_column=1)
58
+{
59
+	unsigned long i_line=0;
60
+	// input is a plain text file
61
+	istream * in=&cin; // default is stdin
62
+	ifstream fin;
63
+	if ( !(mat_file.compare("stdin")==0)) {
64
+		fin.open(mat_file.c_str());
65
+		if (fin.fail()){
66
+		  Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
67
+			// exit(EXIT_FAILURE);
68
+		}
69
+		in = &fin;
70
+	}
71
+	string line;
72
+	while (!(*in).eof()) {
73
+		getline((*in), line);
74
+		//cerr << "Line: " << line << endl;
75
+		if (header_line) {
76
+			if (i_line==0) {
77
+				// skip the header line
78
+				i_line++;
79
+				continue;
80
+			}
81
+		}
82
+		if (line.empty()) {
83
+			// this is the last line of the file
84
+			break;
85
+		}
86
+		process_one_line_of_mat_file(line, num_header_column);
87
+		i_line++;
88
+	}
89
+	if ( !(mat_file.compare("stdin")==0)) {
90
+		fin.close();
91
+	}
92
+	empty = mat.empty();
93
+	nrow = mat.size();
94
+	if (nrow>=1) ncol = mat[0].size();
95
+	else ncol = 0;
96
+}
97
+
98
+// return the row index of this vector
99
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
100
+	if (v_.size()!=ncol) {
101
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
102
+		// exit(EXIT_FAILURE);
103
+	}
104
+	vector<double> v;
105
+	for (int i=0; i<v_.size(); i++) {
106
+		v.push_back( v_[i] );
107
+	}
108
+	mat.push_back( v );
109
+	nrow = mat.size();
110
+	row_labels.push_back(row_label_);
111
+	empty=false;
112
+	return(nrow-1);
113
+}
114
+
115
+//
116
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
117
+//
118
+// return row index of this appended vector, if successfully appending; return -1 otherwise.
119
+//
120
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
121
+	if (v_.size()!=ncol) {
122
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
123
+		// exit(EXIT_FAILURE);
124
+	}
125
+	double min_ = *min_element(v_.begin(), v_.end());
126
+	double max_ = *max_element(v_.begin(), v_.end());
127
+	//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
128
+	if (min_==0) {
129
+		if (max_==0) return(-1);
130
+	} else {
131
+		double ratio=max_/min_;
132
+		if (ratio<min_threshold_maxv_minv) return(-1);
133
+	}
134
+	long row_index = append_row_vector(v_, row_label_);
135
+	return(row_index);
136
+}
137
+
138
+bool Matrix_Double::get_element(const int i, const int j, double & v)
139
+{
140
+	if (!empty) {
141
+		if (i<nrow && j<ncol) {
142
+			v = mat[i][j];
143
+			return true;
144
+		} else {
145
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
146
+			return false;
147
+		}
148
+	} else {
149
+		v = 0;
150
+		return false;
151
+	}
152
+}
153
+
154
+bool Matrix_Double::set_element(const int i, const int j, double v)
155
+{
156
+	if (!empty) {
157
+		if (i<nrow && j<ncol) {
158
+			mat[i][j] = v;
159
+			return true;
160
+		} else {
161
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
162
+			return false;
163
+		}
164
+	} else {
165
+		return false;
166
+	}
167
+}
168
+
169
+bool Matrix_Double::get_column_sum(const int j, double & v)
170
+{
171
+	if (!empty) {
172
+		if (j<ncol) {
173
+			v = 0;
174
+			for (int k=0; k<nrow; k++) {
175
+				v += mat[k][j];
176
+			}
177
+			return true;
178
+		} else {
179
+		  Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
180
+			return false;
181
+		}
182
+	} else {
183
+		v = 0;
184
+		return false;
185
+	}
186
+}
187
+
188
+bool Matrix_Double::get_row_sum(const int i, double & v)
189
+{
190
+	if (!empty) {
191
+		if (i<nrow) {
192
+			v = 0;
193
+			for (int k=0; k<ncol; k++) {
194
+				v += mat[i][k];
195
+			}
196
+			return true;
197
+		} else {
198
+		  Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
199
+			return false;
200
+		}
201
+	} else {
202
+		v = 0;
203
+		return false;
204
+	}
205
+}
206
+
207
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
208
+{
209
+	map<int, int> counts;
210
+	for (int i=0; i<row_labels.size(); i++)
211
+		if (counts.find(row_labels[i]) != counts.end())
212
+			counts[row_labels[i]] += 1; // found this label, then increment count
213
+		else {
214
+			counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
215
+			//counts[row_labels[i]] = 1;
216
+		}
217
+	map<int,int>::iterator it;
218
+	for (it=counts.begin(); it!=counts.end(); ++it)
219
+		uniq_labels.push_back(it->first);
220
+}
221
+
222
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
223
+{
224
+	// obtain unique row labels
225
+	vector<int> unique_row_labels;
226
+	int row_label_;
227
+	for (int i=0; i<nrow; i++) {
228
+		row_label_ = row_labels[i];
229
+		unique_row_labels.push_back( row_label_ );
230
+	}
231
+	Bins2Value::iterator iter;
232
+	for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
233
+		row_label_ = iter->first;
234
+		if (!exist_row_label(row_label_)) {
235
+			unique_row_labels.push_back( row_label_ );
236
+		}
237
+	}
238
+	sort(unique_row_labels.begin(), unique_row_labels.end());
239
+	// print both 'mat' and 'additional_column_data' as the last column
240
+	for (int i=0; i<unique_row_labels.size(); i++) {
241
+		row_label_ = unique_row_labels[i];
242
+		os << row_label_;
243
+		int row_index = get_row_index(row_label_);
244
+		if (row_index!=-1) {
245
+			for (int j=0; j<ncol; j++) {
246
+				os << "\t" << mat[row_index][j];
247
+			}
248
+		} else {
249
+			for (int j=0; j<ncol; j++) {
250
+				os << "\t0";
251
+			}
252
+		}
253
+		if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
254
+			os << "\t" << additional_column_data[row_label_];
255
+		} else {
256
+			os << "\t0";
257
+		}
258
+		os << endl;
259
+	}
260
+}
261
+
262
+ostream & operator<<(ostream & os, Matrix_Double & mat)
263
+{
264
+	double v;
265
+	int row_label;
266
+	if (!mat.isempty()) {
267
+		for (int i=0; i<mat.get_row_num(); i++) {
268
+			int j;
269
+			mat.get_row_label(i, row_label);
270
+			os << row_label << "\t";
271
+			for (j=0; j<mat.get_column_num()-1; j++) {
272
+				mat.get_element(i,j,v);
273
+				os << v << "\t";
274
+			}
275
+			mat.get_element(i,j,v);
276
+			os << v << endl;
277
+		}
278
+	}
279
+	return os;
280
+}
281
+
282
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
283
+{
284
+	unsigned int ncol = p.get_column_num();
285
+	unsigned int nrow = p.get_row_num();
286
+	double v;
287
+	double obj = 0;
288
+	for (int i=0; i<nrow; i++) {
289
+		double sum = 0;
290
+		for (int j=0; j<ncol; j++) {
291
+			p.get_element(i,j,v);
292
+			sum += theta[j]*v;
293
+		}
294
+		obj += log(sum);
295
+	}
296
+	return obj;
297
+}
298
+//
299
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
300
+//
301
+// input:
302
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
303
+//
304
+// output:
305
+//   theta (model parameters), a vector with "T" elements.
306
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
307
+//
308
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
309
+{
310
+	// cout.precision(15);
311
+	// cerr.precision(15);
312
+	unsigned int ncol = p.get_column_num();
313
+	unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
314
+	theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
315
+	// initialize model parameters as uniform distribution
316
+	// alternatively, model parameters can be random numbers by satisfying the crition
317
+	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
318
+	for (int j=0; j<ncol; j++) {
319
+		theta[j] = 1/(double)ncol;
320
+	}
321
+	// create and initialize q with the same size of p and with all elements initialized as 0
322
+	//Matrix_Double q(nrow, ncol, 0);
323
+	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
324
+	double v1, v2;
325
+	for (int iter=0; iter<max_iter; iter++) {
326
+	  // Rcpp::Rcerr << iter+1 << "," ;
327
+		// E-step: estimate q
328
+		for (int i=0; i<nrow; i++) {
329
+			double sum = 0;
330
+			for (int j=0; j<ncol; j++) {
331
+				p.get_element(i,j,v1);
332
+				v2 = theta[j]*v1;
333
+				q.set_element( i, j, v2 );
334
+				sum += v2;
335
+			}
336
+			for (int j=0; j<ncol; j++) {
337
+				q.get_element(i,j,v2);
338
+				v2 /= sum;
339
+				q.set_element( i, j, v2 );
340
+			}
341
+		}
342
+		// M-step: estimate theta
343
+		for (int j=0; j<ncol; j++) {
344
+			double sum=0;
345
+			for (int i=0; i<nrow; i++) {
346
+				q.get_element(i,j,v2);
347
+				sum += v2;
348
+			}
349
+			theta[j] = sum / nrow;
350
+		}
351
+		// for debug
352
+		//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
353
+	}
354
+	// last E-step: estimate q
355
+	for (int i=0; i<nrow; i++) {
356
+		double sum = 0;
357
+		for (int j=0; j<ncol; j++) {
358
+			p.get_element(i,j,v1);
359
+			v2 = theta[j]*v1;
360
+			q.set_element( i, j, v2 );
361
+			sum += v2;
362
+		}
363
+		for (int j=0; j<ncol; j++) {
364
+			q.get_element(i,j,v2);
365
+			v2 /= sum;
366
+			q.set_element( i, j, v2 );
367
+		}
368
+	}
369
+	Rcpp::Rcerr << endl;
370
+}
371
+
372
+//
373
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
374
+//
375
+// input:
376
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
377
+//     p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
378
+//   Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
379
+//   Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
380
+//
381
+// output:
382
+//   theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
383
+//   m is a vector of M X 1, where M is number of markers.
384
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
385
+//   q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
386
+//
387
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
388
+	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
389
+	vector<double> & m)
390
+{
391
+	// cout.precision(15);
392
+  // cerr.precision(15);
393
+	unsigned int ncol = p.get_column_num(); // T, number of known tissues
394
+	unsigned int nrow = p.get_row_num(); // N, number of reads
395
+
396
+	theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
397
+
398
+	vector<int> uniq_marker_Ids;
399
+	p.get_unique_row_labels(uniq_marker_Ids);
400
+	int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
401
+	m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
402
+	map<int,int> markerId2Index; // marker index is 0-based.
403
+	for (int index=0; index<nMarker; index++) {
404
+		markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
405
+	}
406
+
407
+	// initialize model parameters theta as uniform distribution, and m as 0.5
408
+	// alternatively, theta and m can be random numbers, by satisfying:
409
+	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
410
+	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
411
+	for (int j=0; j<ncol+1; j++) {
412
+		theta[j] = 1/(double)(ncol+1);
413
+	}
414
+	for (int k=0; k<nMarker; k++) {
415
+		m[k] = 0.5;
416
+	}
417
+	for (int i=0; i<nrow; i++)
418
+		q_unknown.push_back(0);
419
+
420
+	// EM algorithm
421
+	for (int iter=0; iter<max_iter; iter++) {
422
+	  Rcpp::Rcerr << iter+1 << "," ;
423
+		// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
424
+		double v1, v2, likelihood_unknown_class;
425
+		int markerId, markerIndex;
426
+		for (int i=0; i<nrow; i++) {
427
+			// process the first T known tissues
428
+			double sum = 0;
429
+			for (int j=0; j<ncol; j++) {
430
+				p.get_element(i,j,v1);
431
+				v2 = theta[j]*v1;
432
+				q.set_element( i, j, v2 );
433
+				sum += v2;
434
+			}
435
+			// process the last unknown tissue
436
+			p.get_row_label(i, markerId); // get markerId (1-base) of read i
437
+			markerIndex = markerId2Index[markerId];
438
+			likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
439
+			q_unknown[i] = theta[ncol]*likelihood_unknown_class;
440
+			sum += q_unknown[i];
441
+
442
+			// update q and q_unknown
443
+			for (int j=0; j<ncol; j++) {
444
+				q.get_element(i,j,v2);
445
+				v2 /= sum;
446
+				q.set_element( i, j, v2 );
447
+			}
448
+			q_unknown[i] /= sum;
449
+		}
450
+
451
+		// M-step 1: estimate unknown class's methylation level (m) of each marker
452
+		double sum1=0, sum2=0;
453
+		int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
454
+		for (int i=0; i<nrow; i++) {
455
+			// for each row (or read)
456
+			p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
457
+			curr_marker_index = markerId2Index[markerId];
458
+			if (curr_marker_index != prev_marker_index) {
459
+				// this is the 1st read of a new marker
460
+				// we need to summarize m value of previous marker
461
+				if (prev_marker_index!=-1) { // current marker is not the 1st marker
462
+					m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
463
+					sum1 = 0;
464
+					sum2 = 0;
465
+				}
466
+				prev_marker_index = curr_marker_index;
467
+			}
468
+			sum1 += q_unknown[i]*Rm[i];
469
+			sum2 += q_unknown[i]*Rl[i];
470
+		}
471
+		m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
472
+
473
+		//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
474
+		//print_vec(cout, m, "\n"); // for debug
475
+		//cout << endl; // for debug
476
+
477
+		// M-step 2: estimate theta, which has ncol+1 values
478
+		double sum;
479
+		for (int j=0; j<ncol; j++) {
480
+			sum=0;
481
+			for (int i=0; i<nrow; i++) {
482
+				q.get_element(i,j,v2);
483
+				sum += v2;
484
+			}
485
+			theta[j] = sum / nrow;
486
+		}
487
+		sum=0;
488
+		for (int i=0; i<nrow; i++)
489
+			sum += q_unknown[i];
490
+		theta[ncol] = sum / nrow;
491
+
492
+		//cout << "round " << iter+1 << ", theta="; // for debug
493
+		//print_vec(cout, theta, ", "); // for debug
494
+		//cout << endl << endl; // for debug
495
+	}
496
+	Rcpp::Rcerr << endl;
497
+	//cout << "final:" << endl << std::flush;
498
+	//print_vec(cout, theta, ", "); // for debug
499
+	//cout << endl << endl << std::flush; // for debug
500
+}
501
+
502
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
503
+{
504
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
505
+	unsigned int nrow = q.get_row_num(); // N, number of reads
506
+
507
+	vector<int> uniq_marker_Ids;
508
+	q.get_unique_row_labels(uniq_marker_Ids);
509
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
510
+
511
+	double v;
512
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
513
+	vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
514
+	for (int i=0; i<nrow; i++) {
515
+		// for each row (or read)
516
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
517
+		if (curr_markerId != prev_markerId) {
518
+			// this is the 1st read of a new marker
519
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
520
+			// then clear readCountsPerMarker for a new read counting of the next new marker
521
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
522
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
523
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
524
+				assign_vector_zeros(readCountsPerMarker);
525
+			}
526
+			prev_markerId = curr_markerId;
527
+		}
528
+		for (int j=0; j<ncol; j++) {
529
+			q.get_element(i,j,v);
530
+			readCountsPerMarker[j] += v;
531
+		}
532
+	}
533
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
534
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
535
+}
536
+
537
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
538
+{
539
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
540
+	unsigned int nrow = q.get_row_num(); // N, number of reads
541
+	if (nrow!=(unsigned int)q_unknown.size()) {
542
+	  Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
543
+		// exit(EXIT_FAILURE);
544
+	}
545
+
546
+	vector<int> uniq_marker_Ids;
547
+	q.get_unique_row_labels(uniq_marker_Ids);
548
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
549
+
550
+	double v;
551
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
552
+	vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
553
+	for (int i=0; i<nrow; i++) {
554
+		// for each row (or read)
555
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
556
+		if (curr_markerId != prev_markerId) {
557
+			// this is the 1st read of a new marker
558
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
559
+			// then clear readCountsPerMarker for a new read counting of the next new marker
560
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
561
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
562
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
563
+				assign_vector_zeros(readCountsPerMarker);
564
+			}
565
+			prev_markerId = curr_markerId;
566
+		}
567
+		for (int j=0; j<ncol; j++) {
568
+			q.get_element(i,j,v);
569
+			readCountsPerMarker[j] += v;
570
+		}
571
+		readCountsPerMarker[ncol] += q_unknown[i];
572
+	}
573
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
574
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
575
+}
576
+
Browse code

win version

RoseHuRan authored on 29/03/2022 16:48:36
Showing 1 changed files
... ...
@@ -1,575 +1,576 @@
1
-#include <Rcpp.h>
2
-using namespace Rcpp;
3
-
4
-#include <cstdlib>
5
-#include <cmath>
6
-#include <sstream>
7
-#include <iostream>
8
-#include <fstream>
9
-#include <map>
10
-#include <vector>
11
-#include <string>
12
-#include <cassert>
13
-#include <algorithm>
14
-#include <boost/algorithm/string.hpp>
15
-#include <boost/assign.hpp>
16
-#include <boost/iostreams/filtering_stream.hpp>
17
-#include "matrix.h"
18
-#include "utils.h"
19
-
20
-using namespace std;
21
-using namespace boost;
22
-
23
-#include "template_utils.cpp"
24
-
25
-Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
26
-{
27
-	nrow = nrow_;
28
-	ncol = ncol_;
29
-	empty = false;
30
-	mat.reserve(nrow_);
31
-	for (int i=0; i<nrow; i++) {
32
-		vector<double> temp;
33
-		temp.reserve(ncol);
34
-		for (int j=0; j<ncol; j++)
35
-			temp.push_back(init_value);
36
-		mat.push_back( temp );
37
-	}
38
-	empty = mat.empty();
39
-}
40
-
41
-// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
42
-void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
43
-{
44
-	vector<string> items;
45
-	split(items, line, is_any_of("\t"));
46
-	vector<double> v;
47
-	for (int i=0; i<items.size(); i++) {
48
-		if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
49
-		if (i<num_header_column) continue; // skip the first num_header_column header columns
50
-		v.push_back( atof(items[i].c_str()) );
51
-	}
52
-	mat.push_back( v );
53
-}
54
-
55
-Matrix_Double::Matrix_Double(const string & mat_file,
56
-	bool header_line=false, int num_header_column=1)
57
-{
58
-	unsigned long i_line=0;
59
-	// input is a plain text file
60
-	istream * in=&cin; // default is stdin
61
-	ifstream fin;
62
-	if ( !(mat_file.compare("stdin")==0)) {
63
-		fin.open(mat_file.c_str());
64
-		if (fin.fail()){
65
-		  Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
66
-			// exit(EXIT_FAILURE);
67
-		}
68
-		in = &fin;
69
-	}
70
-	string line;
71
-	while (!(*in).eof()) {
72
-		getline((*in), line);
73
-		//cerr << "Line: " << line << endl;
74
-		if (header_line) {
75
-			if (i_line==0) {
76
-				// skip the header line
77
-				i_line++;
78
-				continue;
79
-			}
80
-		}
81
-		if (line.empty()) {
82
-			// this is the last line of the file
83
-			break;
84
-		}
85
-		process_one_line_of_mat_file(line, num_header_column);
86
-		i_line++;
87
-	}
88
-	if ( !(mat_file.compare("stdin")==0)) {
89
-		fin.close();
90
-	}
91
-	empty = mat.empty();
92
-	nrow = mat.size();
93
-	if (nrow>=1) ncol = mat[0].size();
94
-	else ncol = 0;
95
-}
96
-
97
-// return the row index of this vector
98
-long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
99
-	if (v_.size()!=ncol) {
100
-	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
101
-		// exit(EXIT_FAILURE);
102
-	}
103
-	vector<double> v;
104
-	for (int i=0; i<v_.size(); i++) {
105
-		v.push_back( v_[i] );
106
-	}
107
-	mat.push_back( v );
108
-	nrow = mat.size();
109
-	row_labels.push_back(row_label_);
110
-	empty=false;
111
-	return(nrow-1);
112
-}
113
-
114
-//
115
-// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
116
-//
117
-// return row index of this appended vector, if successfully appending; return -1 otherwise.
118
-//
119
-long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
120
-	if (v_.size()!=ncol) {
121
-	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
122
-		// exit(EXIT_FAILURE);
123
-	}
124
-	double min_ = *min_element(v_.begin(), v_.end());
125
-	double max_ = *max_element(v_.begin(), v_.end());
126
-	//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
127
-	if (min_==0) {
128
-		if (max_==0) return(-1);
129
-	} else {
130
-		double ratio=max_/min_;
131
-		if (ratio<min_threshold_maxv_minv) return(-1);
132
-	}
133
-	long row_index = append_row_vector(v_, row_label_);
134
-	return(row_index);
135
-}
136
-
137
-bool Matrix_Double::get_element(const int i, const int j, double & v)
138
-{
139
-	if (!empty) {
140
-		if (i<nrow && j<ncol) {
141
-			v = mat[i][j];
142
-			return true;
143
-		} else {
144
-		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
145
-			return false;
146
-		}
147
-	} else {
148
-		v = 0;
149
-		return false;
150
-	}
151
-}
152
-
153
-bool Matrix_Double::set_element(const int i, const int j, double v)
154
-{
155
-	if (!empty) {
156
-		if (i<nrow && j<ncol) {
157
-			mat[i][j] = v;
158
-			return true;
159
-		} else {
160
-		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
161
-			return false;
162
-		}
163
-	} else {
164
-		return false;
165
-	}
166
-}
167
-
168
-bool Matrix_Double::get_column_sum(const int j, double & v)
169
-{
170
-	if (!empty) {
171
-		if (j<ncol) {
172
-			v = 0;
173
-			for (int k=0; k<nrow; k++) {
174
-				v += mat[k][j];
175
-			}
176
-			return true;
177
-		} else {
178
-		  Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
179
-			return false;
180
-		}
181
-	} else {
182
-		v = 0;
183
-		return false;
184
-	}
185
-}
186
-
187
-bool Matrix_Double::get_row_sum(const int i, double & v)
188
-{
189
-	if (!empty) {
190
-		if (i<nrow) {
191
-			v = 0;
192
-			for (int k=0; k<ncol; k++) {
193
-				v += mat[i][k];
194
-			}
195
-			return true;
196
-		} else {
197
-		  Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
198
-			return false;
199
-		}
200
-	} else {
201
-		v = 0;
202
-		return false;
203
-	}
204
-}
205
-
206
-void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
207
-{
208
-	map<int, int> counts;
209
-	for (int i=0; i<row_labels.size(); i++)
210
-		if (counts.find(row_labels[i]) != counts.end())
211
-			counts[row_labels[i]] += 1; // found this label, then increment count
212
-		else {
213
-			counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
214
-			//counts[row_labels[i]] = 1;
215
-		}
216
-	map<int,int>::iterator it;
217
-	for (it=counts.begin(); it!=counts.end(); ++it)
218
-		uniq_labels.push_back(it->first);
219
-}
220
-
221
-void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
222
-{
223
-	// obtain unique row labels
224
-	vector<int> unique_row_labels;
225
-	int row_label_;
226
-	for (int i=0; i<nrow; i++) {
227
-		row_label_ = row_labels[i];
228
-		unique_row_labels.push_back( row_label_ );
229
-	}
230
-	Bins2Value::iterator iter;
231
-	for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
232
-		row_label_ = iter->first;
233
-		if (!exist_row_label(row_label_)) {
234
-			unique_row_labels.push_back( row_label_ );
235
-		}
236
-	}
237
-	sort(unique_row_labels.begin(), unique_row_labels.end());
238
-	// print both 'mat' and 'additional_column_data' as the last column
239
-	for (int i=0; i<unique_row_labels.size(); i++) {
240
-		row_label_ = unique_row_labels[i];
241
-		os << row_label_;
242
-		int row_index = get_row_index(row_label_);
243
-		if (row_index!=-1) {
244
-			for (int j=0; j<ncol; j++) {
245
-				os << "\t" << mat[row_index][j];
246
-			}
247
-		} else {
248
-			for (int j=0; j<ncol; j++) {
249
-				os << "\t0";
250
-			}
251
-		}
252
-		if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
253
-			os << "\t" << additional_column_data[row_label_];
254
-		} else {
255
-			os << "\t0";
256
-		}
257
-		os << endl;
258
-	}
259
-}
260
-
261
-ostream & operator<<(ostream & os, Matrix_Double & mat)
262
-{
263
-	double v;
264
-	int row_label;
265
-	if (!mat.isempty()) {
266
-		for (int i=0; i<mat.get_row_num(); i++) {
267
-			int j;
268
-			mat.get_row_label(i, row_label);
269
-			os << row_label << "\t";
270
-			for (j=0; j<mat.get_column_num()-1; j++) {
271
-				mat.get_element(i,j,v);
272
-				os << v << "\t";
273
-			}
274
-			mat.get_element(i,j,v);
275
-			os << v << endl;
276
-		}
277
-	}
278
-	return os;
279
-}
280
-
281
-double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
282
-{
283
-	unsigned int ncol = p.get_column_num();
284
-	unsigned int nrow = p.get_row_num();
285
-	double v;
286
-	double obj = 0;
287
-	for (int i=0; i<nrow; i++) {
288
-		double sum = 0;
289
-		for (int j=0; j<ncol; j++) {
290
-			p.get_element(i,j,v);
291
-			sum += theta[j]*v;
292
-		}
293
-		obj += log(sum);
294
-	}
295
-	return obj;
296
-}
297
-//
298
-// There are T known tissues and 1 unknown tissue (described by a double vector "m")
299
-//
300
-// input:
301
-//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
302
-//
303
-// output:
304
-//   theta (model parameters), a vector with "T" elements.
305
-//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
306
-//
307
-void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
308
-{
309
-	// cout.precision(15);
310
-	// cerr.precision(15);
311
-	unsigned int ncol = p.get_column_num();
312
-	unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
313
-	theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
314
-	// initialize model parameters as uniform distribution
315
-	// alternatively, model parameters can be random numbers by satisfying the crition
316
-	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
317
-	for (int j=0; j<ncol; j++) {
318
-		theta[j] = 1/(double)ncol;
319
-	}
320
-	// create and initialize q with the same size of p and with all elements initialized as 0
321
-	//Matrix_Double q(nrow, ncol, 0);
322
-	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
323
-	double v1, v2;
324
-	for (int iter=0; iter<max_iter; iter++) {
325
-	  Rcpp::Rcerr << iter+1 << "," ;
326
-		// E-step: estimate q
327
-		for (int i=0; i<nrow; i++) {
328
-			double sum = 0;
329
-			for (int j=0; j<ncol; j++) {
330
-				p.get_element(i,j,v1);
331
-				v2 = theta[j]*v1;
332
-				q.set_element( i, j, v2 );
333
-				sum += v2;
334
-			}
335
-			for (int j=0; j<ncol; j++) {
336
-				q.get_element(i,j,v2);
337
-				v2 /= sum;
338
-				q.set_element( i, j, v2 );
339
-			}
340
-		}
341
-		// M-step: estimate theta
342
-		for (int j=0; j<ncol; j++) {
343
-			double sum=0;
344
-			for (int i=0; i<nrow; i++) {
345
-				q.get_element(i,j,v2);
346
-				sum += v2;
347
-			}
348
-			theta[j] = sum / nrow;
349
-		}
350
-		// for debug
351
-		//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
352
-	}
353
-	// last E-step: estimate q
354
-	for (int i=0; i<nrow; i++) {
355
-		double sum = 0;
356
-		for (int j=0; j<ncol; j++) {
357
-			p.get_element(i,j,v1);
358
-			v2 = theta[j]*v1;
359
-			q.set_element( i, j, v2 );
360
-			sum += v2;
361
-		}
362
-		for (int j=0; j<ncol; j++) {
363
-			q.get_element(i,j,v2);
364
-			v2 /= sum;
365
-			q.set_element( i, j, v2 );
366
-		}
367
-	}
368
-	Rcpp::Rcerr << endl;
369
-}
370
-
371
-//
372
-// There are T known tissues and 1 unknown tissue (described by a double vector "m")
373
-//
374
-// input:
375
-//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
376
-//     p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
377
-//   Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
378
-//   Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
379
-//
380
-// output:
381
-//   theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
382
-//   m is a vector of M X 1, where M is number of markers.
383
-//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
384
-//   q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
385
-//
386
-void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
387
-	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
388
-	vector<double> & m)
389
-{
390
-	// cout.precision(15);
391
-  // cerr.precision(15);
392
-	unsigned int ncol = p.get_column_num(); // T, number of known tissues
393
-	unsigned int nrow = p.get_row_num(); // N, number of reads
394
-
395
-	theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
396
-
397
-	vector<int> uniq_marker_Ids;
398
-	p.get_unique_row_labels(uniq_marker_Ids);
399
-	int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
400
-	m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
401
-	map<int,int> markerId2Index; // marker index is 0-based.
402
-	for (int index=0; index<nMarker; index++) {
403
-		markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
404
-	}
405
-
406
-	// initialize model parameters theta as uniform distribution, and m as 0.5
407
-	// alternatively, theta and m can be random numbers, by satisfying:
408
-	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
409
-	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
410
-	for (int j=0; j<ncol+1; j++) {
411
-		theta[j] = 1/(double)(ncol+1);
412
-	}
413
-	for (int k=0; k<nMarker; k++) {
414
-		m[k] = 0.5;
415
-	}
416
-	for (int i=0; i<nrow; i++)
417
-		q_unknown.push_back(0);
418
-
419
-	// EM algorithm
420
-	for (int iter=0; iter<max_iter; iter++) {
421
-	  Rcpp::Rcerr << iter+1 << "," ;
422
-		// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
423
-		double v1, v2, likelihood_unknown_class;
424
-		int markerId, markerIndex;
425
-		for (int i=0; i<nrow; i++) {
426
-			// process the first T known tissues
427
-			double sum = 0;
428
-			for (int j=0; j<ncol; j++) {
429
-				p.get_element(i,j,v1);
430
-				v2 = theta[j]*v1;
431
-				q.set_element( i, j, v2 );
432
-				sum += v2;
433
-			}
434
-			// process the last unknown tissue
435
-			p.get_row_label(i, markerId); // get markerId (1-base) of read i
436
-			markerIndex = markerId2Index[markerId];
437
-			likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
438
-			q_unknown[i] = theta[ncol]*likelihood_unknown_class;
439
-			sum += q_unknown[i];
440
-
441
-			// update q and q_unknown
442
-			for (int j=0; j<ncol; j++) {
443
-				q.get_element(i,j,v2);
444
-				v2 /= sum;
445
-				q.set_element( i, j, v2 );
446
-			}
447
-			q_unknown[i] /= sum;
448
-		}
449
-
450
-		// M-step 1: estimate unknown class's methylation level (m) of each marker
451
-		double sum1=0, sum2=0;
452
-		int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
453
-		for (int i=0; i<nrow; i++) {
454
-			// for each row (or read)
455
-			p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
456
-			curr_marker_index = markerId2Index[markerId];
457
-			if (curr_marker_index != prev_marker_index) {
458
-				// this is the 1st read of a new marker
459
-				// we need to summarize m value of previous marker
460
-				if (prev_marker_index!=-1) { // current marker is not the 1st marker
461
-					m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
462
-					sum1 = 0;
463
-					sum2 = 0;
464
-				}
465
-				prev_marker_index = curr_marker_index;
466
-			}
467
-			sum1 += q_unknown[i]*Rm[i];
468
-			sum2 += q_unknown[i]*Rl[i];
469
-		}
470
-		m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
471
-
472
-		//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
473
-		//print_vec(cout, m, "\n"); // for debug
474
-		//cout << endl; // for debug
475
-
476
-		// M-step 2: estimate theta, which has ncol+1 values
477
-		double sum;
478
-		for (int j=0; j<ncol; j++) {
479
-			sum=0;
480
-			for (int i=0; i<nrow; i++) {
481
-				q.get_element(i,j,v2);
482
-				sum += v2;
483
-			}
484
-			theta[j] = sum / nrow;
485
-		}
486
-		sum=0;
487
-		for (int i=0; i<nrow; i++)
488
-			sum += q_unknown[i];
489
-		theta[ncol] = sum / nrow;
490
-
491
-		//cout << "round " << iter+1 << ", theta="; // for debug
492
-		//print_vec(cout, theta, ", "); // for debug
493
-		//cout << endl << endl; // for debug
494
-	}
495
-	Rcpp::Rcerr << endl;
496
-	//cout << "final:" << endl << std::flush;
497
-	//print_vec(cout, theta, ", "); // for debug
498
-	//cout << endl << endl << std::flush; // for debug
499
-}
500
-
501
-void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
502
-{
503
-	unsigned int ncol = q.get_column_num(); // T, number of known tissues
504
-	unsigned int nrow = q.get_row_num(); // N, number of reads
505
-
506
-	vector<int> uniq_marker_Ids;
507
-	q.get_unique_row_labels(uniq_marker_Ids);
508
-	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
509
-
510
-	double v;
511
-	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
512
-	vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
513
-	for (int i=0; i<nrow; i++) {
514
-		// for each row (or read)
515
-		q.get_row_label(i, curr_markerId); // get the marker ID of read i
516
-		if (curr_markerId != prev_markerId) {
517
-			// this is the 1st read of a new marker
518
-			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
519
-			// then clear readCountsPerMarker for a new read counting of the next new marker
520
-			if (prev_markerId!=-1) { // current marker is not the 1st marker
521
-				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
522
-				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
523
-				assign_vector_zeros(readCountsPerMarker);
524
-			}
525
-			prev_markerId = curr_markerId;
526
-		}
527
-		for (int j=0; j<ncol; j++) {
528
-			q.get_element(i,j,v);
529
-			readCountsPerMarker[j] += v;
530
-		}
531
-	}
532
-	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
533
-	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
534
-}
535
-
536
-void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
537
-{
538
-	unsigned int ncol = q.get_column_num(); // T, number of known tissues
539
-	unsigned int nrow = q.get_row_num(); // N, number of reads
540
-	if (nrow!=(unsigned int)q_unknown.size()) {
541
-	  Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
542
-		// exit(EXIT_FAILURE);
543
-	}
544
-
545
-	vector<int> uniq_marker_Ids;
546
-	q.get_unique_row_labels(uniq_marker_Ids);
547
-	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
548
-
549
-	double v;
550
-	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
551
-	vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
552
-	for (int i=0; i<nrow; i++) {
553
-		// for each row (or read)
554
-		q.get_row_label(i, curr_markerId); // get the marker ID of read i
555
-		if (curr_markerId != prev_markerId) {
556
-			// this is the 1st read of a new marker
557
-			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
558
-			// then clear readCountsPerMarker for a new read counting of the next new marker
559
-			if (prev_markerId!=-1) { // current marker is not the 1st marker
560
-				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
561
-				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
562
-				assign_vector_zeros(readCountsPerMarker);
563
-			}
564
-			prev_markerId = curr_markerId;
565
-		}
566
-		for (int j=0; j<ncol; j++) {
567
-			q.get_element(i,j,v);
568
-			readCountsPerMarker[j] += v;
569
-		}
570
-		readCountsPerMarker[ncol] += q_unknown[i];
571
-	}
572
-	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
573
-	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
574
-}
575
-
1
+// [[Rcpp::depends(BH)]]
2
+#include <Rcpp.h>
3
+using namespace Rcpp;
4
+
5
+#include <cstdlib>
6
+#include <cmath>
7
+#include <sstream>
8
+#include <iostream>
9
+#include <fstream>
10
+#include <map>
11
+#include <vector>
12
+#include <string>
13
+#include <cassert>
14
+#include <algorithm>
15
+#include <boost/algorithm/string.hpp>
16
+#include <boost/assign.hpp>
17
+#include <boost/iostreams/filtering_stream.hpp>
18
+#include "matrix.h"
19
+#include "utils.h"
20
+
21
+using namespace std;
22
+using namespace boost;
23
+
24
+#include "template_utils.cpp"
25
+
26
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
27
+{
28
+	nrow = nrow_;
29
+	ncol = ncol_;
30
+	empty = false;
31
+	mat.reserve(nrow_);
32
+	for (int i=0; i<nrow; i++) {
33
+		vector<double> temp;
34
+		temp.reserve(ncol);
35
+		for (int j=0; j<ncol; j++)
36
+			temp.push_back(init_value);
37
+		mat.push_back( temp );
38
+	}
39
+	empty = mat.empty();
40
+}
41
+
42
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
43
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
44
+{
45
+	vector<string> items;
46
+	split(items, line, is_any_of("\t"));
47
+	vector<double> v;
48
+	for (int i=0; i<items.size(); i++) {
49
+		if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
50
+		if (i<num_header_column) continue; // skip the first num_header_column header columns
51
+		v.push_back( atof(items[i].c_str()) );
52
+	}
53
+	mat.push_back( v );
54
+}
55
+
56
+Matrix_Double::Matrix_Double(const string & mat_file,
57
+	bool header_line=false, int num_header_column=1)
58
+{
59
+	unsigned long i_line=0;
60
+	// input is a plain text file
61
+	istream * in=&cin; // default is stdin
62
+	ifstream fin;
63
+	if ( !(mat_file.compare("stdin")==0)) {
64
+		fin.open(mat_file.c_str());
65
+		if (fin.fail()){
66
+		  Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
67
+			// exit(EXIT_FAILURE);
68
+		}
69
+		in = &fin;
70
+	}
71
+	string line;
72
+	while (!(*in).eof()) {
73
+		getline((*in), line);
74
+		//cerr << "Line: " << line << endl;
75
+		if (header_line) {
76
+			if (i_line==0) {
77
+				// skip the header line
78
+				i_line++;
79
+				continue;
80
+			}
81
+		}
82
+		if (line.empty()) {
83
+			// this is the last line of the file
84
+			break;
85
+		}
86
+		process_one_line_of_mat_file(line, num_header_column);
87
+		i_line++;
88
+	}
89
+	if ( !(mat_file.compare("stdin")==0)) {
90
+		fin.close();
91
+	}
92
+	empty = mat.empty();
93
+	nrow = mat.size();
94
+	if (nrow>=1) ncol = mat[0].size();
95
+	else ncol = 0;
96
+}
97
+
98
+// return the row index of this vector
99
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
100
+	if (v_.size()!=ncol) {
101
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
102
+		// exit(EXIT_FAILURE);
103
+	}
104
+	vector<double> v;
105
+	for (int i=0; i<v_.size(); i++) {
106
+		v.push_back( v_[i] );
107
+	}
108
+	mat.push_back( v );
109
+	nrow = mat.size();
110
+	row_labels.push_back(row_label_);
111
+	empty=false;
112
+	return(nrow-1);
113
+}
114
+
115
+//
116
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
117
+//
118
+// return row index of this appended vector, if successfully appending; return -1 otherwise.
119
+//
120
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
121
+	if (v_.size()!=ncol) {
122
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
123
+		// exit(EXIT_FAILURE);
124
+	}
125
+	double min_ = *min_element(v_.begin(), v_.end());
126
+	double max_ = *max_element(v_.begin(), v_.end());
127
+	//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
128
+	if (min_==0) {
129
+		if (max_==0) return(-1);
130
+	} else {
131
+		double ratio=max_/min_;
132
+		if (ratio<min_threshold_maxv_minv) return(-1);
133
+	}
134
+	long row_index = append_row_vector(v_, row_label_);
135
+	return(row_index);
136
+}
137
+
138
+bool Matrix_Double::get_element(const int i, const int j, double & v)
139
+{
140
+	if (!empty) {
141
+		if (i<nrow && j<ncol) {
142
+			v = mat[i][j];
143
+			return true;
144
+		} else {
145
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
146
+			return false;
147
+		}
148
+	} else {
149
+		v = 0;
150
+		return false;
151
+	}
152
+}
153
+
154
+bool Matrix_Double::set_element(const int i, const int j, double v)
155
+{
156
+	if (!empty) {
157
+		if (i<nrow && j<ncol) {
158
+			mat[i][j] = v;
159
+			return true;
160
+		} else {
161
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
162
+			return false;
163
+		}
164
+	} else {
165
+		return false;
166
+	}
167
+}
168
+
169
+bool Matrix_Double::get_column_sum(const int j, double & v)
170
+{
171
+	if (!empty) {
172
+		if (j<ncol) {
173
+			v = 0;
174
+			for (int k=0; k<nrow; k++) {
175
+				v += mat[k][j];
176
+			}
177
+			return true;
178
+		} else {
179
+		  Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
180
+			return false;
181
+		}
182
+	} else {
183
+		v = 0;
184
+		return false;
185
+	}
186
+}
187
+
188
+bool Matrix_Double::get_row_sum(const int i, double & v)
189
+{
190
+	if (!empty) {
191
+		if (i<nrow) {
192
+			v = 0;
193
+			for (int k=0; k<ncol; k++) {
194
+				v += mat[i][k];
195
+			}
196
+			return true;
197
+		} else {
198
+		  Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
199
+			return false;
200
+		}
201
+	} else {
202
+		v = 0;
203
+		return false;
204
+	}
205
+}
206
+
207
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
208
+{
209
+	map<int, int> counts;
210
+	for (int i=0; i<row_labels.size(); i++)
211
+		if (counts.find(row_labels[i]) != counts.end())
212
+			counts[row_labels[i]] += 1; // found this label, then increment count
213
+		else {
214
+			counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
215
+			//counts[row_labels[i]] = 1;
216
+		}
217
+	map<int,int>::iterator it;
218
+	for (it=counts.begin(); it!=counts.end(); ++it)
219
+		uniq_labels.push_back(it->first);
220
+}
221
+
222
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
223
+{
224
+	// obtain unique row labels
225
+	vector<int> unique_row_labels;
226
+	int row_label_;
227
+	for (int i=0; i<nrow; i++) {
228
+		row_label_ = row_labels[i];
229
+		unique_row_labels.push_back( row_label_ );
230
+	}
231
+	Bins2Value::iterator iter;
232
+	for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
233
+		row_label_ = iter->first;
234
+		if (!exist_row_label(row_label_)) {
235
+			unique_row_labels.push_back( row_label_ );
236
+		}
237
+	}
238
+	sort(unique_row_labels.begin(), unique_row_labels.end());
239
+	// print both 'mat' and 'additional_column_data' as the last column
240
+	for (int i=0; i<unique_row_labels.size(); i++) {
241
+		row_label_ = unique_row_labels[i];
242
+		os << row_label_;
243
+		int row_index = get_row_index(row_label_);
244
+		if (row_index!=-1) {
245
+			for (int j=0; j<ncol; j++) {
246
+				os << "\t" << mat[row_index][j];
247
+			}
248
+		} else {
249
+			for (int j=0; j<ncol; j++) {
250
+				os << "\t0";
251
+			}
252
+		}
253
+		if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
254
+			os << "\t" << additional_column_data[row_label_];
255
+		} else {
256
+			os << "\t0";
257
+		}
258
+		os << endl;
259
+	}
260
+}
261
+
262
+ostream & operator<<(ostream & os, Matrix_Double & mat)
263
+{
264
+	double v;
265
+	int row_label;
266
+	if (!mat.isempty()) {
267
+		for (int i=0; i<mat.get_row_num(); i++) {
268
+			int j;
269
+			mat.get_row_label(i, row_label);
270
+			os << row_label << "\t";
271
+			for (j=0; j<mat.get_column_num()-1; j++) {
272
+				mat.get_element(i,j,v);
273
+				os << v << "\t";
274
+			}
275
+			mat.get_element(i,j,v);
276
+			os << v << endl;
277
+		}
278
+	}
279
+	return os;
280
+}
281
+
282
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
283
+{
284
+	unsigned int ncol = p.get_column_num();
285
+	unsigned int nrow = p.get_row_num();
286
+	double v;
287
+	double obj = 0;
288
+	for (int i=0; i<nrow; i++) {
289
+		double sum = 0;
290
+		for (int j=0; j<ncol; j++) {
291
+			p.get_element(i,j,v);
292
+			sum += theta[j]*v;
293
+		}
294
+		obj += log(sum);
295
+	}
296
+	return obj;
297
+}
298
+//
299
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
300
+//
301
+// input:
302
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
303
+//
304
+// output:
305
+//   theta (model parameters), a vector with "T" elements.
306
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
307
+//
308
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
309
+{
310
+	// cout.precision(15);
311
+	// cerr.precision(15);
312
+	unsigned int ncol = p.get_column_num();
313
+	unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
314
+	theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
315
+	// initialize model parameters as uniform distribution
316
+	// alternatively, model parameters can be random numbers by satisfying the crition
317
+	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
318
+	for (int j=0; j<ncol; j++) {
319
+		theta[j] = 1/(double)ncol;
320
+	}
321
+	// create and initialize q with the same size of p and with all elements initialized as 0
322
+	//Matrix_Double q(nrow, ncol, 0);
323
+	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
324
+	double v1, v2;
325
+	for (int iter=0; iter<max_iter; iter++) {
326
+	  Rcpp::Rcerr << iter+1 << "," ;
327
+		// E-step: estimate q
328
+		for (int i=0; i<nrow; i++) {
329
+			double sum = 0;
330
+			for (int j=0; j<ncol; j++) {
331
+				p.get_element(i,j,v1);
332
+				v2 = theta[j]*v1;
333
+				q.set_element( i, j, v2 );
334
+				sum += v2;
335
+			}
336
+			for (int j=0; j<ncol; j++) {
337
+				q.get_element(i,j,v2);
338
+				v2 /= sum;
339
+				q.set_element( i, j, v2 );
340
+			}
341
+		}
342
+		// M-step: estimate theta
343
+		for (int j=0; j<ncol; j++) {
344
+			double sum=0;
345
+			for (int i=0; i<nrow; i++) {
346
+				q.get_element(i,j,v2);
347
+				sum += v2;
348
+			}
349
+			theta[j] = sum / nrow;
350
+		}
351
+		// for debug
352
+		//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
353
+	}
354
+	// last E-step: estimate q
355
+	for (int i=0; i<nrow; i++) {
356
+		double sum = 0;
357
+		for (int j=0; j<ncol; j++) {
358
+			p.get_element(i,j,v1);
359
+			v2 = theta[j]*v1;
360
+			q.set_element( i, j, v2 );
361
+			sum += v2;
362
+		}
363
+		for (int j=0; j<ncol; j++) {
364
+			q.get_element(i,j,v2);
365
+			v2 /= sum;
366
+			q.set_element( i, j, v2 );
367
+		}
368
+	}
369
+	Rcpp::Rcerr << endl;
370
+}
371
+
372
+//
373
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
374
+//
375
+// input:
376
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
377
+//     p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
378
+//   Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
379
+//   Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
380
+//
381
+// output:
382
+//   theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
383
+//   m is a vector of M X 1, where M is number of markers.
384
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
385
+//   q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
386
+//
387
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
388
+	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
389
+	vector<double> & m)
390
+{
391
+	// cout.precision(15);
392
+  // cerr.precision(15);
393
+	unsigned int ncol = p.get_column_num(); // T, number of known tissues
394
+	unsigned int nrow = p.get_row_num(); // N, number of reads
395
+
396
+	theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
397
+
398
+	vector<int> uniq_marker_Ids;
399
+	p.get_unique_row_labels(uniq_marker_Ids);
400
+	int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
401
+	m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
402
+	map<int,int> markerId2Index; // marker index is 0-based.
403
+	for (int index=0; index<nMarker; index++) {
404
+		markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
405
+	}
406
+
407
+	// initialize model parameters theta as uniform distribution, and m as 0.5
408
+	// alternatively, theta and m can be random numbers, by satisfying:
409
+	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
410
+	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
411
+	for (int j=0; j<ncol+1; j++) {
412
+		theta[j] = 1/(double)(ncol+1);
413
+	}
414
+	for (int k=0; k<nMarker; k++) {
415
+		m[k] = 0.5;
416
+	}
417
+	for (int i=0; i<nrow; i++)
418
+		q_unknown.push_back(0);
419
+
420
+	// EM algorithm
421
+	for (int iter=0; iter<max_iter; iter++) {
422
+	  Rcpp::Rcerr << iter+1 << "," ;
423
+		// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
424
+		double v1, v2, likelihood_unknown_class;
425
+		int markerId, markerIndex;
426
+		for (int i=0; i<nrow; i++) {
427
+			// process the first T known tissues
428
+			double sum = 0;
429
+			for (int j=0; j<ncol; j++) {
430
+				p.get_element(i,j,v1);
431
+				v2 = theta[j]*v1;
432
+				q.set_element( i, j, v2 );
433
+				sum += v2;
434
+			}
435
+			// process the last unknown tissue
436
+			p.get_row_label(i, markerId); // get markerId (1-base) of read i
437
+			markerIndex = markerId2Index[markerId];
438
+			likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
439
+			q_unknown[i] = theta[ncol]*likelihood_unknown_class;
440
+			sum += q_unknown[i];
441
+
442
+			// update q and q_unknown
443
+			for (int j=0; j<ncol; j++) {
444
+				q.get_element(i,j,v2);
445
+				v2 /= sum;
446
+				q.set_element( i, j, v2 );
447
+			}
448
+			q_unknown[i] /= sum;
449
+		}
450
+
451
+		// M-step 1: estimate unknown class's methylation level (m) of each marker
452
+		double sum1=0, sum2=0;
453
+		int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
454
+		for (int i=0; i<nrow; i++) {
455
+			// for each row (or read)
456
+			p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
457
+			curr_marker_index = markerId2Index[markerId];
458
+			if (curr_marker_index != prev_marker_index) {
459
+				// this is the 1st read of a new marker
460
+				// we need to summarize m value of previous marker
461
+				if (prev_marker_index!=-1) { // current marker is not the 1st marker
462
+					m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
463
+					sum1 = 0;
464
+					sum2 = 0;
465
+				}
466
+				prev_marker_index = curr_marker_index;
467
+			}
468
+			sum1 += q_unknown[i]*Rm[i];
469
+			sum2 += q_unknown[i]*Rl[i];
470
+		}
471
+		m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
472
+
473
+		//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
474
+		//print_vec(cout, m, "\n"); // for debug
475
+		//cout << endl; // for debug
476
+
477
+		// M-step 2: estimate theta, which has ncol+1 values
478
+		double sum;
479
+		for (int j=0; j<ncol; j++) {
480
+			sum=0;
481
+			for (int i=0; i<nrow; i++) {
482
+				q.get_element(i,j,v2);
483
+				sum += v2;
484
+			}
485
+			theta[j] = sum / nrow;
486
+		}
487
+		sum=0;
488
+		for (int i=0; i<nrow; i++)
489
+			sum += q_unknown[i];
490
+		theta[ncol] = sum / nrow;
491
+
492
+		//cout << "round " << iter+1 << ", theta="; // for debug
493
+		//print_vec(cout, theta, ", "); // for debug
494
+		//cout << endl << endl; // for debug
495
+	}
496
+	Rcpp::Rcerr << endl;
497
+	//cout << "final:" << endl << std::flush;
498
+	//print_vec(cout, theta, ", "); // for debug
499
+	//cout << endl << endl << std::flush; // for debug
500
+}
501
+
502
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
503
+{
504
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
505
+	unsigned int nrow = q.get_row_num(); // N, number of reads
506
+
507
+	vector<int> uniq_marker_Ids;
508
+	q.get_unique_row_labels(uniq_marker_Ids);
509
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
510
+
511
+	double v;
512
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
513
+	vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
514
+	for (int i=0; i<nrow; i++) {
515
+		// for each row (or read)
516
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
517
+		if (curr_markerId != prev_markerId) {
518
+			// this is the 1st read of a new marker
519
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
520
+			// then clear readCountsPerMarker for a new read counting of the next new marker
521
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
522
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
523
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
524
+				assign_vector_zeros(readCountsPerMarker);
525
+			}
526
+			prev_markerId = curr_markerId;
527
+		}
528
+		for (int j=0; j<ncol; j++) {
529
+			q.get_element(i,j,v);
530
+			readCountsPerMarker[j] += v;
531
+		}
532
+	}
533
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
534
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
535
+}
536
+
537
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
538
+{
539
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
540
+	unsigned int nrow = q.get_row_num(); // N, number of reads
541
+	if (nrow!=(unsigned int)q_unknown.size()) {
542
+	  Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
543
+		// exit(EXIT_FAILURE);
544
+	}
545
+
546
+	vector<int> uniq_marker_Ids;
547
+	q.get_unique_row_labels(uniq_marker_Ids);
548
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
549
+
550
+	double v;
551
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
552
+	vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
553
+	for (int i=0; i<nrow; i++) {
554
+		// for each row (or read)
555
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
556
+		if (curr_markerId != prev_markerId) {
557
+			// this is the 1st read of a new marker
558
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
559
+			// then clear readCountsPerMarker for a new read counting of the next new marker
560
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
561
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
562
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
563
+				assign_vector_zeros(readCountsPerMarker);
564
+			}
565
+			prev_markerId = curr_markerId;
566
+		}
567
+		for (int j=0; j<ncol; j++) {
568
+			q.get_element(i,j,v);
569
+			readCountsPerMarker[j] += v;
570
+		}
571
+		readCountsPerMarker[ncol] += q_unknown[i];
572
+	}
573
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
574
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
575
+}
576
+
Browse code

add files

RoseHuRan authored on 27/03/2022 20:48:07
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,575 @@
1
+#include <Rcpp.h>
2
+using namespace Rcpp;
3
+
4
+#include <cstdlib>
5
+#include <cmath>
6
+#include <sstream>
7
+#include <iostream>
8
+#include <fstream>
9
+#include <map>
10
+#include <vector>
11
+#include <string>
12
+#include <cassert>
13
+#include <algorithm>
14
+#include <boost/algorithm/string.hpp>
15
+#include <boost/assign.hpp>
16
+#include <boost/iostreams/filtering_stream.hpp>
17
+#include "matrix.h"
18
+#include "utils.h"
19
+
20
+using namespace std;
21
+using namespace boost;
22
+
23
+#include "template_utils.cpp"
24
+
25
+Matrix_Double::Matrix_Double(const unsigned int nrow_, const unsigned int ncol_, const double init_value)
26
+{
27
+	nrow = nrow_;
28
+	ncol = ncol_;
29
+	empty = false;
30
+	mat.reserve(nrow_);
31
+	for (int i=0; i<nrow; i++) {
32
+		vector<double> temp;
33
+		temp.reserve(ncol);
34
+		for (int j=0; j<ncol; j++)
35
+			temp.push_back(init_value);
36
+		mat.push_back( temp );
37
+	}
38
+	empty = mat.empty();
39
+}
40
+
41
+// a line represents a row of the matrix. Each element in this row is delimited by a TAB.
42
+void Matrix_Double::process_one_line_of_mat_file(string & line, int num_header_column)
43
+{
44
+	vector<string> items;
45
+	split(items, line, is_any_of("\t"));
46
+	vector<double> v;
47
+	for (int i=0; i<items.size(); i++) {
48
+		if (i==0) row_labels.push_back( atoi(items[0].c_str()) ); // the first column is the label of this row.
49
+		if (i<num_header_column) continue; // skip the first num_header_column header columns
50
+		v.push_back( atof(items[i].c_str()) );
51
+	}
52
+	mat.push_back( v );
53
+}
54
+
55
+Matrix_Double::Matrix_Double(const string & mat_file,
56
+	bool header_line=false, int num_header_column=1)
57
+{
58
+	unsigned long i_line=0;
59
+	// input is a plain text file
60
+	istream * in=&cin; // default is stdin
61
+	ifstream fin;
62
+	if ( !(mat_file.compare("stdin")==0)) {
63
+		fin.open(mat_file.c_str());
64
+		if (fin.fail()){
65
+		  Rcpp::Rcerr << "Error: Unable to open " << mat_file << " in Matrix_Double()" << endl;
66
+			// exit(EXIT_FAILURE);
67
+		}
68
+		in = &fin;
69
+	}
70
+	string line;
71
+	while (!(*in).eof()) {
72
+		getline((*in), line);
73
+		//cerr << "Line: " << line << endl;
74
+		if (header_line) {
75
+			if (i_line==0) {
76
+				// skip the header line
77
+				i_line++;
78
+				continue;
79
+			}
80
+		}
81
+		if (line.empty()) {
82
+			// this is the last line of the file
83
+			break;
84
+		}
85
+		process_one_line_of_mat_file(line, num_header_column);
86
+		i_line++;
87
+	}
88
+	if ( !(mat_file.compare("stdin")==0)) {
89
+		fin.close();
90
+	}
91
+	empty = mat.empty();
92
+	nrow = mat.size();
93
+	if (nrow>=1) ncol = mat[0].size();
94
+	else ncol = 0;
95
+}
96
+
97
+// return the row index of this vector
98
+long Matrix_Double::append_row_vector(vector<double>& v_, int row_label_) {
99
+	if (v_.size()!=ncol) {
100
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
101
+		// exit(EXIT_FAILURE);
102
+	}
103
+	vector<double> v;
104
+	for (int i=0; i<v_.size(); i++) {
105
+		v.push_back( v_[i] );
106
+	}
107
+	mat.push_back( v );
108
+	nrow = mat.size();
109
+	row_labels.push_back(row_label_);
110
+	empty=false;
111
+	return(nrow-1);
112
+}
113
+
114
+//
115
+// if max(v_)/min(v_) >= min_threshold_maxv_minv, then we append this vector, otherwise ignore it.
116
+//
117
+// return row index of this appended vector, if successfully appending; return -1 otherwise.
118
+//
119
+long Matrix_Double::append_row_vector_with_filter(vector<double>& v_, int row_label_, double min_threshold_maxv_minv) {
120
+	if (v_.size()!=ncol) {
121
+	  Rcpp::Rcerr << "Error of Matrix_Double::append_row_vector_with_filter: the appended vector size (" << v_.size() << ") does not match the number of column in the matrix (" << ncol << ")!\nExit." << endl;
122
+		// exit(EXIT_FAILURE);
123
+	}
124
+	double min_ = *min_element(v_.begin(), v_.end());
125
+	double max_ = *max_element(v_.begin(), v_.end());
126
+	//cout << "max: " << max_ << ", min: " << min_ << ", ratio: " << max_/min_ << endl;
127
+	if (min_==0) {
128
+		if (max_==0) return(-1);
129
+	} else {
130
+		double ratio=max_/min_;
131
+		if (ratio<min_threshold_maxv_minv) return(-1);
132
+	}
133
+	long row_index = append_row_vector(v_, row_label_);
134
+	return(row_index);
135
+}
136
+
137
+bool Matrix_Double::get_element(const int i, const int j, double & v)
138
+{
139
+	if (!empty) {
140
+		if (i<nrow && j<ncol) {
141
+			v = mat[i][j];
142
+			return true;
143
+		} else {
144
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
145
+			return false;
146
+		}
147
+	} else {
148
+		v = 0;
149
+		return false;
150
+	}
151
+}
152
+
153
+bool Matrix_Double::set_element(const int i, const int j, double v)
154
+{
155
+	if (!empty) {
156
+		if (i<nrow && j<ncol) {
157
+			mat[i][j] = v;
158
+			return true;
159
+		} else {
160
+		  Rcpp::Rcerr << "Warning: i=" << i << " and j=" << j << " exceed matrix size!" << endl;
161
+			return false;
162
+		}
163
+	} else {
164
+		return false;
165
+	}
166
+}
167
+
168
+bool Matrix_Double::get_column_sum(const int j, double & v)
169
+{
170
+	if (!empty) {
171
+		if (j<ncol) {
172
+			v = 0;
173
+			for (int k=0; k<nrow; k++) {
174
+				v += mat[k][j];
175
+			}
176
+			return true;
177
+		} else {
178
+		  Rcpp::Rcerr << "Warning: j=" << j << " exceeds matrix column size (" << ncol << ")!" << endl;
179
+			return false;
180
+		}
181
+	} else {
182
+		v = 0;
183
+		return false;
184
+	}
185
+}
186
+
187
+bool Matrix_Double::get_row_sum(const int i, double & v)
188
+{
189
+	if (!empty) {
190
+		if (i<nrow) {
191
+			v = 0;
192
+			for (int k=0; k<ncol; k++) {
193
+				v += mat[i][k];
194
+			}
195
+			return true;
196
+		} else {
197
+		  Rcpp::Rcerr << "Warning: i=" << i << " exceeds matrix row size (" << nrow << ")!" << endl;
198
+			return false;
199
+		}
200
+	} else {
201
+		v = 0;
202
+		return false;
203
+	}
204
+}
205
+
206
+void Matrix_Double::get_unique_row_labels(vector<int> & uniq_labels)
207
+{
208
+	map<int, int> counts;
209
+	for (int i=0; i<row_labels.size(); i++)
210
+		if (counts.find(row_labels[i]) != counts.end())
211
+			counts[row_labels[i]] += 1; // found this label, then increment count
212
+		else {
213
+			counts.insert(make_pair(row_labels[i],1)); // not found this label, then count it once
214
+			//counts[row_labels[i]] = 1;
215
+		}
216
+	map<int,int>::iterator it;
217
+	for (it=counts.begin(); it!=counts.end(); ++it)
218
+		uniq_labels.push_back(it->first);
219
+}
220
+
221
+void Matrix_Double::print_with_additional_column_of_Bins2Value(ostream & os, Bins2Value& additional_column_data)
222
+{
223
+	// obtain unique row labels
224
+	vector<int> unique_row_labels;
225
+	int row_label_;
226
+	for (int i=0; i<nrow; i++) {
227
+		row_label_ = row_labels[i];
228
+		unique_row_labels.push_back( row_label_ );
229
+	}
230
+	Bins2Value::iterator iter;
231
+	for (iter=additional_column_data.begin(); iter!=additional_column_data.end(); iter++) {
232
+		row_label_ = iter->first;
233
+		if (!exist_row_label(row_label_)) {
234
+			unique_row_labels.push_back( row_label_ );
235
+		}
236
+	}
237
+	sort(unique_row_labels.begin(), unique_row_labels.end());
238
+	// print both 'mat' and 'additional_column_data' as the last column
239
+	for (int i=0; i<unique_row_labels.size(); i++) {
240
+		row_label_ = unique_row_labels[i];
241
+		os << row_label_;
242
+		int row_index = get_row_index(row_label_);
243
+		if (row_index!=-1) {
244
+			for (int j=0; j<ncol; j++) {
245
+				os << "\t" << mat[row_index][j];
246
+			}
247
+		} else {
248
+			for (int j=0; j<ncol; j++) {
249
+				os << "\t0";
250
+			}
251
+		}
252
+		if (additional_column_data.find(row_label_)!=additional_column_data.end()) {
253
+			os << "\t" << additional_column_data[row_label_];
254
+		} else {
255
+			os << "\t0";
256
+		}
257
+		os << endl;
258
+	}
259
+}
260
+
261
+ostream & operator<<(ostream & os, Matrix_Double & mat)
262
+{
263
+	double v;
264
+	int row_label;
265
+	if (!mat.isempty()) {
266
+		for (int i=0; i<mat.get_row_num(); i++) {
267
+			int j;
268
+			mat.get_row_label(i, row_label);
269
+			os << row_label << "\t";
270
+			for (j=0; j<mat.get_column_num()-1; j++) {
271
+				mat.get_element(i,j,v);
272
+				os << v << "\t";
273
+			}
274
+			mat.get_element(i,j,v);
275
+			os << v << endl;
276
+		}
277
+	}
278
+	return os;
279
+}
280
+
281
+double objective_em_supervise(Matrix_Double & p, vector<double> & theta)
282
+{
283
+	unsigned int ncol = p.get_column_num();
284
+	unsigned int nrow = p.get_row_num();
285
+	double v;
286
+	double obj = 0;
287
+	for (int i=0; i<nrow; i++) {
288
+		double sum = 0;
289
+		for (int j=0; j<ncol; j++) {
290
+			p.get_element(i,j,v);
291
+			sum += theta[j]*v;
292
+		}
293
+		obj += log(sum);
294
+	}
295
+	return obj;
296
+}
297
+//
298
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
299
+//
300
+// input:
301
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
302
+//
303
+// output:
304
+//   theta (model parameters), a vector with "T" elements.
305
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
306
+//
307
+void em_supervise(Matrix_Double & p, int max_iter, vector<double> & theta, Matrix_Double& q)
308
+{
309
+	// cout.precision(15);
310
+	// cerr.precision(15);
311
+	unsigned int ncol = p.get_column_num();
312
+	unsigned int nrow = p.get_row_num(); // nrow is number of tissues.
313
+	theta.resize(ncol, 0); // assign (num_tissues) space and initialize to 0s.
314
+	// initialize model parameters as uniform distribution
315
+	// alternatively, model parameters can be random numbers by satisfying the crition
316
+	//    (1) \sum_{i=1}^{ncol}{theta_i}=1
317
+	for (int j=0; j<ncol; j++) {
318
+		theta[j] = 1/(double)ncol;
319
+	}
320
+	// create and initialize q with the same size of p and with all elements initialized as 0
321
+	//Matrix_Double q(nrow, ncol, 0);
322
+	//cerr << "iter 0\t" << objective_em_supervise(p, theta) << endl;
323
+	double v1, v2;
324
+	for (int iter=0; iter<max_iter; iter++) {
325
+	  Rcpp::Rcerr << iter+1 << "," ;
326
+		// E-step: estimate q
327
+		for (int i=0; i<nrow; i++) {
328
+			double sum = 0;
329
+			for (int j=0; j<ncol; j++) {
330
+				p.get_element(i,j,v1);
331
+				v2 = theta[j]*v1;
332
+				q.set_element( i, j, v2 );
333
+				sum += v2;
334
+			}
335
+			for (int j=0; j<ncol; j++) {
336
+				q.get_element(i,j,v2);
337
+				v2 /= sum;
338
+				q.set_element( i, j, v2 );
339
+			}
340
+		}
341
+		// M-step: estimate theta
342
+		for (int j=0; j<ncol; j++) {
343
+			double sum=0;
344
+			for (int i=0; i<nrow; i++) {
345
+				q.get_element(i,j,v2);
346
+				sum += v2;
347
+			}
348
+			theta[j] = sum / nrow;
349
+		}
350
+		// for debug
351
+		//cerr << "iter " << (iter+1) << "\t" << objective_em_supervise(p, theta) << endl;
352
+	}
353
+	// last E-step: estimate q
354
+	for (int i=0; i<nrow; i++) {
355
+		double sum = 0;
356
+		for (int j=0; j<ncol; j++) {
357
+			p.get_element(i,j,v1);
358
+			v2 = theta[j]*v1;
359
+			q.set_element( i, j, v2 );
360
+			sum += v2;
361
+		}
362
+		for (int j=0; j<ncol; j++) {
363
+			q.get_element(i,j,v2);
364
+			v2 /= sum;
365
+			q.set_element( i, j, v2 );
366
+		}
367
+	}
368
+	Rcpp::Rcerr << endl;
369
+}
370
+
371
+//
372
+// There are T known tissues and 1 unknown tissue (described by a double vector "m")
373
+//
374
+// input:
375
+//   p is a matrix of N X T, where N is number of reads and T is number of known tissue
376
+//     p.row_label is an int vector of N X 1, each element is the marker Id (1-base) of a read.
377
+//   Rm is a vector of N X 1, each element is number of valid methylated CpG sites in a read
378
+//   Rl is a vector of N X 1, each element is number of all valid CpG sites in a read
379
+//
380
+// output:
381
+//   theta is a vector of (T+1) X 1, where T is number of known tissue. This vector is already allocated with space of (ncol+1) units.
382
+//   m is a vector of M X 1, where M is number of markers.
383
+//   q is a matrix of N X T, the tissue-specific posterior probabilty of each read
384
+//   q_unknown is a vector of N X 1, the posterior probabilty of each read for the unknown class. It does not need to be allocated space before this function calling.
385
+//
386
+void em_semisupervise(Matrix_Double & p, vector<int> & Rm, vector<int> & Rl,
387
+	int max_iter, vector<double> & theta, Matrix_Double& q, vector<double>& q_unknown,
388
+	vector<double> & m)
389
+{
390
+	// cout.precision(15);
391
+  // cerr.precision(15);
392
+	unsigned int ncol = p.get_column_num(); // T, number of known tissues
393
+	unsigned int nrow = p.get_row_num(); // N, number of reads
394
+
395
+	theta.resize(ncol+1, 0); // assign (num_tissues+1) space and initialize to 0s.
396
+
397
+	vector<int> uniq_marker_Ids;
398
+	p.get_unique_row_labels(uniq_marker_Ids);
399
+	int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
400
+	m.resize(nMarker, 0); // assign nMarker space and initialize to 0s.
401
+	map<int,int> markerId2Index; // marker index is 0-based.
402
+	for (int index=0; index<nMarker; index++) {
403
+		markerId2Index.insert(make_pair(uniq_marker_Ids[index],index));
404
+	}
405
+
406
+	// initialize model parameters theta as uniform distribution, and m as 0.5
407
+	// alternatively, theta and m can be random numbers, by satisfying:
408
+	//    (1) \sum_{i=1}^{ncol+1}{theta_i}=1
409
+	//    (2) 0 <= m_k <=1 for all k=1,2,...,#marker_covered_by_all_reads
410
+	for (int j=0; j<ncol+1; j++) {
411
+		theta[j] = 1/(double)(ncol+1);
412
+	}
413
+	for (int k=0; k<nMarker; k++) {
414
+		m[k] = 0.5;
415
+	}
416
+	for (int i=0; i<nrow; i++)
417
+		q_unknown.push_back(0);
418
+
419
+	// EM algorithm
420
+	for (int iter=0; iter<max_iter; iter++) {
421
+	  Rcpp::Rcerr << iter+1 << "," ;
422
+		// E-step: estimate q (for T known tissues) and q_unknown (for one unknown tissue)
423
+		double v1, v2, likelihood_unknown_class;
424
+		int markerId, markerIndex;
425
+		for (int i=0; i<nrow; i++) {
426
+			// process the first T known tissues
427
+			double sum = 0;
428
+			for (int j=0; j<ncol; j++) {
429
+				p.get_element(i,j,v1);
430
+				v2 = theta[j]*v1;
431
+				q.set_element( i, j, v2 );
432
+				sum += v2;
433
+			}
434
+			// process the last unknown tissue
435
+			p.get_row_label(i, markerId); // get markerId (1-base) of read i
436
+			markerIndex = markerId2Index[markerId];
437
+			likelihood_unknown_class = pow(m[markerIndex],Rm[i]) * pow(1-m[markerIndex],Rl[i]-Rm[i]);
438
+			q_unknown[i] = theta[ncol]*likelihood_unknown_class;
439
+			sum += q_unknown[i];
440
+
441
+			// update q and q_unknown
442
+			for (int j=0; j<ncol; j++) {
443
+				q.get_element(i,j,v2);
444
+				v2 /= sum;
445
+				q.set_element( i, j, v2 );
446
+			}
447
+			q_unknown[i] /= sum;
448
+		}
449
+
450
+		// M-step 1: estimate unknown class's methylation level (m) of each marker
451
+		double sum1=0, sum2=0;
452
+		int curr_marker_index=-1, prev_marker_index=-1; // all marker index is 0-base
453
+		for (int i=0; i<nrow; i++) {
454
+			// for each row (or read)
455
+			p.get_row_label(i, markerId); // get the marker_index (0-base) of read i
456
+			curr_marker_index = markerId2Index[markerId];
457
+			if (curr_marker_index != prev_marker_index) {
458
+				// this is the 1st read of a new marker
459
+				// we need to summarize m value of previous marker
460
+				if (prev_marker_index!=-1) { // current marker is not the 1st marker
461
+					m[prev_marker_index] = (sum2!=0 ? sum1/sum2 : 0);
462
+					sum1 = 0;
463
+					sum2 = 0;
464
+				}
465
+				prev_marker_index = curr_marker_index;
466
+			}
467
+			sum1 += q_unknown[i]*Rm[i];
468
+			sum2 += q_unknown[i]*Rl[i];
469
+		}
470
+		m[curr_marker_index] = (sum2!=0 ? sum1/sum2 : 0); // estimate m of the last marker
471
+
472
+		//cout << "iter=" << iter << ", " << "m=" << endl; // for debug
473
+		//print_vec(cout, m, "\n"); // for debug
474
+		//cout << endl; // for debug
475
+
476
+		// M-step 2: estimate theta, which has ncol+1 values
477
+		double sum;
478
+		for (int j=0; j<ncol; j++) {
479
+			sum=0;
480
+			for (int i=0; i<nrow; i++) {
481
+				q.get_element(i,j,v2);
482
+				sum += v2;
483
+			}
484
+			theta[j] = sum / nrow;
485
+		}
486
+		sum=0;
487
+		for (int i=0; i<nrow; i++)
488
+			sum += q_unknown[i];
489
+		theta[ncol] = sum / nrow;
490
+
491
+		//cout << "round " << iter+1 << ", theta="; // for debug
492
+		//print_vec(cout, theta, ", "); // for debug
493
+		//cout << endl << endl; // for debug
494
+	}
495
+	Rcpp::Rcerr << endl;
496
+	//cout << "final:" << endl << std::flush;
497
+	//print_vec(cout, theta, ", "); // for debug
498
+	//cout << endl << endl << std::flush; // for debug
499
+}
500
+
501
+void readCounts_by_reads_posterior_probability_version_regular(Matrix_Double& q, double unit, Matrix_Double& readCounts)
502
+{
503
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
504
+	unsigned int nrow = q.get_row_num(); // N, number of reads
505
+
506
+	vector<int> uniq_marker_Ids;
507
+	q.get_unique_row_labels(uniq_marker_Ids);
508
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
509
+
510
+	double v;
511
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
512
+	vector<double> readCountsPerMarker(ncol, 0); // a vector of read counts for one marker (T elements) with default values 0
513
+	for (int i=0; i<nrow; i++) {
514
+		// for each row (or read)
515
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
516
+		if (curr_markerId != prev_markerId) {
517
+			// this is the 1st read of a new marker
518
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
519
+			// then clear readCountsPerMarker for a new read counting of the next new marker
520
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
521
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
522
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
523
+				assign_vector_zeros(readCountsPerMarker);
524
+			}
525
+			prev_markerId = curr_markerId;
526
+		}
527
+		for (int j=0; j<ncol; j++) {
528
+			q.get_element(i,j,v);
529
+			readCountsPerMarker[j] += v;
530
+		}
531
+	}
532
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
533
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
534
+}
535
+
536
+void readCounts_by_reads_posterior_probability_version_unknownclass(Matrix_Double& q, vector<double>& q_unknown, double unit, Matrix_Double& readCounts)
537
+{
538
+	unsigned int ncol = q.get_column_num(); // T, number of known tissues
539
+	unsigned int nrow = q.get_row_num(); // N, number of reads
540
+	if (nrow!=(unsigned int)q_unknown.size()) {
541
+	  Rcpp::Rcerr << "Error (readCounts_by_reads_posterior_probability_version_unknownclass): row number of q_unknown does not match with row number of q!" << endl;
542
+		// exit(EXIT_FAILURE);
543
+	}
544
+
545
+	vector<int> uniq_marker_Ids;
546
+	q.get_unique_row_labels(uniq_marker_Ids);
547
+	// int nMarker = uniq_marker_Ids.size(); // M, number of markers that all N reads cover. Some markers may not be covered by these N reads.
548
+
549
+	double v;
550
+	int curr_markerId=-1, prev_markerId=-1; // all marker index is 0-base
551
+	vector<double> readCountsPerMarker(ncol+1, 0); // a vector of read counts for one marker (T+1 elements) with default values 0. The last element is for the unknown class.
552
+	for (int i=0; i<nrow; i++) {
553
+		// for each row (or read)
554
+		q.get_row_label(i, curr_markerId); // get the marker ID of read i
555
+		if (curr_markerId != prev_markerId) {
556
+			// this is the 1st read of a new marker
557
+			// we need to summarize read counts of each tissue for the previous marker and deposit them to the matrix "readCounts"
558
+			// then clear readCountsPerMarker for a new read counting of the next new marker
559
+			if (prev_markerId!=-1) { // current marker is not the 1st marker
560
+				multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
561
+				readCounts.append_row_vector(readCountsPerMarker, prev_markerId);
562
+				assign_vector_zeros(readCountsPerMarker);
563
+			}
564
+			prev_markerId = curr_markerId;
565
+		}
566
+		for (int j=0; j<ncol; j++) {
567
+			q.get_element(i,j,v);
568
+			readCountsPerMarker[j] += v;
569
+		}
570
+		readCountsPerMarker[ncol] += q_unknown[i];
571
+	}
572
+	multi_vec_by_number(readCountsPerMarker, unit); // normalize the raw read counts by unit.
573
+	readCounts.append_row_vector(readCountsPerMarker, curr_markerId);
574
+}
575
+