Thanks to visit codestin.com
Credit goes to code.bioconductor.org

src/read_deconvolution.cpp
536fd2e8
 // [[Rcpp::depends(BH)]]
 #include <Rcpp.h>
 using namespace Rcpp;
 
 #include <cstdlib>
 #include <vector>
 #include <map>
 #include <string>
 #include <climits>
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <cassert>
 #include <boost/algorithm/string.hpp>
 #include <boost/assign.hpp>
 #include <boost/iostreams/filtering_stream.hpp>
 #include "getopt.h"
 #include "utils.h"
 #include "data_types.h"
 #include "matrix.h"
 
 using namespace std;
 using namespace boost;
 
 #include "template_utils.cpp"
 
 //#define DEBUG
 
 // Global variables
 string tissue_markers_file="";
 int num_tissues=0;
 string reads_binning_file; //="stdin"; // default is "stdin"
 string output_file="stdout"; // default is "stdout"
 string output_type="tissueFraction"; // default is "tissueFraction"
 double min_likelihood_ratio_cutoff=-1.0; // default is -1.0, which means "No reads filtering with likelihood ratio cutoff"
 string em_algorithm_type="em.global.unknown"; // default is "the EM algorithm with unknown class", whose short name: "em.global.unknown". Other option is "em_known"
 int em_max_iterations=100; // default is 100 iterations
 
 /*-----------------------------------
 command line parser's data structure
 -----------------------------------*/
 struct option longopts[] =
 {
 	/* name,                      has_arg,          flag, val */ /* longind */
 	{ "help",                           no_argument,       0,   'h' }, /*       0 */
 	{ "reads_binning_file",             required_argument, 0,   'r' }, /*       2 */
 	{ "tissue_markers_file",            required_argument, 0,   't' }, /*       1 */
 	{ "num_tissues",                    required_argument, 0,   'T' }, /*       1 */
 	{ "min_likelihood_ratio_cutoff",    required_argument, 0,   'p' }, /*       1 */
 	{ "em_algorithm",                   required_argument, 0,   'a' }, /*       3 */
 	{ "em_max_iterations",              required_argument, 0,   'x' }, /*       3 */
 	{ "output_file",                    required_argument, 0,   'o' }, /*       3 */
 	{ "output_type",                    required_argument, 0,   'O' }, /*       3 */
 	/* end-of-list marker */
 	{ 0, 0, 0, 0 }
 };
 
 // char *shortopts = "hr:t:T:a:p:x:o:O:"; /* short options string */
 int longind = 0; /* long option list index */
 
 void print_usage() {
   Rcpp::Rcerr << "Calculate tissue-specific likelihood for each sequencing read and perform EM algorithm on all reads to obtain model parameters, then compute and output the tissue-specific read counts for each marker" << endl << endl;
   Rcpp::Rcerr  << "USAGE: tissue_deconv [options]" << endl << endl;
   Rcpp::Rcerr  << "   Options:" << endl;
   Rcpp::Rcerr  << "     -a [string]:  read-based tissue deconvolution EM algorithm type: em.global.unknown (default), em.global.known, em.local.unknown, em.local.known." << endl;
   Rcpp::Rcerr  << "     -r [FILE]:  reads binning file (default: stdin)" << endl;
   Rcpp::Rcerr  << "     -o [FILE]:  output file (default: stdout)" << endl;
   Rcpp::Rcerr  << "     -O [FILE]:  output type tissueFraction (default), tissueFraction+readCountRaw, tissueFraction+readCountPerMillion, tissueFraction+readCountPerBillion" << endl;
   Rcpp::Rcerr  << "     -T [positive integer]: number of tissue types." << endl;
   Rcpp::Rcerr  << "     -p [positive real value]: all reads with likelihood ratio < cutoff (default: -1.0, meanin no reads filtering is used) will not be used. Likelihood ratio is the max(all tissues' likelihoods)/min(all tissues' likelihoods)" << endl;
   Rcpp::Rcerr  << "     -t [FILE]:  tissue markers file: each marker for each tissue type has either a median beta value or a paired shape parameters of beta-distribution from the tissue population of tissue types." << endl;
   Rcpp::Rcerr  << "     -x [positive integer]:  EM algorithm maximum iteration number: 100 (default)." << endl;
   Rcpp::Rcerr  << endl;
     //cerr << "version 1.0, January 22, 2020" << endl;
     //cerr << "By Wenyuan Li" << endl;
   Rcpp::Rcerr  << endl;
 }
 
 void parse_command_line(string reads_binning_file, int num_tissues, double min_likelihood_ratio_cutoff, string tissue_markers_file,
                         string em_algorithm_type, string output_file, string output_type, int em_max_iterations)
 {
 
 	if (output_type!="tissueFraction" && output_type!="tissueFraction+readCountRaw" && output_type!="tissueFraction+readCountPerMillion" && output_type!="tissueFraction+readCountPerBillion") {
 	  Rcpp::Rcerr  << endl << "Error: output types should take one of four values: tissueFraction, tissueFraction+readCountRaw, tissueFraction+readCountPerMillion and tissueFraction+readCountPerBillion!" << endl << endl;
 		print_usage();
 		// exit(EXIT_FAILURE);
 	}
 	if (em_algorithm_type.compare("em.global.known")!=0 && em_algorithm_type.compare("em.global.unknown")!=0 && em_algorithm_type.compare("em.local.unknown")!=0 && em_algorithm_type.compare("em.local.unknown")!=0) {
 	  Rcpp::Rcerr  << endl << "Error: EM algorithm types should take one of three values: em.global.known, em.global.unknown, em.local.known, em.local.unknown!" << endl << endl;
 		print_usage();
 		// exit(EXIT_FAILURE);
 	}
 	if (reads_binning_file.empty()) {
 	  Rcpp::Rcerr  << endl << "Error: input reads binning file is required!" << endl << endl;
 		print_usage();
 		// exit(EXIT_FAILURE);
 	}
 	if (num_tissues<=0) {
 	  Rcpp::Rcerr  << endl << "Error: number of tissue types must be >1!" << endl << endl;
 		print_usage();
 	}
 	if (tissue_markers_file.empty()) {
 	  Rcpp::Rcerr  << endl << "Error: input file of tissue markers is required!" << endl << endl;
 		print_usage();
 		// exit(EXIT_FAILURE);
 	}
 }
 
 // [[Rcpp::export]]
 void read_deconvolution_cpp(std::string reads_binning_file, int num_tissues, double min_likelihood_ratio_cutoff, std::string tissue_markers_file,
433af17b
          std::string em_algorithm_type, std::string output_file, std::string output_type, int em_max_iterations=100, int random_seed=0) {
536fd2e8
 
 	parse_command_line(reads_binning_file, num_tissues, min_likelihood_ratio_cutoff, tissue_markers_file,
                     em_algorithm_type, output_file, output_type, em_max_iterations);
 
53b02cb7
 //   Rcpp::Rcerr  << "fragment-level methylation states file: " << reads_binning_file << endl;
 //   Rcpp::Rcerr  << "marker file: " << tissue_markers_file << endl;
 //   Rcpp::Rcerr  << "number of sample types: " << num_tissues << endl;
 // 	if (min_likelihood_ratio_cutoff==-1.0)
 // 	  Rcpp::Rcerr  << "No likilihood-ratio-based reads filtering" << endl;
 // 	else
 // 	  Rcpp::Rcerr  << "min likelihood ratio cutoff (for each read): " << min_likelihood_ratio_cutoff << endl;
 // 	Rcpp::Rcerr  << "read deconvolution algorithm: " << em_algorithm_type << endl;
 // 	Rcpp::Rcerr  << "EM max iterations: " << em_max_iterations << endl;
 // 	Rcpp::Rcerr  << "output type: " << output_type << endl;
 // 	Rcpp::Rcerr  << "output file: " << output_file << endl;
 // 	Rcpp::Rcerr  << endl;
536fd2e8
 
53b02cb7
 	// Rcpp::Rcerr  << "reading " << tissue_markers_file << " ..." << endl;
536fd2e8
 	string fileext_gzip = ".gz";
 	Bins2Values marker2beta;
 	vector<string> tissue_names;
 	if (str_ends_with(tissue_markers_file,fileext_gzip)) {
 		// read_tissue_markers_gz_file(tissue_markers_file, 2, num_tissues, marker2beta, tissue_names);
de1a1604
 		Rcpp::Rcout << "Input has to be a txt file" << endl;
536fd2e8
 	} else {
 		read_tissue_markers_txt_file(tissue_markers_file, 2, num_tissues, marker2beta, tissue_names);
 	}
 	//string debug_file="./debug.single_value.tissue_markers.txt";
 	//write_Bins2Values(marker2beta, tissue_names, debug_file, false);
 	//exit(EXIT_SUCCESS);
 
53b02cb7
 	// Rcpp::Rcerr  << "calculating reads likelihoods ..." << endl;
536fd2e8
 	Matrix_Double reads_likelihoods((unsigned int)num_tissues);
 	Bins2UnsignedIntegers marker2rowindexes;
 	Bins2Value marker2ambiguousreadcounts;
 	vector<int> Rm, Rl;
 	unsigned long num_total_reads;
 	if (str_ends_with(reads_binning_file,fileext_gzip)) {
 		// num_total_reads = calc_read_probability_by_marker2beta_from_reads_binning_gzip_file(reads_binning_file, marker2beta, reads_likelihoods, marker2rowindexes, marker2ambiguousreadcounts, Rm, Rl, min_likelihood_ratio_cutoff);
 		Rcpp::Rcout << "Input has to be a txt file" << endl;
 	} else {
 		num_total_reads = calc_read_probability_by_marker2beta_from_reads_binning_text_file(reads_binning_file, marker2beta, reads_likelihoods, marker2rowindexes, marker2ambiguousreadcounts, Rm, Rl, min_likelihood_ratio_cutoff);
 	}
 	int num_reads_non_ambiguous = (int)reads_likelihoods.get_row_num();
 	int num_reads_ambiguous = 0;
 	Bins2Value::iterator m2amb_iter;
 	for (m2amb_iter=marker2ambiguousreadcounts.begin(); m2amb_iter!=marker2ambiguousreadcounts.end(); m2amb_iter++) {
 		num_reads_ambiguous += (int)(m2amb_iter->second);
 	}
53b02cb7
 	// Rcpp::Rcerr  << "#reads_total: " << num_total_reads << endl;
 	// Rcpp::Rcerr  << "#reads_covering_markers (non-ambiguous): " << num_reads_non_ambiguous << endl;
 	// Rcpp::Rcerr  << "#reads_covering_markers (ambiguous): " << num_reads_ambiguous << endl;
536fd2e8
 
 	//////////////////////////////
 	////// begin for debug
 #ifdef DEBUG
 	Rcpp::Rcout << "reads likelihoods: " << endl;
 	Rcpp::Rcout << reads_likelihoods << endl;
 	Rcpp::Rcout << "marker2ambiguousreadcounts: " << endl;
 	Rcpp::Rcout << marker2ambiguousreadcounts << endl;
 	//for (int z=0; z<Rm.size(); z++) {
 		//cerr << Rm[z] << "\t" << Rl[z] << endl;
 	//}
 	//cout << endl;
 	Rcpp::Rcout << "marker2rowindexes: " << endl;
 	print_Bins2UnsignedIntegers(marker2rowindexes);
 	Rcpp::Rcout << "c++_done" << endl;
 #endif
 	//// end for debug
 	//////////////////////////////
 
53b02cb7
 	// Rcpp::Rcerr  << "perform " << em_algorithm_type << " ..." << endl;
536fd2e8
 	vector<double> theta;
 	//cout << "debug before em_type.compare" << endl << flush;
 	if (em_algorithm_type.compare("em.global.known")==0) {
 		// create and initialize q with the same size of p and with all elements initialized as 0
 		Matrix_Double q(num_reads_non_ambiguous, num_tissues, 0);
 		// estimate tissue fractions (theta) and tissue-specific posterior probability matrix (q)
3a37cb6e
 		double obj = em_supervise(reads_likelihoods, em_max_iterations, theta, q, random_seed);
536fd2e8
 
 		// output
 		if (output_type=="tissueFraction") {
 			ofstream fout(output_file.c_str());
 			if (fout.is_open()) {
53b02cb7
 				print_vec(fout, tissue_names, "\t");
536fd2e8
 				fout << endl;
 				fout.precision(6);
53b02cb7
 				print_vec(fout, theta, "\t");
433af17b
 				fout << endl; ////////RH
3a37cb6e
 				// fout << "obj: " << obj << endl;
53b02cb7
 				// fout << endl << num_reads_ambiguous << endl;
 				// fout << endl << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << endl << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << endl;
536fd2e8
 				fout.close();
 			}
 		} else if (output_type.find("readCount")!=std::string::npos) {
53b02cb7
 		  // Rcpp::Rcerr  << "Count reads of each marker by reads posterior probabilities: " << output_type << " ..." << endl;
536fd2e8
 			double unit = 1e6 / num_total_reads; // default unit is readCountPerMillion
 			if (output_type.find("readCountPerMillion")!=std::string::npos) {
 				unit = 1e6 / num_total_reads;
53b02cb7
 			  // Rcpp::Rcerr  << "   unit: 1e6 / #total_reads = " << unit << endl;
536fd2e8
 			} else if (output_type.find("readCountPerBillion")!=std::string::npos) {
 				unit = 1e9 / num_total_reads;
53b02cb7
 			  // Rcpp::Rcerr  << "   unit: 1e9 / #total_reads = " << unit << endl;
536fd2e8
 			} else if (output_type.find("readCountRaw")!=std::string::npos) {
 				unit = 1.0;
53b02cb7
 			  // Rcpp::Rcerr  << "   unit: " << unit << endl;
536fd2e8
 			}
 			q.set_row_labels(reads_likelihoods.get_row_labels());
 
 			Matrix_Double readCounts(num_tissues);
 			readCounts_by_reads_posterior_probability_version_regular(q, unit, readCounts);
 			ofstream fout(output_file.c_str());
 			if (fout.is_open()) {
53b02cb7
 				print_vec(fout, tissue_names, "\t");
536fd2e8
 				fout << endl;
 				fout.precision(6);
53b02cb7
 				print_vec(fout, theta, "\t");
 				// fout << endl << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << "\tunit: " << unit << endl;
536fd2e8
 				fout << readCounts;
 				fout.close();
 			}
 		}
 		//////////////////////////////
 		////// begin for debug
92af6841
 		// Rcpp::Rcerr  << "tissue fractions: ";
 		// print_vec(Rcpp::Rcerr, theta, ", ");
 		// Rcpp::Rcerr  << endl;
536fd2e8
 		////// end for debug
 		//////////////////////////////
 	}
 	if (em_algorithm_type.compare("em.global.unknown")==0) {
 		// create and initialize q with the same size of p and with all elements initialized as 0
 		Matrix_Double q(num_reads_non_ambiguous, num_tissues, 0);
 		// estimate tissue fractions (theta) and tissue-specific posterior probability matrix (q)
3a37cb6e
 		double obj = em_supervise(reads_likelihoods, em_max_iterations, theta, q, random_seed);
536fd2e8
 		// adjust 'theta' and 'q' by "num_reads_non_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous)", because we consider 'theta_unknown'
 		double theta_unknown = (double)num_reads_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
 		double tmp = (double)num_reads_non_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
 		for (int i=0; i<theta.size(); i++) {
 			theta[i] *= tmp;
 		}
 		q.multipy_single_value(tmp);
 		// output
 		if (output_type=="tissueFraction") {
 			ofstream fout(output_file.c_str());
 			if (fout.is_open()) {
53b02cb7
 				print_vec(fout, tissue_names, "\t");
536fd2e8
 				fout << "\tunknown" << endl;
 				fout.precision(6);
53b02cb7
 				print_vec(fout, theta, "\t");
 				fout << "\t" << theta_unknown << endl;
3a37cb6e
 				// fout << "obj: " << obj << endl;
53b02cb7
 				// fout << endl;
 				// fout << endl << num_reads_ambiguous << endl;
 				// fout << endl << "#total_reads: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << endl << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << endl;
536fd2e8
 				fout.close();
 			}
 		} else if (output_type.find("readCount")!=std::string::npos) {
53b02cb7
 		  // Rcpp::Rcerr  << "Count reads of each marker by reads posterior probabilities: " << output_type << " ..." << endl;
536fd2e8
 			double unit = 1e6 / num_total_reads; // default unit is readCountPerMillion
 			if (output_type.find("readCountPerMillion")!=std::string::npos) {
 				unit = 1e6 / num_total_reads;
53b02cb7
 			  // Rcpp::Rcerr  << "   unit: 1e6 / #total_reads = " << unit << endl;
536fd2e8
 			} else if (output_type.find("readCountPerBillion")!=std::string::npos) {
 				unit = 1e9 / num_total_reads;
 				//unit = 1.0;  // for debug only
53b02cb7
 				// Rcpp::Rcerr  << "   unit: 1e9 / #total_reads = " << unit << endl;
536fd2e8
 			} else if (output_type.find("readCountRaw")!=std::string::npos) {
 				unit = 1.0;
53b02cb7
 			  // Rcpp::Rcerr  << "   unit: " << unit << endl;
536fd2e8
 			}
 			q.set_row_labels(reads_likelihoods.get_row_labels());
 
 			Matrix_Double readCounts(num_tissues); // the last column is for the unknown class.
 			readCounts_by_reads_posterior_probability_version_regular(q, unit, readCounts); // collapse from reads to markers
 			//tmp = (double)num_reads_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
 			for (m2amb_iter=marker2ambiguousreadcounts.begin(); m2amb_iter!=marker2ambiguousreadcounts.end(); m2amb_iter++) {
 				m2amb_iter->second *= theta_unknown * unit;
 			}
 			ofstream fout(output_file.c_str());
 			if (fout.is_open()) {
53b02cb7
 				print_vec(fout, tissue_names, "\t");
536fd2e8
 				fout << "\tunknown" << endl;
 				fout.precision(6);
53b02cb7
 				print_vec(fout, theta, "\t");
536fd2e8
 				fout << "\t" << theta_unknown << endl;
53b02cb7
 				// fout << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << "\tunit: " << unit << endl;
536fd2e8
 				readCounts.print_with_additional_column_of_Bins2Value(fout, marker2ambiguousreadcounts);
 				//fout << readCounts;
 				fout.close();
 			}
 		}
 		//////////////////////////////
 		////// begin for debug
53b02cb7
 		// Rcpp::Rcerr  << "tissue fractions: ";
 		// print_vec(Rcpp::Rcerr , theta, ", ");
 		// Rcpp::Rcerr << ", " << theta_unknown << endl;
536fd2e8
 		////// end for debug
 		//////////////////////////////
 	}
 }