Thanks to visit codestin.com
Credit goes to code.bioconductor.org

// [[Rcpp::depends(BH)]]
#include <Rcpp.h>
using namespace Rcpp;

#include <cstdlib>
#include <vector>
#include <map>
#include <string>
#include <climits>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cassert>
#include <boost/algorithm/string.hpp>
#include <boost/assign.hpp>
#include <boost/iostreams/filtering_stream.hpp>
#include "getopt.h"
#include "utils.h"
#include "data_types.h"
#include "matrix.h"

using namespace std;
using namespace boost;

#include "template_utils.cpp"

//#define DEBUG

// Global variables
string tissue_markers_file="";
int num_tissues=0;
string reads_binning_file; //="stdin"; // default is "stdin"
string output_file="stdout"; // default is "stdout"
string output_type="tissueFraction"; // default is "tissueFraction"
double min_likelihood_ratio_cutoff=-1.0; // default is -1.0, which means "No reads filtering with likelihood ratio cutoff"
string em_algorithm_type="em.global.unknown"; // default is "the EM algorithm with unknown class", whose short name: "em.global.unknown". Other option is "em_known"
int em_max_iterations=100; // default is 100 iterations

/*-----------------------------------
command line parser's data structure
-----------------------------------*/
struct option longopts[] =
{
	/* name,                      has_arg,          flag, val */ /* longind */
	{ "help",                           no_argument,       0,   'h' }, /*       0 */
	{ "reads_binning_file",             required_argument, 0,   'r' }, /*       2 */
	{ "tissue_markers_file",            required_argument, 0,   't' }, /*       1 */
	{ "num_tissues",                    required_argument, 0,   'T' }, /*       1 */
	{ "min_likelihood_ratio_cutoff",    required_argument, 0,   'p' }, /*       1 */
	{ "em_algorithm",                   required_argument, 0,   'a' }, /*       3 */
	{ "em_max_iterations",              required_argument, 0,   'x' }, /*       3 */
	{ "output_file",                    required_argument, 0,   'o' }, /*       3 */
	{ "output_type",                    required_argument, 0,   'O' }, /*       3 */
	/* end-of-list marker */
	{ 0, 0, 0, 0 }
};

// char *shortopts = "hr:t:T:a:p:x:o:O:"; /* short options string */
int longind = 0; /* long option list index */

void print_usage() {
  Rcpp::Rcerr << "Calculate tissue-specific likelihood for each sequencing read and perform EM algorithm on all reads to obtain model parameters, then compute and output the tissue-specific read counts for each marker" << endl << endl;
  Rcpp::Rcerr  << "USAGE: tissue_deconv [options]" << endl << endl;
  Rcpp::Rcerr  << "   Options:" << endl;
  Rcpp::Rcerr  << "     -a [string]:  read-based tissue deconvolution EM algorithm type: em.global.unknown (default), em.global.known, em.local.unknown, em.local.known." << endl;
  Rcpp::Rcerr  << "     -r [FILE]:  reads binning file (default: stdin)" << endl;
  Rcpp::Rcerr  << "     -o [FILE]:  output file (default: stdout)" << endl;
  Rcpp::Rcerr  << "     -O [FILE]:  output type tissueFraction (default), tissueFraction+readCountRaw, tissueFraction+readCountPerMillion, tissueFraction+readCountPerBillion" << endl;
  Rcpp::Rcerr  << "     -T [positive integer]: number of tissue types." << endl;
  Rcpp::Rcerr  << "     -p [positive real value]: all reads with likelihood ratio < cutoff (default: -1.0, meanin no reads filtering is used) will not be used. Likelihood ratio is the max(all tissues' likelihoods)/min(all tissues' likelihoods)" << endl;
  Rcpp::Rcerr  << "     -t [FILE]:  tissue markers file: each marker for each tissue type has either a median beta value or a paired shape parameters of beta-distribution from the tissue population of tissue types." << endl;
  Rcpp::Rcerr  << "     -x [positive integer]:  EM algorithm maximum iteration number: 100 (default)." << endl;
  Rcpp::Rcerr  << endl;
    //cerr << "version 1.0, January 22, 2020" << endl;
    //cerr << "By Wenyuan Li" << endl;
  Rcpp::Rcerr  << endl;
}

void parse_command_line(string reads_binning_file, int num_tissues, double min_likelihood_ratio_cutoff, string tissue_markers_file,
                        string em_algorithm_type, string output_file, string output_type, int em_max_iterations)
{

	if (output_type!="tissueFraction" && output_type!="tissueFraction+readCountRaw" && output_type!="tissueFraction+readCountPerMillion" && output_type!="tissueFraction+readCountPerBillion") {
	  Rcpp::Rcerr  << endl << "Error: output types should take one of four values: tissueFraction, tissueFraction+readCountRaw, tissueFraction+readCountPerMillion and tissueFraction+readCountPerBillion!" << endl << endl;
		print_usage();
		// exit(EXIT_FAILURE);
	}
	if (em_algorithm_type.compare("em.global.known")!=0 && em_algorithm_type.compare("em.global.unknown")!=0 && em_algorithm_type.compare("em.local.unknown")!=0 && em_algorithm_type.compare("em.local.unknown")!=0) {
	  Rcpp::Rcerr  << endl << "Error: EM algorithm types should take one of three values: em.global.known, em.global.unknown, em.local.known, em.local.unknown!" << endl << endl;
		print_usage();
		// exit(EXIT_FAILURE);
	}
	if (reads_binning_file.empty()) {
	  Rcpp::Rcerr  << endl << "Error: input reads binning file is required!" << endl << endl;
		print_usage();
		// exit(EXIT_FAILURE);
	}
	if (num_tissues<=0) {
	  Rcpp::Rcerr  << endl << "Error: number of tissue types must be >1!" << endl << endl;
		print_usage();
	}
	if (tissue_markers_file.empty()) {
	  Rcpp::Rcerr  << endl << "Error: input file of tissue markers is required!" << endl << endl;
		print_usage();
		// exit(EXIT_FAILURE);
	}
}

// [[Rcpp::export]]
void read_deconvolution_cpp(std::string reads_binning_file, int num_tissues, double min_likelihood_ratio_cutoff, std::string tissue_markers_file,
         std::string em_algorithm_type, std::string output_file, std::string output_type, int em_max_iterations=100, int random_seed=0) {

	parse_command_line(reads_binning_file, num_tissues, min_likelihood_ratio_cutoff, tissue_markers_file,
                    em_algorithm_type, output_file, output_type, em_max_iterations);

//   Rcpp::Rcerr  << "fragment-level methylation states file: " << reads_binning_file << endl;
//   Rcpp::Rcerr  << "marker file: " << tissue_markers_file << endl;
//   Rcpp::Rcerr  << "number of sample types: " << num_tissues << endl;
// 	if (min_likelihood_ratio_cutoff==-1.0)
// 	  Rcpp::Rcerr  << "No likilihood-ratio-based reads filtering" << endl;
// 	else
// 	  Rcpp::Rcerr  << "min likelihood ratio cutoff (for each read): " << min_likelihood_ratio_cutoff << endl;
// 	Rcpp::Rcerr  << "read deconvolution algorithm: " << em_algorithm_type << endl;
// 	Rcpp::Rcerr  << "EM max iterations: " << em_max_iterations << endl;
// 	Rcpp::Rcerr  << "output type: " << output_type << endl;
// 	Rcpp::Rcerr  << "output file: " << output_file << endl;
// 	Rcpp::Rcerr  << endl;

	// Rcpp::Rcerr  << "reading " << tissue_markers_file << " ..." << endl;
	string fileext_gzip = ".gz";
	Bins2Values marker2beta;
	vector<string> tissue_names;
	if (str_ends_with(tissue_markers_file,fileext_gzip)) {
		// read_tissue_markers_gz_file(tissue_markers_file, 2, num_tissues, marker2beta, tissue_names);
		Rcpp::Rcout << "Input has to be a txt file" << endl;
	} else {
		read_tissue_markers_txt_file(tissue_markers_file, 2, num_tissues, marker2beta, tissue_names);
	}
	//string debug_file="./debug.single_value.tissue_markers.txt";
	//write_Bins2Values(marker2beta, tissue_names, debug_file, false);
	//exit(EXIT_SUCCESS);

	// Rcpp::Rcerr  << "calculating reads likelihoods ..." << endl;
	Matrix_Double reads_likelihoods((unsigned int)num_tissues);
	Bins2UnsignedIntegers marker2rowindexes;
	Bins2Value marker2ambiguousreadcounts;
	vector<int> Rm, Rl;
	unsigned long num_total_reads;
	if (str_ends_with(reads_binning_file,fileext_gzip)) {
		// num_total_reads = calc_read_probability_by_marker2beta_from_reads_binning_gzip_file(reads_binning_file, marker2beta, reads_likelihoods, marker2rowindexes, marker2ambiguousreadcounts, Rm, Rl, min_likelihood_ratio_cutoff);
		Rcpp::Rcout << "Input has to be a txt file" << endl;
	} else {
		num_total_reads = calc_read_probability_by_marker2beta_from_reads_binning_text_file(reads_binning_file, marker2beta, reads_likelihoods, marker2rowindexes, marker2ambiguousreadcounts, Rm, Rl, min_likelihood_ratio_cutoff);
	}
	int num_reads_non_ambiguous = (int)reads_likelihoods.get_row_num();
	int num_reads_ambiguous = 0;
	Bins2Value::iterator m2amb_iter;
	for (m2amb_iter=marker2ambiguousreadcounts.begin(); m2amb_iter!=marker2ambiguousreadcounts.end(); m2amb_iter++) {
		num_reads_ambiguous += (int)(m2amb_iter->second);
	}
	// Rcpp::Rcerr  << "#reads_total: " << num_total_reads << endl;
	// Rcpp::Rcerr  << "#reads_covering_markers (non-ambiguous): " << num_reads_non_ambiguous << endl;
	// Rcpp::Rcerr  << "#reads_covering_markers (ambiguous): " << num_reads_ambiguous << endl;

	//////////////////////////////
	////// begin for debug
#ifdef DEBUG
	Rcpp::Rcout << "reads likelihoods: " << endl;
	Rcpp::Rcout << reads_likelihoods << endl;
	Rcpp::Rcout << "marker2ambiguousreadcounts: " << endl;
	Rcpp::Rcout << marker2ambiguousreadcounts << endl;
	//for (int z=0; z<Rm.size(); z++) {
		//cerr << Rm[z] << "\t" << Rl[z] << endl;
	//}
	//cout << endl;
	Rcpp::Rcout << "marker2rowindexes: " << endl;
	print_Bins2UnsignedIntegers(marker2rowindexes);
	Rcpp::Rcout << "c++_done" << endl;
#endif
	//// end for debug
	//////////////////////////////

	// Rcpp::Rcerr  << "perform " << em_algorithm_type << " ..." << endl;
	vector<double> theta;
	//cout << "debug before em_type.compare" << endl << flush;
	if (em_algorithm_type.compare("em.global.known")==0) {
		// create and initialize q with the same size of p and with all elements initialized as 0
		Matrix_Double q(num_reads_non_ambiguous, num_tissues, 0);
		// estimate tissue fractions (theta) and tissue-specific posterior probability matrix (q)
		double obj = em_supervise(reads_likelihoods, em_max_iterations, theta, q, random_seed);

		// output
		if (output_type=="tissueFraction") {
			ofstream fout(output_file.c_str());
			if (fout.is_open()) {
				print_vec(fout, tissue_names, "\t");
				fout << endl;
				fout.precision(6);
				print_vec(fout, theta, "\t");
				fout << endl; ////////RH
				// fout << "obj: " << obj << endl;
				// fout << endl << num_reads_ambiguous << endl;
				// fout << endl << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << endl << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << endl;
				fout.close();
			}
		} else if (output_type.find("readCount")!=std::string::npos) {
		  // Rcpp::Rcerr  << "Count reads of each marker by reads posterior probabilities: " << output_type << " ..." << endl;
			double unit = 1e6 / num_total_reads; // default unit is readCountPerMillion
			if (output_type.find("readCountPerMillion")!=std::string::npos) {
				unit = 1e6 / num_total_reads;
			  // Rcpp::Rcerr  << "   unit: 1e6 / #total_reads = " << unit << endl;
			} else if (output_type.find("readCountPerBillion")!=std::string::npos) {
				unit = 1e9 / num_total_reads;
			  // Rcpp::Rcerr  << "   unit: 1e9 / #total_reads = " << unit << endl;
			} else if (output_type.find("readCountRaw")!=std::string::npos) {
				unit = 1.0;
			  // Rcpp::Rcerr  << "   unit: " << unit << endl;
			}
			q.set_row_labels(reads_likelihoods.get_row_labels());

			Matrix_Double readCounts(num_tissues);
			readCounts_by_reads_posterior_probability_version_regular(q, unit, readCounts);
			ofstream fout(output_file.c_str());
			if (fout.is_open()) {
				print_vec(fout, tissue_names, "\t");
				fout << endl;
				fout.precision(6);
				print_vec(fout, theta, "\t");
				// fout << endl << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << "\tunit: " << unit << endl;
				fout << readCounts;
				fout.close();
			}
		}
		//////////////////////////////
		////// begin for debug
		// Rcpp::Rcerr  << "tissue fractions: ";
		// print_vec(Rcpp::Rcerr, theta, ", ");
		// Rcpp::Rcerr  << endl;
		////// end for debug
		//////////////////////////////
	}
	if (em_algorithm_type.compare("em.global.unknown")==0) {
		// create and initialize q with the same size of p and with all elements initialized as 0
		Matrix_Double q(num_reads_non_ambiguous, num_tissues, 0);
		// estimate tissue fractions (theta) and tissue-specific posterior probability matrix (q)
		double obj = em_supervise(reads_likelihoods, em_max_iterations, theta, q, random_seed);
		// adjust 'theta' and 'q' by "num_reads_non_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous)", because we consider 'theta_unknown'
		double theta_unknown = (double)num_reads_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
		double tmp = (double)num_reads_non_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
		for (int i=0; i<theta.size(); i++) {
			theta[i] *= tmp;
		}
		q.multipy_single_value(tmp);
		// output
		if (output_type=="tissueFraction") {
			ofstream fout(output_file.c_str());
			if (fout.is_open()) {
				print_vec(fout, tissue_names, "\t");
				fout << "\tunknown" << endl;
				fout.precision(6);
				print_vec(fout, theta, "\t");
				fout << "\t" << theta_unknown << endl;
				// fout << "obj: " << obj << endl;
				// fout << endl;
				// fout << endl << num_reads_ambiguous << endl;
				// fout << endl << "#total_reads: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << endl << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << endl;
				fout.close();
			}
		} else if (output_type.find("readCount")!=std::string::npos) {
		  // Rcpp::Rcerr  << "Count reads of each marker by reads posterior probabilities: " << output_type << " ..." << endl;
			double unit = 1e6 / num_total_reads; // default unit is readCountPerMillion
			if (output_type.find("readCountPerMillion")!=std::string::npos) {
				unit = 1e6 / num_total_reads;
			  // Rcpp::Rcerr  << "   unit: 1e6 / #total_reads = " << unit << endl;
			} else if (output_type.find("readCountPerBillion")!=std::string::npos) {
				unit = 1e9 / num_total_reads;
				//unit = 1.0;  // for debug only
				// Rcpp::Rcerr  << "   unit: 1e9 / #total_reads = " << unit << endl;
			} else if (output_type.find("readCountRaw")!=std::string::npos) {
				unit = 1.0;
			  // Rcpp::Rcerr  << "   unit: " << unit << endl;
			}
			q.set_row_labels(reads_likelihoods.get_row_labels());

			Matrix_Double readCounts(num_tissues); // the last column is for the unknown class.
			readCounts_by_reads_posterior_probability_version_regular(q, unit, readCounts); // collapse from reads to markers
			//tmp = (double)num_reads_ambiguous/(num_reads_ambiguous + num_reads_non_ambiguous);
			for (m2amb_iter=marker2ambiguousreadcounts.begin(); m2amb_iter!=marker2ambiguousreadcounts.end(); m2amb_iter++) {
				m2amb_iter->second *= theta_unknown * unit;
			}
			ofstream fout(output_file.c_str());
			if (fout.is_open()) {
				print_vec(fout, tissue_names, "\t");
				fout << "\tunknown" << endl;
				fout.precision(6);
				print_vec(fout, theta, "\t");
				fout << "\t" << theta_unknown << endl;
				// fout << "#reads_total: " << num_total_reads << "\t#reads_covering_markers(non_ambiguous): " << num_reads_non_ambiguous << "\t#reads_covering_markers(ambiguous): " << num_reads_ambiguous << "\tunit: " << unit << endl;
				readCounts.print_with_additional_column_of_Bins2Value(fout, marker2ambiguousreadcounts);
				//fout << readCounts;
				fout.close();
			}
		}
		//////////////////////////////
		////// begin for debug
		// Rcpp::Rcerr  << "tissue fractions: ";
		// print_vec(Rcpp::Rcerr , theta, ", ");
		// Rcpp::Rcerr << ", " << theta_unknown << endl;
		////// end for debug
		//////////////////////////////
	}
}