/*
 * readSimulator.h
 *
 *  Created on: 21.05.2013
 *      Author: AlexanderDilthey
 */

#ifndef READSIMULATOR_H_
#define READSIMULATOR_H_

#include <map>
#include <vector>
#include <assert.h>
#include <string>
#include <utility>
#include <ostream>
#include "../Graph/Graph.h"
#include "../Utilities.h"

/*
 * This class implements a simple-minded read simulator.
 * It reads in an empirical quality matrix such as the ones generated by readRecalibrator.cpp.
 * Conditional on expected haploid coverage and read length, the number of starting reads at each position of the reference string is ~ Poisson()
 * The difference in mate pair starting positions is assumed to be ~ Normal(parameter, parameter)
 *
 * We walk along the chromosome, compute how many reads start at each position and where the mates are. For each pair, specified in terms of its starting positions, we
 * then generate x bases (where x = read length). At each base, we generate the number of reference positions we want to jump over (deletions in read) or how many new
 * non-ref bases we want to generate (insertions in read; with error). Usually, these numbers are 0. If 0, we copy (with error) a base from the reference chromosome, and
 * increase the position pointer specifying where we are in the reference chromosome by 1. We repeat until we have x bases.
 *
 * "With error" for the copying means: for each base, we have a position in the read. We identify the corresponding columns in the recalibration matrix and
 * draw a quality value according to the distribution of quality values conditional on position in read. Conditional on a selected quality value at a particular
 * position, we have an empirical estimate of the read base being correct. We use this probability in a Bernoulli trial to decide whether any particular printed base
 * should be equal to the underlying base - if not, we randomly generate a new base.
 *
 * This read simulator has a couple of obvious flaws. Insertion rates are simply estimated from deletion rates, whereas the underlying empirical recalibration matrix
 * counts insertions as new alleles at a position and thus as an element of the total allelic error at a position. Also, the process independently selects a quality
 * value at each position, and conditional on that independently selects base correctness. In reality, both processes are not independent along a read, and a Markov chain
 * might do a better job at capturing the interdependencies.
 *
 */

namespace simulator {

extern std::string readName_field_separator;

class oneRead {
public:
	std::string name;
	std::string sequence;
	std::string quality;
	std::vector<int> coordinates_string;
	std::vector<int> coordinates_edgePath;

	std::string underlyingEdgeLabels;

	std::vector<int> fullAlignment_coordinates_edgePath;
	std::string fullAlignment_underlyingEdgeLabels;
	std::string fullAlignment_sequence;

	oneRead(std::string read_name, std::string read_sequence, std::string read_qualities) : name(read_name), sequence(read_sequence), quality(read_qualities)
	{
		assert(read_sequence.length() == read_qualities.length());
	}

	void invert()
	{
		sequence = Utilities::seq_reverse_complement(sequence);
		std::reverse(quality.begin(), quality.end());
		std::reverse(coordinates_string.begin(), coordinates_string.end());
		std::reverse(coordinates_edgePath.begin(), coordinates_edgePath.end());
	}
};

class oneReadPair {
public:
	std::pair<oneRead, oneRead> reads;
	unsigned int diff_starting_coordinates;

	bool firstRead_minusStrand;

	oneReadPair(oneRead r1, oneRead r2, unsigned int difference_starting_coordinates) : reads(std::pair<oneRead, oneRead>(r1, r2)), diff_starting_coordinates(difference_starting_coordinates)
	{

	}

	void invert()
	{
		oneRead t("", "", "");
		t = reads.first;
		reads.first = reads.second;
		reads.second = t;
		// reads.first.invert();
		// reads.second.invert();
	}
};



class readSimulator {
private:
	std::vector<std::map<char, double> > read_quality_frequencies;
	std::vector<std::map<char, double> > read_quality_correctness;
	std::vector<double> read_INDEL_freq;

	std::vector<std::map<char, double> > read_quality_frequencies_2nd;
	std::vector<std::map<char, double> > read_quality_correctness_2nd;
	std::vector<double> read_INDEL_freq_2nd;

	unsigned int read_length;

	bool paranoid;

	bool interpolateLength;

	bool quiet;

	static double averageErrorRate(std::vector<std::map<char, double> > q_freq, std::vector<std::map<char, double> > error_freq_conditional_q);

public:

	readSimulator(std::string qualityMatrixFile, unsigned int readLength = 100, bool interpolateLength_ = false, char removeUpperBaseQualityIndices = 1, char additional_2ndRead_removeUpperBaseQualityIndices = 0);

	std::vector<oneReadPair> simulate_paired_reads_from_string(std::string S, double expected_haploid_coverage, double starting_coordinates_diff_mean, double starting_coordinates_diff_sd, bool perfectly, std::string readIDPrefix = "");
	void simulate_paired_reads_from_string_mt_immediateOutput(std::string S, std::string fn_output_prefix, double expected_haploid_coverage, double starting_coordinates_diff_mean, double starting_coordinates_diff_sd, bool perfectly, unsigned int threads, std::string readIDPrefix = "");

	std::vector<oneRead> simulate_unpaired_reads_from_string(std::string S, double expected_haploid_coverage, bool perfectly = false);
	std::vector<oneReadPair> simulate_paired_reads_from_edgePath(std::vector<Edge*> edgePath, double expected_haploid_coverage, double starting_coordinates_diff_mean, double starting_coordinates_diff_sd, bool perfectly, std::string readIDPrefix = "", bool includeDeletions = false);

	std::pair<double, double> averageErrorRate_R1_R2();

};

}
#endif /* READSIMULATOR_H_ */
