Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
73268c0
compute the exact eff len based on the conditional distribution
pimentel Jun 30, 2015
d60eeec
wip. helper function to give 'mean frag len' for each trans
pimentel Jun 30, 2015
9714c38
calc eff lens based off of FL distribution instead of constant mean. …
pimentel Jun 30, 2015
8978e2f
lots of little refactoring to get eff len stuff to compile
pimentel Jun 30, 2015
15adeab
stupid bugfix inserts frag len means into vector
pimentel Jun 30, 2015
d59d4a5
use truncated normal for single-end reads
pimentel Jul 3, 2015
2214a69
updated bias to incorporate conditional mean
pimentel Jul 5, 2015
0a656c8
incorporate palls suggestions. technically works, but not benchmarked…
pimentel Jul 9, 2015
f88be4d
update_eff_lens has been refactored to remove const mean. remove some…
pimentel Jul 9, 2015
cfa8704
refactor mean_fl out of EM and Bootstrap
pimentel Jul 9, 2015
df3d568
general cleanup of comments and output
pimentel Jul 9, 2015
caa7233
Added ability to dump index to GFA file
pmelsted Jul 21, 2015
0783077
Fixes bug with paired ends and specified fragment length
pmelsted Aug 14, 2015
f74de71
Adds num_processed to hdf5 output, json for plaintext and converter u…
pmelsted Aug 17, 2015
99ebb25
Merge pull request #62 from pachterlab/eff_len
pimentel Aug 22, 2015
8a952c3
Merge pull request #69 from pachterlab/numprocessed
pimentel Aug 22, 2015
0dea25f
Fixes issues with repeated target names
pmelsted Aug 24, 2015
4c82812
Merge pull request #73 from pachterlab/unique-fasta-names
pimentel Aug 24, 2015
772aa74
fix off-by-one error in trunc gaussian
pimentel Aug 25, 2015
2f9c3cc
add option for standard deviation in fld. add some more msgs for fld
pimentel Aug 25, 2015
d875cf1
update to version 0.42.3
pimentel Aug 25, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions src/Bootstrap.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#include "Bootstrap.h"
// #include "weights.h"
// #include "EMAlgorithm.h"

EMAlgorithm Bootstrap::run_em() {
auto counts = mult_.sample();
EMAlgorithm em(counts, index_, tc_, mean_fl);
EMAlgorithm em(counts, index_, tc_, mean_fls_);

//em.set_start(em_start);
em.run(10000, 50, false, false);
Expand All @@ -20,9 +18,9 @@ BootstrapThreadPool::BootstrapThreadPool(
const KmerIndex& index,
const MinCollector& tc,
const std::vector<double>& eff_lens,
double mean,
const ProgramOptions& p_opts,
H5Writer& h5writer
H5Writer& h5writer,
const std::vector<double>& mean_fls
) :
n_threads_(n_threads),
seeds_(seeds),
Expand All @@ -31,9 +29,9 @@ BootstrapThreadPool::BootstrapThreadPool(
index_(index),
tc_(tc),
eff_lens_(eff_lens),
mean_fl_(mean),
opt_(p_opts),
writer_(h5writer)
writer_(h5writer),
mean_fls_(mean_fls)
{
for (size_t i = 0; i < n_threads_; ++i) {
threads_.push_back( std::thread(BootstrapWorker(*this, i)) );
Expand Down Expand Up @@ -71,8 +69,8 @@ void BootstrapWorker::operator() (){
pool_.index_,
pool_.tc_,
pool_.eff_lens_,
pool_.mean_fl_,
cur_seed);
cur_seed,
pool_.mean_fls_);

auto res = bs.run_em();

Expand Down
16 changes: 8 additions & 8 deletions src/Bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ class Bootstrap {
const KmerIndex& index,
const MinCollector& tc,
const std::vector<double>& eff_lens,
double mean,
size_t seed) :
size_t seed,
const std::vector<double>& mean_fls) :
index_(index),
tc_(tc),
eff_lens_(eff_lens),
mean_fl(mean),
seed_(seed),
mult_(true_counts, seed_)
mult_(true_counts, seed_),
mean_fls_(mean_fls)
{}

// EM Algorithm generates a sample from the Multinomial, then returns
Expand All @@ -41,9 +41,9 @@ class Bootstrap {
const KmerIndex& index_;
const MinCollector& tc_;
const std::vector<double>& eff_lens_;
double mean_fl;
size_t seed_;
Multinomial mult_;
const std::vector<double>& mean_fls_;
};

class BootstrapThreadPool {
Expand All @@ -57,9 +57,9 @@ class BootstrapThreadPool {
const KmerIndex& index,
const MinCollector& tc,
const std::vector<double>& eff_lens,
double mean,
const ProgramOptions& p_opts,
H5Writer& h5writer
H5Writer& h5writer,
const std::vector<double>& mean_fls
);

size_t num_threads() {return n_threads_;}
Expand All @@ -80,9 +80,9 @@ class BootstrapThreadPool {
const KmerIndex& index_;
const MinCollector& tc_;
const std::vector<double>& eff_lens_;
double mean_fl_;
const ProgramOptions& opt_;
H5Writer& writer_;
const std::vector<double>& mean_fls_;
};

class BootstrapWorker {
Expand Down
44 changes: 15 additions & 29 deletions src/EMAlgorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,7 @@ struct EMAlgorithm {
EMAlgorithm(const std::vector<int>& counts,
const KmerIndex& index,
const MinCollector& tc,
double mean
/* const EcMap& ecmap,
const std::vector<int>& counts,
const std::vector<std::string>& target_names,

const WeightMap& wm*/) :
//idx_(idx),
const std::vector<double>& all_means) :
index_(index),
tc_(tc),
num_trans_(index.target_names_.size()),
Expand All @@ -42,23 +36,24 @@ struct EMAlgorithm {
alpha_(num_trans_, 1.0/num_trans_), // uniform distribution over targets
rho_(num_trans_, 0.0),
rho_set_(false),
mean_fl(mean)
all_fl_means(all_means)
{
eff_lens_ = calc_eff_lens(index_.target_lens_, mean_fl);
assert(all_fl_means.size() == index_.target_lens_.size());
eff_lens_ = calc_eff_lens(index_.target_lens_, all_fl_means);
weight_map_ = calc_weights (tc_.counts, ecmap_, eff_lens_);
for (auto i = 0; i < alpha_.size(); i++) {
for (size_t i = 0; i < alpha_.size(); i++) {
if (counts_[i] > 0) {
alpha_[i] = counts_[i];
} else {
alpha_[i] = eff_lens_[i] / 1000.0;
}

}
assert(target_names_.size() == eff_lens_.size());
}

~EMAlgorithm() {}

void run(size_t n_iter = 10000, size_t min_rounds=50, bool verbose = true, bool recomputeEffLen = true) {
std::vector<double> next_alpha(alpha_.size(), 0.0);

Expand All @@ -76,34 +71,25 @@ struct EMAlgorithm {

int i;
for (i = 0; i < n_iter; ++i) {
/*if (i % 50 == 0) {
std::cerr << ".";
std::cerr.flush();
if (i % 500 == 0 && i > 0) {
std::cerr << std::endl;
}
}*/


if (recomputeEffLen && (i == min_rounds || i == min_rounds + 500)) {
eff_lens_ = update_eff_lens(mean_fl, tc_, index_, alpha_, eff_lens_);
eff_lens_ = update_eff_lens(all_fl_means, tc_, index_, alpha_, eff_lens_);
weight_map_ = calc_weights (tc_.counts, ecmap_, eff_lens_);
}


//for (auto& ec_kv : ecmap_ ) {
for (int ec = 0; ec < num_trans_; ec++) {
next_alpha[ec] = counts_[ec];
}


for (int ec = num_trans_; ec < ecmap_.size(); ec++) {
denom = 0.0;

if (counts_[ec] == 0) {
continue;
}

// first, compute the denominator: a normalizer
// iterate over targets in EC map
auto& wv = weight_map_[ec];
Expand Down Expand Up @@ -143,7 +129,7 @@ struct EMAlgorithm {
if (next_alpha[ec] > alpha_change_limit && (std::fabs(next_alpha[ec] - alpha_[ec]) / next_alpha[ec]) > alpha_change) {
chcount++;
}

//if (stopEM && next_alpha[ec] >= alpha_limit) {

/* double reldiff = abs(next_alpha[ec]-alpha_[ec]) / next_alpha[ec];
Expand Down Expand Up @@ -289,7 +275,7 @@ struct EMAlgorithm {
}

std::cout << sum_big << " " << count_big << " " << n << std::endl;

std::copy(em_start.alpha_before_zeroes_.begin(), em_start.alpha_before_zeroes_.end(),
alpha_.begin());
}
Expand All @@ -301,13 +287,13 @@ struct EMAlgorithm {
const EcMap& ecmap_;
const std::vector<int>& counts_;
const std::vector<std::string>& target_names_;
const std::vector<double>& all_fl_means;
std::vector<double> eff_lens_;
WeightMap weight_map_;
std::vector<double> alpha_;
std::vector<double> alpha_before_zeroes_;
std::vector<double> rho_;
bool rho_set_;
double mean_fl;
};


Expand Down
13 changes: 12 additions & 1 deletion src/H5Writer.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "H5Writer.h"

void H5Writer::init(const std::string& fname, int num_bootstrap, uint compression,
void H5Writer::init(const std::string& fname, int num_bootstrap, int num_processed, uint compression,
size_t index_version, const std::string& shell_call,
const std::string& start_time)
{
Expand All @@ -16,6 +16,10 @@ void H5Writer::init(const std::string& fname, int num_bootstrap, uint compressio
std::vector<int> n_bs {num_bootstrap};
vector_to_h5(n_bs, aux_, "num_bootstrap", false, compression_);

std::vector<int> n_proc {num_processed};
vector_to_h5(n_proc, aux_, "num_processed", false, compression_);


// info about run
std::vector<std::string> kal_version{ KALLISTO_VERSION };
vector_to_h5(kal_version, aux_, "kallisto_version", true, compression_);
Expand Down Expand Up @@ -85,6 +89,12 @@ H5Converter::H5Converter(const std::string& h5_fname, const std::string& out_dir
read_dataset(aux_, "num_bootstrap", n_bs_vec);
n_bs_ = n_bs_vec[0];

// read bootstrap info
std::vector<int> n_proc_vec;
read_dataset(aux_, "num_processed", n_proc_vec);
n_proc_ = n_proc_vec[0];


std::cerr << "[h5dump] number of bootstraps: " << n_bs_ << std::endl;
// </aux info>
if (n_bs_ > 0) {
Expand Down Expand Up @@ -135,6 +145,7 @@ void H5Converter::write_aux() {
out_name,
std::string(std::to_string(n_targs_)),
std::string(std::to_string(n_bs_)),
std::string(std::to_string(n_proc_)),
kal_version_,
std::string(std::to_string(idx_version_)),
start_time_,
Expand Down
3 changes: 2 additions & 1 deletion src/H5Writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class H5Writer {
H5Writer() : primed_(false) {}
~H5Writer();

void init(const std::string& fname, int num_bootstrap, uint compression,
void init(const std::string& fname, int num_bootstrap, int num_processed, uint compression,
size_t index_version, const std::string& shell_call,
const std::string& start_time);

Expand Down Expand Up @@ -70,6 +70,7 @@ class H5Converter {
hid_t bs_;

int n_bs_;
int n_proc_;
size_t n_targs_;
};

Expand Down
66 changes: 65 additions & 1 deletion src/Inspect.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

using namespace std;


void printVector(const vector<int>& v) {
cout << "[";
int i = 0;
Expand Down Expand Up @@ -40,7 +41,11 @@ void printHisto(const unordered_map<int,int>& m, const string& header) {
}
}

void InspectIndex(const KmerIndex& index) {
void InspectIndex(const KmerIndex& index, const std::string& gfa) {

static const char *dna = "ACGT";
auto Dna = [](int i) {return dna[i & 0x03];};

int k = index.k;
cout << "#[inspect] Index version number = " << index.INDEX_VERSION << endl;
cout << "#[inspect] k = " << index.k << endl;;
Expand Down Expand Up @@ -192,6 +197,65 @@ void InspectIndex(const KmerIndex& index) {
printHisto(kmhisto, "#EC.size\tNum.kmers");


if (!gfa.empty()) {
std::ofstream out;
out.open(gfa);
out << "H\tVN:Z:1.0\n";
int i = 0;
for (auto& c : index.dbGraph.contigs) {
out << "S\t" << i << "\t" << c.seq << "\tXT:S:";
for (int j = 0; j < c.transcripts.size(); j++) {
auto &ct = c.transcripts[j];
if (j > 0) {
out << ",";
}
out << index.target_names_[ct.trid];
}
out << "\n";
i++;
}

const auto& kmap = index.kmap;
i = 0;
for (auto& c : index.dbGraph.contigs) {
auto& seq = c.seq;

Kmer last(seq.c_str() + seq.size()-k);
for (int j = 0; j < 4; j++) {
Kmer after = last.forwardBase(Dna(j));
auto search = kmap.find(after.rep());
if (search != kmap.end()) {
KmerEntry val = search->second;
// check if + or -
bool strand = val.isFw() == (after == after.rep());
out << "L\t" << i << "\t+\t" << val.contig
<< "\t" << (strand ? '+' : '-') << "\t" << (k-1) << "M\n";
}
}

// enumerate bw links
Kmer first(seq.c_str());
for (int j = 0; j < 4; j++) {
Kmer before = first.backwardBase(Dna(j));
auto search = kmap.find(before.rep());
if (search != kmap.end()) {
KmerEntry val = search->second;
// check if + or -
bool strand = val.isFw() == (before == before.rep());
out << "L\t" << i << "\t-\t"
<< val.contig << "\t" << (strand ? '-' : '+')
<< "\t" << (k-1) << "M\n";
}
}

i++;
}

out.flush();

out.close();
}


}

Expand Down
Loading