| ... | ... |
@@ -1,322 +1,323 @@ |
| 1 |
-#include <Rcpp.h> |
|
| 2 |
-using namespace Rcpp; |
|
| 3 |
- |
|
| 4 |
-#include <cstdlib> |
|
| 5 |
-#include <sstream> |
|
| 6 |
-#include <iostream> |
|
| 7 |
-#include <fstream> |
|
| 8 |
-#include <map> |
|
| 9 |
-#include <vector> |
|
| 10 |
-#include <string> |
|
| 11 |
-#include <cmath> |
|
| 12 |
-#include <cstdlib> |
|
| 13 |
-#include <boost/algorithm/string.hpp> |
|
| 14 |
-#include <boost/assign.hpp> |
|
| 15 |
-#include "data_types.h" |
|
| 16 |
- |
|
| 17 |
-using namespace std; |
|
| 18 |
-using namespace boost; |
|
| 19 |
- |
|
| 20 |
-// |
|
| 21 |
-// bins (or features) annotation format |
|
| 22 |
-// |
|
| 23 |
-//1. file of bins (features) annotation: "biomarkers.all_bins" |
|
| 24 |
-//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line. |
|
| 25 |
-//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0. |
|
| 26 |
-//Column 2: chr |
|
| 27 |
-//Column 3: start coordinate of read (1-base) |
|
| 28 |
-//Column 4: end coordinate of read (1-base). The range of the bin is [start, end) |
|
| 29 |
-//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching. |
|
| 30 |
-// |
|
| 31 |
-// The following is an example file |
|
| 32 |
-// |
|
| 33 |
-//marker_index chr start end marker_type |
|
| 34 |
-//0 chr1 1 855266 - |
|
| 35 |
-//1 chr1 855266 855766 II |
|
| 36 |
-//0 chr1 855766 969796 - |
|
| 37 |
-//2 chr1 969796 970296 II |
|
| 38 |
-//0 chr1 970296 1099044 - |
|
| 39 |
-//3 chr1 1099044 1099544 II |
|
| 40 |
-//0 chr1 1099544 1109315 - |
|
| 41 |
-//4 chr1 1109315 1109815 II |
|
| 42 |
-// |
|
| 43 |
-void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord, |
|
| 44 |
- Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true) |
|
| 45 |
-{
|
|
| 46 |
- ifstream fin; |
|
| 47 |
- fin.open(input_bins_annot_file.c_str()); |
|
| 48 |
- if (fin.fail()){
|
|
| 49 |
- Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl; |
|
| 50 |
- // exit(EXIT_FAILURE); |
|
| 51 |
- } |
|
| 52 |
- string line; |
|
| 53 |
- if (has_header_line) |
|
| 54 |
- // skip the first header line |
|
| 55 |
- getline(fin, line); |
|
| 56 |
- unsigned long i=0; |
|
| 57 |
- string old_chr; |
|
| 58 |
- while (!fin.eof()) {
|
|
| 59 |
- getline(fin, line); |
|
| 60 |
- if (line.empty()) {
|
|
| 61 |
- // this is the last line of the file |
|
| 62 |
- break; |
|
| 63 |
- } |
|
| 64 |
- //cout << line << endl; |
|
| 65 |
- |
|
| 66 |
- vector<string> strs1; |
|
| 67 |
- split(strs1, line, is_any_of("\t"));
|
|
| 68 |
- int bin_index = atoi(strs1[0].c_str()); |
|
| 69 |
- string chr = strs1[1]; |
|
| 70 |
- //int start_coord = atoi(strs1[2].c_str()); // start coordinate |
|
| 71 |
- int end_coord = atoi(strs1[3].c_str()); // end coordinate |
|
| 72 |
- if (i==0) {
|
|
| 73 |
- // this is the first bin of all the genome, so we initialize old_chr |
|
| 74 |
- old_chr = chr; |
|
| 75 |
- } |
|
| 76 |
- if (chr.compare(old_chr)!=0) {
|
|
| 77 |
- // This is the 1st bin of the new chromosome |
|
| 78 |
- old_chr = chr; |
|
| 79 |
- bins_end_coord.insert(make_pair(chr, vector<unsigned int>())); |
|
| 80 |
- bins_index.insert(make_pair(chr, vector<int >())); |
|
| 81 |
- bins_info.insert(make_pair(chr, vector<string >())); |
|
| 82 |
- } |
|
| 83 |
- vector<unsigned int> & coords = bins_end_coord[chr]; |
|
| 84 |
- coords.push_back( end_coord ); |
|
| 85 |
- vector<int> & indexes = bins_index[chr]; |
|
| 86 |
- indexes.push_back( bin_index ); |
|
| 87 |
- vector<string> & infos = bins_info[chr]; |
|
| 88 |
- infos.push_back( line ); |
|
| 89 |
- i++; |
|
| 90 |
- } |
|
| 91 |
- Rcpp::Rcerr << "#bins=" << i << endl; |
|
| 92 |
-} |
|
| 93 |
- |
|
| 94 |
-int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index) |
|
| 95 |
-{
|
|
| 96 |
- int num_of_non_void_bins = 0; |
|
| 97 |
- Bins_index::iterator it; |
|
| 98 |
- for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
|
|
| 99 |
- vector<int> & bins_of_chr = it->second; |
|
| 100 |
- for (int i=0; i<bins_of_chr.size(); i++) |
|
| 101 |
- if (bins_of_chr[i] > 0) {
|
|
| 102 |
- num_of_non_void_bins++; |
|
| 103 |
- returned_markers_index.push_back( bins_of_chr[i] ); |
|
| 104 |
- } |
|
| 105 |
- } |
|
| 106 |
- return num_of_non_void_bins; |
|
| 107 |
-} |
|
| 108 |
- |
|
| 109 |
-// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base. |
|
| 110 |
-// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 111 |
-int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
|
|
| 112 |
- int bin_internal_index=-1; |
|
| 113 |
- vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 114 |
- vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord); |
|
| 115 |
- if (bin_it!=coords_bins_of_chr.end()) {
|
|
| 116 |
- // found |
|
| 117 |
- bin_internal_index = bin_it-coords_bins_of_chr.begin(); |
|
| 118 |
- } |
|
| 119 |
- return bin_internal_index; |
|
| 120 |
-} |
|
| 121 |
- |
|
| 122 |
-// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin. |
|
| 123 |
-// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 124 |
-int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
|
|
| 125 |
- int bin_internal_index=-1; |
|
| 126 |
- if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
|
|
| 127 |
- // chr name is not found in binning system |
|
| 128 |
- bin_internal_index = -1; |
|
| 129 |
- } else {
|
|
| 130 |
- vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 131 |
- vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position); |
|
| 132 |
- if (position==*bin_it) bin_it++; |
|
| 133 |
- bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base |
|
| 134 |
- if (bin_internal_index==coords_bins_of_chr.size()) {
|
|
| 135 |
- //cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl; |
|
| 136 |
- bin_internal_index=-1; |
|
| 137 |
- } |
|
| 138 |
- } |
|
| 139 |
- return bin_internal_index; |
|
| 140 |
-} |
|
| 141 |
- |
|
| 142 |
-// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] ) |
|
| 143 |
-// Given a query region, we want to know which bin has overlap with this query region. If there is overlap, |
|
| 144 |
-// return (1) bin index, and (2) the overlap length |
|
| 145 |
-// ongoing devevloping |
|
| 146 |
-int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord, |
|
| 147 |
- unsigned int query_region_end_coord, int & overlap_length) |
|
| 148 |
-{
|
|
| 149 |
- int bin_internal_index = -1; |
|
| 150 |
- overlap_length = -1; |
|
| 151 |
- if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
|
|
| 152 |
- // chr name is found in binning system |
|
| 153 |
- bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord); |
|
| 154 |
- if (bin_internal_index != -1) {
|
|
| 155 |
- unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index]; |
|
| 156 |
- if ( query_region_end_coord > bin_end_coord ) |
|
| 157 |
- overlap_length = bin_end_coord - query_region_start_coord + 1; |
|
| 158 |
- else |
|
| 159 |
- overlap_length = query_region_end_coord - query_region_start_coord + 1; |
|
| 160 |
- } |
|
| 161 |
- } |
|
| 162 |
- return bin_internal_index; |
|
| 163 |
-} |
|
| 164 |
- |
|
| 165 |
-void print_uint_vec( ostream& os, vector<unsigned int>& v, int len ) |
|
| 166 |
-{
|
|
| 167 |
- int i; |
|
| 168 |
- if (len>v.size() || len==0) len=v.size(); |
|
| 169 |
- if (len==0 || v.size()==0) {
|
|
| 170 |
- os << "[" << "]"; |
|
| 171 |
- } else {
|
|
| 172 |
- os << "["; |
|
| 173 |
- for (i=0; i<len-1; i++) |
|
| 174 |
- os << v[i] << ","; |
|
| 175 |
- os << v[i] << "]"; |
|
| 176 |
- } |
|
| 177 |
-} |
|
| 178 |
- |
|
| 179 |
-void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
|
|
| 180 |
- Bins_end_coord::iterator it; |
|
| 181 |
- for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
|
|
| 182 |
- string chr = it->first; |
|
| 183 |
- vector<unsigned int> coords=it->second; |
|
| 184 |
- vector<int> indexes=bins_index[chr]; |
|
| 185 |
- vector<string> infos=bins_info[chr]; |
|
| 186 |
- int n_bins = coords.size(); |
|
| 187 |
- for (int i=0; i<n_bins; i++) {
|
|
| 188 |
- os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'" << endl; |
|
| 189 |
- } |
|
| 190 |
- } |
|
| 191 |
-} |
|
| 192 |
- |
|
| 193 |
-/* |
|
| 194 |
-void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
|
|
| 195 |
- Bins_FullInfo::iterator: it; |
|
| 196 |
- for (it=bins.begin(); it!=bins.end(); ++it) {
|
|
| 197 |
- cout << it.first << "\t"; |
|
| 198 |
- print_int_vec(cout, it.second, it.second.size()); |
|
| 199 |
- cout << endl; |
|
| 200 |
- } |
|
| 201 |
-} |
|
| 202 |
-*/ |
|
| 203 |
- |
|
| 204 |
-// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base |
|
| 205 |
-void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 206 |
-{
|
|
| 207 |
- for (int bin_index=1; bin_index<=num_bins; bin_index++) {
|
|
| 208 |
- vector<double> values; |
|
| 209 |
- for (int i=0; i<num_of_values; i++) |
|
| 210 |
- values.push_back( init_value ); |
|
| 211 |
- bins2values[bin_index] = values; |
|
| 212 |
- } |
|
| 213 |
-} |
|
| 214 |
- |
|
| 215 |
-// Bins2Values: a map of marker_index -> a vector of values. |
|
| 216 |
-void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 217 |
-{
|
|
| 218 |
- int num_bins = markers_index.size(); |
|
| 219 |
- for (int ibin=0; ibin<num_bins; ibin++) {
|
|
| 220 |
- vector<double> values; |
|
| 221 |
- for (int i=0; i<num_of_values; i++) |
|
| 222 |
- values.push_back( init_value ); |
|
| 223 |
- bins2values[markers_index[ibin]] = values; |
|
| 224 |
- } |
|
| 225 |
-} |
|
| 226 |
- |
|
| 227 |
-void print_Bins2Values(Bins2Values & bins2values) |
|
| 228 |
-{
|
|
| 229 |
- // cout.precision(15); |
|
| 230 |
- Bins2Values::iterator it; |
|
| 231 |
- for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 232 |
- vector<double> & values = it->second; |
|
| 233 |
- Rcpp::Rcout << it->first; |
|
| 234 |
- for (int i=0; i<values.size(); i++) |
|
| 235 |
- Rcpp::Rcout << "\t" << values[i]; |
|
| 236 |
- Rcpp::Rcout << endl; |
|
| 237 |
- } |
|
| 238 |
-} |
|
| 239 |
- |
|
| 240 |
-void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values) |
|
| 241 |
-{
|
|
| 242 |
- Bins2UnsignedIntegers::iterator it; |
|
| 243 |
- for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 244 |
- vector<unsigned int> & values = it->second; |
|
| 245 |
- Rcpp::Rcout << it->first; |
|
| 246 |
- for (int i=0; i<(int)values.size(); i++) |
|
| 247 |
- Rcpp::Rcout << "\t" << values[i]; |
|
| 248 |
- Rcpp::Rcout << endl; |
|
| 249 |
- } |
|
| 250 |
-} |
|
| 251 |
- |
|
| 252 |
-// when optional_write==TRUE, we assume there are two values associated with each bin |
|
| 253 |
-void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names, |
|
| 254 |
- string output_file, bool optional_write) |
|
| 255 |
-{
|
|
| 256 |
- ofstream out; |
|
| 257 |
- out.open(output_file.c_str()); |
|
| 258 |
- if (out.fail()){
|
|
| 259 |
- Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 260 |
- // exit(EXIT_FAILURE); |
|
| 261 |
- } |
|
| 262 |
- int i; |
|
| 263 |
- for (i=0; i<columns_names.size()-1; i++) |
|
| 264 |
- out << columns_names[i] << "\t"; |
|
| 265 |
- out << columns_names[i] << endl; |
|
| 266 |
- |
|
| 267 |
- out.precision(15); |
|
| 268 |
- Bins2Values::iterator it; |
|
| 269 |
- for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 270 |
- vector<double> & values = it->second; |
|
| 271 |
- out << it->first; |
|
| 272 |
- if (optional_write) {
|
|
| 273 |
- // assume there are at least two values associated with each bin |
|
| 274 |
- // for example, when we have three associated values, they can be |
|
| 275 |
- // (1) methylation_count |
|
| 276 |
- // (2) unmethylation_count |
|
| 277 |
- // (3) number of reads |
|
| 278 |
- double n = values[0] + values[1]; |
|
| 279 |
- double v; |
|
| 280 |
- if (n==0) v=0; |
|
| 281 |
- else v=values[0]/n; |
|
| 282 |
- out << "\t" << v << "\t" << n; |
|
| 283 |
- } |
|
| 284 |
- for (i=0; i<values.size(); i++) |
|
| 285 |
- out << "\t" << values[i]; |
|
| 286 |
- out << endl; |
|
| 287 |
- } |
|
| 288 |
- out.close(); |
|
| 289 |
-} |
|
| 290 |
- |
|
| 291 |
-// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base |
|
| 292 |
-void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value) |
|
| 293 |
-{
|
|
| 294 |
- for (int bin_index=1; bin_index<=num_bins; bin_index++) |
|
| 295 |
- bins2value[bin_index] = init_value; |
|
| 296 |
-} |
|
| 297 |
- |
|
| 298 |
-ostream& operator<<(ostream& out, Bins2Value& bins2value) {
|
|
| 299 |
- out << "bin_index" << "\t" << "value" << endl; |
|
| 300 |
- out.precision(15); |
|
| 301 |
- Bins2Value::iterator it; |
|
| 302 |
- for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 303 |
- out << it->first << "\t" << it->second << endl; |
|
| 304 |
- return(out); |
|
| 305 |
-} |
|
| 306 |
- |
|
| 307 |
-void write_Bins2Value(Bins2Value & bins2value, string output_file) |
|
| 308 |
-{
|
|
| 309 |
- ofstream out; |
|
| 310 |
- out.open(output_file.c_str()); |
|
| 311 |
- if (out.fail()){
|
|
| 312 |
- Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 313 |
- // exit(EXIT_FAILURE); |
|
| 314 |
- } |
|
| 315 |
- out << "bin_index" << "\t" << "value" << endl; |
|
| 316 |
- out.precision(15); |
|
| 317 |
- Bins2Value::iterator it; |
|
| 318 |
- for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 319 |
- out << it->first << "\t" << it->second << endl; |
|
| 320 |
- out.close(); |
|
| 321 |
-} |
|
| 322 |
- |
|
| 1 |
+// [[Rcpp::depends(BH)]] |
|
| 2 |
+#include <Rcpp.h> |
|
| 3 |
+using namespace Rcpp; |
|
| 4 |
+ |
|
| 5 |
+#include <cstdlib> |
|
| 6 |
+#include <sstream> |
|
| 7 |
+#include <iostream> |
|
| 8 |
+#include <fstream> |
|
| 9 |
+#include <map> |
|
| 10 |
+#include <vector> |
|
| 11 |
+#include <string> |
|
| 12 |
+#include <cmath> |
|
| 13 |
+#include <cstdlib> |
|
| 14 |
+#include <boost/algorithm/string.hpp> |
|
| 15 |
+#include <boost/assign.hpp> |
|
| 16 |
+#include "data_types.h" |
|
| 17 |
+ |
|
| 18 |
+using namespace std; |
|
| 19 |
+using namespace boost; |
|
| 20 |
+ |
|
| 21 |
+// |
|
| 22 |
+// bins (or features) annotation format |
|
| 23 |
+// |
|
| 24 |
+//1. file of bins (features) annotation: "biomarkers.all_bins" |
|
| 25 |
+//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line. |
|
| 26 |
+//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0. |
|
| 27 |
+//Column 2: chr |
|
| 28 |
+//Column 3: start coordinate of read (1-base) |
|
| 29 |
+//Column 4: end coordinate of read (1-base). The range of the bin is [start, end) |
|
| 30 |
+//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching. |
|
| 31 |
+// |
|
| 32 |
+// The following is an example file |
|
| 33 |
+// |
|
| 34 |
+//marker_index chr start end marker_type |
|
| 35 |
+//0 chr1 1 855266 - |
|
| 36 |
+//1 chr1 855266 855766 II |
|
| 37 |
+//0 chr1 855766 969796 - |
|
| 38 |
+//2 chr1 969796 970296 II |
|
| 39 |
+//0 chr1 970296 1099044 - |
|
| 40 |
+//3 chr1 1099044 1099544 II |
|
| 41 |
+//0 chr1 1099544 1109315 - |
|
| 42 |
+//4 chr1 1109315 1109815 II |
|
| 43 |
+// |
|
| 44 |
+void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord, |
|
| 45 |
+ Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true) |
|
| 46 |
+{
|
|
| 47 |
+ ifstream fin; |
|
| 48 |
+ fin.open(input_bins_annot_file.c_str()); |
|
| 49 |
+ if (fin.fail()){
|
|
| 50 |
+ Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl; |
|
| 51 |
+ // exit(EXIT_FAILURE); |
|
| 52 |
+ } |
|
| 53 |
+ string line; |
|
| 54 |
+ if (has_header_line) |
|
| 55 |
+ // skip the first header line |
|
| 56 |
+ getline(fin, line); |
|
| 57 |
+ unsigned long i=0; |
|
| 58 |
+ string old_chr; |
|
| 59 |
+ while (!fin.eof()) {
|
|
| 60 |
+ getline(fin, line); |
|
| 61 |
+ if (line.empty()) {
|
|
| 62 |
+ // this is the last line of the file |
|
| 63 |
+ break; |
|
| 64 |
+ } |
|
| 65 |
+ //cout << line << endl; |
|
| 66 |
+ |
|
| 67 |
+ vector<string> strs1; |
|
| 68 |
+ split(strs1, line, is_any_of("\t"));
|
|
| 69 |
+ int bin_index = atoi(strs1[0].c_str()); |
|
| 70 |
+ string chr = strs1[1]; |
|
| 71 |
+ //int start_coord = atoi(strs1[2].c_str()); // start coordinate |
|
| 72 |
+ int end_coord = atoi(strs1[3].c_str()); // end coordinate |
|
| 73 |
+ if (i==0) {
|
|
| 74 |
+ // this is the first bin of all the genome, so we initialize old_chr |
|
| 75 |
+ old_chr = chr; |
|
| 76 |
+ } |
|
| 77 |
+ if (chr.compare(old_chr)!=0) {
|
|
| 78 |
+ // This is the 1st bin of the new chromosome |
|
| 79 |
+ old_chr = chr; |
|
| 80 |
+ bins_end_coord.insert(make_pair(chr, vector<unsigned int>())); |
|
| 81 |
+ bins_index.insert(make_pair(chr, vector<int >())); |
|
| 82 |
+ bins_info.insert(make_pair(chr, vector<string >())); |
|
| 83 |
+ } |
|
| 84 |
+ vector<unsigned int> & coords = bins_end_coord[chr]; |
|
| 85 |
+ coords.push_back( end_coord ); |
|
| 86 |
+ vector<int> & indexes = bins_index[chr]; |
|
| 87 |
+ indexes.push_back( bin_index ); |
|
| 88 |
+ vector<string> & infos = bins_info[chr]; |
|
| 89 |
+ infos.push_back( line ); |
|
| 90 |
+ i++; |
|
| 91 |
+ } |
|
| 92 |
+ Rcpp::Rcerr << "#bins=" << i << endl; |
|
| 93 |
+} |
|
| 94 |
+ |
|
| 95 |
+int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index) |
|
| 96 |
+{
|
|
| 97 |
+ int num_of_non_void_bins = 0; |
|
| 98 |
+ Bins_index::iterator it; |
|
| 99 |
+ for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
|
|
| 100 |
+ vector<int> & bins_of_chr = it->second; |
|
| 101 |
+ for (int i=0; i<bins_of_chr.size(); i++) |
|
| 102 |
+ if (bins_of_chr[i] > 0) {
|
|
| 103 |
+ num_of_non_void_bins++; |
|
| 104 |
+ returned_markers_index.push_back( bins_of_chr[i] ); |
|
| 105 |
+ } |
|
| 106 |
+ } |
|
| 107 |
+ return num_of_non_void_bins; |
|
| 108 |
+} |
|
| 109 |
+ |
|
| 110 |
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base. |
|
| 111 |
+// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 112 |
+int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
|
|
| 113 |
+ int bin_internal_index=-1; |
|
| 114 |
+ vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 115 |
+ vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord); |
|
| 116 |
+ if (bin_it!=coords_bins_of_chr.end()) {
|
|
| 117 |
+ // found |
|
| 118 |
+ bin_internal_index = bin_it-coords_bins_of_chr.begin(); |
|
| 119 |
+ } |
|
| 120 |
+ return bin_internal_index; |
|
| 121 |
+} |
|
| 122 |
+ |
|
| 123 |
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin. |
|
| 124 |
+// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 125 |
+int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
|
|
| 126 |
+ int bin_internal_index=-1; |
|
| 127 |
+ if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
|
|
| 128 |
+ // chr name is not found in binning system |
|
| 129 |
+ bin_internal_index = -1; |
|
| 130 |
+ } else {
|
|
| 131 |
+ vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 132 |
+ vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position); |
|
| 133 |
+ if (position==*bin_it) bin_it++; |
|
| 134 |
+ bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base |
|
| 135 |
+ if (bin_internal_index==coords_bins_of_chr.size()) {
|
|
| 136 |
+ //cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl; |
|
| 137 |
+ bin_internal_index=-1; |
|
| 138 |
+ } |
|
| 139 |
+ } |
|
| 140 |
+ return bin_internal_index; |
|
| 141 |
+} |
|
| 142 |
+ |
|
| 143 |
+// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] ) |
|
| 144 |
+// Given a query region, we want to know which bin has overlap with this query region. If there is overlap, |
|
| 145 |
+// return (1) bin index, and (2) the overlap length |
|
| 146 |
+// ongoing devevloping |
|
| 147 |
+int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord, |
|
| 148 |
+ unsigned int query_region_end_coord, int & overlap_length) |
|
| 149 |
+{
|
|
| 150 |
+ int bin_internal_index = -1; |
|
| 151 |
+ overlap_length = -1; |
|
| 152 |
+ if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
|
|
| 153 |
+ // chr name is found in binning system |
|
| 154 |
+ bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord); |
|
| 155 |
+ if (bin_internal_index != -1) {
|
|
| 156 |
+ unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index]; |
|
| 157 |
+ if ( query_region_end_coord > bin_end_coord ) |
|
| 158 |
+ overlap_length = bin_end_coord - query_region_start_coord + 1; |
|
| 159 |
+ else |
|
| 160 |
+ overlap_length = query_region_end_coord - query_region_start_coord + 1; |
|
| 161 |
+ } |
|
| 162 |
+ } |
|
| 163 |
+ return bin_internal_index; |
|
| 164 |
+} |
|
| 165 |
+ |
|
| 166 |
+void print_uint_vec( ostream& os, vector<unsigned int>& v, int len ) |
|
| 167 |
+{
|
|
| 168 |
+ int i; |
|
| 169 |
+ if (len>v.size() || len==0) len=v.size(); |
|
| 170 |
+ if (len==0 || v.size()==0) {
|
|
| 171 |
+ os << "[" << "]"; |
|
| 172 |
+ } else {
|
|
| 173 |
+ os << "["; |
|
| 174 |
+ for (i=0; i<len-1; i++) |
|
| 175 |
+ os << v[i] << ","; |
|
| 176 |
+ os << v[i] << "]"; |
|
| 177 |
+ } |
|
| 178 |
+} |
|
| 179 |
+ |
|
| 180 |
+void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
|
|
| 181 |
+ Bins_end_coord::iterator it; |
|
| 182 |
+ for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
|
|
| 183 |
+ string chr = it->first; |
|
| 184 |
+ vector<unsigned int> coords=it->second; |
|
| 185 |
+ vector<int> indexes=bins_index[chr]; |
|
| 186 |
+ vector<string> infos=bins_info[chr]; |
|
| 187 |
+ int n_bins = coords.size(); |
|
| 188 |
+ for (int i=0; i<n_bins; i++) {
|
|
| 189 |
+ os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'" << endl; |
|
| 190 |
+ } |
|
| 191 |
+ } |
|
| 192 |
+} |
|
| 193 |
+ |
|
| 194 |
+/* |
|
| 195 |
+void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
|
|
| 196 |
+ Bins_FullInfo::iterator: it; |
|
| 197 |
+ for (it=bins.begin(); it!=bins.end(); ++it) {
|
|
| 198 |
+ cout << it.first << "\t"; |
|
| 199 |
+ print_int_vec(cout, it.second, it.second.size()); |
|
| 200 |
+ cout << endl; |
|
| 201 |
+ } |
|
| 202 |
+} |
|
| 203 |
+*/ |
|
| 204 |
+ |
|
| 205 |
+// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base |
|
| 206 |
+void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 207 |
+{
|
|
| 208 |
+ for (int bin_index=1; bin_index<=num_bins; bin_index++) {
|
|
| 209 |
+ vector<double> values; |
|
| 210 |
+ for (int i=0; i<num_of_values; i++) |
|
| 211 |
+ values.push_back( init_value ); |
|
| 212 |
+ bins2values[bin_index] = values; |
|
| 213 |
+ } |
|
| 214 |
+} |
|
| 215 |
+ |
|
| 216 |
+// Bins2Values: a map of marker_index -> a vector of values. |
|
| 217 |
+void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 218 |
+{
|
|
| 219 |
+ int num_bins = markers_index.size(); |
|
| 220 |
+ for (int ibin=0; ibin<num_bins; ibin++) {
|
|
| 221 |
+ vector<double> values; |
|
| 222 |
+ for (int i=0; i<num_of_values; i++) |
|
| 223 |
+ values.push_back( init_value ); |
|
| 224 |
+ bins2values[markers_index[ibin]] = values; |
|
| 225 |
+ } |
|
| 226 |
+} |
|
| 227 |
+ |
|
| 228 |
+void print_Bins2Values(Bins2Values & bins2values) |
|
| 229 |
+{
|
|
| 230 |
+ // cout.precision(15); |
|
| 231 |
+ Bins2Values::iterator it; |
|
| 232 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 233 |
+ vector<double> & values = it->second; |
|
| 234 |
+ Rcpp::Rcout << it->first; |
|
| 235 |
+ for (int i=0; i<values.size(); i++) |
|
| 236 |
+ Rcpp::Rcout << "\t" << values[i]; |
|
| 237 |
+ Rcpp::Rcout << endl; |
|
| 238 |
+ } |
|
| 239 |
+} |
|
| 240 |
+ |
|
| 241 |
+void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values) |
|
| 242 |
+{
|
|
| 243 |
+ Bins2UnsignedIntegers::iterator it; |
|
| 244 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 245 |
+ vector<unsigned int> & values = it->second; |
|
| 246 |
+ Rcpp::Rcout << it->first; |
|
| 247 |
+ for (int i=0; i<(int)values.size(); i++) |
|
| 248 |
+ Rcpp::Rcout << "\t" << values[i]; |
|
| 249 |
+ Rcpp::Rcout << endl; |
|
| 250 |
+ } |
|
| 251 |
+} |
|
| 252 |
+ |
|
| 253 |
+// when optional_write==TRUE, we assume there are two values associated with each bin |
|
| 254 |
+void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names, |
|
| 255 |
+ string output_file, bool optional_write) |
|
| 256 |
+{
|
|
| 257 |
+ ofstream out; |
|
| 258 |
+ out.open(output_file.c_str()); |
|
| 259 |
+ if (out.fail()){
|
|
| 260 |
+ Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 261 |
+ // exit(EXIT_FAILURE); |
|
| 262 |
+ } |
|
| 263 |
+ int i; |
|
| 264 |
+ for (i=0; i<columns_names.size()-1; i++) |
|
| 265 |
+ out << columns_names[i] << "\t"; |
|
| 266 |
+ out << columns_names[i] << endl; |
|
| 267 |
+ |
|
| 268 |
+ out.precision(15); |
|
| 269 |
+ Bins2Values::iterator it; |
|
| 270 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 271 |
+ vector<double> & values = it->second; |
|
| 272 |
+ out << it->first; |
|
| 273 |
+ if (optional_write) {
|
|
| 274 |
+ // assume there are at least two values associated with each bin |
|
| 275 |
+ // for example, when we have three associated values, they can be |
|
| 276 |
+ // (1) methylation_count |
|
| 277 |
+ // (2) unmethylation_count |
|
| 278 |
+ // (3) number of reads |
|
| 279 |
+ double n = values[0] + values[1]; |
|
| 280 |
+ double v; |
|
| 281 |
+ if (n==0) v=0; |
|
| 282 |
+ else v=values[0]/n; |
|
| 283 |
+ out << "\t" << v << "\t" << n; |
|
| 284 |
+ } |
|
| 285 |
+ for (i=0; i<values.size(); i++) |
|
| 286 |
+ out << "\t" << values[i]; |
|
| 287 |
+ out << endl; |
|
| 288 |
+ } |
|
| 289 |
+ out.close(); |
|
| 290 |
+} |
|
| 291 |
+ |
|
| 292 |
+// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base |
|
| 293 |
+void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value) |
|
| 294 |
+{
|
|
| 295 |
+ for (int bin_index=1; bin_index<=num_bins; bin_index++) |
|
| 296 |
+ bins2value[bin_index] = init_value; |
|
| 297 |
+} |
|
| 298 |
+ |
|
| 299 |
+ostream& operator<<(ostream& out, Bins2Value& bins2value) {
|
|
| 300 |
+ out << "bin_index" << "\t" << "value" << endl; |
|
| 301 |
+ out.precision(15); |
|
| 302 |
+ Bins2Value::iterator it; |
|
| 303 |
+ for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 304 |
+ out << it->first << "\t" << it->second << endl; |
|
| 305 |
+ return(out); |
|
| 306 |
+} |
|
| 307 |
+ |
|
| 308 |
+void write_Bins2Value(Bins2Value & bins2value, string output_file) |
|
| 309 |
+{
|
|
| 310 |
+ ofstream out; |
|
| 311 |
+ out.open(output_file.c_str()); |
|
| 312 |
+ if (out.fail()){
|
|
| 313 |
+ Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 314 |
+ // exit(EXIT_FAILURE); |
|
| 315 |
+ } |
|
| 316 |
+ out << "bin_index" << "\t" << "value" << endl; |
|
| 317 |
+ out.precision(15); |
|
| 318 |
+ Bins2Value::iterator it; |
|
| 319 |
+ for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 320 |
+ out << it->first << "\t" << it->second << endl; |
|
| 321 |
+ out.close(); |
|
| 322 |
+} |
|
| 323 |
+ |
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,322 @@ |
| 1 |
+#include <Rcpp.h> |
|
| 2 |
+using namespace Rcpp; |
|
| 3 |
+ |
|
| 4 |
+#include <cstdlib> |
|
| 5 |
+#include <sstream> |
|
| 6 |
+#include <iostream> |
|
| 7 |
+#include <fstream> |
|
| 8 |
+#include <map> |
|
| 9 |
+#include <vector> |
|
| 10 |
+#include <string> |
|
| 11 |
+#include <cmath> |
|
| 12 |
+#include <cstdlib> |
|
| 13 |
+#include <boost/algorithm/string.hpp> |
|
| 14 |
+#include <boost/assign.hpp> |
|
| 15 |
+#include "data_types.h" |
|
| 16 |
+ |
|
| 17 |
+using namespace std; |
|
| 18 |
+using namespace boost; |
|
| 19 |
+ |
|
| 20 |
+// |
|
| 21 |
+// bins (or features) annotation format |
|
| 22 |
+// |
|
| 23 |
+//1. file of bins (features) annotation: "biomarkers.all_bins" |
|
| 24 |
+//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line. |
|
| 25 |
+//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0. |
|
| 26 |
+//Column 2: chr |
|
| 27 |
+//Column 3: start coordinate of read (1-base) |
|
| 28 |
+//Column 4: end coordinate of read (1-base). The range of the bin is [start, end) |
|
| 29 |
+//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching. |
|
| 30 |
+// |
|
| 31 |
+// The following is an example file |
|
| 32 |
+// |
|
| 33 |
+//marker_index chr start end marker_type |
|
| 34 |
+//0 chr1 1 855266 - |
|
| 35 |
+//1 chr1 855266 855766 II |
|
| 36 |
+//0 chr1 855766 969796 - |
|
| 37 |
+//2 chr1 969796 970296 II |
|
| 38 |
+//0 chr1 970296 1099044 - |
|
| 39 |
+//3 chr1 1099044 1099544 II |
|
| 40 |
+//0 chr1 1099544 1109315 - |
|
| 41 |
+//4 chr1 1109315 1109815 II |
|
| 42 |
+// |
|
| 43 |
+void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord, |
|
| 44 |
+ Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true) |
|
| 45 |
+{
|
|
| 46 |
+ ifstream fin; |
|
| 47 |
+ fin.open(input_bins_annot_file.c_str()); |
|
| 48 |
+ if (fin.fail()){
|
|
| 49 |
+ Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl; |
|
| 50 |
+ // exit(EXIT_FAILURE); |
|
| 51 |
+ } |
|
| 52 |
+ string line; |
|
| 53 |
+ if (has_header_line) |
|
| 54 |
+ // skip the first header line |
|
| 55 |
+ getline(fin, line); |
|
| 56 |
+ unsigned long i=0; |
|
| 57 |
+ string old_chr; |
|
| 58 |
+ while (!fin.eof()) {
|
|
| 59 |
+ getline(fin, line); |
|
| 60 |
+ if (line.empty()) {
|
|
| 61 |
+ // this is the last line of the file |
|
| 62 |
+ break; |
|
| 63 |
+ } |
|
| 64 |
+ //cout << line << endl; |
|
| 65 |
+ |
|
| 66 |
+ vector<string> strs1; |
|
| 67 |
+ split(strs1, line, is_any_of("\t"));
|
|
| 68 |
+ int bin_index = atoi(strs1[0].c_str()); |
|
| 69 |
+ string chr = strs1[1]; |
|
| 70 |
+ //int start_coord = atoi(strs1[2].c_str()); // start coordinate |
|
| 71 |
+ int end_coord = atoi(strs1[3].c_str()); // end coordinate |
|
| 72 |
+ if (i==0) {
|
|
| 73 |
+ // this is the first bin of all the genome, so we initialize old_chr |
|
| 74 |
+ old_chr = chr; |
|
| 75 |
+ } |
|
| 76 |
+ if (chr.compare(old_chr)!=0) {
|
|
| 77 |
+ // This is the 1st bin of the new chromosome |
|
| 78 |
+ old_chr = chr; |
|
| 79 |
+ bins_end_coord.insert(make_pair(chr, vector<unsigned int>())); |
|
| 80 |
+ bins_index.insert(make_pair(chr, vector<int >())); |
|
| 81 |
+ bins_info.insert(make_pair(chr, vector<string >())); |
|
| 82 |
+ } |
|
| 83 |
+ vector<unsigned int> & coords = bins_end_coord[chr]; |
|
| 84 |
+ coords.push_back( end_coord ); |
|
| 85 |
+ vector<int> & indexes = bins_index[chr]; |
|
| 86 |
+ indexes.push_back( bin_index ); |
|
| 87 |
+ vector<string> & infos = bins_info[chr]; |
|
| 88 |
+ infos.push_back( line ); |
|
| 89 |
+ i++; |
|
| 90 |
+ } |
|
| 91 |
+ Rcpp::Rcerr << "#bins=" << i << endl; |
|
| 92 |
+} |
|
| 93 |
+ |
|
| 94 |
+int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index) |
|
| 95 |
+{
|
|
| 96 |
+ int num_of_non_void_bins = 0; |
|
| 97 |
+ Bins_index::iterator it; |
|
| 98 |
+ for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
|
|
| 99 |
+ vector<int> & bins_of_chr = it->second; |
|
| 100 |
+ for (int i=0; i<bins_of_chr.size(); i++) |
|
| 101 |
+ if (bins_of_chr[i] > 0) {
|
|
| 102 |
+ num_of_non_void_bins++; |
|
| 103 |
+ returned_markers_index.push_back( bins_of_chr[i] ); |
|
| 104 |
+ } |
|
| 105 |
+ } |
|
| 106 |
+ return num_of_non_void_bins; |
|
| 107 |
+} |
|
| 108 |
+ |
|
| 109 |
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base. |
|
| 110 |
+// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 111 |
+int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
|
|
| 112 |
+ int bin_internal_index=-1; |
|
| 113 |
+ vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 114 |
+ vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord); |
|
| 115 |
+ if (bin_it!=coords_bins_of_chr.end()) {
|
|
| 116 |
+ // found |
|
| 117 |
+ bin_internal_index = bin_it-coords_bins_of_chr.begin(); |
|
| 118 |
+ } |
|
| 119 |
+ return bin_internal_index; |
|
| 120 |
+} |
|
| 121 |
+ |
|
| 122 |
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin. |
|
| 123 |
+// Returned bin_internal_index is 0-base. If not found, return -1 |
|
| 124 |
+int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
|
|
| 125 |
+ int bin_internal_index=-1; |
|
| 126 |
+ if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
|
|
| 127 |
+ // chr name is not found in binning system |
|
| 128 |
+ bin_internal_index = -1; |
|
| 129 |
+ } else {
|
|
| 130 |
+ vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr. |
|
| 131 |
+ vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position); |
|
| 132 |
+ if (position==*bin_it) bin_it++; |
|
| 133 |
+ bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base |
|
| 134 |
+ if (bin_internal_index==coords_bins_of_chr.size()) {
|
|
| 135 |
+ //cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl; |
|
| 136 |
+ bin_internal_index=-1; |
|
| 137 |
+ } |
|
| 138 |
+ } |
|
| 139 |
+ return bin_internal_index; |
|
| 140 |
+} |
|
| 141 |
+ |
|
| 142 |
+// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] ) |
|
| 143 |
+// Given a query region, we want to know which bin has overlap with this query region. If there is overlap, |
|
| 144 |
+// return (1) bin index, and (2) the overlap length |
|
| 145 |
+// ongoing devevloping |
|
| 146 |
+int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord, |
|
| 147 |
+ unsigned int query_region_end_coord, int & overlap_length) |
|
| 148 |
+{
|
|
| 149 |
+ int bin_internal_index = -1; |
|
| 150 |
+ overlap_length = -1; |
|
| 151 |
+ if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
|
|
| 152 |
+ // chr name is found in binning system |
|
| 153 |
+ bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord); |
|
| 154 |
+ if (bin_internal_index != -1) {
|
|
| 155 |
+ unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index]; |
|
| 156 |
+ if ( query_region_end_coord > bin_end_coord ) |
|
| 157 |
+ overlap_length = bin_end_coord - query_region_start_coord + 1; |
|
| 158 |
+ else |
|
| 159 |
+ overlap_length = query_region_end_coord - query_region_start_coord + 1; |
|
| 160 |
+ } |
|
| 161 |
+ } |
|
| 162 |
+ return bin_internal_index; |
|
| 163 |
+} |
|
| 164 |
+ |
|
| 165 |
+void print_uint_vec( ostream& os, vector<unsigned int>& v, int len ) |
|
| 166 |
+{
|
|
| 167 |
+ int i; |
|
| 168 |
+ if (len>v.size() || len==0) len=v.size(); |
|
| 169 |
+ if (len==0 || v.size()==0) {
|
|
| 170 |
+ os << "[" << "]"; |
|
| 171 |
+ } else {
|
|
| 172 |
+ os << "["; |
|
| 173 |
+ for (i=0; i<len-1; i++) |
|
| 174 |
+ os << v[i] << ","; |
|
| 175 |
+ os << v[i] << "]"; |
|
| 176 |
+ } |
|
| 177 |
+} |
|
| 178 |
+ |
|
| 179 |
+void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
|
|
| 180 |
+ Bins_end_coord::iterator it; |
|
| 181 |
+ for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
|
|
| 182 |
+ string chr = it->first; |
|
| 183 |
+ vector<unsigned int> coords=it->second; |
|
| 184 |
+ vector<int> indexes=bins_index[chr]; |
|
| 185 |
+ vector<string> infos=bins_info[chr]; |
|
| 186 |
+ int n_bins = coords.size(); |
|
| 187 |
+ for (int i=0; i<n_bins; i++) {
|
|
| 188 |
+ os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'" << endl; |
|
| 189 |
+ } |
|
| 190 |
+ } |
|
| 191 |
+} |
|
| 192 |
+ |
|
| 193 |
+/* |
|
| 194 |
+void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
|
|
| 195 |
+ Bins_FullInfo::iterator: it; |
|
| 196 |
+ for (it=bins.begin(); it!=bins.end(); ++it) {
|
|
| 197 |
+ cout << it.first << "\t"; |
|
| 198 |
+ print_int_vec(cout, it.second, it.second.size()); |
|
| 199 |
+ cout << endl; |
|
| 200 |
+ } |
|
| 201 |
+} |
|
| 202 |
+*/ |
|
| 203 |
+ |
|
| 204 |
+// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base |
|
| 205 |
+void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 206 |
+{
|
|
| 207 |
+ for (int bin_index=1; bin_index<=num_bins; bin_index++) {
|
|
| 208 |
+ vector<double> values; |
|
| 209 |
+ for (int i=0; i<num_of_values; i++) |
|
| 210 |
+ values.push_back( init_value ); |
|
| 211 |
+ bins2values[bin_index] = values; |
|
| 212 |
+ } |
|
| 213 |
+} |
|
| 214 |
+ |
|
| 215 |
+// Bins2Values: a map of marker_index -> a vector of values. |
|
| 216 |
+void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values) |
|
| 217 |
+{
|
|
| 218 |
+ int num_bins = markers_index.size(); |
|
| 219 |
+ for (int ibin=0; ibin<num_bins; ibin++) {
|
|
| 220 |
+ vector<double> values; |
|
| 221 |
+ for (int i=0; i<num_of_values; i++) |
|
| 222 |
+ values.push_back( init_value ); |
|
| 223 |
+ bins2values[markers_index[ibin]] = values; |
|
| 224 |
+ } |
|
| 225 |
+} |
|
| 226 |
+ |
|
| 227 |
+void print_Bins2Values(Bins2Values & bins2values) |
|
| 228 |
+{
|
|
| 229 |
+ // cout.precision(15); |
|
| 230 |
+ Bins2Values::iterator it; |
|
| 231 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 232 |
+ vector<double> & values = it->second; |
|
| 233 |
+ Rcpp::Rcout << it->first; |
|
| 234 |
+ for (int i=0; i<values.size(); i++) |
|
| 235 |
+ Rcpp::Rcout << "\t" << values[i]; |
|
| 236 |
+ Rcpp::Rcout << endl; |
|
| 237 |
+ } |
|
| 238 |
+} |
|
| 239 |
+ |
|
| 240 |
+void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values) |
|
| 241 |
+{
|
|
| 242 |
+ Bins2UnsignedIntegers::iterator it; |
|
| 243 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 244 |
+ vector<unsigned int> & values = it->second; |
|
| 245 |
+ Rcpp::Rcout << it->first; |
|
| 246 |
+ for (int i=0; i<(int)values.size(); i++) |
|
| 247 |
+ Rcpp::Rcout << "\t" << values[i]; |
|
| 248 |
+ Rcpp::Rcout << endl; |
|
| 249 |
+ } |
|
| 250 |
+} |
|
| 251 |
+ |
|
| 252 |
+// when optional_write==TRUE, we assume there are two values associated with each bin |
|
| 253 |
+void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names, |
|
| 254 |
+ string output_file, bool optional_write) |
|
| 255 |
+{
|
|
| 256 |
+ ofstream out; |
|
| 257 |
+ out.open(output_file.c_str()); |
|
| 258 |
+ if (out.fail()){
|
|
| 259 |
+ Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 260 |
+ // exit(EXIT_FAILURE); |
|
| 261 |
+ } |
|
| 262 |
+ int i; |
|
| 263 |
+ for (i=0; i<columns_names.size()-1; i++) |
|
| 264 |
+ out << columns_names[i] << "\t"; |
|
| 265 |
+ out << columns_names[i] << endl; |
|
| 266 |
+ |
|
| 267 |
+ out.precision(15); |
|
| 268 |
+ Bins2Values::iterator it; |
|
| 269 |
+ for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
|
|
| 270 |
+ vector<double> & values = it->second; |
|
| 271 |
+ out << it->first; |
|
| 272 |
+ if (optional_write) {
|
|
| 273 |
+ // assume there are at least two values associated with each bin |
|
| 274 |
+ // for example, when we have three associated values, they can be |
|
| 275 |
+ // (1) methylation_count |
|
| 276 |
+ // (2) unmethylation_count |
|
| 277 |
+ // (3) number of reads |
|
| 278 |
+ double n = values[0] + values[1]; |
|
| 279 |
+ double v; |
|
| 280 |
+ if (n==0) v=0; |
|
| 281 |
+ else v=values[0]/n; |
|
| 282 |
+ out << "\t" << v << "\t" << n; |
|
| 283 |
+ } |
|
| 284 |
+ for (i=0; i<values.size(); i++) |
|
| 285 |
+ out << "\t" << values[i]; |
|
| 286 |
+ out << endl; |
|
| 287 |
+ } |
|
| 288 |
+ out.close(); |
|
| 289 |
+} |
|
| 290 |
+ |
|
| 291 |
+// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base |
|
| 292 |
+void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value) |
|
| 293 |
+{
|
|
| 294 |
+ for (int bin_index=1; bin_index<=num_bins; bin_index++) |
|
| 295 |
+ bins2value[bin_index] = init_value; |
|
| 296 |
+} |
|
| 297 |
+ |
|
| 298 |
+ostream& operator<<(ostream& out, Bins2Value& bins2value) {
|
|
| 299 |
+ out << "bin_index" << "\t" << "value" << endl; |
|
| 300 |
+ out.precision(15); |
|
| 301 |
+ Bins2Value::iterator it; |
|
| 302 |
+ for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 303 |
+ out << it->first << "\t" << it->second << endl; |
|
| 304 |
+ return(out); |
|
| 305 |
+} |
|
| 306 |
+ |
|
| 307 |
+void write_Bins2Value(Bins2Value & bins2value, string output_file) |
|
| 308 |
+{
|
|
| 309 |
+ ofstream out; |
|
| 310 |
+ out.open(output_file.c_str()); |
|
| 311 |
+ if (out.fail()){
|
|
| 312 |
+ Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl; |
|
| 313 |
+ // exit(EXIT_FAILURE); |
|
| 314 |
+ } |
|
| 315 |
+ out << "bin_index" << "\t" << "value" << endl; |
|
| 316 |
+ out.precision(15); |
|
| 317 |
+ Bins2Value::iterator it; |
|
| 318 |
+ for (it=bins2value.begin(); it!=bins2value.end(); ++it) |
|
| 319 |
+ out << it->first << "\t" << it->second << endl; |
|
| 320 |
+ out.close(); |
|
| 321 |
+} |
|
| 322 |
+ |