Thanks to visit codestin.com
Credit goes to code.bioconductor.org

Browse code

win version

RoseHuRan authored on 29/03/2022 16:48:36
Showing 1 changed files
... ...
@@ -1,322 +1,323 @@
1
-#include <Rcpp.h>
2
-using namespace Rcpp;
3
-
4
-#include <cstdlib>
5
-#include <sstream>
6
-#include <iostream>
7
-#include <fstream>
8
-#include <map>
9
-#include <vector>
10
-#include <string>
11
-#include <cmath>
12
-#include <cstdlib>
13
-#include <boost/algorithm/string.hpp>
14
-#include <boost/assign.hpp>
15
-#include "data_types.h"
16
-
17
-using namespace std;
18
-using namespace boost;
19
-
20
-//
21
-// bins (or features) annotation format
22
-//
23
-//1. file of bins (features) annotation: "biomarkers.all_bins"
24
-//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line.
25
-//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0.
26
-//Column 2: chr
27
-//Column 3: start coordinate of read (1-base)
28
-//Column 4: end coordinate of read (1-base). The range of the bin is [start, end)
29
-//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
30
-//
31
-// The following is an example file
32
-//
33
-//marker_index    chr     start   end     marker_type
34
-//0       chr1    1       855266  -
35
-//1       chr1    855266  855766  II
36
-//0       chr1    855766  969796  -
37
-//2       chr1    969796  970296  II
38
-//0       chr1    970296  1099044 -
39
-//3       chr1    1099044 1099544 II
40
-//0       chr1    1099544 1109315 -
41
-//4       chr1    1109315 1109815 II
42
-//
43
-void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord,
44
-	Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true)
45
-{
46
-	ifstream fin;
47
-	fin.open(input_bins_annot_file.c_str());
48
-	if (fin.fail()){
49
-	  Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl;
50
-	  // exit(EXIT_FAILURE);
51
-  }
52
-	string line;
53
-	if (has_header_line)
54
-		// skip the first header line
55
-		getline(fin, line);
56
-	unsigned long i=0;
57
-	string old_chr;
58
-	while (!fin.eof()) {
59
-		getline(fin, line);
60
-		if (line.empty()) {
61
-			// this is the last line of the file
62
-			break;
63
-		}
64
-		//cout << line << endl;
65
-
66
-		vector<string> strs1;
67
-		split(strs1, line, is_any_of("\t"));
68
-		int bin_index = atoi(strs1[0].c_str());
69
-		string chr = strs1[1];
70
-		//int start_coord = atoi(strs1[2].c_str()); // start coordinate
71
-		int end_coord = atoi(strs1[3].c_str()); // end coordinate
72
-		if (i==0) {
73
-			// this is the first bin of all the genome, so we initialize old_chr
74
-			old_chr = chr;
75
-		}
76
-		if (chr.compare(old_chr)!=0) {
77
-			// This is the 1st bin of the new chromosome
78
-			old_chr = chr;
79
-			bins_end_coord.insert(make_pair(chr, vector<unsigned int>()));
80
-			bins_index.insert(make_pair(chr, vector<int >()));
81
-			bins_info.insert(make_pair(chr, vector<string >()));
82
-		}
83
-		vector<unsigned int> & coords = bins_end_coord[chr];
84
-		coords.push_back( end_coord );
85
-		vector<int> & indexes = bins_index[chr];
86
-		indexes.push_back( bin_index );
87
-		vector<string> & infos = bins_info[chr];
88
-		infos.push_back( line );
89
-		i++;
90
-	}
91
-	Rcpp::Rcerr << "#bins=" << i << endl;
92
-}
93
-
94
-int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index)
95
-{
96
-	int num_of_non_void_bins = 0;
97
-	Bins_index::iterator it;
98
-	for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
99
-		vector<int> & bins_of_chr = it->second;
100
-		for (int i=0; i<bins_of_chr.size(); i++)
101
-			if (bins_of_chr[i] > 0) {
102
-				num_of_non_void_bins++;
103
-				returned_markers_index.push_back( bins_of_chr[i] );
104
-			}
105
-	}
106
-	return num_of_non_void_bins;
107
-}
108
-
109
-// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base.
110
-// Returned bin_internal_index is 0-base. If not found, return -1
111
-int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
112
-	int bin_internal_index=-1;
113
-	vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
114
-	vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord);
115
-	if (bin_it!=coords_bins_of_chr.end()) {
116
-		// found
117
-		bin_internal_index = bin_it-coords_bins_of_chr.begin();
118
-	}
119
-	return bin_internal_index;
120
-}
121
-
122
-// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin.
123
-// Returned bin_internal_index is 0-base. If not found, return -1
124
-int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
125
-	int bin_internal_index=-1;
126
-	if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
127
-		// chr name is not found in binning system
128
-		bin_internal_index = -1;
129
-	} else {
130
-		vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
131
-		vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position);
132
-		if (position==*bin_it) bin_it++;
133
-		bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base
134
-		if (bin_internal_index==coords_bins_of_chr.size()) {
135
-			//cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl;
136
-			bin_internal_index=-1;
137
-		}
138
-	}
139
-	return bin_internal_index;
140
-}
141
-
142
-// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] )
143
-// Given a query region, we want to know which bin has overlap with this query region. If there is overlap,
144
-// return (1) bin index, and (2) the overlap length
145
-// ongoing devevloping
146
-int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord,
147
-	unsigned int query_region_end_coord, int & overlap_length)
148
-{
149
-	int bin_internal_index = -1;
150
-	overlap_length = -1;
151
-	if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
152
-		// chr name is found in binning system
153
-		bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord);
154
-		if (bin_internal_index != -1) {
155
-			unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index];
156
-			if ( query_region_end_coord > bin_end_coord )
157
-				overlap_length = bin_end_coord - query_region_start_coord + 1;
158
-			else
159
-				overlap_length = query_region_end_coord - query_region_start_coord + 1;
160
-		}
161
-	}
162
-	return bin_internal_index;
163
-}
164
-
165
-void print_uint_vec( ostream& os, vector<unsigned int>& v, int len )
166
-{
167
-	int i;
168
-	if (len>v.size() || len==0) len=v.size();
169
-	if (len==0 || v.size()==0) {
170
-		os << "[" << "]";
171
-	} else {
172
-		os << "[";
173
-		for (i=0; i<len-1; i++)
174
-			os << v[i] << ",";
175
-		os << v[i] << "]";
176
-	}
177
-}
178
-
179
-void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
180
-	Bins_end_coord::iterator it;
181
-	for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
182
-		string chr = it->first;
183
-		vector<unsigned int> coords=it->second;
184
-		vector<int> indexes=bins_index[chr];
185
-		vector<string> infos=bins_info[chr];
186
-		int n_bins = coords.size();
187
-		for (int i=0; i<n_bins; i++) {
188
-			os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'"  << endl;
189
-		}
190
-	}
191
-}
192
-
193
-/*
194
-void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
195
-	Bins_FullInfo::iterator: it;
196
-	for (it=bins.begin(); it!=bins.end(); ++it) {
197
-		cout << it.first << "\t";
198
-		print_int_vec(cout, it.second, it.second.size());
199
-		cout << endl;
200
-	}
201
-}
202
-*/
203
-
204
-// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base
205
-void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values)
206
-{
207
-	for (int bin_index=1; bin_index<=num_bins; bin_index++) {
208
-		vector<double> values;
209
-		for (int i=0; i<num_of_values; i++)
210
-			values.push_back( init_value );
211
-		bins2values[bin_index] = values;
212
-	}
213
-}
214
-
215
-// Bins2Values: a map of marker_index -> a vector of values.
216
-void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values)
217
-{
218
-	int num_bins = markers_index.size();
219
-	for (int ibin=0; ibin<num_bins; ibin++) {
220
-		vector<double> values;
221
-		for (int i=0; i<num_of_values; i++)
222
-			values.push_back( init_value );
223
-		bins2values[markers_index[ibin]] = values;
224
-	}
225
-}
226
-
227
-void print_Bins2Values(Bins2Values & bins2values)
228
-{
229
-	// cout.precision(15);
230
-	Bins2Values::iterator it;
231
-	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
232
-		vector<double> & values = it->second;
233
-	  Rcpp::Rcout << it->first;
234
-		for (int i=0; i<values.size(); i++)
235
-		  Rcpp::Rcout << "\t" << values[i];
236
-		Rcpp::Rcout << endl;
237
-	}
238
-}
239
-
240
-void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values)
241
-{
242
-	Bins2UnsignedIntegers::iterator it;
243
-	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
244
-		vector<unsigned int> & values = it->second;
245
-	  Rcpp::Rcout << it->first;
246
-		for (int i=0; i<(int)values.size(); i++)
247
-		  Rcpp::Rcout << "\t" << values[i];
248
-		Rcpp::Rcout << endl;
249
-	}
250
-}
251
-
252
-// when optional_write==TRUE, we assume there are two values associated with each bin
253
-void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names,
254
-	string output_file, bool optional_write)
255
-{
256
-	ofstream out;
257
-	out.open(output_file.c_str());
258
-	if (out.fail()){
259
-	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
260
-	  // exit(EXIT_FAILURE);
261
-	}
262
-	int i;
263
-	for (i=0; i<columns_names.size()-1; i++)
264
-		out << columns_names[i] << "\t";
265
-	out << columns_names[i] << endl;
266
-
267
-	out.precision(15);
268
-	Bins2Values::iterator it;
269
-	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
270
-		vector<double> & values = it->second;
271
-		out << it->first;
272
-		if (optional_write) {
273
-			// assume there are at least two values associated with each bin
274
-			// for example, when we have three associated values, they can be
275
-			// (1) methylation_count
276
-			// (2) unmethylation_count
277
-			// (3) number of reads
278
-			double n = values[0] + values[1];
279
-			double v;
280
-			if (n==0) v=0;
281
-			else v=values[0]/n;
282
-			out << "\t" << v << "\t" << n;
283
-		}
284
-		for (i=0; i<values.size(); i++)
285
-			out << "\t" << values[i];
286
-		out << endl;
287
-	}
288
-	out.close();
289
-}
290
-
291
-// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base
292
-void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value)
293
-{
294
-	for (int bin_index=1; bin_index<=num_bins; bin_index++)
295
-		bins2value[bin_index] = init_value;
296
-}
297
-
298
-ostream& operator<<(ostream& out, Bins2Value& bins2value) {
299
-	out << "bin_index" << "\t" << "value" << endl;
300
-	out.precision(15);
301
-	Bins2Value::iterator it;
302
-	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
303
-		out << it->first << "\t" << it->second << endl;
304
-	return(out);
305
-}
306
-
307
-void write_Bins2Value(Bins2Value & bins2value, string output_file)
308
-{
309
-	ofstream out;
310
-	out.open(output_file.c_str());
311
-	if (out.fail()){
312
-	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
313
-	  // exit(EXIT_FAILURE);
314
-	}
315
-	out << "bin_index" << "\t" << "value" << endl;
316
-	out.precision(15);
317
-	Bins2Value::iterator it;
318
-	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
319
-		out << it->first << "\t" << it->second << endl;
320
-	out.close();
321
-}
322
-
1
+// [[Rcpp::depends(BH)]]
2
+#include <Rcpp.h>
3
+using namespace Rcpp;
4
+
5
+#include <cstdlib>
6
+#include <sstream>
7
+#include <iostream>
8
+#include <fstream>
9
+#include <map>
10
+#include <vector>
11
+#include <string>
12
+#include <cmath>
13
+#include <cstdlib>
14
+#include <boost/algorithm/string.hpp>
15
+#include <boost/assign.hpp>
16
+#include "data_types.h"
17
+
18
+using namespace std;
19
+using namespace boost;
20
+
21
+//
22
+// bins (or features) annotation format
23
+//
24
+//1. file of bins (features) annotation: "biomarkers.all_bins"
25
+//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line.
26
+//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0.
27
+//Column 2: chr
28
+//Column 3: start coordinate of read (1-base)
29
+//Column 4: end coordinate of read (1-base). The range of the bin is [start, end)
30
+//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
31
+//
32
+// The following is an example file
33
+//
34
+//marker_index    chr     start   end     marker_type
35
+//0       chr1    1       855266  -
36
+//1       chr1    855266  855766  II
37
+//0       chr1    855766  969796  -
38
+//2       chr1    969796  970296  II
39
+//0       chr1    970296  1099044 -
40
+//3       chr1    1099044 1099544 II
41
+//0       chr1    1099544 1109315 -
42
+//4       chr1    1109315 1109815 II
43
+//
44
+void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord,
45
+	Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true)
46
+{
47
+	ifstream fin;
48
+	fin.open(input_bins_annot_file.c_str());
49
+	if (fin.fail()){
50
+	  Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl;
51
+	  // exit(EXIT_FAILURE);
52
+  }
53
+	string line;
54
+	if (has_header_line)
55
+		// skip the first header line
56
+		getline(fin, line);
57
+	unsigned long i=0;
58
+	string old_chr;
59
+	while (!fin.eof()) {
60
+		getline(fin, line);
61
+		if (line.empty()) {
62
+			// this is the last line of the file
63
+			break;
64
+		}
65
+		//cout << line << endl;
66
+
67
+		vector<string> strs1;
68
+		split(strs1, line, is_any_of("\t"));
69
+		int bin_index = atoi(strs1[0].c_str());
70
+		string chr = strs1[1];
71
+		//int start_coord = atoi(strs1[2].c_str()); // start coordinate
72
+		int end_coord = atoi(strs1[3].c_str()); // end coordinate
73
+		if (i==0) {
74
+			// this is the first bin of all the genome, so we initialize old_chr
75
+			old_chr = chr;
76
+		}
77
+		if (chr.compare(old_chr)!=0) {
78
+			// This is the 1st bin of the new chromosome
79
+			old_chr = chr;
80
+			bins_end_coord.insert(make_pair(chr, vector<unsigned int>()));
81
+			bins_index.insert(make_pair(chr, vector<int >()));
82
+			bins_info.insert(make_pair(chr, vector<string >()));
83
+		}
84
+		vector<unsigned int> & coords = bins_end_coord[chr];
85
+		coords.push_back( end_coord );
86
+		vector<int> & indexes = bins_index[chr];
87
+		indexes.push_back( bin_index );
88
+		vector<string> & infos = bins_info[chr];
89
+		infos.push_back( line );
90
+		i++;
91
+	}
92
+	Rcpp::Rcerr << "#bins=" << i << endl;
93
+}
94
+
95
+int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index)
96
+{
97
+	int num_of_non_void_bins = 0;
98
+	Bins_index::iterator it;
99
+	for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
100
+		vector<int> & bins_of_chr = it->second;
101
+		for (int i=0; i<bins_of_chr.size(); i++)
102
+			if (bins_of_chr[i] > 0) {
103
+				num_of_non_void_bins++;
104
+				returned_markers_index.push_back( bins_of_chr[i] );
105
+			}
106
+	}
107
+	return num_of_non_void_bins;
108
+}
109
+
110
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base.
111
+// Returned bin_internal_index is 0-base. If not found, return -1
112
+int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
113
+	int bin_internal_index=-1;
114
+	vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
115
+	vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord);
116
+	if (bin_it!=coords_bins_of_chr.end()) {
117
+		// found
118
+		bin_internal_index = bin_it-coords_bins_of_chr.begin();
119
+	}
120
+	return bin_internal_index;
121
+}
122
+
123
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin.
124
+// Returned bin_internal_index is 0-base. If not found, return -1
125
+int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
126
+	int bin_internal_index=-1;
127
+	if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
128
+		// chr name is not found in binning system
129
+		bin_internal_index = -1;
130
+	} else {
131
+		vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
132
+		vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position);
133
+		if (position==*bin_it) bin_it++;
134
+		bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base
135
+		if (bin_internal_index==coords_bins_of_chr.size()) {
136
+			//cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl;
137
+			bin_internal_index=-1;
138
+		}
139
+	}
140
+	return bin_internal_index;
141
+}
142
+
143
+// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] )
144
+// Given a query region, we want to know which bin has overlap with this query region. If there is overlap,
145
+// return (1) bin index, and (2) the overlap length
146
+// ongoing devevloping
147
+int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord,
148
+	unsigned int query_region_end_coord, int & overlap_length)
149
+{
150
+	int bin_internal_index = -1;
151
+	overlap_length = -1;
152
+	if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
153
+		// chr name is found in binning system
154
+		bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord);
155
+		if (bin_internal_index != -1) {
156
+			unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index];
157
+			if ( query_region_end_coord > bin_end_coord )
158
+				overlap_length = bin_end_coord - query_region_start_coord + 1;
159
+			else
160
+				overlap_length = query_region_end_coord - query_region_start_coord + 1;
161
+		}
162
+	}
163
+	return bin_internal_index;
164
+}
165
+
166
+void print_uint_vec( ostream& os, vector<unsigned int>& v, int len )
167
+{
168
+	int i;
169
+	if (len>v.size() || len==0) len=v.size();
170
+	if (len==0 || v.size()==0) {
171
+		os << "[" << "]";
172
+	} else {
173
+		os << "[";
174
+		for (i=0; i<len-1; i++)
175
+			os << v[i] << ",";
176
+		os << v[i] << "]";
177
+	}
178
+}
179
+
180
+void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
181
+	Bins_end_coord::iterator it;
182
+	for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
183
+		string chr = it->first;
184
+		vector<unsigned int> coords=it->second;
185
+		vector<int> indexes=bins_index[chr];
186
+		vector<string> infos=bins_info[chr];
187
+		int n_bins = coords.size();
188
+		for (int i=0; i<n_bins; i++) {
189
+			os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'"  << endl;
190
+		}
191
+	}
192
+}
193
+
194
+/*
195
+void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
196
+	Bins_FullInfo::iterator: it;
197
+	for (it=bins.begin(); it!=bins.end(); ++it) {
198
+		cout << it.first << "\t";
199
+		print_int_vec(cout, it.second, it.second.size());
200
+		cout << endl;
201
+	}
202
+}
203
+*/
204
+
205
+// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base
206
+void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values)
207
+{
208
+	for (int bin_index=1; bin_index<=num_bins; bin_index++) {
209
+		vector<double> values;
210
+		for (int i=0; i<num_of_values; i++)
211
+			values.push_back( init_value );
212
+		bins2values[bin_index] = values;
213
+	}
214
+}
215
+
216
+// Bins2Values: a map of marker_index -> a vector of values.
217
+void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values)
218
+{
219
+	int num_bins = markers_index.size();
220
+	for (int ibin=0; ibin<num_bins; ibin++) {
221
+		vector<double> values;
222
+		for (int i=0; i<num_of_values; i++)
223
+			values.push_back( init_value );
224
+		bins2values[markers_index[ibin]] = values;
225
+	}
226
+}
227
+
228
+void print_Bins2Values(Bins2Values & bins2values)
229
+{
230
+	// cout.precision(15);
231
+	Bins2Values::iterator it;
232
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
233
+		vector<double> & values = it->second;
234
+	  Rcpp::Rcout << it->first;
235
+		for (int i=0; i<values.size(); i++)
236
+		  Rcpp::Rcout << "\t" << values[i];
237
+		Rcpp::Rcout << endl;
238
+	}
239
+}
240
+
241
+void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values)
242
+{
243
+	Bins2UnsignedIntegers::iterator it;
244
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
245
+		vector<unsigned int> & values = it->second;
246
+	  Rcpp::Rcout << it->first;
247
+		for (int i=0; i<(int)values.size(); i++)
248
+		  Rcpp::Rcout << "\t" << values[i];
249
+		Rcpp::Rcout << endl;
250
+	}
251
+}
252
+
253
+// when optional_write==TRUE, we assume there are two values associated with each bin
254
+void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names,
255
+	string output_file, bool optional_write)
256
+{
257
+	ofstream out;
258
+	out.open(output_file.c_str());
259
+	if (out.fail()){
260
+	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
261
+	  // exit(EXIT_FAILURE);
262
+	}
263
+	int i;
264
+	for (i=0; i<columns_names.size()-1; i++)
265
+		out << columns_names[i] << "\t";
266
+	out << columns_names[i] << endl;
267
+
268
+	out.precision(15);
269
+	Bins2Values::iterator it;
270
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
271
+		vector<double> & values = it->second;
272
+		out << it->first;
273
+		if (optional_write) {
274
+			// assume there are at least two values associated with each bin
275
+			// for example, when we have three associated values, they can be
276
+			// (1) methylation_count
277
+			// (2) unmethylation_count
278
+			// (3) number of reads
279
+			double n = values[0] + values[1];
280
+			double v;
281
+			if (n==0) v=0;
282
+			else v=values[0]/n;
283
+			out << "\t" << v << "\t" << n;
284
+		}
285
+		for (i=0; i<values.size(); i++)
286
+			out << "\t" << values[i];
287
+		out << endl;
288
+	}
289
+	out.close();
290
+}
291
+
292
+// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base
293
+void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value)
294
+{
295
+	for (int bin_index=1; bin_index<=num_bins; bin_index++)
296
+		bins2value[bin_index] = init_value;
297
+}
298
+
299
+ostream& operator<<(ostream& out, Bins2Value& bins2value) {
300
+	out << "bin_index" << "\t" << "value" << endl;
301
+	out.precision(15);
302
+	Bins2Value::iterator it;
303
+	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
304
+		out << it->first << "\t" << it->second << endl;
305
+	return(out);
306
+}
307
+
308
+void write_Bins2Value(Bins2Value & bins2value, string output_file)
309
+{
310
+	ofstream out;
311
+	out.open(output_file.c_str());
312
+	if (out.fail()){
313
+	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
314
+	  // exit(EXIT_FAILURE);
315
+	}
316
+	out << "bin_index" << "\t" << "value" << endl;
317
+	out.precision(15);
318
+	Bins2Value::iterator it;
319
+	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
320
+		out << it->first << "\t" << it->second << endl;
321
+	out.close();
322
+}
323
+
Browse code

add files

RoseHuRan authored on 27/03/2022 20:48:07
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,322 @@
1
+#include <Rcpp.h>
2
+using namespace Rcpp;
3
+
4
+#include <cstdlib>
5
+#include <sstream>
6
+#include <iostream>
7
+#include <fstream>
8
+#include <map>
9
+#include <vector>
10
+#include <string>
11
+#include <cmath>
12
+#include <cstdlib>
13
+#include <boost/algorithm/string.hpp>
14
+#include <boost/assign.hpp>
15
+#include "data_types.h"
16
+
17
+using namespace std;
18
+using namespace boost;
19
+
20
+//
21
+// bins (or features) annotation format
22
+//
23
+//1. file of bins (features) annotation: "biomarkers.all_bins"
24
+//Each line is a bin (or feature). All columns are delimited by TAB. There is one header line.
25
+//Column 1: marker_index, 1-based index. Those bins that are not markers, are indexed as 0.
26
+//Column 2: chr
27
+//Column 3: start coordinate of read (1-base)
28
+//Column 4: end coordinate of read (1-base). The range of the bin is [start, end)
29
+//Column 5: marker_type. "I" is Marker_type_I, "II" is Marker_type_II, "-" is the complementary bin, only facilitating the searching.
30
+//
31
+// The following is an example file
32
+//
33
+//marker_index    chr     start   end     marker_type
34
+//0       chr1    1       855266  -
35
+//1       chr1    855266  855766  II
36
+//0       chr1    855766  969796  -
37
+//2       chr1    969796  970296  II
38
+//0       chr1    970296  1099044 -
39
+//3       chr1    1099044 1099544 II
40
+//0       chr1    1099544 1109315 -
41
+//4       chr1    1109315 1109815 II
42
+//
43
+void read_bins_annot_file(string input_bins_annot_file, Bins_end_coord & bins_end_coord,
44
+	Bins_index & bins_index, Bins_info & bins_info, bool has_header_line=true)
45
+{
46
+	ifstream fin;
47
+	fin.open(input_bins_annot_file.c_str());
48
+	if (fin.fail()){
49
+	  Rcpp::Rcerr << "Error: Unable to open " << input_bins_annot_file << " in read_bins_annot_file()" << endl;
50
+	  // exit(EXIT_FAILURE);
51
+  }
52
+	string line;
53
+	if (has_header_line)
54
+		// skip the first header line
55
+		getline(fin, line);
56
+	unsigned long i=0;
57
+	string old_chr;
58
+	while (!fin.eof()) {
59
+		getline(fin, line);
60
+		if (line.empty()) {
61
+			// this is the last line of the file
62
+			break;
63
+		}
64
+		//cout << line << endl;
65
+
66
+		vector<string> strs1;
67
+		split(strs1, line, is_any_of("\t"));
68
+		int bin_index = atoi(strs1[0].c_str());
69
+		string chr = strs1[1];
70
+		//int start_coord = atoi(strs1[2].c_str()); // start coordinate
71
+		int end_coord = atoi(strs1[3].c_str()); // end coordinate
72
+		if (i==0) {
73
+			// this is the first bin of all the genome, so we initialize old_chr
74
+			old_chr = chr;
75
+		}
76
+		if (chr.compare(old_chr)!=0) {
77
+			// This is the 1st bin of the new chromosome
78
+			old_chr = chr;
79
+			bins_end_coord.insert(make_pair(chr, vector<unsigned int>()));
80
+			bins_index.insert(make_pair(chr, vector<int >()));
81
+			bins_info.insert(make_pair(chr, vector<string >()));
82
+		}
83
+		vector<unsigned int> & coords = bins_end_coord[chr];
84
+		coords.push_back( end_coord );
85
+		vector<int> & indexes = bins_index[chr];
86
+		indexes.push_back( bin_index );
87
+		vector<string> & infos = bins_info[chr];
88
+		infos.push_back( line );
89
+		i++;
90
+	}
91
+	Rcpp::Rcerr << "#bins=" << i << endl;
92
+}
93
+
94
+int get_num_of_non_void_bins(Bins_index & bins_index, vector<int> & returned_markers_index)
95
+{
96
+	int num_of_non_void_bins = 0;
97
+	Bins_index::iterator it;
98
+	for (it=bins_index.begin(); it!=bins_index.end(); ++it) {
99
+		vector<int> & bins_of_chr = it->second;
100
+		for (int i=0; i<bins_of_chr.size(); i++)
101
+			if (bins_of_chr[i] > 0) {
102
+				num_of_non_void_bins++;
103
+				returned_markers_index.push_back( bins_of_chr[i] );
104
+			}
105
+	}
106
+	return num_of_non_void_bins;
107
+}
108
+
109
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where both "bin_start_coord" and "bin_end_coord" are 0-base.
110
+// Returned bin_internal_index is 0-base. If not found, return -1
111
+int find_exact_bin(Bins_end_coord & bins_end_coord, string chr, unsigned int bin_start_coord, unsigned int bin_end_coord) {
112
+	int bin_internal_index=-1;
113
+	vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
114
+	vector<unsigned int>::iterator bin_it = find( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), bin_end_coord);
115
+	if (bin_it!=coords_bins_of_chr.end()) {
116
+		// found
117
+		bin_internal_index = bin_it-coords_bins_of_chr.begin();
118
+	}
119
+	return bin_internal_index;
120
+}
121
+
122
+// We find out the index of the bin with the range [bin_start_coord, bin_end_coord), where the input paramter "position" (1-base) falls into this bin.
123
+// Returned bin_internal_index is 0-base. If not found, return -1
124
+int find_bin_of_position(Bins_end_coord & bins_end_coord, string chr, unsigned int position) {
125
+	int bin_internal_index=-1;
126
+	if ( bins_end_coord.find(chr) == bins_end_coord.end() ) {
127
+		// chr name is not found in binning system
128
+		bin_internal_index = -1;
129
+	} else {
130
+		vector<unsigned int> & coords_bins_of_chr = bins_end_coord[chr]; // a vector of end coordinates (1-base) of this chr.
131
+		vector<unsigned int>::iterator bin_it = lower_bound( coords_bins_of_chr.begin(), coords_bins_of_chr.end(), position);
132
+		if (position==*bin_it) bin_it++;
133
+		bin_internal_index = bin_it - coords_bins_of_chr.begin(); // bin_internal_index is 0-base
134
+		if (bin_internal_index==coords_bins_of_chr.size()) {
135
+			//cerr << "position(1-base): " << position << " doesn't exist in " << chr << endl;
136
+			bin_internal_index=-1;
137
+		}
138
+	}
139
+	return bin_internal_index;
140
+}
141
+
142
+// Each bin is in the range [ bins_end_coord[i-1], bins_end_coord[i] )
143
+// Given a query region, we want to know which bin has overlap with this query region. If there is overlap,
144
+// return (1) bin index, and (2) the overlap length
145
+// ongoing devevloping
146
+int find_overlap_bin(Bins_end_coord & bins_end_coord, string query_region_chr, unsigned int query_region_start_coord,
147
+	unsigned int query_region_end_coord, int & overlap_length)
148
+{
149
+	int bin_internal_index = -1;
150
+	overlap_length = -1;
151
+	if ( bins_end_coord.find(query_region_chr) != bins_end_coord.end() ) {
152
+		// chr name is found in binning system
153
+		bin_internal_index = find_bin_of_position(bins_end_coord, query_region_chr, query_region_start_coord);
154
+		if (bin_internal_index != -1) {
155
+			unsigned int bin_end_coord = bins_end_coord[query_region_chr][bin_internal_index];
156
+			if ( query_region_end_coord > bin_end_coord )
157
+				overlap_length = bin_end_coord - query_region_start_coord + 1;
158
+			else
159
+				overlap_length = query_region_end_coord - query_region_start_coord + 1;
160
+		}
161
+	}
162
+	return bin_internal_index;
163
+}
164
+
165
+void print_uint_vec( ostream& os, vector<unsigned int>& v, int len )
166
+{
167
+	int i;
168
+	if (len>v.size() || len==0) len=v.size();
169
+	if (len==0 || v.size()==0) {
170
+		os << "[" << "]";
171
+	} else {
172
+		os << "[";
173
+		for (i=0; i<len-1; i++)
174
+			os << v[i] << ",";
175
+		os << v[i] << "]";
176
+	}
177
+}
178
+
179
+void print_bins( ostream& os, Bins_end_coord & bins_end_coord, Bins_index & bins_index, Bins_info & bins_info) {
180
+	Bins_end_coord::iterator it;
181
+	for (it=bins_end_coord.begin(); it!=bins_end_coord.end(); ++it) {
182
+		string chr = it->first;
183
+		vector<unsigned int> coords=it->second;
184
+		vector<int> indexes=bins_index[chr];
185
+		vector<string> infos=bins_info[chr];
186
+		int n_bins = coords.size();
187
+		for (int i=0; i<n_bins; i++) {
188
+			os << indexes[i] << "\t" << chr << "\t" << coords[i] << "\t'" << infos[i] << "'"  << endl;
189
+		}
190
+	}
191
+}
192
+
193
+/*
194
+void print_bins_fullinfo( Bins_FullInfo & bins_fullinfo ) {
195
+	Bins_FullInfo::iterator: it;
196
+	for (it=bins.begin(); it!=bins.end(); ++it) {
197
+		cout << it.first << "\t";
198
+		print_int_vec(cout, it.second, it.second.size());
199
+		cout << endl;
200
+	}
201
+}
202
+*/
203
+
204
+// Bins2Values: a map of bin_index -> a vector of values. bin_index is always 1-base
205
+void create_Bins2Values(int num_bins, int num_of_values, double init_value, Bins2Values & bins2values)
206
+{
207
+	for (int bin_index=1; bin_index<=num_bins; bin_index++) {
208
+		vector<double> values;
209
+		for (int i=0; i<num_of_values; i++)
210
+			values.push_back( init_value );
211
+		bins2values[bin_index] = values;
212
+	}
213
+}
214
+
215
+// Bins2Values: a map of marker_index -> a vector of values.
216
+void create_Bins2Values(vector<int> markers_index, int num_of_values, double init_value, Bins2Values & bins2values)
217
+{
218
+	int num_bins = markers_index.size();
219
+	for (int ibin=0; ibin<num_bins; ibin++) {
220
+		vector<double> values;
221
+		for (int i=0; i<num_of_values; i++)
222
+			values.push_back( init_value );
223
+		bins2values[markers_index[ibin]] = values;
224
+	}
225
+}
226
+
227
+void print_Bins2Values(Bins2Values & bins2values)
228
+{
229
+	// cout.precision(15);
230
+	Bins2Values::iterator it;
231
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
232
+		vector<double> & values = it->second;
233
+	  Rcpp::Rcout << it->first;
234
+		for (int i=0; i<values.size(); i++)
235
+		  Rcpp::Rcout << "\t" << values[i];
236
+		Rcpp::Rcout << endl;
237
+	}
238
+}
239
+
240
+void print_Bins2UnsignedIntegers(Bins2UnsignedIntegers& bins2values)
241
+{
242
+	Bins2UnsignedIntegers::iterator it;
243
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
244
+		vector<unsigned int> & values = it->second;
245
+	  Rcpp::Rcout << it->first;
246
+		for (int i=0; i<(int)values.size(); i++)
247
+		  Rcpp::Rcout << "\t" << values[i];
248
+		Rcpp::Rcout << endl;
249
+	}
250
+}
251
+
252
+// when optional_write==TRUE, we assume there are two values associated with each bin
253
+void write_Bins2Values(Bins2Values & bins2values, vector<string> & columns_names,
254
+	string output_file, bool optional_write)
255
+{
256
+	ofstream out;
257
+	out.open(output_file.c_str());
258
+	if (out.fail()){
259
+	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
260
+	  // exit(EXIT_FAILURE);
261
+	}
262
+	int i;
263
+	for (i=0; i<columns_names.size()-1; i++)
264
+		out << columns_names[i] << "\t";
265
+	out << columns_names[i] << endl;
266
+
267
+	out.precision(15);
268
+	Bins2Values::iterator it;
269
+	for (it=bins2values.begin(); it!=bins2values.end(); ++it) {
270
+		vector<double> & values = it->second;
271
+		out << it->first;
272
+		if (optional_write) {
273
+			// assume there are at least two values associated with each bin
274
+			// for example, when we have three associated values, they can be
275
+			// (1) methylation_count
276
+			// (2) unmethylation_count
277
+			// (3) number of reads
278
+			double n = values[0] + values[1];
279
+			double v;
280
+			if (n==0) v=0;
281
+			else v=values[0]/n;
282
+			out << "\t" << v << "\t" << n;
283
+		}
284
+		for (i=0; i<values.size(); i++)
285
+			out << "\t" << values[i];
286
+		out << endl;
287
+	}
288
+	out.close();
289
+}
290
+
291
+// Bins2Value: a map of bin_index -> a value. bin_index is always 1-base
292
+void create_Bins2Value(int num_bins, double init_value, Bins2Value & bins2value)
293
+{
294
+	for (int bin_index=1; bin_index<=num_bins; bin_index++)
295
+		bins2value[bin_index] = init_value;
296
+}
297
+
298
+ostream& operator<<(ostream& out, Bins2Value& bins2value) {
299
+	out << "bin_index" << "\t" << "value" << endl;
300
+	out.precision(15);
301
+	Bins2Value::iterator it;
302
+	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
303
+		out << it->first << "\t" << it->second << endl;
304
+	return(out);
305
+}
306
+
307
+void write_Bins2Value(Bins2Value & bins2value, string output_file)
308
+{
309
+	ofstream out;
310
+	out.open(output_file.c_str());
311
+	if (out.fail()){
312
+	  Rcpp::Rcerr << "Error: Unable to write " << output_file << " in write_Bins2Value()" << endl;
313
+	  // exit(EXIT_FAILURE);
314
+	}
315
+	out << "bin_index" << "\t" << "value" << endl;
316
+	out.precision(15);
317
+	Bins2Value::iterator it;
318
+	for (it=bins2value.begin(); it!=bins2value.end(); ++it)
319
+		out << it->first << "\t" << it->second << endl;
320
+	out.close();
321
+}
322
+