orx_concurrent_vec/grow.rs
1use crate::{ConcurrentVec, elem::ConcurrentElement};
2use core::sync::atomic::Ordering;
3use orx_pinned_vec::IntoConcurrentPinnedVec;
4
5impl<T, P> ConcurrentVec<T, P>
6where
7 P: IntoConcurrentPinnedVec<ConcurrentElement<T>>,
8{
9 /// Concurrent, thread-safe method to push the given `value` to the back of the bag, and returns the position or index of the pushed value.
10 ///
11 /// It preserves the order of elements with respect to the order the `push` method is called.
12 ///
13 /// # Panics
14 ///
15 /// Panics if the concurrent bag is already at its maximum capacity; i.e., if `self.len() == self.maximum_capacity()`.
16 ///
17 /// Note that this is an important safety assertion in the concurrent context; however, not a practical limitation.
18 /// Please see the [`orx_pinned_concurrent_col::PinnedConcurrentCol::maximum_capacity`] for details.
19 ///
20 /// # Examples
21 ///
22 /// We can directly take a shared reference of the bag, share it among threads and collect results concurrently.
23 ///
24 /// ```rust
25 /// use orx_concurrent_vec::*;
26 ///
27 /// let (num_threads, num_items_per_thread) = (4, 1_024);
28 ///
29 /// let vec = ConcurrentVec::new();
30 ///
31 /// std::thread::scope(|s| {
32 /// let vec = &vec;
33 /// for i in 0..num_threads {
34 /// s.spawn(move || {
35 /// for j in 0..num_items_per_thread {
36 /// // concurrently collect results simply by calling `push`
37 /// vec.push(i * 1000 + j);
38 /// }
39 /// });
40 /// }
41 /// });
42 ///
43 /// let mut vec = vec.to_vec();
44 /// vec.sort();
45 /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
46 /// expected.sort();
47 /// assert_eq!(vec, expected);
48 /// ```
49 ///
50 /// # Performance Notes - False Sharing
51 ///
52 /// [`ConcurrentVec::push`] implementation is lock-free and focuses on efficiency.
53 /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
54 /// False sharing might lead to significant performance degradation.
55 /// However, it is possible to avoid in many cases.
56 ///
57 /// ## When?
58 ///
59 /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
60 /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
61 /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
62 /// * very little or negligible work / time is required in between `push` calls.
63 ///
64 /// The example above fits this situation.
65 /// Each thread only performs one multiplication and addition in between pushing elements, and the elements to be pushed are very small, just one `usize`.
66 ///
67 /// ## Why?
68 ///
69 /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
70 /// * However, cache lines contain more than one position.
71 /// * One thread updating a particular position invalidates the entire cache line on an other thread.
72 /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
73 /// * This might lead to a significant performance degradation.
74 ///
75 /// ### Solution: `extend` rather than `push`
76 ///
77 /// One very simple, effective and memory efficient solution to this problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
78 ///
79 /// Assume that we will have 4 threads and each will push 1_024 elements.
80 /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
81 /// This would give the best performance.
82 /// Further, it has zero buffer or memory cost:
83 /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
84 /// * there is no additional allocation,
85 /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
86 ///
87 /// However, we do not need to have such a perfect information about the number of elements to be pushed.
88 /// Performance gains after reaching the cache line size are much lesser.
89 ///
90 /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
91 /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
92 ///
93 /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
94 ///
95 /// Required change in the code from `push` to `extend` is not significant.
96 /// The example above could be revised as follows to avoid the performance degrading of false sharing.
97 ///
98 /// ```rust
99 /// use orx_concurrent_vec::*;
100 ///
101 /// let (num_threads, num_items_per_thread) = (4, 1_024);
102 ///
103 /// let vec = ConcurrentVec::new();
104 /// let batch_size = 16;
105 ///
106 /// std::thread::scope(|s| {
107 /// let vec = &vec;
108 /// for i in 0..num_threads {
109 /// s.spawn(move || {
110 /// for j in (0..num_items_per_thread).step_by(batch_size) {
111 /// let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
112 /// // concurrently collect results simply by calling `extend`
113 /// vec.extend(iter);
114 /// }
115 /// });
116 /// }
117 /// });
118 ///
119 /// let mut vec = vec.to_vec();
120 /// vec.sort();
121 /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
122 /// expected.sort();
123 /// assert_eq!(vec, expected);
124 /// ```
125 pub fn push(&self, value: T) -> usize {
126 let idx = self.len_reserved().fetch_add(1, Ordering::Relaxed);
127
128 // # SAFETY: ConcurrentVec ensures that each `idx` will be written only and exactly once.
129 let maybe = unsafe { self.core.single_item_as_ref(idx) };
130 unsafe { maybe.0.initialize_unchecked(value) };
131
132 idx
133 }
134
135 /// Pushes the value which will be computed as a function of the index where it will be written.
136 ///
137 /// Note that we cannot guarantee the index of the element by `push`ing since there might be many
138 /// pushes happening concurrently. In cases where we absolutely need to know the index, in other
139 /// words, when the value depends on the index, we can use `push_for_idx`.
140 ///
141 /// # Examples
142 ///
143 /// ```rust
144 /// use orx_concurrent_vec::*;
145 ///
146 /// let vec = ConcurrentVec::new();
147 /// vec.push(0);
148 /// vec.push_for_idx(|i| i * 2);
149 /// vec.push_for_idx(|i| i + 10);
150 /// vec.push(42);
151 ///
152 /// assert_eq!(&vec, &[0, 2, 12, 42]);
153 /// ```
154 pub fn push_for_idx<F>(&self, f: F) -> usize
155 where
156 F: FnOnce(usize) -> T,
157 {
158 let idx = self.len_reserved().fetch_add(1, Ordering::Relaxed);
159 let value = f(idx);
160
161 // # SAFETY: ConcurrentVec ensures that each `idx` will be written only and exactly once.
162 let maybe = unsafe { self.core.single_item_as_ref(idx) };
163 unsafe { maybe.0.initialize_unchecked(value) };
164
165 idx
166 }
167
168 /// Concurrent, thread-safe method to push all `values` that the given iterator will yield to the back of the bag.
169 /// The method returns the position or index of the first pushed value (returns the length of the concurrent bag if the iterator is empty).
170 ///
171 /// All `values` in the iterator will be added to the bag consecutively:
172 /// * the first yielded value will be written to the position which is equal to the current length of the bag, say `begin_idx`, which is the returned value,
173 /// * the second yielded value will be written to the `begin_idx + 1`-th position,
174 /// * ...
175 /// * and the last value will be written to the `begin_idx + values.count() - 1`-th position of the bag.
176 ///
177 /// Important notes:
178 /// * This method does not allocate to buffer.
179 /// * All it does is to increment the atomic counter by the length of the iterator (`push` would increment by 1) and reserve the range of positions for this operation.
180 /// * If there is not sufficient space, the vector grows first; iterating over and writing elements to the vec happens afterwards.
181 /// * Therefore, other threads do not wait for the `extend` method to complete, they can concurrently write.
182 /// * This is a simple and effective approach to deal with the false sharing problem.
183 ///
184 /// For this reason, the method requires an `ExactSizeIterator`.
185 /// There exists the variant [`ConcurrentVec::extend_n_items`] method which accepts any iterator together with the correct length to be passed by the caller.
186 /// It is `unsafe` as the caller must guarantee that the iterator yields at least the number of elements explicitly passed in as an argument.
187 ///
188 /// # Panics
189 ///
190 /// Panics if not all of the `values` fit in the concurrent bag's maximum capacity.
191 ///
192 /// Note that this is an important safety assertion in the concurrent context; however, not a practical limitation.
193 /// Please see the [`orx_pinned_concurrent_col::PinnedConcurrentCol::maximum_capacity`] for details.
194 ///
195 /// # Examples
196 ///
197 /// We can directly take a shared reference of the bag and share it among threads.
198 ///
199 /// ```rust
200 /// use orx_concurrent_vec::*;
201 ///
202 /// let (num_threads, num_items_per_thread) = (4, 1_024);
203 ///
204 /// let vec = ConcurrentVec::new();
205 /// let batch_size = 16;
206 ///
207 /// std::thread::scope(|s| {
208 /// let vec = &vec;
209 /// for i in 0..num_threads {
210 /// s.spawn(move || {
211 /// for j in (0..num_items_per_thread).step_by(batch_size) {
212 /// let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
213 /// // concurrently collect results simply by calling `extend`
214 /// vec.extend(iter);
215 /// }
216 /// });
217 /// }
218 /// });
219 ///
220 /// let mut vec: Vec<_> = vec.to_vec();
221 /// vec.sort();
222 /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
223 /// expected.sort();
224 /// assert_eq!(vec, expected);
225 /// ```
226 ///
227 /// # Performance Notes - False Sharing
228 ///
229 /// [`ConcurrentVec::push`] method is implementation is simple, lock-free and efficient.
230 /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
231 /// False sharing might lead to significant performance degradation; fortunately, it is possible to avoid in many cases.
232 ///
233 /// ## When?
234 ///
235 /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
236 /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
237 /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
238 /// * very little or negligible work / time is required in between `push` calls.
239 ///
240 /// The example above fits this situation.
241 /// Each thread only performs one multiplication and addition for computing elements, and the elements to be pushed are very small, just one `usize`.
242 ///
243 /// ## Why?
244 ///
245 /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
246 /// * However, cache lines contain more than one position.
247 /// * One thread updating a particular position invalidates the entire cache line on an other thread.
248 /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
249 /// * This might lead to a significant performance degradation.
250 ///
251 /// ### Solution: `extend` rather than `push`
252 ///
253 /// One very simple, effective and memory efficient solution to the false sharing problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
254 ///
255 /// Assume that we will have 4 threads and each will push 1_024 elements.
256 /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
257 /// This would give the best performance.
258 /// Further, it has zero buffer or memory cost:
259 /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
260 /// * there is no additional allocation,
261 /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
262 ///
263 /// However, we do not need to have such a perfect information about the number of elements to be pushed.
264 /// Performance gains after reaching the cache line size are much lesser.
265 ///
266 /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
267 /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
268 ///
269 /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
270 ///
271 /// The example code above already demonstrates the solution to a potentially problematic case in the [`ConcurrentVec::push`] example.
272 pub fn extend<IntoIter, Iter>(&self, values: IntoIter) -> usize
273 where
274 IntoIter: IntoIterator<Item = T, IntoIter = Iter>,
275 Iter: Iterator<Item = T> + ExactSizeIterator,
276 {
277 let values = values.into_iter();
278 let num_items = values.len();
279 self.extend_n_items::<_>(values, num_items)
280 }
281
282 /// Extends the vector with the values of the iterator which is created as a function of the
283 /// index that the first element of the iterator will be written to.
284 ///
285 /// Note that we cannot guarantee the index of the element by `extend`ing since there might be many
286 /// pushes or extends happening concurrently. In cases where we absolutely need to know the index, in other
287 /// words, when the values depend on the indices, we can use `extend_for_idx`.
288 ///
289 /// # Panics
290 ///
291 /// Panics if the iterator created by `f` does not yield `num_items` elements.
292 ///
293 /// # Examples
294 ///
295 /// ```rust
296 /// use orx_concurrent_vec::*;
297 ///
298 /// let vec = ConcurrentVec::new();
299 ///
300 /// vec.push(0);
301 ///
302 /// let iter = |begin_idx: usize| ((begin_idx..(begin_idx + 3)).map(|i| i * 5));
303 /// vec.extend_for_idx(|begin_idx| iter(begin_idx), 3);
304 /// vec.push(42);
305 ///
306 /// assert_eq!(&vec, &[0, 5, 10, 15, 42]);
307 /// ```
308 pub fn extend_for_idx<IntoIter, Iter, F>(&self, f: F, num_items: usize) -> usize
309 where
310 IntoIter: IntoIterator<Item = T, IntoIter = Iter>,
311 Iter: Iterator<Item = T> + ExactSizeIterator,
312 F: FnOnce(usize) -> IntoIter,
313 {
314 let begin_idx = self.len_reserved().fetch_add(num_items, Ordering::Relaxed);
315 let slices = unsafe { self.core.n_items_buffer_as_slices(begin_idx, num_items) };
316 let mut values = f(begin_idx).into_iter();
317
318 assert_eq!(values.len(), num_items);
319
320 for slice in slices {
321 for maybe in slice {
322 let value = values
323 .next()
324 .expect("provided iterator is shorter than expected num_items");
325 unsafe { maybe.0.initialize_unchecked(value) };
326 }
327 }
328
329 begin_idx
330 }
331
332 /// Concurrent, thread-safe method to push `num_items` elements yielded by the `values` iterator to the back of the bag.
333 /// The method returns the position or index of the first pushed value (returns the length of the concurrent bag if the iterator is empty).
334 ///
335 /// All `values` in the iterator will be added to the bag consecutively:
336 /// * the first yielded value will be written to the position which is equal to the current length of the bag, say `begin_idx`, which is the returned value,
337 /// * the second yielded value will be written to the `begin_idx + 1`-th position,
338 /// * ...
339 /// * and the last value will be written to the `begin_idx + num_items - 1`-th position of the bag.
340 ///
341 /// Important notes:
342 /// * This method does not allocate at all to buffer elements to be pushed.
343 /// * All it does is to increment the atomic counter by the length of the iterator (`push` would increment by 1) and reserve the range of positions for this operation.
344 /// * Iterating over and writing elements to the vec happens afterwards.
345 /// * This is a simple, effective and memory efficient solution to the false sharing problem.
346 ///
347 /// For this reason, the method requires the additional `num_items` argument.
348 /// There exists the variant [`ConcurrentVec::extend`] method which accepts only an `ExactSizeIterator`.
349 ///
350 /// # Panics
351 ///
352 /// Panics if the iterator created by `f` does not yield `num_items` elements.
353 ///
354 /// # Examples
355 ///
356 /// We can directly take a shared reference of the bag and share it among threads.
357 ///
358 /// ```rust
359 /// use orx_concurrent_vec::*;
360 ///
361 /// let (num_threads, num_items_per_thread) = (4, 1_024);
362 ///
363 /// let vec = ConcurrentVec::new();
364 /// let batch_size = 16;
365 ///
366 /// std::thread::scope(|s| {
367 /// let vec = &vec;
368 /// for i in 0..num_threads {
369 /// s.spawn(move || {
370 /// for j in (0..num_items_per_thread).step_by(batch_size) {
371 /// let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
372 /// // concurrently collect results simply by calling `extend_n_items`
373 /// unsafe { vec.extend_n_items(iter, batch_size) };
374 /// }
375 /// });
376 /// }
377 /// });
378 ///
379 /// let mut vec: Vec<_> = vec.to_vec();
380 /// vec.sort();
381 /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
382 /// expected.sort();
383 /// assert_eq!(vec, expected);
384 /// ```
385 ///
386 /// # Performance Notes - False Sharing
387 ///
388 /// [`ConcurrentVec::push`] method is implementation is simple, lock-free and efficient.
389 /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
390 /// False sharing might lead to significant performance degradation; fortunately, it is possible to avoid in many cases.
391 ///
392 /// ## When?
393 ///
394 /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
395 /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
396 /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
397 /// * very little or negligible work / time is required in between `push` calls.
398 ///
399 /// The example above fits this situation.
400 /// Each thread only performs one multiplication and addition for computing elements, and the elements to be pushed are very small, just one `usize`.
401 ///
402 /// ## Why?
403 ///
404 /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
405 /// * However, cache lines contain more than one position.
406 /// * One thread updating a particular position invalidates the entire cache line on an other thread.
407 /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
408 /// * This might lead to a significant performance degradation.
409 ///
410 /// ### Solution: `extend` rather than `push`
411 ///
412 /// One very simple, effective and memory efficient solution to the false sharing problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
413 ///
414 /// Assume that we will have 4 threads and each will push 1_024 elements.
415 /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
416 /// This would give the best performance.
417 /// Further, it has zero buffer or memory cost:
418 /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
419 /// * there is no additional allocation,
420 /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
421 ///
422 /// However, we do not need to have such a perfect information about the number of elements to be pushed.
423 /// Performance gains after reaching the cache line size are much lesser.
424 ///
425 /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
426 /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
427 ///
428 /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
429 ///
430 /// The example code above already demonstrates the solution to a potentially problematic case in the [`ConcurrentVec::push`] example.
431 pub fn extend_n_items<IntoIter>(&self, values: IntoIter, num_items: usize) -> usize
432 where
433 IntoIter: IntoIterator<Item = T>,
434 {
435 let begin_idx = self.len_reserved().fetch_add(num_items, Ordering::Relaxed);
436 let slices = unsafe { self.core.n_items_buffer_as_slices(begin_idx, num_items) };
437 let mut values = values.into_iter();
438
439 for slice in slices {
440 for maybe in slice {
441 let value = values
442 .next()
443 .expect("provided iterator is shorter than expected num_items");
444 unsafe { maybe.0.initialize_unchecked(value) };
445 }
446 }
447
448 begin_idx
449 }
450}