orx_concurrent_vec/
grow.rs

1use crate::{ConcurrentVec, elem::ConcurrentElement};
2use core::sync::atomic::Ordering;
3use orx_pinned_vec::IntoConcurrentPinnedVec;
4
5impl<T, P> ConcurrentVec<T, P>
6where
7    P: IntoConcurrentPinnedVec<ConcurrentElement<T>>,
8{
9    /// Concurrent, thread-safe method to push the given `value` to the back of the bag, and returns the position or index of the pushed value.
10    ///
11    /// It preserves the order of elements with respect to the order the `push` method is called.
12    ///
13    /// # Panics
14    ///
15    /// Panics if the concurrent bag is already at its maximum capacity; i.e., if `self.len() == self.maximum_capacity()`.
16    ///
17    /// Note that this is an important safety assertion in the concurrent context; however, not a practical limitation.
18    /// Please see the [`orx_pinned_concurrent_col::PinnedConcurrentCol::maximum_capacity`] for details.
19    ///
20    /// # Examples
21    ///
22    /// We can directly take a shared reference of the bag, share it among threads and collect results concurrently.
23    ///
24    /// ```rust
25    /// use orx_concurrent_vec::*;
26    ///
27    /// let (num_threads, num_items_per_thread) = (4, 1_024);
28    ///
29    /// let vec = ConcurrentVec::new();
30    ///
31    /// std::thread::scope(|s| {
32    ///     let vec = &vec;
33    ///     for i in 0..num_threads {
34    ///         s.spawn(move || {
35    ///             for j in 0..num_items_per_thread {
36    ///                 // concurrently collect results simply by calling `push`
37    ///                 vec.push(i * 1000 + j);
38    ///             }
39    ///         });
40    ///     }
41    /// });
42    ///
43    /// let mut vec = vec.to_vec();
44    /// vec.sort();
45    /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
46    /// expected.sort();
47    /// assert_eq!(vec, expected);
48    /// ```
49    ///
50    /// # Performance Notes - False Sharing
51    ///
52    /// [`ConcurrentVec::push`] implementation is lock-free and focuses on efficiency.
53    /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
54    /// False sharing might lead to significant performance degradation.
55    /// However, it is possible to avoid in many cases.
56    ///
57    /// ## When?
58    ///
59    /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
60    /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
61    /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
62    ///   * very little or negligible work / time is required in between `push` calls.
63    ///
64    /// The example above fits this situation.
65    /// Each thread only performs one multiplication and addition in between pushing elements, and the elements to be pushed are very small, just one `usize`.
66    ///
67    /// ## Why?
68    ///
69    /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
70    /// * However, cache lines contain more than one position.
71    /// * One thread updating a particular position invalidates the entire cache line on an other thread.
72    /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
73    /// * This might lead to a significant performance degradation.
74    ///
75    /// ### Solution: `extend` rather than `push`
76    ///
77    /// One very simple, effective and memory efficient solution to this problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
78    ///
79    /// Assume that we will have 4 threads and each will push 1_024 elements.
80    /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
81    /// This would give the best performance.
82    /// Further, it has zero buffer or memory cost:
83    /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
84    /// * there is no additional allocation,
85    /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
86    ///
87    /// However, we do not need to have such a perfect information about the number of elements to be pushed.
88    /// Performance gains after reaching the cache line size are much lesser.
89    ///
90    /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
91    /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
92    ///
93    /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
94    ///
95    /// Required change in the code from `push` to `extend` is not significant.
96    /// The example above could be revised as follows to avoid the performance degrading of false sharing.
97    ///
98    /// ```rust
99    /// use orx_concurrent_vec::*;
100    ///
101    /// let (num_threads, num_items_per_thread) = (4, 1_024);
102    ///
103    /// let vec = ConcurrentVec::new();
104    /// let batch_size = 16;
105    ///
106    /// std::thread::scope(|s| {
107    ///     let vec = &vec;
108    ///     for i in 0..num_threads {
109    ///         s.spawn(move || {
110    ///             for j in (0..num_items_per_thread).step_by(batch_size) {
111    ///                 let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
112    ///                 // concurrently collect results simply by calling `extend`
113    ///                 vec.extend(iter);
114    ///             }
115    ///         });
116    ///     }
117    /// });
118    ///
119    /// let mut vec = vec.to_vec();
120    /// vec.sort();
121    /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
122    /// expected.sort();
123    /// assert_eq!(vec, expected);
124    /// ```
125    pub fn push(&self, value: T) -> usize {
126        let idx = self.len_reserved().fetch_add(1, Ordering::Relaxed);
127
128        // # SAFETY: ConcurrentVec ensures that each `idx` will be written only and exactly once.
129        let maybe = unsafe { self.core.single_item_as_ref(idx) };
130        unsafe { maybe.0.initialize_unchecked(value) };
131
132        idx
133    }
134
135    /// Pushes the value which will be computed as a function of the index where it will be written.
136    ///
137    /// Note that we cannot guarantee the index of the element by `push`ing since there might be many
138    /// pushes happening concurrently. In cases where we absolutely need to know the index, in other
139    /// words, when the value depends on the index, we can use `push_for_idx`.
140    ///
141    /// # Examples
142    ///
143    /// ```rust
144    /// use orx_concurrent_vec::*;
145    ///
146    /// let vec = ConcurrentVec::new();
147    /// vec.push(0);
148    /// vec.push_for_idx(|i| i * 2);
149    /// vec.push_for_idx(|i| i + 10);
150    /// vec.push(42);
151    ///
152    /// assert_eq!(&vec, &[0, 2, 12, 42]);
153    /// ```
154    pub fn push_for_idx<F>(&self, f: F) -> usize
155    where
156        F: FnOnce(usize) -> T,
157    {
158        let idx = self.len_reserved().fetch_add(1, Ordering::Relaxed);
159        let value = f(idx);
160
161        // # SAFETY: ConcurrentVec ensures that each `idx` will be written only and exactly once.
162        let maybe = unsafe { self.core.single_item_as_ref(idx) };
163        unsafe { maybe.0.initialize_unchecked(value) };
164
165        idx
166    }
167
168    /// Concurrent, thread-safe method to push all `values` that the given iterator will yield to the back of the bag.
169    /// The method returns the position or index of the first pushed value (returns the length of the concurrent bag if the iterator is empty).
170    ///
171    /// All `values` in the iterator will be added to the bag consecutively:
172    /// * the first yielded value will be written to the position which is equal to the current length of the bag, say `begin_idx`, which is the returned value,
173    /// * the second yielded value will be written to the `begin_idx + 1`-th position,
174    /// * ...
175    /// * and the last value will be written to the `begin_idx + values.count() - 1`-th position of the bag.
176    ///
177    /// Important notes:
178    /// * This method does not allocate to buffer.
179    /// * All it does is to increment the atomic counter by the length of the iterator (`push` would increment by 1) and reserve the range of positions for this operation.
180    /// * If there is not sufficient space, the vector grows first; iterating over and writing elements to the vec happens afterwards.
181    /// * Therefore, other threads do not wait for the `extend` method to complete, they can concurrently write.
182    /// * This is a simple and effective approach to deal with the false sharing problem.
183    ///
184    /// For this reason, the method requires an `ExactSizeIterator`.
185    /// There exists the variant [`ConcurrentVec::extend_n_items`] method which accepts any iterator together with the correct length to be passed by the caller.
186    /// It is `unsafe` as the caller must guarantee that the iterator yields at least the number of elements explicitly passed in as an argument.
187    ///
188    /// # Panics
189    ///
190    /// Panics if not all of the `values` fit in the concurrent bag's maximum capacity.
191    ///
192    /// Note that this is an important safety assertion in the concurrent context; however, not a practical limitation.
193    /// Please see the [`orx_pinned_concurrent_col::PinnedConcurrentCol::maximum_capacity`] for details.
194    ///
195    /// # Examples
196    ///
197    /// We can directly take a shared reference of the bag and share it among threads.
198    ///
199    /// ```rust
200    /// use orx_concurrent_vec::*;
201    ///
202    /// let (num_threads, num_items_per_thread) = (4, 1_024);
203    ///
204    /// let vec = ConcurrentVec::new();
205    /// let batch_size = 16;
206    ///
207    /// std::thread::scope(|s| {
208    ///     let vec = &vec;
209    ///     for i in 0..num_threads {
210    ///         s.spawn(move || {
211    ///             for j in (0..num_items_per_thread).step_by(batch_size) {
212    ///                 let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
213    ///                 // concurrently collect results simply by calling `extend`
214    ///                 vec.extend(iter);
215    ///             }
216    ///         });
217    ///     }
218    /// });
219    ///
220    /// let mut vec: Vec<_> = vec.to_vec();
221    /// vec.sort();
222    /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
223    /// expected.sort();
224    /// assert_eq!(vec, expected);
225    /// ```
226    ///
227    /// # Performance Notes - False Sharing
228    ///
229    /// [`ConcurrentVec::push`] method is implementation is simple, lock-free and efficient.
230    /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
231    /// False sharing might lead to significant performance degradation; fortunately, it is possible to avoid in many cases.
232    ///
233    /// ## When?
234    ///
235    /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
236    /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
237    /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
238    ///   * very little or negligible work / time is required in between `push` calls.
239    ///
240    /// The example above fits this situation.
241    /// Each thread only performs one multiplication and addition for computing elements, and the elements to be pushed are very small, just one `usize`.
242    ///
243    /// ## Why?
244    ///
245    /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
246    /// * However, cache lines contain more than one position.
247    /// * One thread updating a particular position invalidates the entire cache line on an other thread.
248    /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
249    /// * This might lead to a significant performance degradation.
250    ///
251    /// ### Solution: `extend` rather than `push`
252    ///
253    /// One very simple, effective and memory efficient solution to the false sharing problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
254    ///
255    /// Assume that we will have 4 threads and each will push 1_024 elements.
256    /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
257    /// This would give the best performance.
258    /// Further, it has zero buffer or memory cost:
259    /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
260    /// * there is no additional allocation,
261    /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
262    ///
263    /// However, we do not need to have such a perfect information about the number of elements to be pushed.
264    /// Performance gains after reaching the cache line size are much lesser.
265    ///
266    /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
267    /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
268    ///
269    /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
270    ///
271    /// The example code above already demonstrates the solution to a potentially problematic case in the [`ConcurrentVec::push`] example.
272    pub fn extend<IntoIter, Iter>(&self, values: IntoIter) -> usize
273    where
274        IntoIter: IntoIterator<Item = T, IntoIter = Iter>,
275        Iter: Iterator<Item = T> + ExactSizeIterator,
276    {
277        let values = values.into_iter();
278        let num_items = values.len();
279        self.extend_n_items::<_>(values, num_items)
280    }
281
282    /// Extends the vector with the values of the iterator which is created as a function of the
283    /// index that the first element of the iterator will be written to.
284    ///
285    /// Note that we cannot guarantee the index of the element by `extend`ing since there might be many
286    /// pushes or extends happening concurrently. In cases where we absolutely need to know the index, in other
287    /// words, when the values depend on the indices, we can use `extend_for_idx`.
288    ///
289    /// # Panics
290    ///
291    /// Panics if the iterator created by `f` does not yield `num_items` elements.
292    ///
293    /// # Examples
294    ///
295    /// ```rust
296    /// use orx_concurrent_vec::*;
297    ///
298    /// let vec = ConcurrentVec::new();
299    ///
300    /// vec.push(0);
301    ///
302    /// let iter = |begin_idx: usize| ((begin_idx..(begin_idx + 3)).map(|i| i * 5));
303    /// vec.extend_for_idx(|begin_idx| iter(begin_idx), 3);
304    /// vec.push(42);
305    ///
306    /// assert_eq!(&vec, &[0, 5, 10, 15, 42]);
307    /// ```
308    pub fn extend_for_idx<IntoIter, Iter, F>(&self, f: F, num_items: usize) -> usize
309    where
310        IntoIter: IntoIterator<Item = T, IntoIter = Iter>,
311        Iter: Iterator<Item = T> + ExactSizeIterator,
312        F: FnOnce(usize) -> IntoIter,
313    {
314        let begin_idx = self.len_reserved().fetch_add(num_items, Ordering::Relaxed);
315        let slices = unsafe { self.core.n_items_buffer_as_slices(begin_idx, num_items) };
316        let mut values = f(begin_idx).into_iter();
317
318        assert_eq!(values.len(), num_items);
319
320        for slice in slices {
321            for maybe in slice {
322                let value = values
323                    .next()
324                    .expect("provided iterator is shorter than expected num_items");
325                unsafe { maybe.0.initialize_unchecked(value) };
326            }
327        }
328
329        begin_idx
330    }
331
332    /// Concurrent, thread-safe method to push `num_items` elements yielded by the `values` iterator to the back of the bag.
333    /// The method returns the position or index of the first pushed value (returns the length of the concurrent bag if the iterator is empty).
334    ///
335    /// All `values` in the iterator will be added to the bag consecutively:
336    /// * the first yielded value will be written to the position which is equal to the current length of the bag, say `begin_idx`, which is the returned value,
337    /// * the second yielded value will be written to the `begin_idx + 1`-th position,
338    /// * ...
339    /// * and the last value will be written to the `begin_idx + num_items - 1`-th position of the bag.
340    ///
341    /// Important notes:
342    /// * This method does not allocate at all to buffer elements to be pushed.
343    /// * All it does is to increment the atomic counter by the length of the iterator (`push` would increment by 1) and reserve the range of positions for this operation.
344    /// * Iterating over and writing elements to the vec happens afterwards.
345    /// * This is a simple, effective and memory efficient solution to the false sharing problem.
346    ///
347    /// For this reason, the method requires the additional `num_items` argument.
348    /// There exists the variant [`ConcurrentVec::extend`] method which accepts only an `ExactSizeIterator`.
349    ///
350    /// # Panics
351    ///
352    /// Panics if the iterator created by `f` does not yield `num_items` elements.
353    ///
354    /// # Examples
355    ///
356    /// We can directly take a shared reference of the bag and share it among threads.
357    ///
358    /// ```rust
359    /// use orx_concurrent_vec::*;
360    ///
361    /// let (num_threads, num_items_per_thread) = (4, 1_024);
362    ///
363    /// let vec = ConcurrentVec::new();
364    /// let batch_size = 16;
365    ///
366    /// std::thread::scope(|s| {
367    ///     let vec = &vec;
368    ///     for i in 0..num_threads {
369    ///         s.spawn(move || {
370    ///             for j in (0..num_items_per_thread).step_by(batch_size) {
371    ///                 let iter = (j..(j + batch_size)).map(|j| i * 1000 + j);
372    ///                 // concurrently collect results simply by calling `extend_n_items`
373    ///                 unsafe { vec.extend_n_items(iter, batch_size) };
374    ///             }
375    ///         });
376    ///     }
377    /// });
378    ///
379    /// let mut vec: Vec<_> = vec.to_vec();
380    /// vec.sort();
381    /// let mut expected: Vec<_> = (0..num_threads).flat_map(|i| (0..num_items_per_thread).map(move |j| i * 1000 + j)).collect();
382    /// expected.sort();
383    /// assert_eq!(vec, expected);
384    /// ```
385    ///
386    /// # Performance Notes - False Sharing
387    ///
388    /// [`ConcurrentVec::push`] method is implementation is simple, lock-free and efficient.
389    /// However, we need to be aware of the potential [false sharing](https://en.wikipedia.org/wiki/False_sharing) risk.
390    /// False sharing might lead to significant performance degradation; fortunately, it is possible to avoid in many cases.
391    ///
392    /// ## When?
393    ///
394    /// Performance degradation due to false sharing might be observed when both of the following conditions hold:
395    /// * **small data**: data to be pushed is small, the more elements fitting in a cache line the bigger the risk,
396    /// * **little work**: multiple threads/cores are pushing to the concurrent bag with high frequency; i.e.,
397    ///   * very little or negligible work / time is required in between `push` calls.
398    ///
399    /// The example above fits this situation.
400    /// Each thread only performs one multiplication and addition for computing elements, and the elements to be pushed are very small, just one `usize`.
401    ///
402    /// ## Why?
403    ///
404    /// * `ConcurrentBag` assigns unique positions to each value to be pushed. There is no *true* sharing among threads in the position level.
405    /// * However, cache lines contain more than one position.
406    /// * One thread updating a particular position invalidates the entire cache line on an other thread.
407    /// * Threads end up frequently reloading cache lines instead of doing the actual work of writing elements to the bag.
408    /// * This might lead to a significant performance degradation.
409    ///
410    /// ### Solution: `extend` rather than `push`
411    ///
412    /// One very simple, effective and memory efficient solution to the false sharing problem is to use [`ConcurrentVec::extend`] rather than `push` in *small data & little work* situations.
413    ///
414    /// Assume that we will have 4 threads and each will push 1_024 elements.
415    /// Instead of making 1_024 `push` calls from each thread, we can make one `extend` call from each.
416    /// This would give the best performance.
417    /// Further, it has zero buffer or memory cost:
418    /// * it is important to note that the batch of 1_024 elements are not stored temporarily in another buffer,
419    /// * there is no additional allocation,
420    /// * `extend` does nothing more than reserving the position range for the thread by incrementing the atomic counter accordingly.
421    ///
422    /// However, we do not need to have such a perfect information about the number of elements to be pushed.
423    /// Performance gains after reaching the cache line size are much lesser.
424    ///
425    /// For instance, consider the challenging super small element size case, where we are collecting `i32`s.
426    /// We can already achieve a very high performance by simply `extend`ing the bag by batches of 16 elements.
427    ///
428    /// As the element size gets larger, required batch size to achieve a high performance gets smaller and smaller.
429    ///
430    /// The example code above already demonstrates the solution to a potentially problematic case in the [`ConcurrentVec::push`] example.
431    pub fn extend_n_items<IntoIter>(&self, values: IntoIter, num_items: usize) -> usize
432    where
433        IntoIter: IntoIterator<Item = T>,
434    {
435        let begin_idx = self.len_reserved().fetch_add(num_items, Ordering::Relaxed);
436        let slices = unsafe { self.core.n_items_buffer_as_slices(begin_idx, num_items) };
437        let mut values = values.into_iter();
438
439        for slice in slices {
440            for maybe in slice {
441                let value = values
442                    .next()
443                    .expect("provided iterator is shorter than expected num_items");
444                unsafe { maybe.0.initialize_unchecked(value) };
445            }
446        }
447
448        begin_idx
449    }
450}
orx_concurrent_vec/grow.rs

orx_concurrent_vec/
grow.rs