• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //! Parallel merge sort.
2 //!
3 //! This implementation is copied verbatim from `std::slice::sort` and then parallelized.
4 //! The only difference from the original is that the sequential `mergesort` returns
5 //! `MergesortResult` and leaves descending arrays intact.
6 
7 use crate::iter::*;
8 use crate::slice::ParallelSliceMut;
9 use crate::SendPtr;
10 use std::mem;
11 use std::mem::size_of;
12 use std::ptr;
13 use std::slice;
14 
get_and_increment<T>(ptr: &mut *mut T) -> *mut T15 unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
16     let old = *ptr;
17     *ptr = ptr.offset(1);
18     old
19 }
20 
decrement_and_get<T>(ptr: &mut *mut T) -> *mut T21 unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
22     *ptr = ptr.offset(-1);
23     *ptr
24 }
25 
26 /// When dropped, copies from `src` into `dest` a sequence of length `len`.
27 struct CopyOnDrop<T> {
28     src: *const T,
29     dest: *mut T,
30     len: usize,
31 }
32 
33 impl<T> Drop for CopyOnDrop<T> {
drop(&mut self)34     fn drop(&mut self) {
35         unsafe {
36             ptr::copy_nonoverlapping(self.src, self.dest, self.len);
37         }
38     }
39 }
40 
41 /// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
42 ///
43 /// This is the integral subroutine of insertion sort.
insert_head<T, F>(v: &mut [T], is_less: &F) where F: Fn(&T, &T) -> bool,44 fn insert_head<T, F>(v: &mut [T], is_less: &F)
45 where
46     F: Fn(&T, &T) -> bool,
47 {
48     if v.len() >= 2 && is_less(&v[1], &v[0]) {
49         unsafe {
50             // There are three ways to implement insertion here:
51             //
52             // 1. Swap adjacent elements until the first one gets to its final destination.
53             //    However, this way we copy data around more than is necessary. If elements are big
54             //    structures (costly to copy), this method will be slow.
55             //
56             // 2. Iterate until the right place for the first element is found. Then shift the
57             //    elements succeeding it to make room for it and finally place it into the
58             //    remaining hole. This is a good method.
59             //
60             // 3. Copy the first element into a temporary variable. Iterate until the right place
61             //    for it is found. As we go along, copy every traversed element into the slot
62             //    preceding it. Finally, copy data from the temporary variable into the remaining
63             //    hole. This method is very good. Benchmarks demonstrated slightly better
64             //    performance than with the 2nd method.
65             //
66             // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
67             let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
68 
69             // Intermediate state of the insertion process is always tracked by `hole`, which
70             // serves two purposes:
71             // 1. Protects integrity of `v` from panics in `is_less`.
72             // 2. Fills the remaining hole in `v` in the end.
73             //
74             // Panic safety:
75             //
76             // If `is_less` panics at any point during the process, `hole` will get dropped and
77             // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
78             // initially held exactly once.
79             let mut hole = InsertionHole {
80                 src: &*tmp,
81                 dest: &mut v[1],
82             };
83             ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
84 
85             for i in 2..v.len() {
86                 if !is_less(&v[i], &*tmp) {
87                     break;
88                 }
89                 ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
90                 hole.dest = &mut v[i];
91             }
92             // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
93         }
94     }
95 
96     // When dropped, copies from `src` into `dest`.
97     struct InsertionHole<T> {
98         src: *const T,
99         dest: *mut T,
100     }
101 
102     impl<T> Drop for InsertionHole<T> {
103         fn drop(&mut self) {
104             unsafe {
105                 ptr::copy_nonoverlapping(self.src, self.dest, 1);
106             }
107         }
108     }
109 }
110 
111 /// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
112 /// stores the result into `v[..]`.
113 ///
114 /// # Safety
115 ///
116 /// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
117 /// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &F) where F: Fn(&T, &T) -> bool,118 unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &F)
119 where
120     F: Fn(&T, &T) -> bool,
121 {
122     let len = v.len();
123     let v = v.as_mut_ptr();
124     let v_mid = v.add(mid);
125     let v_end = v.add(len);
126 
127     // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
128     // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
129     // copying the lesser (or greater) one into `v`.
130     //
131     // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
132     // consumed first, then we must copy whatever is left of the shorter run into the remaining
133     // hole in `v`.
134     //
135     // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
136     // 1. Protects integrity of `v` from panics in `is_less`.
137     // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
138     //
139     // Panic safety:
140     //
141     // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
142     // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
143     // object it initially held exactly once.
144     let mut hole;
145 
146     if mid <= len - mid {
147         // The left run is shorter.
148         ptr::copy_nonoverlapping(v, buf, mid);
149         hole = MergeHole {
150             start: buf,
151             end: buf.add(mid),
152             dest: v,
153         };
154 
155         // Initially, these pointers point to the beginnings of their arrays.
156         let left = &mut hole.start;
157         let mut right = v_mid;
158         let out = &mut hole.dest;
159 
160         while *left < hole.end && right < v_end {
161             // Consume the lesser side.
162             // If equal, prefer the left run to maintain stability.
163             let to_copy = if is_less(&*right, &**left) {
164                 get_and_increment(&mut right)
165             } else {
166                 get_and_increment(left)
167             };
168             ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
169         }
170     } else {
171         // The right run is shorter.
172         ptr::copy_nonoverlapping(v_mid, buf, len - mid);
173         hole = MergeHole {
174             start: buf,
175             end: buf.add(len - mid),
176             dest: v_mid,
177         };
178 
179         // Initially, these pointers point past the ends of their arrays.
180         let left = &mut hole.dest;
181         let right = &mut hole.end;
182         let mut out = v_end;
183 
184         while v < *left && buf < *right {
185             // Consume the greater side.
186             // If equal, prefer the right run to maintain stability.
187             let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
188                 decrement_and_get(left)
189             } else {
190                 decrement_and_get(right)
191             };
192             ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
193         }
194     }
195     // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
196     // it will now be copied into the hole in `v`.
197 
198     // When dropped, copies the range `start..end` into `dest..`.
199     struct MergeHole<T> {
200         start: *mut T,
201         end: *mut T,
202         dest: *mut T,
203     }
204 
205     impl<T> Drop for MergeHole<T> {
206         fn drop(&mut self) {
207             // `T` is not a zero-sized type, so it's okay to divide by its size.
208             unsafe {
209                 let len = self.end.offset_from(self.start) as usize;
210                 ptr::copy_nonoverlapping(self.start, self.dest, len);
211             }
212         }
213     }
214 }
215 
216 /// The result of merge sort.
217 #[must_use]
218 #[derive(Clone, Copy, PartialEq, Eq)]
219 enum MergesortResult {
220     /// The slice has already been sorted.
221     NonDescending,
222     /// The slice has been descending and therefore it was left intact.
223     Descending,
224     /// The slice was sorted.
225     Sorted,
226 }
227 
228 /// A sorted run that starts at index `start` and is of length `len`.
229 #[derive(Clone, Copy)]
230 struct Run {
231     start: usize,
232     len: usize,
233 }
234 
235 /// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
236 /// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
237 /// algorithm should continue building a new run instead, `None` is returned.
238 ///
239 /// TimSort is infamous for its buggy implementations, as described here:
240 /// http://envisage-project.eu/timsort-specification-and-verification/
241 ///
242 /// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
243 /// Enforcing them on just top three is not sufficient to ensure that the invariants will still
244 /// hold for *all* runs in the stack.
245 ///
246 /// This function correctly checks invariants for the top four runs. Additionally, if the top
247 /// run starts at index 0, it will always demand a merge operation until the stack is fully
248 /// collapsed, in order to complete the sort.
249 #[inline]
collapse(runs: &[Run]) -> Option<usize>250 fn collapse(runs: &[Run]) -> Option<usize> {
251     let n = runs.len();
252 
253     if n >= 2
254         && (runs[n - 1].start == 0
255             || runs[n - 2].len <= runs[n - 1].len
256             || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
257             || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
258     {
259         if n >= 3 && runs[n - 3].len < runs[n - 1].len {
260             Some(n - 3)
261         } else {
262             Some(n - 2)
263         }
264     } else {
265         None
266     }
267 }
268 
269 /// Sorts a slice using merge sort, unless it is already in descending order.
270 ///
271 /// This function doesn't modify the slice if it is already non-descending or descending.
272 /// Otherwise, it sorts the slice into non-descending order.
273 ///
274 /// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
275 /// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
276 ///
277 /// The algorithm identifies strictly descending and non-descending subsequences, which are called
278 /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
279 /// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
280 /// satisfied:
281 ///
282 /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
283 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
284 ///
285 /// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
286 ///
287 /// # Safety
288 ///
289 /// The argument `buf` is used as a temporary buffer and must be at least as long as `v`.
mergesort<T, F>(v: &mut [T], buf: *mut T, is_less: &F) -> MergesortResult where T: Send, F: Fn(&T, &T) -> bool + Sync,290 unsafe fn mergesort<T, F>(v: &mut [T], buf: *mut T, is_less: &F) -> MergesortResult
291 where
292     T: Send,
293     F: Fn(&T, &T) -> bool + Sync,
294 {
295     // Very short runs are extended using insertion sort to span at least this many elements.
296     const MIN_RUN: usize = 10;
297 
298     let len = v.len();
299 
300     // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
301     // strange decision, but consider the fact that merges more often go in the opposite direction
302     // (forwards). According to benchmarks, merging forwards is slightly faster than merging
303     // backwards. To conclude, identifying runs by traversing backwards improves performance.
304     let mut runs = vec![];
305     let mut end = len;
306     while end > 0 {
307         // Find the next natural run, and reverse it if it's strictly descending.
308         let mut start = end - 1;
309 
310         if start > 0 {
311             start -= 1;
312 
313             if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
314                 while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
315                     start -= 1;
316                 }
317 
318                 // If this descending run covers the whole slice, return immediately.
319                 if start == 0 && end == len {
320                     return MergesortResult::Descending;
321                 } else {
322                     v[start..end].reverse();
323                 }
324             } else {
325                 while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
326                     start -= 1;
327                 }
328 
329                 // If this non-descending run covers the whole slice, return immediately.
330                 if end - start == len {
331                     return MergesortResult::NonDescending;
332                 }
333             }
334         }
335 
336         // Insert some more elements into the run if it's too short. Insertion sort is faster than
337         // merge sort on short sequences, so this significantly improves performance.
338         while start > 0 && end - start < MIN_RUN {
339             start -= 1;
340             insert_head(&mut v[start..end], &is_less);
341         }
342 
343         // Push this run onto the stack.
344         runs.push(Run {
345             start,
346             len: end - start,
347         });
348         end = start;
349 
350         // Merge some pairs of adjacent runs to satisfy the invariants.
351         while let Some(r) = collapse(&runs) {
352             let left = runs[r + 1];
353             let right = runs[r];
354             merge(
355                 &mut v[left.start..right.start + right.len],
356                 left.len,
357                 buf,
358                 &is_less,
359             );
360 
361             runs[r] = Run {
362                 start: left.start,
363                 len: left.len + right.len,
364             };
365             runs.remove(r + 1);
366         }
367     }
368 
369     // Finally, exactly one run must remain in the stack.
370     debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
371 
372     // The original order of the slice was neither non-descending nor descending.
373     MergesortResult::Sorted
374 }
375 
376 ////////////////////////////////////////////////////////////////////////////
377 // Everything above this line is copied from `std::slice::sort` (with very minor tweaks).
378 // Everything below this line is parallelization.
379 ////////////////////////////////////////////////////////////////////////////
380 
381 /// Splits two sorted slices so that they can be merged in parallel.
382 ///
383 /// Returns two indices `(a, b)` so that slices `left[..a]` and `right[..b]` come before
384 /// `left[a..]` and `right[b..]`.
split_for_merge<T, F>(left: &[T], right: &[T], is_less: &F) -> (usize, usize) where F: Fn(&T, &T) -> bool,385 fn split_for_merge<T, F>(left: &[T], right: &[T], is_less: &F) -> (usize, usize)
386 where
387     F: Fn(&T, &T) -> bool,
388 {
389     let left_len = left.len();
390     let right_len = right.len();
391 
392     if left_len >= right_len {
393         let left_mid = left_len / 2;
394 
395         // Find the first element in `right` that is greater than or equal to `left[left_mid]`.
396         let mut a = 0;
397         let mut b = right_len;
398         while a < b {
399             let m = a + (b - a) / 2;
400             if is_less(&right[m], &left[left_mid]) {
401                 a = m + 1;
402             } else {
403                 b = m;
404             }
405         }
406 
407         (left_mid, a)
408     } else {
409         let right_mid = right_len / 2;
410 
411         // Find the first element in `left` that is greater than `right[right_mid]`.
412         let mut a = 0;
413         let mut b = left_len;
414         while a < b {
415             let m = a + (b - a) / 2;
416             if is_less(&right[right_mid], &left[m]) {
417                 b = m;
418             } else {
419                 a = m + 1;
420             }
421         }
422 
423         (a, right_mid)
424     }
425 }
426 
427 /// Merges slices `left` and `right` in parallel and stores the result into `dest`.
428 ///
429 /// # Safety
430 ///
431 /// The `dest` pointer must have enough space to store the result.
432 ///
433 /// Even if `is_less` panics at any point during the merge process, this function will fully copy
434 /// all elements from `left` and `right` into `dest` (not necessarily in sorted order).
par_merge<T, F>(left: &mut [T], right: &mut [T], dest: *mut T, is_less: &F) where T: Send, F: Fn(&T, &T) -> bool + Sync,435 unsafe fn par_merge<T, F>(left: &mut [T], right: &mut [T], dest: *mut T, is_less: &F)
436 where
437     T: Send,
438     F: Fn(&T, &T) -> bool + Sync,
439 {
440     // Slices whose lengths sum up to this value are merged sequentially. This number is slightly
441     // larger than `CHUNK_LENGTH`, and the reason is that merging is faster than merge sorting, so
442     // merging needs a bit coarser granularity in order to hide the overhead of Rayon's task
443     // scheduling.
444     const MAX_SEQUENTIAL: usize = 5000;
445 
446     let left_len = left.len();
447     let right_len = right.len();
448 
449     // Intermediate state of the merge process, which serves two purposes:
450     // 1. Protects integrity of `dest` from panics in `is_less`.
451     // 2. Copies the remaining elements as soon as one of the two sides is exhausted.
452     //
453     // Panic safety:
454     //
455     // If `is_less` panics at any point during the merge process, `s` will get dropped and copy the
456     // remaining parts of `left` and `right` into `dest`.
457     let mut s = State {
458         left_start: left.as_mut_ptr(),
459         left_end: left.as_mut_ptr().add(left_len),
460         right_start: right.as_mut_ptr(),
461         right_end: right.as_mut_ptr().add(right_len),
462         dest,
463     };
464 
465     if left_len == 0 || right_len == 0 || left_len + right_len < MAX_SEQUENTIAL {
466         while s.left_start < s.left_end && s.right_start < s.right_end {
467             // Consume the lesser side.
468             // If equal, prefer the left run to maintain stability.
469             let to_copy = if is_less(&*s.right_start, &*s.left_start) {
470                 get_and_increment(&mut s.right_start)
471             } else {
472                 get_and_increment(&mut s.left_start)
473             };
474             ptr::copy_nonoverlapping(to_copy, get_and_increment(&mut s.dest), 1);
475         }
476     } else {
477         // Function `split_for_merge` might panic. If that happens, `s` will get destructed and copy
478         // the whole `left` and `right` into `dest`.
479         let (left_mid, right_mid) = split_for_merge(left, right, is_less);
480         let (left_l, left_r) = left.split_at_mut(left_mid);
481         let (right_l, right_r) = right.split_at_mut(right_mid);
482 
483         // Prevent the destructor of `s` from running. Rayon will ensure that both calls to
484         // `par_merge` happen. If one of the two calls panics, they will ensure that elements still
485         // get copied into `dest_left` and `dest_right``.
486         mem::forget(s);
487 
488         // Wrap pointers in SendPtr so that they can be sent to another thread
489         // See the documentation of SendPtr for a full explanation
490         let dest_l = SendPtr(dest);
491         let dest_r = SendPtr(dest.add(left_l.len() + right_l.len()));
492         rayon_core::join(
493             move || par_merge(left_l, right_l, dest_l.get(), is_less),
494             move || par_merge(left_r, right_r, dest_r.get(), is_less),
495         );
496     }
497     // Finally, `s` gets dropped if we used sequential merge, thus copying the remaining elements
498     // all at once.
499 
500     // When dropped, copies arrays `left_start..left_end` and `right_start..right_end` into `dest`,
501     // in that order.
502     struct State<T> {
503         left_start: *mut T,
504         left_end: *mut T,
505         right_start: *mut T,
506         right_end: *mut T,
507         dest: *mut T,
508     }
509 
510     impl<T> Drop for State<T> {
511         fn drop(&mut self) {
512             let size = size_of::<T>();
513             let left_len = (self.left_end as usize - self.left_start as usize) / size;
514             let right_len = (self.right_end as usize - self.right_start as usize) / size;
515 
516             // Copy array `left`, followed by `right`.
517             unsafe {
518                 ptr::copy_nonoverlapping(self.left_start, self.dest, left_len);
519                 self.dest = self.dest.add(left_len);
520                 ptr::copy_nonoverlapping(self.right_start, self.dest, right_len);
521             }
522         }
523     }
524 }
525 
526 /// Recursively merges pre-sorted chunks inside `v`.
527 ///
528 /// Chunks of `v` are stored in `chunks` as intervals (inclusive left and exclusive right bound).
529 /// Argument `buf` is an auxiliary buffer that will be used during the procedure.
530 /// If `into_buf` is true, the result will be stored into `buf`, otherwise it will be in `v`.
531 ///
532 /// # Safety
533 ///
534 /// The number of chunks must be positive and they must be adjacent: the right bound of each chunk
535 /// must equal the left bound of the following chunk.
536 ///
537 /// The buffer must be at least as long as `v`.
recurse<T, F>( v: *mut T, buf: *mut T, chunks: &[(usize, usize)], into_buf: bool, is_less: &F, ) where T: Send, F: Fn(&T, &T) -> bool + Sync,538 unsafe fn recurse<T, F>(
539     v: *mut T,
540     buf: *mut T,
541     chunks: &[(usize, usize)],
542     into_buf: bool,
543     is_less: &F,
544 ) where
545     T: Send,
546     F: Fn(&T, &T) -> bool + Sync,
547 {
548     let len = chunks.len();
549     debug_assert!(len > 0);
550 
551     // Base case of the algorithm.
552     // If only one chunk is remaining, there's no more work to split and merge.
553     if len == 1 {
554         if into_buf {
555             // Copy the chunk from `v` into `buf`.
556             let (start, end) = chunks[0];
557             let src = v.add(start);
558             let dest = buf.add(start);
559             ptr::copy_nonoverlapping(src, dest, end - start);
560         }
561         return;
562     }
563 
564     // Split the chunks into two halves.
565     let (start, _) = chunks[0];
566     let (mid, _) = chunks[len / 2];
567     let (_, end) = chunks[len - 1];
568     let (left, right) = chunks.split_at(len / 2);
569 
570     // After recursive calls finish we'll have to merge chunks `(start, mid)` and `(mid, end)` from
571     // `src` into `dest`. If the current invocation has to store the result into `buf`, we'll
572     // merge chunks from `v` into `buf`, and vice versa.
573     //
574     // Recursive calls flip `into_buf` at each level of recursion. More concretely, `par_merge`
575     // merges chunks from `buf` into `v` at the first level, from `v` into `buf` at the second
576     // level etc.
577     let (src, dest) = if into_buf { (v, buf) } else { (buf, v) };
578 
579     // Panic safety:
580     //
581     // If `is_less` panics at any point during the recursive calls, the destructor of `guard` will
582     // be executed, thus copying everything from `src` into `dest`. This way we ensure that all
583     // chunks are in fact copied into `dest`, even if the merge process doesn't finish.
584     let guard = CopyOnDrop {
585         src: src.add(start),
586         dest: dest.add(start),
587         len: end - start,
588     };
589 
590     // Wrap pointers in SendPtr so that they can be sent to another thread
591     // See the documentation of SendPtr for a full explanation
592     let v = SendPtr(v);
593     let buf = SendPtr(buf);
594     rayon_core::join(
595         move || recurse(v.get(), buf.get(), left, !into_buf, is_less),
596         move || recurse(v.get(), buf.get(), right, !into_buf, is_less),
597     );
598 
599     // Everything went all right - recursive calls didn't panic.
600     // Forget the guard in order to prevent its destructor from running.
601     mem::forget(guard);
602 
603     // Merge chunks `(start, mid)` and `(mid, end)` from `src` into `dest`.
604     let src_left = slice::from_raw_parts_mut(src.add(start), mid - start);
605     let src_right = slice::from_raw_parts_mut(src.add(mid), end - mid);
606     par_merge(src_left, src_right, dest.add(start), is_less);
607 }
608 
609 /// Sorts `v` using merge sort in parallel.
610 ///
611 /// The algorithm is stable, allocates memory, and `O(n log n)` worst-case.
612 /// The allocated temporary buffer is of the same length as is `v`.
par_mergesort<T, F>(v: &mut [T], is_less: F) where T: Send, F: Fn(&T, &T) -> bool + Sync,613 pub(super) fn par_mergesort<T, F>(v: &mut [T], is_less: F)
614 where
615     T: Send,
616     F: Fn(&T, &T) -> bool + Sync,
617 {
618     // Slices of up to this length get sorted using insertion sort in order to avoid the cost of
619     // buffer allocation.
620     const MAX_INSERTION: usize = 20;
621     // The length of initial chunks. This number is as small as possible but so that the overhead
622     // of Rayon's task scheduling is still negligible.
623     const CHUNK_LENGTH: usize = 2000;
624 
625     // Sorting has no meaningful behavior on zero-sized types.
626     if size_of::<T>() == 0 {
627         return;
628     }
629 
630     let len = v.len();
631 
632     // Short slices get sorted in-place via insertion sort to avoid allocations.
633     if len <= MAX_INSERTION {
634         if len >= 2 {
635             for i in (0..len - 1).rev() {
636                 insert_head(&mut v[i..], &is_less);
637             }
638         }
639         return;
640     }
641 
642     // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
643     // shallow copies of the contents of `v` without risking the dtors running on copies if
644     // `is_less` panics.
645     let mut buf = Vec::<T>::with_capacity(len);
646     let buf = buf.as_mut_ptr();
647 
648     // If the slice is not longer than one chunk would be, do sequential merge sort and return.
649     if len <= CHUNK_LENGTH {
650         let res = unsafe { mergesort(v, buf, &is_less) };
651         if res == MergesortResult::Descending {
652             v.reverse();
653         }
654         return;
655     }
656 
657     // Split the slice into chunks and merge sort them in parallel.
658     // However, descending chunks will not be sorted - they will be simply left intact.
659     let mut iter = {
660         // Wrap pointer in SendPtr so that it can be sent to another thread
661         // See the documentation of SendPtr for a full explanation
662         let buf = SendPtr(buf);
663         let is_less = &is_less;
664 
665         v.par_chunks_mut(CHUNK_LENGTH)
666             .with_max_len(1)
667             .enumerate()
668             .map(move |(i, chunk)| {
669                 let l = CHUNK_LENGTH * i;
670                 let r = l + chunk.len();
671                 unsafe {
672                     let buf = buf.get().add(l);
673                     (l, r, mergesort(chunk, buf, is_less))
674                 }
675             })
676             .collect::<Vec<_>>()
677             .into_iter()
678             .peekable()
679     };
680 
681     // Now attempt to concatenate adjacent chunks that were left intact.
682     let mut chunks = Vec::with_capacity(iter.len());
683 
684     while let Some((a, mut b, res)) = iter.next() {
685         // If this chunk was not modified by the sort procedure...
686         if res != MergesortResult::Sorted {
687             while let Some(&(x, y, r)) = iter.peek() {
688                 // If the following chunk is of the same type and can be concatenated...
689                 if r == res && (r == MergesortResult::Descending) == is_less(&v[x], &v[x - 1]) {
690                     // Concatenate them.
691                     b = y;
692                     iter.next();
693                 } else {
694                     break;
695                 }
696             }
697         }
698 
699         // Descending chunks must be reversed.
700         if res == MergesortResult::Descending {
701             v[a..b].reverse();
702         }
703 
704         chunks.push((a, b));
705     }
706 
707     // All chunks are properly sorted.
708     // Now we just have to merge them together.
709     unsafe {
710         recurse(v.as_mut_ptr(), buf, &chunks, false, &is_less);
711     }
712 }
713 
714 #[cfg(test)]
715 mod tests {
716     use super::split_for_merge;
717     use rand::distributions::Uniform;
718     use rand::{thread_rng, Rng};
719 
720     #[test]
test_split_for_merge()721     fn test_split_for_merge() {
722         fn check(left: &[u32], right: &[u32]) {
723             let (l, r) = split_for_merge(left, right, &|&a, &b| a < b);
724             assert!(left[..l]
725                 .iter()
726                 .all(|&x| right[r..].iter().all(|&y| x <= y)));
727             assert!(right[..r].iter().all(|&x| left[l..].iter().all(|&y| x < y)));
728         }
729 
730         check(&[1, 2, 2, 2, 2, 3], &[1, 2, 2, 2, 2, 3]);
731         check(&[1, 2, 2, 2, 2, 3], &[]);
732         check(&[], &[1, 2, 2, 2, 2, 3]);
733 
734         let rng = &mut thread_rng();
735 
736         for _ in 0..100 {
737             let limit: u32 = rng.gen_range(1..21);
738             let left_len: usize = rng.gen_range(0..20);
739             let right_len: usize = rng.gen_range(0..20);
740 
741             let mut left = rng
742                 .sample_iter(&Uniform::new(0, limit))
743                 .take(left_len)
744                 .collect::<Vec<_>>();
745             let mut right = rng
746                 .sample_iter(&Uniform::new(0, limit))
747                 .take(right_len)
748                 .collect::<Vec<_>>();
749 
750             left.sort();
751             right.sort();
752             check(&left, &right);
753         }
754     }
755 }
756