IOSS 2.0
Loading...
Searching...
No Matches
pdqsort.h
Go to the documentation of this file.
1/*
2 pdqsort.h - Pattern-defeating quicksort.
3
4 Copyright (c) 2021, 2022, 2023 Orson Peters
5
6 This software is provided 'as-is', without any express or implied warranty. In no event will the
7 authors be held liable for any damages arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose, including commercial
10 applications, and to alter it and redistribute it freely, subject to the following restrictions:
11
12 1. The origin of this software must not be misrepresented; you must not claim that you wrote the
13 original software. If you use this software in a product, an acknowledgment in the product
14 documentation would be appreciated but is not required.
15
16 2. Altered source versions must be plainly marked as such, and must not be misrepresented as
17 being the original software.
18
19 3. This notice may not be removed or altered from any source distribution.
20*/
21
22#ifndef PDQSORT_H
23#define PDQSORT_H
24
25#include <algorithm>
26#include <cstddef>
27#include <functional>
28#include <iterator>
29#include <utility>
30
31#if __cplusplus >= 201103L
32#include <cstdint>
33#include <type_traits>
34#define PDQSORT_PREFER_MOVE(x) std::move(x)
35#else
36#define PDQSORT_PREFER_MOVE(x) (x)
37#endif
38
39// NOLINTBEGIN
40namespace pdqsort_detail {
41 enum {
42 // Partitions below this size are sorted using insertion sort.
44
45 // Partitions above this size use Tukey's ninther to select the pivot.
47
48 // When we detect an already sorted partition, attempt an insertion sort that allows this
49 // amount of element moves before giving up.
51
52 // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
54
55 // Cacheline size, assumes power of two.
57
58 };
59
60#if __cplusplus >= 201103L
61 template <class T> struct is_default_compare : std::false_type
62 {
63 };
64 template <class T> struct is_default_compare<std::less<T>> : std::true_type
65 {
66 };
67 template <class T> struct is_default_compare<std::greater<T>> : std::true_type
68 {
69 };
70#endif
71
72 // Returns floor(log2(n)), assumes n > 0.
73 template <class T> inline int pdq_log2(T n)
74 {
75 int log = 0;
76 while (n >>= 1)
77 ++log;
78 return log;
79 }
80
81 // Sorts [begin, end) using insertion sort with the given comparison function.
82 template <class Iter, class Compare>
83 inline void insertion_sort(Iter begin, Iter end, Compare comp)
84 {
85 typedef typename std::iterator_traits<Iter>::value_type T;
86 if (begin == end)
87 return;
88
89 for (Iter cur = begin + 1; cur != end; ++cur) {
90 Iter sift = cur;
91 Iter sift_1 = cur - 1;
92
93 // Compare first so we can avoid 2 moves for an element already positioned correctly.
94 if (comp(*sift, *sift_1)) {
95 T tmp = PDQSORT_PREFER_MOVE(*sift);
96
97 do {
98 *sift-- = PDQSORT_PREFER_MOVE(*sift_1);
99 } while (sift != begin && comp(tmp, *--sift_1));
100
101 *sift = PDQSORT_PREFER_MOVE(tmp);
102 }
103 }
104 }
105
106 // Sorts [begin, end) using insertion sort with the given comparison function. Assumes
107 // *(begin - 1) is an element smaller than or equal to any element in [begin, end).
108 template <class Iter, class Compare>
109 inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp)
110 {
111 typedef typename std::iterator_traits<Iter>::value_type T;
112 if (begin == end)
113 return;
114
115 for (Iter cur = begin + 1; cur != end; ++cur) {
116 Iter sift = cur;
117 Iter sift_1 = cur - 1;
118
119 // Compare first so we can avoid 2 moves for an element already positioned correctly.
120 if (comp(*sift, *sift_1)) {
121 T tmp = PDQSORT_PREFER_MOVE(*sift);
122
123 do {
124 *sift-- = PDQSORT_PREFER_MOVE(*sift_1);
125 } while (comp(tmp, *--sift_1));
126
127 *sift = PDQSORT_PREFER_MOVE(tmp);
128 }
129 }
130 }
131
132 // Attempts to use insertion sort on [begin, end). Will return false if more than
133 // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
134 // successfully sort and return true.
135 template <class Iter, class Compare>
136 inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp)
137 {
138 typedef typename std::iterator_traits<Iter>::value_type T;
139 if (begin == end)
140 return true;
141
142 std::size_t limit = 0;
143 for (Iter cur = begin + 1; cur != end; ++cur) {
144 Iter sift = cur;
145 Iter sift_1 = cur - 1;
146
147 // Compare first so we can avoid 2 moves for an element already positioned correctly.
148 if (comp(*sift, *sift_1)) {
149 T tmp = PDQSORT_PREFER_MOVE(*sift);
150
151 do {
152 *sift-- = PDQSORT_PREFER_MOVE(*sift_1);
153 } while (sift != begin && comp(tmp, *--sift_1));
154
155 *sift = PDQSORT_PREFER_MOVE(tmp);
156 limit += cur - sift;
157 }
158
160 return false;
161 }
162
163 return true;
164 }
165
166 template <class Iter, class Compare> inline void sort2(Iter a, Iter b, Compare comp)
167 {
168 if (comp(*b, *a))
169 std::iter_swap(a, b);
170 }
171
172 // Sorts the elements *a, *b and *c using comparison function comp.
173 template <class Iter, class Compare> inline void sort3(Iter a, Iter b, Iter c, Compare comp)
174 {
175 sort2(a, b, comp);
176 sort2(b, c, comp);
177 sort2(a, b, comp);
178 }
179
180 template <class T> inline T *align_cacheline(T *p)
181 {
182#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
183 auto ip = reinterpret_cast<std::uintptr_t>(p);
184#else
185 auto ip = reinterpret_cast<std::size_t>(p);
186#endif
187 int icacheline_size = int(cacheline_size);
188 ip = (ip + icacheline_size - 1) & -icacheline_size;
189 return reinterpret_cast<T *>(ip);
190 }
191
192 template <class Iter>
193 inline void swap_offsets(Iter first, Iter last, unsigned char *offsets_l,
194 unsigned char *offsets_r, size_t num, bool use_swaps)
195 {
196 typedef typename std::iterator_traits<Iter>::value_type T;
197 if (use_swaps) {
198 // This case is needed for the descending distribution, where we need
199 // to have proper swapping for pdqsort to remain O(n).
200 for (size_t i = 0; i < num; ++i) {
201 std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
202 }
203 }
204 else if (num > 0) {
205 Iter l = first + offsets_l[0];
206 Iter r = last - offsets_r[0];
207 T tmp(PDQSORT_PREFER_MOVE(*l));
208 *l = PDQSORT_PREFER_MOVE(*r);
209 for (size_t i = 1; i < num; ++i) {
210 l = first + offsets_l[i];
211 *r = PDQSORT_PREFER_MOVE(*l);
212 r = last - offsets_r[i];
213 *l = PDQSORT_PREFER_MOVE(*r);
214 }
215 *r = PDQSORT_PREFER_MOVE(tmp);
216 }
217 }
218
219 // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
220 // to the pivot are put in the right-hand partition. Returns the position of the pivot after
221 // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
222 // pivot is a median of at least 3 elements and that [begin, end) is at least
223 // insertion_sort_threshold long. Uses branchless partitioning.
224 template <class Iter, class Compare>
225 inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp)
226 {
227 typedef typename std::iterator_traits<Iter>::value_type T;
228
229 // Move pivot into local for speed.
230 T pivot(PDQSORT_PREFER_MOVE(*begin));
231 Iter first = begin;
232 Iter last = end;
233
234 // Find the first element greater than or equal than the pivot (the median of 3 guarantees
235 // this exists).
236 while (comp(*++first, pivot))
237 ;
238
239 // Find the first element strictly smaller than the pivot. We have to guard this search if
240 // there was no element before *first.
241 if (first - 1 == begin)
242 while (first < last && !comp(*--last, pivot))
243 ;
244 else
245 while (!comp(*--last, pivot))
246 ;
247
248 // If the first pair of elements that should be swapped to partition are the same element,
249 // the passed in sequence already was correctly partitioned.
250 bool already_partitioned = first >= last;
251 if (!already_partitioned) {
252 std::iter_swap(first, last);
253 ++first;
254
255 // The following branchless partitioning is derived from "BlockQuicksort: How Branch
256 // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss, but
257 // heavily micro-optimized.
258 unsigned char offsets_l_storage[block_size + cacheline_size];
259 unsigned char offsets_r_storage[block_size + cacheline_size];
260 unsigned char *offsets_l = align_cacheline(offsets_l_storage);
261 unsigned char *offsets_r = align_cacheline(offsets_r_storage);
262
263 Iter offsets_l_base = first;
264 Iter offsets_r_base = last;
265 size_t num_l, num_r, start_l, start_r;
266 num_l = num_r = start_l = start_r = 0;
267
268 while (first < last) {
269 // Fill up offset blocks with elements that are on the wrong side.
270 // First we determine how much elements are considered for each offset block.
271 size_t num_unknown = last - first;
272 size_t left_split = num_l == 0 ? (num_r == 0 ? num_unknown / 2 : num_unknown) : 0;
273 size_t right_split = num_r == 0 ? (num_unknown - left_split) : 0;
274
275 // Fill the offset blocks.
276 if (left_split >= block_size) {
277 for (size_t i = 0; i < block_size;) {
278 offsets_l[num_l] = (unsigned char)i++;
279 num_l += !comp(*first, pivot);
280 ++first;
281 offsets_l[num_l] = (unsigned char)i++;
282 num_l += !comp(*first, pivot);
283 ++first;
284 offsets_l[num_l] = (unsigned char)i++;
285 num_l += !comp(*first, pivot);
286 ++first;
287 offsets_l[num_l] = (unsigned char)i++;
288 num_l += !comp(*first, pivot);
289 ++first;
290 offsets_l[num_l] = (unsigned char)i++;
291 num_l += !comp(*first, pivot);
292 ++first;
293 offsets_l[num_l] = (unsigned char)i++;
294 num_l += !comp(*first, pivot);
295 ++first;
296 offsets_l[num_l] = (unsigned char)i++;
297 num_l += !comp(*first, pivot);
298 ++first;
299 offsets_l[num_l] = (unsigned char)i++;
300 num_l += !comp(*first, pivot);
301 ++first;
302 }
303 }
304 else {
305 for (size_t i = 0; i < left_split;) {
306 offsets_l[num_l] = (unsigned char)i++;
307 num_l += !comp(*first, pivot);
308 ++first;
309 }
310 }
311
312 if (right_split >= block_size) {
313 for (size_t i = 0; i < block_size;) {
314 offsets_r[num_r] = (unsigned char)++i;
315 num_r += comp(*--last, pivot);
316 offsets_r[num_r] = (unsigned char)++i;
317 num_r += comp(*--last, pivot);
318 offsets_r[num_r] = (unsigned char)++i;
319 num_r += comp(*--last, pivot);
320 offsets_r[num_r] = (unsigned char)++i;
321 num_r += comp(*--last, pivot);
322 offsets_r[num_r] = (unsigned char)++i;
323 num_r += comp(*--last, pivot);
324 offsets_r[num_r] = (unsigned char)++i;
325 num_r += comp(*--last, pivot);
326 offsets_r[num_r] = (unsigned char)++i;
327 num_r += comp(*--last, pivot);
328 offsets_r[num_r] = (unsigned char)++i;
329 num_r += comp(*--last, pivot);
330 }
331 }
332 else {
333 for (size_t i = 0; i < right_split;) {
334 offsets_r[num_r] = (unsigned char)++i;
335 num_r += comp(*--last, pivot);
336 }
337 }
338
339 // Swap elements and update block sizes and first/last boundaries.
340 size_t num = std::min(num_l, num_r);
341 swap_offsets(offsets_l_base, offsets_r_base, offsets_l + start_l, offsets_r + start_r, num,
342 num_l == num_r);
343 num_l -= num;
344 num_r -= num;
345 start_l += num;
346 start_r += num;
347
348 if (num_l == 0) {
349 start_l = 0;
350 offsets_l_base = first;
351 }
352
353 if (num_r == 0) {
354 start_r = 0;
355 offsets_r_base = last;
356 }
357 }
358
359 // We have now fully identified [first, last)'s proper position. Swap the last elements.
360 if (num_l) {
361 offsets_l += start_l;
362 while (num_l--)
363 std::iter_swap(offsets_l_base + offsets_l[num_l], --last);
364 first = last;
365 }
366 if (num_r) {
367 offsets_r += start_r;
368 while (num_r--)
369 std::iter_swap(offsets_r_base - offsets_r[num_r], first), ++first;
370 last = first;
371 }
372 }
373
374 // Put the pivot in the right place.
375 Iter pivot_pos = first - 1;
376 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
377 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
378
379 return std::make_pair(pivot_pos, already_partitioned);
380 }
381
382 // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
383 // to the pivot are put in the right-hand partition. Returns the position of the pivot after
384 // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
385 // pivot is a median of at least 3 elements and that [begin, end) is at least
386 // insertion_sort_threshold long.
387 template <class Iter, class Compare>
388 inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp)
389 {
390 typedef typename std::iterator_traits<Iter>::value_type T;
391
392 // Move pivot into local for speed.
393 T pivot(PDQSORT_PREFER_MOVE(*begin));
394
395 Iter first = begin;
396 Iter last = end;
397
398 // Find the first element greater than or equal than the pivot (the median of 3 guarantees
399 // this exists).
400 while (comp(*++first, pivot))
401 ;
402
403 // Find the first element strictly smaller than the pivot. We have to guard this search if
404 // there was no element before *first.
405 if (first - 1 == begin)
406 while (first < last && !comp(*--last, pivot))
407 ;
408 else
409 while (!comp(*--last, pivot))
410 ;
411
412 // If the first pair of elements that should be swapped to partition are the same element,
413 // the passed in sequence already was correctly partitioned.
414 bool already_partitioned = first >= last;
415
416 // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
417 // swapped pairs guard the searches, which is why the first iteration is special-cased
418 // above.
419 while (first < last) {
420 std::iter_swap(first, last);
421 while (comp(*++first, pivot))
422 ;
423 while (!comp(*--last, pivot))
424 ;
425 }
426
427 // Put the pivot in the right place.
428 Iter pivot_pos = first - 1;
429 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
430 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
431
432 return std::make_pair(pivot_pos, already_partitioned);
433 }
434
435 // Similar function to the one above, except elements equal to the pivot are put to the left of
436 // the pivot and it doesn't check or return if the passed sequence already was partitioned.
437 // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
438 // performance, no block quicksort is applied here for simplicity.
439 template <class Iter, class Compare>
440 inline Iter partition_left(Iter begin, Iter end, Compare comp)
441 {
442 typedef typename std::iterator_traits<Iter>::value_type T;
443
444 T pivot(PDQSORT_PREFER_MOVE(*begin));
445 Iter first = begin;
446 Iter last = end;
447
448 while (comp(pivot, *--last))
449 ;
450
451 if (last + 1 == end)
452 while (first < last && !comp(pivot, *++first))
453 ;
454 else
455 while (!comp(pivot, *++first))
456 ;
457
458 while (first < last) {
459 std::iter_swap(first, last);
460 while (comp(pivot, *--last))
461 ;
462 while (!comp(pivot, *++first))
463 ;
464 }
465
466 Iter pivot_pos = last;
467 *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
468 *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
469
470 return pivot_pos;
471 }
472
473 template <class Iter, class Compare, bool Branchless>
474 inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed,
475 bool leftmost = true)
476 {
477 typedef typename std::iterator_traits<Iter>::difference_type diff_t;
478
479 // Use a while loop for tail recursion elimination.
480 while (true) {
481 diff_t size = end - begin;
482
483 // Insertion sort is faster for small arrays.
484 if (size < insertion_sort_threshold) {
485 if (leftmost)
486 insertion_sort(begin, end, comp);
487 else
488 unguarded_insertion_sort(begin, end, comp);
489 return;
490 }
491
492 // Choose pivot as median of 3 or pseudomedian of 9.
493 diff_t s2 = size / 2;
494 if (size > ninther_threshold) {
495 sort3(begin, begin + s2, end - 1, comp);
496 sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
497 sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
498 sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
499 std::iter_swap(begin, begin + s2);
500 }
501 else
502 sort3(begin + s2, begin, end - 1, comp);
503
504 // If *(begin - 1) is the end of the right partition of a previous partition operation
505 // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
506 // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
507 // the left partition, greater elements in the right partition. We do not have to
508 // recurse on the left partition, since it's sorted (all equal).
509 if (!leftmost && !comp(*(begin - 1), *begin)) {
510 begin = partition_left(begin, end, comp) + 1;
511 continue;
512 }
513
514 // Partition and get results.
515 std::pair<Iter, bool> part_result = Branchless ? partition_right_branchless(begin, end, comp)
516 : partition_right(begin, end, comp);
517 Iter pivot_pos = part_result.first;
518 bool already_partitioned = part_result.second;
519
520 // Check for a highly unbalanced partition.
521 diff_t l_size = pivot_pos - begin;
522 diff_t r_size = end - (pivot_pos + 1);
523 bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
524
525 // If we got a highly unbalanced partition we shuffle elements to break many patterns.
526 if (highly_unbalanced) {
527 // If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
528 if (--bad_allowed == 0) {
529 std::make_heap(begin, end, comp);
530 std::sort_heap(begin, end, comp);
531 return;
532 }
533
534 if (l_size >= insertion_sort_threshold) {
535 std::iter_swap(begin, begin + l_size / 4);
536 std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
537
538 if (l_size > ninther_threshold) {
539 std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
540 std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
541 std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
542 std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
543 }
544 }
545
546 if (r_size >= insertion_sort_threshold) {
547 std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
548 std::iter_swap(end - 1, end - r_size / 4);
549
550 if (r_size > ninther_threshold) {
551 std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
552 std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
553 std::iter_swap(end - 2, end - (1 + r_size / 4));
554 std::iter_swap(end - 3, end - (2 + r_size / 4));
555 }
556 }
557 }
558 else {
559 // If we were decently balanced and we tried to sort an already partitioned
560 // sequence try to use insertion sort.
561 if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp) &&
562 partial_insertion_sort(pivot_pos + 1, end, comp))
563 return;
564 }
565
566 // Sort the left partition first using recursion and do tail recursion elimination for
567 // the right-hand partition.
568 pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
569 begin = pivot_pos + 1;
570 leftmost = false;
571 }
572 }
573} // namespace pdqsort_detail
574
575template <class Iter, class Compare> inline void pdqsort(Iter begin, Iter end, Compare comp)
576{
577 if (begin == end)
578 return;
579
580#if __cplusplus >= 201103L
582 Iter, Compare,
583 pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
584 std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
585 begin, end, comp, pdqsort_detail::pdq_log2(end - begin));
586#else
588 pdqsort_detail::pdq_log2(end - begin));
589#endif
590}
591
592template <class Iter> inline void pdqsort(Iter begin, Iter end)
593{
594 typedef typename std::iterator_traits<Iter>::value_type T;
595 pdqsort(begin, end, std::less<T>());
596}
597
598template <class Iter, class Compare>
599inline void pdqsort_branchless(Iter begin, Iter end, Compare comp)
600{
601 if (begin == end)
602 return;
604 pdqsort_detail::pdq_log2(end - begin));
605}
606
607template <class Iter> inline void pdqsort_branchless(Iter begin, Iter end)
608{
609 typedef typename std::iterator_traits<Iter>::value_type T;
610 pdqsort_branchless(begin, end, std::less<T>());
611}
612
613#undef PDQSORT_PREFER_MOVE
614
615#endif
616// NOLINTEND
Definition pdqsort.h:40
bool partial_insertion_sort(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:136
std::pair< Iter, bool > partition_right(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:388
void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost=true)
Definition pdqsort.h:474
void sort2(Iter a, Iter b, Compare comp)
Definition pdqsort.h:166
void unguarded_insertion_sort(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:109
int pdq_log2(T n)
Definition pdqsort.h:73
@ ninther_threshold
Definition pdqsort.h:46
@ block_size
Definition pdqsort.h:53
@ insertion_sort_threshold
Definition pdqsort.h:43
@ partial_insertion_sort_limit
Definition pdqsort.h:50
@ cacheline_size
Definition pdqsort.h:56
T * align_cacheline(T *p)
Definition pdqsort.h:180
void swap_offsets(Iter first, Iter last, unsigned char *offsets_l, unsigned char *offsets_r, size_t num, bool use_swaps)
Definition pdqsort.h:193
void insertion_sort(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:83
std::pair< Iter, bool > partition_right_branchless(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:225
void sort3(Iter a, Iter b, Iter c, Compare comp)
Definition pdqsort.h:173
Iter partition_left(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:440
void pdqsort_branchless(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:599
#define PDQSORT_PREFER_MOVE(x)
Definition pdqsort.h:36
void pdqsort(Iter begin, Iter end, Compare comp)
Definition pdqsort.h:575