Moar sorting

This commit is contained in:
Justine Tunney 2023-04-27 20:36:25 -07:00
parent dbb8d66f22
commit 7f2bf3895e
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
11 changed files with 3627 additions and 9 deletions

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
#include "libc/mem/alg.h"
#include "libc/mem/gc.internal.h"
#include "libc/mem/mem.h"
@ -25,8 +26,24 @@
#include "libc/str/str.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/testlib.h"
#include "third_party/libcxx/learned_sort.h"
#include "third_party/libcxx/pdqsort.h"
#include "third_party/libcxx/ska_sort.h"
#include "third_party/vqsort/vqsort.h"
void InsertionSort(long *A, long n) {
long i, j, t;
for (i = 1; i < n; i++) {
t = A[i];
j = i - 1;
while (j >= 0 && A[j] > t) {
A[j + 1] = A[j];
j = j - 1;
}
A[j + 1] = t;
}
}
int CompareLong(const void *a, const void *b) {
const long *x = a;
const long *y = b;
@ -110,25 +127,87 @@ BENCH(_longsort, bench) {
long *p1 = gc(malloc(n * sizeof(long)));
long *p2 = gc(malloc(n * sizeof(long)));
rngset(p1, n * sizeof(long), 0, 0);
EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n));
EZBENCH2("_longsort rand", memcpy(p2, p1, n * sizeof(long)),
_longsort(p2, n));
if (X86_HAVE(AVX2)) {
EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("vq_int64_avx2 rand", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_avx2(p2, n));
}
if (X86_HAVE(SSE4_2)) {
EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("vq_int64_sse4 rand", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_sse4(p2, n));
}
if (X86_HAVE(SSSE3)) {
EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("vq_int64_ssse3 rand", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_ssse3(p2, n));
}
EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("vq_int64_sse2 rand", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_sse2(p2, n));
EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("radix_int64 rand", memcpy(p2, p1, n * sizeof(long)),
radix_sort_int64(p2, n));
EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)),
EZBENCH2("ska_sort_int64 rand", memcpy(p2, p1, n * sizeof(long)),
ska_sort_int64(p2, n));
EZBENCH2("pdq_int64 rand", memcpy(p2, p1, n * sizeof(long)),
pdqsort_int64(p2, n));
EZBENCH2("pdq_branchless rand", memcpy(p2, p1, n * sizeof(long)),
pdqsort_branchless_int64(p2, n));
EZBENCH2("learned_int64 rand", memcpy(p2, p1, n * sizeof(long)),
learned_sort_int64(p2, n));
EZBENCH2("qsort(long) rand", memcpy(p2, p1, n * sizeof(long)),
qsort(p2, n, sizeof(long), CompareLong));
EZBENCH2("InsertionSort rand", memcpy(p2, p1, n * sizeof(long)),
InsertionSort(p2, n));
}
BENCH(_longsort, benchNearlySorted) {
printf("\n");
long i, j, k, t;
long n = 5000; // total items
long m = 500; // misplaced items
long d = 5; // maximum drift
long *p1 = gc(malloc(n * sizeof(long)));
long *p2 = gc(malloc(n * sizeof(long)));
for (i = 0; i < n; ++i) {
p1[i] = i;
}
for (i = 0; i < m; ++i) {
j = rand() % n;
k = rand() % (d * 2) - d + j;
k = MIN(MAX(0, k), n - 1);
t = p1[j];
p1[j] = p1[k];
p1[k] = t;
}
EZBENCH2("_longsort near", memcpy(p2, p1, n * sizeof(long)),
_longsort(p2, n));
if (X86_HAVE(AVX2)) {
EZBENCH2("vq_int64_avx2 near", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_avx2(p2, n));
}
if (X86_HAVE(SSE4_2)) {
EZBENCH2("vq_int64_sse4 near", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_sse4(p2, n));
}
if (X86_HAVE(SSSE3)) {
EZBENCH2("vq_int64_ssse3 near", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_ssse3(p2, n));
}
EZBENCH2("vq_int64_sse2 near", memcpy(p2, p1, n * sizeof(long)),
vqsort_int64_sse2(p2, n));
EZBENCH2("radix_int64 near", memcpy(p2, p1, n * sizeof(long)),
radix_sort_int64(p2, n));
EZBENCH2("ska_sort_int64 near", memcpy(p2, p1, n * sizeof(long)),
ska_sort_int64(p2, n));
EZBENCH2("pdq_int64 near", memcpy(p2, p1, n * sizeof(long)),
pdqsort_int64(p2, n));
EZBENCH2("pdq_branchless near", memcpy(p2, p1, n * sizeof(long)),
pdqsort_branchless_int64(p2, n));
EZBENCH2("learned_int64 near", memcpy(p2, p1, n * sizeof(long)),
learned_sort_int64(p2, n));
EZBENCH2("qsort(long) near", memcpy(p2, p1, n * sizeof(long)),
qsort(p2, n, sizeof(long), CompareLong));
EZBENCH2("InsertionSort near", memcpy(p2, p1, n * sizeof(long)),
InsertionSort(p2, n));
}
int CompareInt(const void *a, const void *b) {
@ -215,6 +294,7 @@ BENCH(_intsort, bench) {
int *p2 = gc(malloc(n * sizeof(int)));
rngset(p1, n * sizeof(int), 0, 0);
EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
if (X86_HAVE(AVX2)) {
EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),
vqsort_int32_avx2(p2, n));
@ -229,7 +309,6 @@ BENCH(_intsort, bench) {
}
EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)),
vqsort_int32_sse2(p2, n));
EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)),
radix_sort_int32(p2, n));
EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)),

View file

@ -1,6 +1,6 @@
// -*- c++ -*-
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
#ifndef LLAMA_UTIL_H
#define LLAMA_UTIL_H
#include "libc/calls/struct/rlimit.h"

25
third_party/libcxx/learned_sort.cc vendored Normal file
View file

@ -0,0 +1,25 @@
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/libcxx/learned_sort.h"
void learned_sort_int64(int64_t* A, size_t n) {
std::vector<int64_t> v(A, A + n);
learned_sort::sort(v.begin(), v.end());
std::copy(v.begin(), v.end(), A);
}

1100
third_party/libcxx/learned_sort.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -8,6 +8,11 @@ THIRD_PARTY_LIBCXX = $(THIRD_PARTY_LIBCXX_A_DEPS) $(THIRD_PARTY_LIBCXX_A)
THIRD_PARTY_LIBCXX_A = o/$(MODE)/third_party/libcxx/libcxx.a
THIRD_PARTY_LIBCXX_A_HDRS = \
third_party/libcxx/rmi.h \
third_party/libcxx/utils.h \
third_party/libcxx/learned_sort.h \
third_party/libcxx/pdqsort.h \
third_party/libcxx/ska_sort.h \
third_party/libcxx/__bit_reference \
third_party/libcxx/__bsd_locale_fallbacks.h \
third_party/libcxx/__config \
@ -138,6 +143,9 @@ THIRD_PARTY_LIBCXX_A_HDRS = \
third_party/libcxx/wctype.h
THIRD_PARTY_LIBCXX_A_SRCS_CC = \
third_party/libcxx/learned_sort.cc \
third_party/libcxx/pdqsort.cc \
third_party/libcxx/ska_sort.cc \
third_party/libcxx/algorithm.cc \
third_party/libcxx/charconv.cc \
third_party/libcxx/chrono.cc \

25
third_party/libcxx/pdqsort.cc vendored Normal file
View file

@ -0,0 +1,25 @@
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/libcxx/pdqsort.h"
void pdqsort_int64(int64_t* A, size_t n) { pdqsort(A, A + n); }
void pdqsort_branchless_int64(int64_t* A, size_t n) {
pdqsort_branchless(A, A + n);
}

552
third_party/libcxx/pdqsort.h vendored Normal file
View file

@ -0,0 +1,552 @@
/* -*- c++ -*- */
/* clang-format off */
/*
pdqsort.h - Pattern-defeating quicksort.
Copyright (c) 2015 Orson Peters
This software is provided 'as-is', without any express or implied warranty. In no event will the
authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, including commercial
applications, and to alter it and redistribute it freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the
original software. If you use this software in a product, an acknowledgment in the product
documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as
being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef PDQSORT_H
#define PDQSORT_H
#ifdef __cplusplus
#include "third_party/libcxx/algorithm"
#include "third_party/libcxx/cstddef"
#include "third_party/libcxx/functional"
#include "third_party/libcxx/utility"
#include "third_party/libcxx/iterator"
#if __cplusplus >= 201103L
#include "third_party/libcxx/cstdint"
#include "third_party/libcxx/type_traits"
#define PDQSORT_PREFER_MOVE(x) std::move(x)
#else
#define PDQSORT_PREFER_MOVE(x) (x)
#endif
namespace pdqsort_detail {
enum {
// Partitions below this size are sorted using insertion sort.
insertion_sort_threshold = 24,
// Partitions above this size use Tukey's ninther to select the pivot.
ninther_threshold = 128,
// When we detect an already sorted partition, attempt an insertion sort that allows this
// amount of element moves before giving up.
partial_insertion_sort_limit = 8,
// Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
block_size = 64,
// Cacheline size, assumes power of two.
cacheline_size = 64
};
#if __cplusplus >= 201103L
template<class T> struct is_default_compare : std::false_type { };
template<class T> struct is_default_compare<std::less<T> > : std::true_type { };
template<class T> struct is_default_compare<std::greater<T> > : std::true_type { };
#endif
// Returns floor(log2(n)), assumes n > 0.
template<class T>
inline int log2(T n) {
int log = 0;
while (n >>= 1) ++log;
return log;
}
// Sorts [begin, end) using insertion sort with the given comparison function.
template<class Iter, class Compare>
inline void insertion_sort(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
if (begin == end) return;
for (Iter cur = begin + 1; cur != end; ++cur) {
Iter sift = cur;
Iter sift_1 = cur - 1;
// Compare first so we can avoid 2 moves for an element already positioned correctly.
if (comp(*sift, *sift_1)) {
T tmp = PDQSORT_PREFER_MOVE(*sift);
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
while (sift != begin && comp(tmp, *--sift_1));
*sift = PDQSORT_PREFER_MOVE(tmp);
}
}
}
// Sorts [begin, end) using insertion sort with the given comparison function. Assumes
// *(begin - 1) is an element smaller than or equal to any element in [begin, end).
template<class Iter, class Compare>
inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
if (begin == end) return;
for (Iter cur = begin + 1; cur != end; ++cur) {
Iter sift = cur;
Iter sift_1 = cur - 1;
// Compare first so we can avoid 2 moves for an element already positioned correctly.
if (comp(*sift, *sift_1)) {
T tmp = PDQSORT_PREFER_MOVE(*sift);
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
while (comp(tmp, *--sift_1));
*sift = PDQSORT_PREFER_MOVE(tmp);
}
}
}
// Attempts to use insertion sort on [begin, end). Will return false if more than
// partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
// successfully sort and return true.
template<class Iter, class Compare>
inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
if (begin == end) return true;
std::size_t limit = 0;
for (Iter cur = begin + 1; cur != end; ++cur) {
Iter sift = cur;
Iter sift_1 = cur - 1;
// Compare first so we can avoid 2 moves for an element already positioned correctly.
if (comp(*sift, *sift_1)) {
T tmp = PDQSORT_PREFER_MOVE(*sift);
do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
while (sift != begin && comp(tmp, *--sift_1));
*sift = PDQSORT_PREFER_MOVE(tmp);
limit += cur - sift;
}
if (limit > partial_insertion_sort_limit) return false;
}
return true;
}
template<class Iter, class Compare>
inline void sort2(Iter a, Iter b, Compare comp) {
if (comp(*b, *a)) std::iter_swap(a, b);
}
// Sorts the elements *a, *b and *c using comparison function comp.
template<class Iter, class Compare>
inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
sort2(a, b, comp);
sort2(b, c, comp);
sort2(a, b, comp);
}
template<class T>
inline T* align_cacheline(T* p) {
#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
#else
std::size_t ip = reinterpret_cast<std::size_t>(p);
#endif
ip = (ip + cacheline_size - 1) & -cacheline_size;
return reinterpret_cast<T*>(ip);
}
template<class Iter>
inline void swap_offsets(Iter first, Iter last,
unsigned char* offsets_l, unsigned char* offsets_r,
int num, bool use_swaps) {
typedef typename std::iterator_traits<Iter>::value_type T;
if (use_swaps) {
// This case is needed for the descending distribution, where we need
// to have proper swapping for pdqsort to remain O(n).
for (int i = 0; i < num; ++i) {
std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
}
} else if (num > 0) {
Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r);
for (int i = 1; i < num; ++i) {
l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l);
r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r);
}
*r = PDQSORT_PREFER_MOVE(tmp);
}
}
// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
// to the pivot are put in the right-hand partition. Returns the position of the pivot after
// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
// pivot is a median of at least 3 elements and that [begin, end) is at least
// insertion_sort_threshold long. Uses branchless partitioning.
template<class Iter, class Compare>
inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
// Move pivot into local for speed.
T pivot(PDQSORT_PREFER_MOVE(*begin));
Iter first = begin;
Iter last = end;
// Find the first element greater than or equal than the pivot (the median of 3 guarantees
// this exists).
while (comp(*++first, pivot));
// Find the first element strictly smaller than the pivot. We have to guard this search if
// there was no element before *first.
if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
else while ( !comp(*--last, pivot));
// If the first pair of elements that should be swapped to partition are the same element,
// the passed in sequence already was correctly partitioned.
bool already_partitioned = first >= last;
if (!already_partitioned) {
std::iter_swap(first, last);
++first;
}
// The following branchless partitioning is derived from "BlockQuicksort: How Branch
// Mispredictions dont affect Quicksort" by Stefan Edelkamp and Armin Weiss.
unsigned char offsets_l_storage[block_size + cacheline_size];
unsigned char offsets_r_storage[block_size + cacheline_size];
unsigned char* offsets_l = align_cacheline(offsets_l_storage);
unsigned char* offsets_r = align_cacheline(offsets_r_storage);
int num_l, num_r, start_l, start_r;
num_l = num_r = start_l = start_r = 0;
while (last - first > 2 * block_size) {
// Fill up offset blocks with elements that are on the wrong side.
if (num_l == 0) {
start_l = 0;
Iter it = first;
for (unsigned char i = 0; i < block_size;) {
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
}
}
if (num_r == 0) {
start_r = 0;
Iter it = last;
for (unsigned char i = 0; i < block_size;) {
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
}
}
// Swap elements and update block sizes and first/last boundaries.
int num = std::min(num_l, num_r);
swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
num, num_l == num_r);
num_l -= num; num_r -= num;
start_l += num; start_r += num;
if (num_l == 0) first += block_size;
if (num_r == 0) last -= block_size;
}
int l_size = 0, r_size = 0;
int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0);
if (num_r) {
// Handle leftover block by assigning the unknown elements to the other block.
l_size = unknown_left;
r_size = block_size;
} else if (num_l) {
l_size = block_size;
r_size = unknown_left;
} else {
// No leftover block, split the unknown elements in two blocks.
l_size = unknown_left/2;
r_size = unknown_left - l_size;
}
// Fill offset buffers if needed.
if (unknown_left && !num_l) {
start_l = 0;
Iter it = first;
for (unsigned char i = 0; i < l_size;) {
offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
}
}
if (unknown_left && !num_r) {
start_r = 0;
Iter it = last;
for (unsigned char i = 0; i < r_size;) {
offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
}
}
int num = std::min(num_l, num_r);
swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
num_l -= num; num_r -= num;
start_l += num; start_r += num;
if (num_l == 0) first += l_size;
if (num_r == 0) last -= r_size;
// We have now fully identified [first, last)'s proper position. Swap the last elements.
if (num_l) {
offsets_l += start_l;
while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
first = last;
}
if (num_r) {
offsets_r += start_r;
while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
last = first;
}
// Put the pivot in the right place.
Iter pivot_pos = first - 1;
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
return std::make_pair(pivot_pos, already_partitioned);
}
// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
// to the pivot are put in the right-hand partition. Returns the position of the pivot after
// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
// pivot is a median of at least 3 elements and that [begin, end) is at least
// insertion_sort_threshold long.
template<class Iter, class Compare>
inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
// Move pivot into local for speed.
T pivot(PDQSORT_PREFER_MOVE(*begin));
Iter first = begin;
Iter last = end;
// Find the first element greater than or equal than the pivot (the median of 3 guarantees
// this exists).
while (comp(*++first, pivot));
// Find the first element strictly smaller than the pivot. We have to guard this search if
// there was no element before *first.
if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
else while ( !comp(*--last, pivot));
// If the first pair of elements that should be swapped to partition are the same element,
// the passed in sequence already was correctly partitioned.
bool already_partitioned = first >= last;
// Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
// swapped pairs guard the searches, which is why the first iteration is special-cased
// above.
while (first < last) {
std::iter_swap(first, last);
while (comp(*++first, pivot));
while (!comp(*--last, pivot));
}
// Put the pivot in the right place.
Iter pivot_pos = first - 1;
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
return std::make_pair(pivot_pos, already_partitioned);
}
// Similar function to the one above, except elements equal to the pivot are put to the left of
// the pivot and it doesn't check or return if the passed sequence already was partitioned.
// Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
// performance, no block quicksort is applied here for simplicity.
template<class Iter, class Compare>
inline Iter partition_left(Iter begin, Iter end, Compare comp) {
typedef typename std::iterator_traits<Iter>::value_type T;
T pivot(PDQSORT_PREFER_MOVE(*begin));
Iter first = begin;
Iter last = end;
while (comp(pivot, *--last));
if (last + 1 == end) while (first < last && !comp(pivot, *++first));
else while ( !comp(pivot, *++first));
while (first < last) {
std::iter_swap(first, last);
while (comp(pivot, *--last));
while (!comp(pivot, *++first));
}
Iter pivot_pos = last;
*begin = PDQSORT_PREFER_MOVE(*pivot_pos);
*pivot_pos = PDQSORT_PREFER_MOVE(pivot);
return pivot_pos;
}
template<class Iter, class Compare, bool Branchless>
inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
typedef typename std::iterator_traits<Iter>::difference_type diff_t;
// Use a while loop for tail recursion elimination.
while (true) {
diff_t size = end - begin;
// Insertion sort is faster for small arrays.
if (size < insertion_sort_threshold) {
if (leftmost) insertion_sort(begin, end, comp);
else unguarded_insertion_sort(begin, end, comp);
return;
}
// Choose pivot as median of 3 or pseudomedian of 9.
diff_t s2 = size / 2;
if (size > ninther_threshold) {
sort3(begin, begin + s2, end - 1, comp);
sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
std::iter_swap(begin, begin + s2);
} else sort3(begin + s2, begin, end - 1, comp);
// If *(begin - 1) is the end of the right partition of a previous partition operation
// there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
// pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
// the left partition, greater elements in the right partition. We do not have to
// recurse on the left partition, since it's sorted (all equal).
if (!leftmost && !comp(*(begin - 1), *begin)) {
begin = partition_left(begin, end, comp) + 1;
continue;
}
// Partition and get results.
std::pair<Iter, bool> part_result =
Branchless ? partition_right_branchless(begin, end, comp)
: partition_right(begin, end, comp);
Iter pivot_pos = part_result.first;
bool already_partitioned = part_result.second;
// Check for a highly unbalanced partition.
diff_t l_size = pivot_pos - begin;
diff_t r_size = end - (pivot_pos + 1);
bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
// If we got a highly unbalanced partition we shuffle elements to break many patterns.
if (highly_unbalanced) {
// If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
if (--bad_allowed == 0) {
std::make_heap(begin, end, comp);
std::sort_heap(begin, end, comp);
return;
}
if (l_size >= insertion_sort_threshold) {
std::iter_swap(begin, begin + l_size / 4);
std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
if (l_size > ninther_threshold) {
std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
}
}
if (r_size >= insertion_sort_threshold) {
std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
std::iter_swap(end - 1, end - r_size / 4);
if (r_size > ninther_threshold) {
std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
std::iter_swap(end - 2, end - (1 + r_size / 4));
std::iter_swap(end - 3, end - (2 + r_size / 4));
}
}
} else {
// If we were decently balanced and we tried to sort an already partitioned
// sequence try to use insertion sort.
if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
&& partial_insertion_sort(pivot_pos + 1, end, comp)) return;
}
// Sort the left partition first using recursion and do tail recursion elimination for
// the right-hand partition.
pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
begin = pivot_pos + 1;
leftmost = false;
}
}
}
template<class Iter, class Compare>
inline void pdqsort(Iter begin, Iter end, Compare comp) {
if (begin == end) return;
#if __cplusplus >= 201103L
pdqsort_detail::pdqsort_loop<Iter, Compare,
pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
begin, end, comp, pdqsort_detail::log2(end - begin));
#else
pdqsort_detail::pdqsort_loop<Iter, Compare, false>(
begin, end, comp, pdqsort_detail::log2(end - begin));
#endif
}
template<class Iter>
inline void pdqsort(Iter begin, Iter end) {
typedef typename std::iterator_traits<Iter>::value_type T;
pdqsort(begin, end, std::less<T>());
}
template<class Iter, class Compare>
inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) {
if (begin == end) return;
pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
begin, end, comp, pdqsort_detail::log2(end - begin));
}
template<class Iter>
inline void pdqsort_branchless(Iter begin, Iter end) {
typedef typename std::iterator_traits<Iter>::value_type T;
pdqsort_branchless(begin, end, std::less<T>());
}
#undef PDQSORT_PREFER_MOVE
#endif /* __cplusplus */
COSMOPOLITAN_C_START_
void pdqsort_int64(int64_t *, size_t);
void pdqsort_branchless_int64(int64_t *, size_t);
COSMOPOLITAN_C_END_
#endif /* PDQSORT_H */

308
third_party/libcxx/rmi.h vendored Normal file
View file

@ -0,0 +1,308 @@
// -*- c++ -*-
// clang-format off
#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
#include "third_party/libcxx/iostream"
#include "third_party/libcxx/vector"
using namespace std;
namespace learned_sort {
// Packs the key and its respective scaled CDF value
template <typename T>
struct training_point {
T x;
double y;
};
// Represents linear models
struct linear_model {
double slope = 0;
double intercept = 0;
};
// An implementation of a 2-layer RMI model
template <class T>
class TwoLayerRMI {
public:
// CDF model hyperparameters
struct Params {
// Member fields
long fanout;
float sampling_rate;
long threshold;
long num_leaf_models;
// Default hyperparameters
static constexpr long DEFAULT_FANOUT = 1e3;
static constexpr float DEFAULT_SAMPLING_RATE = .01;
static constexpr long DEFAULT_THRESHOLD = 100;
static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
static constexpr long MIN_SORTING_SIZE = 1e4;
// Default constructor
Params() {
this->fanout = DEFAULT_FANOUT;
this->sampling_rate = DEFAULT_SAMPLING_RATE;
this->threshold = DEFAULT_THRESHOLD;
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
}
// Constructor with custom hyperparameter values
Params(float sampling_rate, float overallocation, long fanout,
long batch_sz, long threshold) {
this->fanout = fanout;
this->sampling_rate = sampling_rate;
this->threshold = threshold;
this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
}
};
// Member variables of the CDF model
bool trained;
linear_model root_model;
vector<linear_model> leaf_models;
vector<T> training_sample;
Params hp;
bool enable_dups_detection;
// CDF model constructor
TwoLayerRMI(Params p) {
this->trained = false;
this->hp = p;
this->leaf_models.resize(p.num_leaf_models);
this->enable_dups_detection = true;
}
// Pretty-printing function
void print() {
printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
root_model.intercept);
for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
}
cout << "-----------------------------" << endl;
}
/**
* @brief Train a CDF function with an RMI architecture, using linear spline
* interpolation.
*
* @param begin Random-access iterators to the initial position of the
* sequence to be used for sorting. The range used is [begin,end), which
* contains all the elements between first and last, including the element
* pointed by first but not the element pointed by last.
* @param end Random-access iterators to the last position of the sequence to
* be used for sorting. The range used is [begin,end), which contains all the
* elements between first and last, including the element pointed by first but
* not the element pointed by last.
* @return true if the model was trained successfully, false otherwise.
*/
bool train(typename vector<T>::iterator begin,
typename vector<T>::iterator end) {
// Determine input size
const long INPUT_SZ = std::distance(begin, end);
// Validate parameters
if (this->hp.fanout >= INPUT_SZ) {
this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
}
if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
}
if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
<< TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
}
// Initialize the CDF model
static const long NUM_LAYERS = 2;
vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
training_data[layer_idx].resize(hp.num_leaf_models);
}
//----------------------------------------------------------//
// SAMPLE //
//----------------------------------------------------------//
// Determine sample size
const long SAMPLE_SZ = std::min<long>(
INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
// Create a sample array
this->training_sample.reserve(SAMPLE_SZ);
// Start sampling
long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
for (auto i = begin; i < end; i += offset) {
// NOTE: We don't directly assign SAMPLE_SZ to this->training_sample_sz
// to avoid issues with divisibility
this->training_sample.push_back(*i);
}
// Sort the sample using the provided comparison function
std::sort(this->training_sample.begin(), this->training_sample.end());
// Count the number of unique keys
auto sample_cpy = this->training_sample;
long num_unique_elms = std::distance(
sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
// Stop early if the array has very few unique values. We need at least 2
// unique training examples per leaf model.
if (num_unique_elms < 2 * this->hp.num_leaf_models) {
return false;
} else if (num_unique_elms > .9 * training_sample.size()) {
this->enable_dups_detection = false;
}
//----------------------------------------------------------//
// TRAIN THE MODELS //
//----------------------------------------------------------//
// Populate the training data for the root model
for (long i = 0; i < SAMPLE_SZ; ++i) {
training_data[0][0].push_back(
{this->training_sample[i], 1. * i / SAMPLE_SZ});
}
// Train the root model using linear interpolation
auto *current_training_data = &training_data[0][0];
linear_model *current_model = &(this->root_model);
// Find the min and max values in the training set
training_point<T> min = current_training_data->front();
training_point<T> max = current_training_data->back();
// Calculate the slope and intercept terms, assuming min.y = 0 and max.y
current_model->slope = 1. / (max.x - min.x);
current_model->intercept = -current_model->slope * min.x;
// Extrapolate for the number of models in the next layer
current_model->slope *= this->hp.num_leaf_models - 1;
current_model->intercept *= this->hp.num_leaf_models - 1;
// Populate the training data for the next layer
for (const auto &d : *current_training_data) {
// Predict the model index in next layer
long rank = current_model->slope * d.x + current_model->intercept;
// Normalize the rank between 0 and the number of models in the next layer
rank = std::max(static_cast<long>(0),
std::min(this->hp.num_leaf_models - 1, rank));
// Place the data in the predicted training bucket
training_data[1][rank].push_back(d);
}
// Train the leaf models
for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
++model_idx) {
// Update iterator variables
current_training_data = &training_data[1][model_idx];
current_model = &(this->leaf_models[model_idx]);
// Interpolate the min points in the training buckets
if (model_idx == 0) {
// The current model is the first model in the current layer
if (current_training_data->size() < 2) {
// Case 1: The first model in this layer is empty
current_model->slope = 0;
current_model->intercept = 0;
// Insert a fictive training point to avoid propagating more than one
// empty initial models.
training_point<T> tp;
tp.x = 0;
tp.y = 0;
current_training_data->push_back(tp);
} else {
// Case 2: The first model in this layer is not empty
min = current_training_data->front();
max = current_training_data->back();
// Hallucinating as if min.y = 0
current_model->slope = (1. * max.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
} else if (model_idx == this->hp.num_leaf_models - 1) {
if (current_training_data->empty()) {
// Case 3: The final model in this layer is empty
current_model->slope = 0;
current_model->intercept = 1;
} else {
// Case 4: The last model in this layer is not empty
min = training_data[1][model_idx - 1].back();
max = current_training_data->back();
// Hallucinating as if max.y = 1
current_model->slope = (1. - min.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
} else {
// The current model is not the first model in the current layer
if (current_training_data->empty()) {
// Case 5: The intermediate model in this layer is empty
current_model->slope = 0;
current_model->intercept = training_data[1][model_idx - 1]
.back()
.y; // If the previous model
// was empty too, it will
// use the fictive
// training points
// Insert a fictive training point to avoid propagating more than one
// empty initial models.
// NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
// because it is working backwards.
training_point<T> tp;
tp.x = training_data[1][model_idx - 1].back().x;
tp.y = training_data[1][model_idx - 1].back().y;
current_training_data->push_back(tp);
} else {
// Case 6: The intermediate leaf model is not empty
min = training_data[1][model_idx - 1].back();
max = current_training_data->back();
current_model->slope = (max.y - min.y) / (max.x - min.x);
current_model->intercept = min.y - current_model->slope * min.x;
}
}
}
// NOTE:
// The last stage (layer) of this model contains weights that predict the
// CDF of the keys (i.e. Range is [0-1]) When using this model to predict
// the position of the keys in the sorted order, you MUST scale the weights
// of the last layer to whatever range you are predicting for. The inner
// layers of the model have already been extrapolated to the length of the
// stage.git
//
// This is a design choice to help with the portability of the model.
//
this->trained = true;
return true;
}
};
} // namespace learned_sort
#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */

22
third_party/libcxx/ska_sort.cc vendored Normal file
View file

@ -0,0 +1,22 @@
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "third_party/libcxx/ska_sort.h"
#include "third_party/libcxx/ska_sort.h"
void ska_sort_int64(int64_t* A, size_t n) { ska_sort(A, A + n); }

1457
third_party/libcxx/ska_sort.h vendored Normal file

File diff suppressed because it is too large Load diff

42
third_party/libcxx/utils.h vendored Normal file
View file

@ -0,0 +1,42 @@
// -*- c++ -*-
// clang-format off
#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
#include "third_party/libcxx/iterator"
namespace learned_sort {
namespace utils {
// Helper functions for working with unsigned types
template <typename T>
inline T _get_key(T a) {
return a;
}
template <class RandomIt>
void insertion_sort(RandomIt begin, RandomIt end) {
// Determine the data type
typedef typename std::iterator_traits<RandomIt>::value_type T;
// Determine the input size
const size_t input_sz = std::distance(begin, end);
if (input_sz <= 0) return;
RandomIt cmp_idx;
T key;
for (auto i = begin + 1; i != end; ++i) {
key = i[0];
cmp_idx = i - 1;
while (cmp_idx >= begin && cmp_idx[0] > key) {
cmp_idx[1] = cmp_idx[0];
--cmp_idx;
}
cmp_idx[1] = key;
}
}
} // namespace utils
} // namespace learned_sort
#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ */