Moar sorting

2024-05-05 21:28:46 +00:00 · 2023-04-27 20:36:25 -07:00 · 2023-04-27 20:36:25 -07:00 · 7f2bf3895e
parent dbb8d66f22
commit 7f2bf3895e
11 changed files with 3627 additions and 9 deletions
--- a/test/libc/str/longsort_test.c
+++ b/test/libc/str/longsort_test.c
@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/macros.internal.h"
 #include "libc/mem/alg.h"
 #include "libc/mem/gc.internal.h"
 #include "libc/mem/mem.h"
@ -25,8 +26,24 @@
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
+#include "third_party/libcxx/learned_sort.h"
+#include "third_party/libcxx/pdqsort.h"
+#include "third_party/libcxx/ska_sort.h"
 #include "third_party/vqsort/vqsort.h"

+void InsertionSort(long *A, long n) {
+  long i, j, t;
+  for (i = 1; i < n; i++) {
+    t = A[i];
+    j = i - 1;
+    while (j >= 0 && A[j] > t) {
+      A[j + 1] = A[j];
+      j = j - 1;
+    }
+    A[j + 1] = t;
+  }
+}
+
 int CompareLong(const void *a, const void *b) {
  const long *x = a;
  const long *y = b;
@ -110,25 +127,87 @@ BENCH(_longsort, bench) {
  long *p1 = gc(malloc(n * sizeof(long)));
  long *p2 = gc(malloc(n * sizeof(long)));
  rngset(p1, n * sizeof(long), 0, 0);
-  EZBENCH2("_longsort", memcpy(p2, p1, n * sizeof(long)), _longsort(p2, n));
+  EZBENCH2("_longsort rand", memcpy(p2, p1, n * sizeof(long)),
+           _longsort(p2, n));
  if (X86_HAVE(AVX2)) {
-    EZBENCH2("vqsort_int64_avx2", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_avx2 rand", memcpy(p2, p1, n * sizeof(long)),
             vqsort_int64_avx2(p2, n));
  }
  if (X86_HAVE(SSE4_2)) {
-    EZBENCH2("vqsort_int64_sse4", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_sse4 rand", memcpy(p2, p1, n * sizeof(long)),
             vqsort_int64_sse4(p2, n));
  }
  if (X86_HAVE(SSSE3)) {
-    EZBENCH2("vqsort_int64_ssse3", memcpy(p2, p1, n * sizeof(long)),
+    EZBENCH2("vq_int64_ssse3 rand", memcpy(p2, p1, n * sizeof(long)),
             vqsort_int64_ssse3(p2, n));
  }
-  EZBENCH2("vqsort_int64_sse2", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("vq_int64_sse2 rand", memcpy(p2, p1, n * sizeof(long)),
           vqsort_int64_sse2(p2, n));
-  EZBENCH2("radix_sort_int64", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("radix_int64 rand", memcpy(p2, p1, n * sizeof(long)),
           radix_sort_int64(p2, n));
-  EZBENCH2("qsort(long)", memcpy(p2, p1, n * sizeof(long)),
+  EZBENCH2("ska_sort_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           ska_sort_int64(p2, n));
+  EZBENCH2("pdq_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_int64(p2, n));
+  EZBENCH2("pdq_branchless rand", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_branchless_int64(p2, n));
+  EZBENCH2("learned_int64 rand", memcpy(p2, p1, n * sizeof(long)),
+           learned_sort_int64(p2, n));
+  EZBENCH2("qsort(long) rand", memcpy(p2, p1, n * sizeof(long)),
           qsort(p2, n, sizeof(long), CompareLong));
+  EZBENCH2("InsertionSort rand", memcpy(p2, p1, n * sizeof(long)),
+           InsertionSort(p2, n));
+}
+
+BENCH(_longsort, benchNearlySorted) {
+  printf("\n");
+  long i, j, k, t;
+  long n = 5000;  // total items
+  long m = 500;   // misplaced items
+  long d = 5;     // maximum drift
+  long *p1 = gc(malloc(n * sizeof(long)));
+  long *p2 = gc(malloc(n * sizeof(long)));
+  for (i = 0; i < n; ++i) {
+    p1[i] = i;
+  }
+  for (i = 0; i < m; ++i) {
+    j = rand() % n;
+    k = rand() % (d * 2) - d + j;
+    k = MIN(MAX(0, k), n - 1);
+    t = p1[j];
+    p1[j] = p1[k];
+    p1[k] = t;
+  }
+  EZBENCH2("_longsort near", memcpy(p2, p1, n * sizeof(long)),
+           _longsort(p2, n));
+  if (X86_HAVE(AVX2)) {
+    EZBENCH2("vq_int64_avx2 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_avx2(p2, n));
+  }
+  if (X86_HAVE(SSE4_2)) {
+    EZBENCH2("vq_int64_sse4 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_sse4(p2, n));
+  }
+  if (X86_HAVE(SSSE3)) {
+    EZBENCH2("vq_int64_ssse3 near", memcpy(p2, p1, n * sizeof(long)),
+             vqsort_int64_ssse3(p2, n));
+  }
+  EZBENCH2("vq_int64_sse2 near", memcpy(p2, p1, n * sizeof(long)),
+           vqsort_int64_sse2(p2, n));
+  EZBENCH2("radix_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           radix_sort_int64(p2, n));
+  EZBENCH2("ska_sort_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           ska_sort_int64(p2, n));
+  EZBENCH2("pdq_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_int64(p2, n));
+  EZBENCH2("pdq_branchless near", memcpy(p2, p1, n * sizeof(long)),
+           pdqsort_branchless_int64(p2, n));
+  EZBENCH2("learned_int64 near", memcpy(p2, p1, n * sizeof(long)),
+           learned_sort_int64(p2, n));
+  EZBENCH2("qsort(long) near", memcpy(p2, p1, n * sizeof(long)),
+           qsort(p2, n, sizeof(long), CompareLong));
+  EZBENCH2("InsertionSort near", memcpy(p2, p1, n * sizeof(long)),
+           InsertionSort(p2, n));
 }

 int CompareInt(const void *a, const void *b) {
@ -215,6 +294,7 @@ BENCH(_intsort, bench) {
  int *p2 = gc(malloc(n * sizeof(int)));
  rngset(p1, n * sizeof(int), 0, 0);
  EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
+  EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
  if (X86_HAVE(AVX2)) {
    EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),
             vqsort_int32_avx2(p2, n));
@ -229,7 +309,6 @@ BENCH(_intsort, bench) {
  }
  EZBENCH2("vqsort_int32_sse2", memcpy(p2, p1, n * sizeof(int)),
           vqsort_int32_sse2(p2, n));
-  EZBENCH2("djbsort", memcpy(p2, p1, n * sizeof(int)), djbsort(p2, n));
  EZBENCH2("radix_sort_int32", memcpy(p2, p1, n * sizeof(int)),
           radix_sort_int32(p2, n));
  EZBENCH2("qsort(int)", memcpy(p2, p1, n * sizeof(int)),
--- a/third_party/ggml/llama_util.h
+++ b/third_party/ggml/llama_util.h
@ -1,6 +1,6 @@
+// -*- c++ -*-
 // Internal header to be included only by llama.cpp.
 // Contains wrappers around OS interfaces.
-
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
 #include "libc/calls/struct/rlimit.h"
--- a/third_party/libcxx/learned_sort.cc
+++ b/third_party/libcxx/learned_sort.cc
@ -0,0 +1,25 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/learned_sort.h"
+
+void learned_sort_int64(int64_t* A, size_t n) {
+  std::vector<int64_t> v(A, A + n);
+  learned_sort::sort(v.begin(), v.end());
+  std::copy(v.begin(), v.end(), A);
+}
--- a/third_party/libcxx/learned_sort.h
+++ b/third_party/libcxx/learned_sort.h
--- a/third_party/libcxx/libcxx.mk
+++ b/third_party/libcxx/libcxx.mk
@ -8,6 +8,11 @@ THIRD_PARTY_LIBCXX = $(THIRD_PARTY_LIBCXX_A_DEPS) $(THIRD_PARTY_LIBCXX_A)
 THIRD_PARTY_LIBCXX_A = o/$(MODE)/third_party/libcxx/libcxx.a

 THIRD_PARTY_LIBCXX_A_HDRS =					\
+	third_party/libcxx/rmi.h				\
+	third_party/libcxx/utils.h				\
+	third_party/libcxx/learned_sort.h			\
+	third_party/libcxx/pdqsort.h				\
+	third_party/libcxx/ska_sort.h				\
 	third_party/libcxx/__bit_reference			\
 	third_party/libcxx/__bsd_locale_fallbacks.h		\
 	third_party/libcxx/__config				\
@ -138,6 +143,9 @@ THIRD_PARTY_LIBCXX_A_HDRS =					\
 	third_party/libcxx/wctype.h

 THIRD_PARTY_LIBCXX_A_SRCS_CC =					\
+	third_party/libcxx/learned_sort.cc			\
+	third_party/libcxx/pdqsort.cc				\
+	third_party/libcxx/ska_sort.cc				\
 	third_party/libcxx/algorithm.cc				\
 	third_party/libcxx/charconv.cc				\
 	third_party/libcxx/chrono.cc				\
--- a/third_party/libcxx/pdqsort.cc
+++ b/third_party/libcxx/pdqsort.cc
@ -0,0 +1,25 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/pdqsort.h"
+
+void pdqsort_int64(int64_t* A, size_t n) { pdqsort(A, A + n); }
+
+void pdqsort_branchless_int64(int64_t* A, size_t n) {
+  pdqsort_branchless(A, A + n);
+}
--- a/third_party/libcxx/pdqsort.h
+++ b/third_party/libcxx/pdqsort.h
@ -0,0 +1,552 @@
+/* -*- c++ -*- */
+/* clang-format off */
+/*
+    pdqsort.h - Pattern-defeating quicksort.
+
+    Copyright (c) 2015 Orson Peters
+
+    This software is provided 'as-is', without any express or implied warranty. In no event will the
+    authors be held liable for any damages arising from the use of this software.
+
+    Permission is granted to anyone to use this software for any purpose, including commercial
+    applications, and to alter it and redistribute it freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not claim that you wrote the
+       original software. If you use this software in a product, an acknowledgment in the product
+       documentation would be appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be misrepresented as
+       being the original software.
+
+    3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef PDQSORT_H
+#define PDQSORT_H
+#ifdef __cplusplus
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cstddef"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/utility"
+#include "third_party/libcxx/iterator"
+
+#if __cplusplus >= 201103L
+    #include "third_party/libcxx/cstdint"
+    #include "third_party/libcxx/type_traits"
+    #define PDQSORT_PREFER_MOVE(x) std::move(x)
+#else
+    #define PDQSORT_PREFER_MOVE(x) (x)
+#endif
+
+
+namespace pdqsort_detail {
+    enum {
+        // Partitions below this size are sorted using insertion sort.
+        insertion_sort_threshold = 24,
+
+        // Partitions above this size use Tukey's ninther to select the pivot.
+        ninther_threshold = 128,
+
+        // When we detect an already sorted partition, attempt an insertion sort that allows this
+        // amount of element moves before giving up.
+        partial_insertion_sort_limit = 8,
+
+        // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char.
+        block_size = 64,
+
+        // Cacheline size, assumes power of two.
+        cacheline_size = 64
+
+    };
+
+#if __cplusplus >= 201103L
+    template<class T> struct is_default_compare : std::false_type { };
+    template<class T> struct is_default_compare<std::less<T> > : std::true_type { };
+    template<class T> struct is_default_compare<std::greater<T> > : std::true_type { };
+#endif
+
+    // Returns floor(log2(n)), assumes n > 0.
+    template<class T>
+    inline int log2(T n) {
+        int log = 0;
+        while (n >>= 1) ++log;
+        return log;
+    }
+
+    // Sorts [begin, end) using insertion sort with the given comparison function.
+    template<class Iter, class Compare>
+    inline void insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return;
+
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (sift != begin && comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+            }
+        }
+    }
+
+    // Sorts [begin, end) using insertion sort with the given comparison function. Assumes
+    // *(begin - 1) is an element smaller than or equal to any element in [begin, end).
+    template<class Iter, class Compare>
+    inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return;
+
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+            }
+        }
+    }
+
+    // Attempts to use insertion sort on [begin, end). Will return false if more than
+    // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will
+    // successfully sort and return true.
+    template<class Iter, class Compare>
+    inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (begin == end) return true;
+        
+        std::size_t limit = 0;
+        for (Iter cur = begin + 1; cur != end; ++cur) {
+            Iter sift = cur;
+            Iter sift_1 = cur - 1;
+
+            // Compare first so we can avoid 2 moves for an element already positioned correctly.
+            if (comp(*sift, *sift_1)) {
+                T tmp = PDQSORT_PREFER_MOVE(*sift);
+
+                do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); }
+                while (sift != begin && comp(tmp, *--sift_1));
+
+                *sift = PDQSORT_PREFER_MOVE(tmp);
+                limit += cur - sift;
+            }
+            
+            if (limit > partial_insertion_sort_limit) return false;
+        }
+
+        return true;
+    }
+
+    template<class Iter, class Compare>
+    inline void sort2(Iter a, Iter b, Compare comp) {
+        if (comp(*b, *a)) std::iter_swap(a, b);
+    }
+
+    // Sorts the elements *a, *b and *c using comparison function comp.
+    template<class Iter, class Compare>
+    inline void sort3(Iter a, Iter b, Iter c, Compare comp) {
+        sort2(a, b, comp);
+        sort2(b, c, comp);
+        sort2(a, b, comp);
+    }
+
+    template<class T>
+    inline T* align_cacheline(T* p) {
+#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
+        std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
+#else
+        std::size_t ip = reinterpret_cast<std::size_t>(p);
+#endif
+        ip = (ip + cacheline_size - 1) & -cacheline_size;
+        return reinterpret_cast<T*>(ip);
+    }
+
+    template<class Iter>
+    inline void swap_offsets(Iter first, Iter last,
+                             unsigned char* offsets_l, unsigned char* offsets_r,
+                             int num, bool use_swaps) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        if (use_swaps) {
+            // This case is needed for the descending distribution, where we need
+            // to have proper swapping for pdqsort to remain O(n).
+            for (int i = 0; i < num; ++i) {
+                std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
+            }
+        } else if (num > 0) {
+            Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
+            T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r);
+            for (int i = 1; i < num; ++i) {
+                l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l);
+                r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r);
+            }
+            *r = PDQSORT_PREFER_MOVE(tmp);
+        }
+    }
+
+    // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+    // to the pivot are put in the right-hand partition. Returns the position of the pivot after
+    // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+    // pivot is a median of at least 3 elements and that [begin, end) is at least
+    // insertion_sort_threshold long. Uses branchless partitioning.
+    template<class Iter, class Compare>
+    inline std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+
+        // Move pivot into local for speed.
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+        Iter first = begin;
+        Iter last = end;
+
+        // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+        // this exists).
+        while (comp(*++first, pivot));
+
+        // Find the first element strictly smaller than the pivot. We have to guard this search if
+        // there was no element before *first.
+        if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+        else                    while (                !comp(*--last, pivot));
+
+        // If the first pair of elements that should be swapped to partition are the same element,
+        // the passed in sequence already was correctly partitioned.
+        bool already_partitioned = first >= last;
+        if (!already_partitioned) {
+            std::iter_swap(first, last);
+            ++first;
+        }
+
+        // The following branchless partitioning is derived from "BlockQuicksort: How Branch
+        // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss.
+        unsigned char offsets_l_storage[block_size + cacheline_size];
+        unsigned char offsets_r_storage[block_size + cacheline_size];
+        unsigned char* offsets_l = align_cacheline(offsets_l_storage);
+        unsigned char* offsets_r = align_cacheline(offsets_r_storage);
+        int num_l, num_r, start_l, start_r;
+        num_l = num_r = start_l = start_r = 0;
+        
+        while (last - first > 2 * block_size) {
+            // Fill up offset blocks with elements that are on the wrong side.
+            if (num_l == 0) {
+                start_l = 0;
+                Iter it = first;
+                for (unsigned char i = 0; i < block_size;) {
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                    offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+                }
+            }
+            if (num_r == 0) {
+                start_r = 0;
+                Iter it = last;
+                for (unsigned char i = 0; i < block_size;) {
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                    offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+                }
+            }
+
+            // Swap elements and update block sizes and first/last boundaries.
+            int num = std::min(num_l, num_r);
+            swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r,
+                         num, num_l == num_r);
+            num_l -= num; num_r -= num;
+            start_l += num; start_r += num;
+            if (num_l == 0) first += block_size;
+            if (num_r == 0) last -= block_size;
+        }
+
+        int l_size = 0, r_size = 0;
+        int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0);
+        if (num_r) {
+            // Handle leftover block by assigning the unknown elements to the other block.
+            l_size = unknown_left;
+            r_size = block_size;
+        } else if (num_l) {
+            l_size = block_size;
+            r_size = unknown_left;
+        } else {
+            // No leftover block, split the unknown elements in two blocks.
+            l_size = unknown_left/2;
+            r_size = unknown_left - l_size;
+        }
+
+        // Fill offset buffers if needed.
+        if (unknown_left && !num_l) {
+            start_l = 0;
+            Iter it = first;
+            for (unsigned char i = 0; i < l_size;) {
+                offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it;
+            }
+        }
+        if (unknown_left && !num_r) {
+            start_r = 0;
+            Iter it = last;
+            for (unsigned char i = 0; i < r_size;) {
+                offsets_r[num_r] = ++i; num_r += comp(*--it, pivot);
+            }
+        }
+
+        int num = std::min(num_l, num_r);
+        swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r);
+        num_l -= num; num_r -= num;
+        start_l += num; start_r += num;
+        if (num_l == 0) first += l_size;
+        if (num_r == 0) last -= r_size;
+        
+        // We have now fully identified [first, last)'s proper position. Swap the last elements.
+        if (num_l) {
+            offsets_l += start_l;
+            while (num_l--) std::iter_swap(first + offsets_l[num_l], --last);
+            first = last;
+        }
+        if (num_r) {
+            offsets_r += start_r;
+            while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first;
+            last = first;
+        }
+
+        // Put the pivot in the right place.
+        Iter pivot_pos = first - 1;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return std::make_pair(pivot_pos, already_partitioned);
+    }
+
+    // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+    // to the pivot are put in the right-hand partition. Returns the position of the pivot after
+    // partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+    // pivot is a median of at least 3 elements and that [begin, end) is at least
+    // insertion_sort_threshold long.
+    template<class Iter, class Compare>
+    inline std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+        
+        // Move pivot into local for speed.
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+
+        Iter first = begin;
+        Iter last = end;
+
+        // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+        // this exists).
+        while (comp(*++first, pivot));
+
+        // Find the first element strictly smaller than the pivot. We have to guard this search if
+        // there was no element before *first.
+        if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+        else                    while (                !comp(*--last, pivot));
+
+        // If the first pair of elements that should be swapped to partition are the same element,
+        // the passed in sequence already was correctly partitioned.
+        bool already_partitioned = first >= last;
+        
+        // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously
+        // swapped pairs guard the searches, which is why the first iteration is special-cased
+        // above.
+        while (first < last) {
+            std::iter_swap(first, last);
+            while (comp(*++first, pivot));
+            while (!comp(*--last, pivot));
+        }
+
+        // Put the pivot in the right place.
+        Iter pivot_pos = first - 1;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return std::make_pair(pivot_pos, already_partitioned);
+    }
+
+    // Similar function to the one above, except elements equal to the pivot are put to the left of
+    // the pivot and it doesn't check or return if the passed sequence already was partitioned.
+    // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n)
+    // performance, no block quicksort is applied here for simplicity.
+    template<class Iter, class Compare>
+    inline Iter partition_left(Iter begin, Iter end, Compare comp) {
+        typedef typename std::iterator_traits<Iter>::value_type T;
+
+        T pivot(PDQSORT_PREFER_MOVE(*begin));
+        Iter first = begin;
+        Iter last = end;
+        
+        while (comp(pivot, *--last));
+
+        if (last + 1 == end) while (first < last && !comp(pivot, *++first));
+        else                 while (                !comp(pivot, *++first));
+
+        while (first < last) {
+            std::iter_swap(first, last);
+            while (comp(pivot, *--last));
+            while (!comp(pivot, *++first));
+        }
+
+        Iter pivot_pos = last;
+        *begin = PDQSORT_PREFER_MOVE(*pivot_pos);
+        *pivot_pos = PDQSORT_PREFER_MOVE(pivot);
+
+        return pivot_pos;
+    }
+
+
+    template<class Iter, class Compare, bool Branchless>
+    inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) {
+        typedef typename std::iterator_traits<Iter>::difference_type diff_t;
+
+        // Use a while loop for tail recursion elimination.
+        while (true) {
+            diff_t size = end - begin;
+
+            // Insertion sort is faster for small arrays.
+            if (size < insertion_sort_threshold) {
+                if (leftmost) insertion_sort(begin, end, comp);
+                else unguarded_insertion_sort(begin, end, comp);
+                return;
+            }
+
+            // Choose pivot as median of 3 or pseudomedian of 9.
+            diff_t s2 = size / 2;
+            if (size > ninther_threshold) {
+                sort3(begin, begin + s2, end - 1, comp);
+                sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
+                sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
+                sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
+                std::iter_swap(begin, begin + s2);
+            } else sort3(begin + s2, begin, end - 1, comp);
+
+            // If *(begin - 1) is the end of the right partition of a previous partition operation
+            // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our
+            // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in
+            // the left partition, greater elements in the right partition. We do not have to
+            // recurse on the left partition, since it's sorted (all equal).
+            if (!leftmost && !comp(*(begin - 1), *begin)) {
+                begin = partition_left(begin, end, comp) + 1;
+                continue;
+            }
+
+            // Partition and get results.
+            std::pair<Iter, bool> part_result =
+                Branchless ? partition_right_branchless(begin, end, comp)
+                           : partition_right(begin, end, comp);
+            Iter pivot_pos = part_result.first;
+            bool already_partitioned = part_result.second;
+
+            // Check for a highly unbalanced partition.
+            diff_t l_size = pivot_pos - begin;
+            diff_t r_size = end - (pivot_pos + 1);
+            bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
+
+            // If we got a highly unbalanced partition we shuffle elements to break many patterns.
+            if (highly_unbalanced) {
+                // If we had too many bad partitions, switch to heapsort to guarantee O(n log n).
+                if (--bad_allowed == 0) {
+                    std::make_heap(begin, end, comp);
+                    std::sort_heap(begin, end, comp);
+                    return;
+                }
+
+                if (l_size >= insertion_sort_threshold) {
+                    std::iter_swap(begin,             begin + l_size / 4);
+                    std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
+
+                    if (l_size > ninther_threshold) {
+                        std::iter_swap(begin + 1,         begin + (l_size / 4 + 1));
+                        std::iter_swap(begin + 2,         begin + (l_size / 4 + 2));
+                        std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
+                        std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
+                    }
+                }
+                
+                if (r_size >= insertion_sort_threshold) {
+                    std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
+                    std::iter_swap(end - 1,                   end - r_size / 4);
+                    
+                    if (r_size > ninther_threshold) {
+                        std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
+                        std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
+                        std::iter_swap(end - 2,             end - (1 + r_size / 4));
+                        std::iter_swap(end - 3,             end - (2 + r_size / 4));
+                    }
+                }
+            } else {
+                // If we were decently balanced and we tried to sort an already partitioned
+                // sequence try to use insertion sort.
+                if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp)
+                                        && partial_insertion_sort(pivot_pos + 1, end, comp)) return;
+            }
+                
+            // Sort the left partition first using recursion and do tail recursion elimination for
+            // the right-hand partition.
+            pdqsort_loop<Iter, Compare, Branchless>(begin, pivot_pos, comp, bad_allowed, leftmost);
+            begin = pivot_pos + 1;
+            leftmost = false;
+        }
+    }
+}
+
+
+template<class Iter, class Compare>
+inline void pdqsort(Iter begin, Iter end, Compare comp) {
+    if (begin == end) return;
+
+#if __cplusplus >= 201103L
+    pdqsort_detail::pdqsort_loop<Iter, Compare,
+        pdqsort_detail::is_default_compare<typename std::decay<Compare>::type>::value &&
+        std::is_arithmetic<typename std::iterator_traits<Iter>::value_type>::value>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+#else
+    pdqsort_detail::pdqsort_loop<Iter, Compare, false>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+#endif
+}
+
+template<class Iter>
+inline void pdqsort(Iter begin, Iter end) {
+    typedef typename std::iterator_traits<Iter>::value_type T;
+    pdqsort(begin, end, std::less<T>());
+}
+
+template<class Iter, class Compare>
+inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) {
+    if (begin == end) return;
+    pdqsort_detail::pdqsort_loop<Iter, Compare, true>(
+        begin, end, comp, pdqsort_detail::log2(end - begin));
+}
+
+template<class Iter>
+inline void pdqsort_branchless(Iter begin, Iter end) {
+    typedef typename std::iterator_traits<Iter>::value_type T;
+    pdqsort_branchless(begin, end, std::less<T>());
+}
+
+
+#undef PDQSORT_PREFER_MOVE
+#endif /* __cplusplus */
+COSMOPOLITAN_C_START_
+
+void pdqsort_int64(int64_t *, size_t);
+void pdqsort_branchless_int64(int64_t *, size_t);
+
+COSMOPOLITAN_C_END_
+#endif /* PDQSORT_H */
--- a/third_party/libcxx/rmi.h
+++ b/third_party/libcxx/rmi.h
@ -0,0 +1,308 @@
+// -*- c++ -*-
+// clang-format off
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/vector"
+
+using namespace std;
+
+namespace learned_sort {
+
+// Packs the key and its respective scaled CDF value
+template <typename T>
+struct training_point {
+  T x;
+  double y;
+};
+
+// Represents linear models
+struct linear_model {
+  double slope = 0;
+  double intercept = 0;
+};
+
+// An implementation of a 2-layer RMI model
+template <class T>
+class TwoLayerRMI {
+ public:
+  // CDF model hyperparameters
+  struct Params {
+    // Member fields
+    long fanout;
+    float sampling_rate;
+    long threshold;
+    long num_leaf_models;
+
+    // Default hyperparameters
+    static constexpr long DEFAULT_FANOUT = 1e3;
+    static constexpr float DEFAULT_SAMPLING_RATE = .01;
+    static constexpr long DEFAULT_THRESHOLD = 100;
+    static constexpr long DEFAULT_NUM_LEAF_MODELS = 1000;
+    static constexpr long MIN_SORTING_SIZE = 1e4;
+
+    // Default constructor
+    Params() {
+      this->fanout = DEFAULT_FANOUT;
+      this->sampling_rate = DEFAULT_SAMPLING_RATE;
+      this->threshold = DEFAULT_THRESHOLD;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+
+    // Constructor with custom hyperparameter values
+    Params(float sampling_rate, float overallocation, long fanout,
+           long batch_sz, long threshold) {
+      this->fanout = fanout;
+      this->sampling_rate = sampling_rate;
+      this->threshold = threshold;
+      this->num_leaf_models = DEFAULT_NUM_LEAF_MODELS;
+    }
+  };
+
+  // Member variables of the CDF model
+  bool trained;
+  linear_model root_model;
+  vector<linear_model> leaf_models;
+  vector<T> training_sample;
+  Params hp;
+  bool enable_dups_detection;
+
+  // CDF model constructor
+  TwoLayerRMI(Params p) {
+    this->trained = false;
+    this->hp = p;
+    this->leaf_models.resize(p.num_leaf_models);
+    this->enable_dups_detection = true;
+  }
+
+  // Pretty-printing function
+  void print() {
+    printf("[0][0]: slope=%0.5f; intercept=%0.5f;\n", root_model.slope,
+           root_model.intercept);
+    for (int model_idx = 0; model_idx < hp.num_leaf_models; ++model_idx) {
+      printf("[%i][1]: slope=%0.5f; intercept=%0.5f;\n", model_idx,
+             leaf_models[model_idx].slope, leaf_models[model_idx].intercept);
+    }
+    cout << "-----------------------------" << endl;
+  }
+
+  /**
+   * @brief Train a CDF function with an RMI architecture, using linear spline
+   * interpolation.
+   *
+   * @param begin Random-access iterators to the initial position of the
+   * sequence to be used for sorting. The range used is [begin,end), which
+   * contains all the elements between first and last, including the element
+   * pointed by first but not the element pointed by last.
+   * @param end Random-access iterators to the last position of the sequence to
+   * be used for sorting. The range used is [begin,end), which contains all the
+   * elements between first and last, including the element pointed by first but
+   * not the element pointed by last.
+   * @return true if the model was trained successfully, false otherwise.
+   */
+  bool train(typename vector<T>::iterator begin,
+             typename vector<T>::iterator end) {
+    // Determine input size
+    const long INPUT_SZ = std::distance(begin, end);
+
+    // Validate parameters
+    if (this->hp.fanout >= INPUT_SZ) {
+      this->hp.fanout = TwoLayerRMI<T>::Params::DEFAULT_FANOUT;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid fanout. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_FANOUT << ")." << endl;
+    }
+
+    if (this->hp.sampling_rate <= 0 or this->hp.sampling_rate > 1) {
+      this->hp.sampling_rate = TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid sampling rate. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_SAMPLING_RATE << ")." << endl;
+    }
+
+    if (this->hp.threshold <= 0 or this->hp.threshold >= INPUT_SZ or
+        this->hp.threshold >= INPUT_SZ / this->hp.fanout) {
+      this->hp.threshold = TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD;
+      cerr << "\33[93;1mWARNING\33[0m: Invalid threshold. Using default ("
+           << TwoLayerRMI<T>::Params::DEFAULT_THRESHOLD << ")." << endl;
+    }
+
+    // Initialize the CDF model
+    static const long NUM_LAYERS = 2;
+    vector<vector<vector<training_point<T> > > > training_data(NUM_LAYERS);
+    for (long layer_idx = 0; layer_idx < NUM_LAYERS; ++layer_idx) {
+      training_data[layer_idx].resize(hp.num_leaf_models);
+    }
+
+    //----------------------------------------------------------//
+    //                           SAMPLE                         //
+    //----------------------------------------------------------//
+
+    // Determine sample size
+    const long SAMPLE_SZ = std::min<long>(
+        INPUT_SZ, std::max<long>(this->hp.sampling_rate * INPUT_SZ,
+                                 TwoLayerRMI<T>::Params::MIN_SORTING_SIZE));
+
+    // Create a sample array
+    this->training_sample.reserve(SAMPLE_SZ);
+
+    // Start sampling
+    long offset = static_cast<long>(1. * INPUT_SZ / SAMPLE_SZ);
+    for (auto i = begin; i < end; i += offset) {
+      // NOTE:  We don't directly assign SAMPLE_SZ to this->training_sample_sz
+      //        to avoid issues with divisibility
+      this->training_sample.push_back(*i);
+    }
+
+    // Sort the sample using the provided comparison function
+    std::sort(this->training_sample.begin(), this->training_sample.end());
+
+    // Count the number of unique keys
+    auto sample_cpy = this->training_sample;
+    long num_unique_elms = std::distance(
+        sample_cpy.begin(), std::unique(sample_cpy.begin(), sample_cpy.end()));
+
+    // Stop early if the array has very few unique values. We need at least 2
+    // unique training examples per leaf model.
+    if (num_unique_elms < 2 * this->hp.num_leaf_models) {
+      return false;
+    } else if (num_unique_elms > .9 * training_sample.size()) {
+      this->enable_dups_detection = false;
+    }
+
+    //----------------------------------------------------------//
+    //                     TRAIN THE MODELS                     //
+    //----------------------------------------------------------//
+
+    // Populate the training data for the root model
+    for (long i = 0; i < SAMPLE_SZ; ++i) {
+      training_data[0][0].push_back(
+          {this->training_sample[i], 1. * i / SAMPLE_SZ});
+    }
+
+    // Train the root model using linear interpolation
+    auto *current_training_data = &training_data[0][0];
+    linear_model *current_model = &(this->root_model);
+
+    // Find the min and max values in the training set
+    training_point<T> min = current_training_data->front();
+    training_point<T> max = current_training_data->back();
+
+    // Calculate the slope and intercept terms, assuming min.y = 0 and max.y
+    current_model->slope = 1. / (max.x - min.x);
+    current_model->intercept = -current_model->slope * min.x;
+
+    // Extrapolate for the number of models in the next layer
+    current_model->slope *= this->hp.num_leaf_models - 1;
+    current_model->intercept *= this->hp.num_leaf_models - 1;
+
+    // Populate the training data for the next layer
+    for (const auto &d : *current_training_data) {
+      // Predict the model index in next layer
+      long rank = current_model->slope * d.x + current_model->intercept;
+
+      // Normalize the rank between 0 and the number of models in the next layer
+      rank = std::max(static_cast<long>(0),
+                      std::min(this->hp.num_leaf_models - 1, rank));
+
+      // Place the data in the predicted training bucket
+      training_data[1][rank].push_back(d);
+    }
+
+    // Train the leaf models
+    for (long model_idx = 0; model_idx < this->hp.num_leaf_models;
+         ++model_idx) {
+      // Update iterator variables
+      current_training_data = &training_data[1][model_idx];
+      current_model = &(this->leaf_models[model_idx]);
+
+      // Interpolate the min points in the training buckets
+      if (model_idx == 0) {
+        // The current model is the first model in the current layer
+
+        if (current_training_data->size() < 2) {
+          // Case 1: The first model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = 0;
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          training_point<T> tp;
+          tp.x = 0;
+          tp.y = 0;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 2: The first model in this layer is not empty
+
+          min = current_training_data->front();
+          max = current_training_data->back();
+
+          // Hallucinating as if min.y = 0
+          current_model->slope = (1. * max.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else if (model_idx == this->hp.num_leaf_models - 1) {
+        if (current_training_data->empty()) {
+          // Case 3: The final model in this layer is empty
+
+          current_model->slope = 0;
+          current_model->intercept = 1;
+        } else {
+          // Case 4: The last model in this layer is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          // Hallucinating as if max.y = 1
+          current_model->slope = (1. - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      } else {
+        // The current model is not the first model in the current layer
+
+        if (current_training_data->empty()) {
+          // Case 5: The intermediate model in this layer is empty
+          current_model->slope = 0;
+          current_model->intercept = training_data[1][model_idx - 1]
+                                         .back()
+                                         .y;  // If the previous model
+                                              // was empty too, it will
+                                              // use the fictive
+                                              // training points
+
+          // Insert a fictive training point to avoid propagating more than one
+          // empty initial models.
+          // NOTE: This will _NOT_ throw to DIV/0 due to identical x's and y's
+          // because it is working backwards.
+          training_point<T> tp;
+          tp.x = training_data[1][model_idx - 1].back().x;
+          tp.y = training_data[1][model_idx - 1].back().y;
+          current_training_data->push_back(tp);
+        } else {
+          // Case 6: The intermediate leaf model is not empty
+
+          min = training_data[1][model_idx - 1].back();
+          max = current_training_data->back();
+
+          current_model->slope = (max.y - min.y) / (max.x - min.x);
+          current_model->intercept = min.y - current_model->slope * min.x;
+        }
+      }
+    }
+
+    // NOTE:
+    // The last stage (layer) of this model contains weights that predict the
+    // CDF of the keys (i.e. Range is [0-1]) When using this model to predict
+    // the position of the keys in the sorted order, you MUST scale the weights
+    // of the last layer to whatever range you are predicting for. The inner
+    // layers of the model have already been extrapolated to the length of the
+    // stage.git
+    //
+    // This is a design choice to help with the portability of the model.
+    //
+    this->trained = true;
+
+    return true;
+  }
+};
+}  // namespace learned_sort
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_RMI_H_ */
--- a/third_party/libcxx/ska_sort.cc
+++ b/third_party/libcxx/ska_sort.cc
@ -0,0 +1,22 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                              :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/libcxx/ska_sort.h"
+#include "third_party/libcxx/ska_sort.h"
+
+void ska_sort_int64(int64_t* A, size_t n) { ska_sort(A, A + n); }
--- a/third_party/libcxx/ska_sort.h
+++ b/third_party/libcxx/ska_sort.h
--- a/third_party/libcxx/utils.h
+++ b/third_party/libcxx/utils.h
@ -0,0 +1,42 @@
+// -*- c++ -*-
+// clang-format off
+#ifndef COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
+#define COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_
+#include "third_party/libcxx/iterator"
+
+namespace learned_sort {
+namespace utils {
+
+// Helper functions for working with unsigned types
+template <typename T>
+inline T _get_key(T a) {
+  return a;
+}
+
+template <class RandomIt>
+void insertion_sort(RandomIt begin, RandomIt end) {
+  // Determine the data type
+  typedef typename std::iterator_traits<RandomIt>::value_type T;
+
+  // Determine the input size
+  const size_t input_sz = std::distance(begin, end);
+
+  if (input_sz <= 0) return;
+
+  RandomIt cmp_idx;
+  T key;
+  for (auto i = begin + 1; i != end; ++i) {
+    key = i[0];
+    cmp_idx = i - 1;
+    while (cmp_idx >= begin && cmp_idx[0] > key) {
+      cmp_idx[1] = cmp_idx[0];
+      --cmp_idx;
+    }
+    cmp_idx[1] = key;
+  }
+}
+
+}  // namespace utils
+}  // namespace learned_sort
+
+#endif /* COSMOPOLITAN_THIRD_PARTY_LEARNED_SORT_UTILS_H_ */