diff --git a/CMakeLists.txt b/CMakeLists.txt
index 538072798..f49a052e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,7 +231,6 @@ set(qjs_sources
     libregexp.c
     libunicode.c
     quickjs.c
-    xsum.c
 )
 
 if(QJS_BUILD_LIBC)
diff --git a/amalgam.js b/amalgam.js
index 8e2f9da31..028d77985 100644
--- a/amalgam.js
+++ b/amalgam.js
@@ -18,8 +18,6 @@ const quickjs_h = loadFile("quickjs.h")
 const quickjs_libc_c = loadFile("quickjs-libc.c")
 const quickjs_libc_h = loadFile("quickjs-libc.h")
 const quickjs_opcode_h = loadFile("quickjs-opcode.h")
-const xsum_c = loadFile("xsum.c")
-const xsum_h = loadFile("xsum.h")
 const gen_builtin_array_fromasync_h = loadFile("builtin-array-fromasync.h")
 
 let source = "#if defined(QJS_BUILD_LIBC) && defined(__linux__) && !defined(_GNU_SOURCE)\n"
@@ -32,14 +30,12 @@ let source = "#if defined(QJS_BUILD_LIBC) && defined(__linux__) && !defined(_GNU
            + libunicode_h // exports lre_is_id_start, used by libregexp.h
            + libregexp_h
            + libunicode_table_h
-           + xsum_h
            + quickjs_h
            + quickjs_c
            + cutils_c
            + dtoa_c
            + libregexp_c
            + libunicode_c
-           + xsum_c
            + "#ifdef QJS_BUILD_LIBC\n"
            + quickjs_libc_h
            + quickjs_libc_c
diff --git a/fuzz.c b/fuzz.c
index ee43f3ff0..fca8fa413 100644
--- a/fuzz.c
+++ b/fuzz.c
@@ -4,7 +4,6 @@
 #include "cutils.c"
 #include "libregexp.c"
 #include "libunicode.c"
-#include "xsum.c"
 #include <stdlib.h>
 
 int LLVMFuzzerTestOneInput(const uint8_t *buf, size_t len)
diff --git a/meson.build b/meson.build
index 6c1d9f396..533267791 100644
--- a/meson.build
+++ b/meson.build
@@ -142,7 +142,6 @@ qjs_srcs = files(
   'libregexp.c',
   'libunicode.c',
   'quickjs.c',
-  'xsum.c'
 )
 qjs_hdrs = files(
   'quickjs.h',
diff --git a/quickjs.c b/quickjs.c
index 8c00d951f..884375c15 100644
--- a/quickjs.c
+++ b/quickjs.c
@@ -47,7 +47,6 @@
 #include "list.h"
 #include "quickjs.h"
 #include "libregexp.h"
-#include "xsum.h"
 #include "dtoa.h"
 
 #if defined(EMSCRIPTEN) || defined(_MSC_VER)
@@ -11998,7 +11997,7 @@ static JSValue js_atof(JSContext *ctx, const char *str, const char **pp,
     bool buf_allocated = false;
     JSValue val;
     JSATODTempMem atod_mem;
-    
+
     /* optional separator between digits */
     sep = (flags & ATOD_ACCEPT_UNDERSCORES) ? '_' : 256;
     has_legacy_octal = false;
@@ -12799,7 +12798,7 @@ static JSValue js_dtoa2(JSContext *ctx,
     JSValue res;
     JSDTOATempMem dtoa_mem;
     len_max = js_dtoa_max_len(d, radix, n_digits, flags);
-    
+
     /* longer buffer may be used if radix != 10 */
     if (len_max > sizeof(static_buf) - 1) {
         tmp_buf = js_malloc(ctx, len_max + 1);
@@ -44667,93 +44666,249 @@ static JSValue js_math_clz32(JSContext *ctx, JSValueConst this_val,
     return js_int32(r);
 }
 
-typedef enum SumPreciseStateEnum {
+/* we add one extra limb to avoid having to test for overflows during the sum */
+#define SUM_PRECISE_ACC_LEN 34
+
+typedef enum {
     SUM_PRECISE_STATE_MINUS_ZERO,
-    SUM_PRECISE_STATE_NOT_A_NUMBER,
-    SUM_PRECISE_STATE_MINUS_INFINITY,
-    SUM_PRECISE_STATE_PLUS_INFINITY,
     SUM_PRECISE_STATE_FINITE,
+    SUM_PRECISE_STATE_INFINITY,
+    SUM_PRECISE_STATE_MINUS_INFINITY, /* must be after SUM_PRECISE_STATE_INFINITY */
+    SUM_PRECISE_STATE_NAN, /* must be after SUM_PRECISE_STATE_MINUS_INFINITY */
 } SumPreciseStateEnum;
 
+typedef struct {
+    uint64_t acc[SUM_PRECISE_ACC_LEN];
+    int n_limbs; /* acc is not necessarily normalized */
+    SumPreciseStateEnum state;
+} SumPreciseState;
+
+static void sum_precise_init(SumPreciseState *s)
+{
+    s->state = SUM_PRECISE_STATE_MINUS_ZERO;
+    s->acc[0] = 0;
+    s->n_limbs = 1;
+}
+
+#define ADDC64(res, carry_out, op1, op2, carry_in)      \
+do {                                                    \
+    uint64_t __v, __a, __k, __k1;                       \
+    __v = (op1);                                        \
+    __a = __v + (op2);                                  \
+    __k1 = __a < __v;                                   \
+    __k = (carry_in);                                   \
+    __a = __a + __k;                                    \
+    carry_out = (__a < __k) | __k1;                     \
+    res = __a;                                          \
+} while (0)
+
+static void sum_precise_add(SumPreciseState *s, double d)
+{
+    uint64_t a, m, a0, carry, acc_sign, a_sign;
+    int sgn, e, p, n, i;
+    unsigned shift;
+
+    a = float64_as_uint64(d);
+    sgn = a >> 63;
+    e = (a >> 52) & ((1 << 11) - 1);
+    m = a & (((uint64_t)1 << 52) - 1);
+    if (unlikely(e == 2047)) {
+        if (m == 0) {
+            /* +/- infinity */
+            if (s->state == SUM_PRECISE_STATE_NAN ||
+                (s->state == SUM_PRECISE_STATE_MINUS_INFINITY && !sgn) ||
+                (s->state == SUM_PRECISE_STATE_INFINITY && sgn)) {
+                s->state = SUM_PRECISE_STATE_NAN;
+            } else {
+                s->state = SUM_PRECISE_STATE_INFINITY + sgn;
+            }
+        } else {
+            /* NaN */
+            s->state = SUM_PRECISE_STATE_NAN;
+        }
+    } else if (e == 0) {
+        if (likely(m == 0)) {
+            /* zero */
+            if (s->state == SUM_PRECISE_STATE_MINUS_ZERO && !sgn)
+                s->state = SUM_PRECISE_STATE_FINITE;
+        } else {
+            /* subnormal */
+            p = 0;
+            shift = 0;
+            goto add;
+        }
+    } else {
+        m |= (uint64_t)1 << 52;
+        shift = e - 1;
+        p = shift / 64;
+        /* 'p' is the position of a0 in acc */
+        shift %= 64;
+    add:
+        if (s->state >= SUM_PRECISE_STATE_INFINITY)
+            return;
+        s->state = SUM_PRECISE_STATE_FINITE;
+        n = s->n_limbs;
+
+        acc_sign = (int64_t)s->acc[n - 1] >> 63;
+
+        /* sign extend acc */
+        for(i = n; i <= p; i++)
+            s->acc[i] = acc_sign;
+
+        carry = sgn;
+        a_sign = -sgn;
+        a0 = m << shift;
+        ADDC64(s->acc[p], carry, s->acc[p], a0 ^ a_sign, carry);
+        if (shift >= 12) {
+            p++;
+            if (p >= n)
+                s->acc[p] = acc_sign;
+            a0 = m >> (64 - shift);
+            ADDC64(s->acc[p], carry, s->acc[p], a0 ^ a_sign, carry);
+        }
+        p++;
+        if (p >= n) {
+            n = p;
+        } else {
+            /* carry */
+            for(i = p; i < n; i++) {
+                /* if 'a' positive: stop condition: carry = 0.
+                   if 'a' negative: stop condition: carry = 1. */
+                if (carry == sgn)
+                    goto done;
+                ADDC64(s->acc[i], carry, s->acc[i], a_sign, carry);
+            }
+        }
+
+        /* extend the accumulator if needed */
+        a0 = carry + acc_sign + a_sign;
+        /* -1 <= a0 <= 1 (if both acc and a are negative, carry is set) */
+        if (a0 != ((int64_t)s->acc[n - 1] >> 63)) {
+            s->acc[n++] = a0;
+        }
+    done:
+        s->n_limbs = n;
+    }
+}
+
+static double sum_precise_get_result(SumPreciseState *s)
+{
+    int n, shift, e, p, is_neg, i;
+    uint64_t m, addend, carry;
+
+    if (s->state != SUM_PRECISE_STATE_FINITE) {
+        switch(s->state) {
+        default:
+        case SUM_PRECISE_STATE_MINUS_ZERO:
+            return -0.0;
+        case SUM_PRECISE_STATE_INFINITY:
+            return INFINITY;
+        case SUM_PRECISE_STATE_MINUS_INFINITY:
+            return -INFINITY;
+        case SUM_PRECISE_STATE_NAN:
+            return NAN;
+        }
+    }
+
+    /* extract the sign and absolute value */
+    n = s->n_limbs;
+    is_neg = s->acc[n - 1] >> 63;
+    if (is_neg) {
+        /* acc = -acc */
+        carry = 1;
+        for(i = 0; i < n; i++) {
+            ADDC64(s->acc[i], carry, ~s->acc[i], 0, carry);
+        }
+    }
+    /* normalize */
+    while (n > 0 && s->acc[n - 1] == 0)
+        n--;
+    /* zero result. The spec tells it is always positive in the finite case */
+    if (n == 0)
+        return 0.0;
+    /* subnormal case */
+    if (n == 1 && s->acc[0] < ((uint64_t)1 << 52))
+        return uint64_as_float64(((uint64_t)is_neg << 63) | s->acc[0]);
+    /* normal case */
+    e = n * 64;
+    p = n - 1;
+    m = s->acc[p];
+    shift = clz64(m);
+    e = e - shift - 52;
+    if (shift != 0) {
+        m <<= shift;
+        if (p > 0) {
+            int shift1;
+            uint64_t nz;
+            p--;
+            shift1 = 64 - shift;
+            nz = s->acc[p] & (((uint64_t)1 << shift1) - 1);
+            m = m | (s->acc[p] >> shift1) | (nz != 0);
+        }
+    }
+    if ((m & ((1 << 10) - 1)) == 0) {
+        /* see if the LSB part is non zero for the final rounding  */
+        while (p > 0) {
+            p--;
+            if (s->acc[p] != 0) {
+                m |= 1;
+                break;
+            }
+        }
+    }
+    /* rounding to nearest with ties to even */
+    addend = (1 << 10) - 1 + ((m >> 11) & 1);
+    m = (m + addend) >> 11;
+    /* handle overflow in the rounding */
+    if (m == 0)
+        e++;
+    if (unlikely(e >= 2047)) {
+        /* infinity */
+        return uint64_as_float64(((uint64_t)is_neg << 63) | ((uint64_t)2047 << 52));
+    } else {
+        m &= (((uint64_t)1 << 52) - 1);
+        return uint64_as_float64(((uint64_t)is_neg << 63) | ((uint64_t)e << 52) | m);
+    }
+}
+
 static JSValue js_math_sumPrecise(JSContext *ctx, JSValueConst this_val,
                                   int argc, JSValueConst *argv)
 {
     JSValue iter, next, item, ret;
+    uint32_t tag;
     int done;
     double d;
-    xsum_small_accumulator acc;
-    SumPreciseStateEnum state;
+    SumPreciseState s_s, *s = &s_s;
 
-    iter = JS_GetIterator(ctx, argv[0], /*async*/false);
+    iter = JS_GetIterator(ctx, argv[0], /*is_async*/false);
     if (JS_IsException(iter))
         return JS_EXCEPTION;
     ret = JS_EXCEPTION;
     next = JS_GetProperty(ctx, iter, JS_ATOM_next);
     if (JS_IsException(next))
         goto fail;
-    xsum_small_init(&acc);
-    state = SUM_PRECISE_STATE_MINUS_ZERO;
+    sum_precise_init(s);
     for (;;) {
         item = JS_IteratorNext(ctx, iter, next, 0, NULL, &done);
         if (JS_IsException(item))
             goto fail;
         if (done)
-            break; // item == JS_UNDEFINED
-        switch (JS_VALUE_GET_TAG(item)) {
-        default:
+            break;
+        tag = JS_VALUE_GET_TAG(item);
+        if (JS_TAG_IS_FLOAT64(tag)) {
+            d = JS_VALUE_GET_FLOAT64(item);
+        } else if (tag == JS_TAG_INT) {
+            d = JS_VALUE_GET_INT(item);
+        } else {
             JS_FreeValue(ctx, item);
             JS_ThrowTypeError(ctx, "not a number");
+            JS_IteratorClose(ctx, iter, /*is_exception_pending*/true);
             goto fail;
-        case JS_TAG_INT:
-            d = JS_VALUE_GET_INT(item);
-            break;
-        case JS_TAG_FLOAT64:
-            d = JS_VALUE_GET_FLOAT64(item);
-            break;
-        }
-
-        if (state != SUM_PRECISE_STATE_NOT_A_NUMBER) {
-            if (isnan(d))
-                state = SUM_PRECISE_STATE_NOT_A_NUMBER;
-            else if (!isfinite(d) && d > 0.0)
-                if (state == SUM_PRECISE_STATE_MINUS_INFINITY)
-                    state = SUM_PRECISE_STATE_NOT_A_NUMBER;
-                else
-                    state = SUM_PRECISE_STATE_PLUS_INFINITY;
-            else if (!isfinite(d) && d < 0.0)
-                if (state == SUM_PRECISE_STATE_PLUS_INFINITY)
-                    state = SUM_PRECISE_STATE_NOT_A_NUMBER;
-                else
-                    state = SUM_PRECISE_STATE_MINUS_INFINITY;
-            else if (!(d == 0.0 && signbit(d)) && (state == SUM_PRECISE_STATE_MINUS_ZERO || state == SUM_PRECISE_STATE_FINITE)) {
-                state = SUM_PRECISE_STATE_FINITE;
-                xsum_small_add1(&acc, d);
-            }
         }
+        sum_precise_add(s, d);
     }
-
-    switch (state) {
-    case SUM_PRECISE_STATE_NOT_A_NUMBER:
-        d = NAN;
-        break;
-    case SUM_PRECISE_STATE_MINUS_INFINITY:
-        d = -INFINITY;
-        break;
-    case SUM_PRECISE_STATE_PLUS_INFINITY:
-        d = INFINITY;
-        break;
-    case SUM_PRECISE_STATE_MINUS_ZERO:
-        d = -0.0;
-        break;
-    case SUM_PRECISE_STATE_FINITE:
-        d = xsum_small_round(&acc);
-        break;
-    default:
-        abort();
-    }
-    ret = js_float64(d);
+    ret = js_float64(sum_precise_get_result(s));
 fail:
-    JS_IteratorClose(ctx, iter, JS_IsException(ret));
     JS_FreeValue(ctx, iter);
     JS_FreeValue(ctx, next);
     return ret;
diff --git a/tests/test_builtin.js b/tests/test_builtin.js
index 039328c68..74c9ec347 100644
--- a/tests/test_builtin.js
+++ b/tests/test_builtin.js
@@ -392,10 +392,11 @@ function test_math()
     assert(Math.imul((-2)**31, (-2)**31), 0);
     assert(Math.imul(2**31-1, 2**31-1), 1);
     assert(Math.fround(0.1), 0.10000000149011612);
-    assert(Math.hypot() == 0);
-    assert(Math.hypot(-2) == 2);
-    assert(Math.hypot(3, 4) == 5);
+    assert(Math.hypot(), 0);
+    assert(Math.hypot(-2), 2);
+    assert(Math.hypot(3, 4), 5);
     assert(Math.abs(Math.hypot(3, 4, 5) - 7.0710678118654755) <= 1e-15);
+    assert(Math.sumPrecise([1,Number.EPSILON/2,Number.MIN_VALUE]), 1.0000000000000002);
 }
 
 function test_number()
diff --git a/xsum.c b/xsum.c
deleted file mode 100644
index 10be91cd9..000000000
--- a/xsum.c
+++ /dev/null
@@ -1,1122 +0,0 @@
-/* FUNCTIONS FOR EXACT SUMMATION. */
-
-/* Copyright 2015, 2018, 2021, 2024 Radford M. Neal
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <string.h>
-#include <math.h>
-#include "xsum.h"
-
-
-/* ---------------------- IMPLEMENTATION ASSUMPTIONS ----------------------- */
-
-/* This code makes the following assumptions:
-
-     o The 'double' type is a IEEE-754 standard 64-bit floating-point value.
-
-     o The 'int64_t' and 'uint64_t' types exist, for 64-bit signed and
-       unsigned integers.
-
-     o The 'endianness' of 'double' and 64-bit integers is consistent
-       between these types - that is, looking at the bits of a 'double'
-       value as an 64-bit integer will have the expected result.
-
-     o Right shifts of a signed operand produce the results expected for
-       a two's complement representation.
-
-     o Rounding should be done in the "round to nearest, ties to even" mode.
-*/
-
-
-/* --------------------------- CONFIGURATION ------------------------------- */
-
-
-/* IMPLEMENTATION OPTIONS.  Can be set to either 0 or 1, whichever seems
-   to be fastest. */
-
-#define USE_SIMD 1          /* Use SIMD intrinsics (SSE2/AVX) if available?   */
-
-#define USE_MEMSET_SMALL 1  /* Use memset rather than a loop (for small mem)? */
-
-#define OPT_SMALL 0         /* Class of manual optimization for operations on */
-                            /*   small accumulator: 0 (none), 1, 2, 3 (SIMD)  */
-#define OPT_CARRY 1         /* Use manually optimized carry propagation?      */
-
-#define INLINE_SMALL 1      /* Inline more of the small accumulator routines? */
-                            /*   (Not currently used)                         */
-
-
-/* INCLUDE INTEL INTRINSICS IF USED AND AVAILABLE. */
-
-#if USE_SIMD && (__SSE2__ || __AVX__)
-# include <immintrin.h>
-#endif
-
-
-/* COPY A 64-BIT QUANTITY - DOUBLE TO 64-BIT INT OR VICE VERSA.  The
-   arguments are destination and source variables (not values). */
-
-#define COPY64(dst,src) memcpy(&(dst),&(src),sizeof(double))
-
-
-/* SET UP DEBUG FLAG.  It's a variable if debuging is enabled, and a
-   constant if disabled (so that no code will be generated then). */
-
-int xsum_debug = 0;
-
-#ifndef DEBUG
-# define xsum_debug 0
-#endif
-
-
-/* SET UP INLINE / NOINLINE MACROS. */
-
-#if __GNUC__
-# define INLINE inline __attribute__ ((always_inline))
-# define NOINLINE __attribute__ ((noinline))
-#else
-# define INLINE inline
-# define NOINLINE
-#endif
-
-
-/* ------------------------ INTERNAL ROUTINES ------------------------------- */
-
-
-/* ADD AN INF OR NAN TO A SMALL ACCUMULATOR.  This only changes the flags,
-   not the chunks in the accumulator, which retains the sum of the finite
-   terms (which is perhaps sometimes useful to access, though no function
-   to do so is defined at present).  A NaN with larger payload (seen as a
-   52-bit unsigned integer) takes precedence, with the sign of the NaN always
-   being positive.  This ensures that the order of summing NaN values doesn't
-   matter. */
-
-static NOINLINE void xsum_small_add_inf_nan
-                       (xsum_small_accumulator *restrict sacc, xsum_int ivalue)
-{
-  xsum_int mantissa;
-  double fltv;
-
-  mantissa = ivalue & XSUM_MANTISSA_MASK;
-
-  if (mantissa == 0) /* Inf */
-  { if (sacc->Inf == 0)
-    { /* no previous Inf */
-      sacc->Inf = ivalue;
-    }
-    else if (sacc->Inf != ivalue)
-    { /* previous Inf was opposite sign */
-      COPY64 (fltv, ivalue);
-      fltv = fltv - fltv;  /* result will be a NaN */
-      COPY64 (sacc->Inf, fltv);
-    }
-  }
-  else /* NaN */
-  { /* Choose the NaN with the bigger payload and clear its sign.  Using <=
-       ensures that we will choose the first NaN over the previous zero. */
-    if ((sacc->NaN & XSUM_MANTISSA_MASK) <= mantissa)
-    { sacc->NaN = ivalue & ~XSUM_SIGN_MASK;
-    }
-  }
-}
-
-
-/* PROPAGATE CARRIES TO NEXT CHUNK IN A SMALL ACCUMULATOR.  Needs to
-   be called often enough that accumulated carries don't overflow out
-   the top, as indicated by sacc->adds_until_propagate.  Returns the
-   index of the uppermost non-zero chunk (0 if number is zero).
-
-   After carry propagation, the uppermost non-zero chunk will indicate
-   the sign of the number, and will not be -1 (all 1s).  It will be in
-   the range -2^XSUM_LOW_MANTISSA_BITS to 2^XSUM_LOW_MANTISSA_BITS - 1.
-   Lower chunks will be non-negative, and in the range from 0 up to
-   2^XSUM_LOW_MANTISSA_BITS - 1. */
-
-static NOINLINE int xsum_carry_propagate (xsum_small_accumulator *restrict sacc)
-{
-  int i, u, uix;
-
-  /* Set u to the index of the uppermost non-zero (for now) chunk, or
-     return with value 0 if there is none. */
-
-# if OPT_CARRY
-
-  { u = XSUM_SCHUNKS-1;
-    switch (XSUM_SCHUNKS & 0x3)   /* get u to be a multiple of 4 minus one  */
-    {
-      case 3: if (sacc->chunk[u] != 0)
-              { goto found2;
-              }
-              u -= 1;                            /* XSUM_SCHUNKS is a */
-      case 2: if (sacc->chunk[u] != 0)           /* constant, so the  */
-              { goto found2;                     /* compiler will do  */
-              }                                  /* simple code here  */
-              u -= 1;
-      case 1: if (sacc->chunk[u] != 0)
-              { goto found2;
-              }
-              u -= 1;
-      case 0: ;
-    }
-
-    do  /* here, u should be a multiple of 4 minus one, and at least 3 */
-    {
-#     if USE_SIMD && __AVX__
-      { __m256i ch;
-        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
-        if (!_mm256_testz_si256(ch,ch))
-        { goto found;
-        }
-        u -= 4;
-        if (u < 0)  /* never actually happens, because value of XSUM_SCHUNKS */
-        { break;    /*   is such that u < 0 occurs at end of do loop instead */
-        }
-        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+u-3));
-        if (!_mm256_testz_si256(ch,ch))
-        { goto found;
-        }
-        u -= 4;
-      }
-#     else
-      { if (sacc->chunk[u] | sacc->chunk[u-1]
-          | sacc->chunk[u-2] | sacc->chunk[u-3])
-        { goto found;
-        }
-        u -= 4;
-      }
-#     endif
-
-    } while (u >= 0);
-
-    uix = 0;
-    goto done;
-
-  found:
-    if (sacc->chunk[u] != 0)
-    { goto found2;
-    }
-    u -= 1;
-    if (sacc->chunk[u] != 0)
-    { goto found2;
-    }
-    u -= 1;
-    if (sacc->chunk[u] != 0)
-    { goto found2;
-    }
-    u -= 1;
-
-   found2: ;
-  }
-
-# else  /* Non-optimized search for uppermost non-zero chunk */
-
-  { for (u = XSUM_SCHUNKS-1; sacc->chunk[u] == 0; u--)
-    { if (u == 0)
-      {
-        uix = 0;
-        goto done;
-      }
-    }
-  }
-
-# endif
-
-  /* At this point, sacc->chunk[u] must be non-zero */
-
-  /* Carry propagate, starting at the low-order chunks.  Note that the
-     loop limit of u may be increased inside the loop. */
-
-  i = 0;     /* set to the index of the next non-zero chunck, from bottom */
-
-# if OPT_CARRY
-  {
-    /* Quickly skip over unused low-order chunks.  Done here at the start
-       on the theory that there are often many unused low-order chunks,
-       justifying some overhead to begin, but later stretches of unused
-       chunks may not be as large. */
-
-    int e = u-3;  /* go only to 3 before so won't access beyond chunk array */
-
-    do
-    {
-#     if USE_SIMD && __AVX__
-      { __m256i ch;
-        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
-        if (!_mm256_testz_si256(ch,ch))
-        { break;
-        }
-        i += 4;
-        if (i >= e)
-        { break;
-        }
-        ch = _mm256_loadu_si256 ((__m256i *)(sacc->chunk+i));
-        if (!_mm256_testz_si256(ch,ch))
-        { break;
-        }
-      }
-#     else
-      { if (sacc->chunk[i] | sacc->chunk[i+1]
-          | sacc->chunk[i+2] | sacc->chunk[i+3])
-        { break;
-        }
-      }
-#     endif
-
-      i += 4;
-
-    } while (i <= e);
-  }
-# endif
-
-  uix = -1;  /* indicates that a non-zero chunk has not been found yet */
-
-  do
-  { xsum_schunk c;       /* Set to the chunk at index i (next non-zero one) */
-    xsum_schunk clow;    /* Low-order bits of c */
-    xsum_schunk chigh;   /* High-order bits of c */
-
-    /* Find the next non-zero chunk, setting i to its index, or break out
-       of loop if there is none.  Note that the chunk at index u is not
-       necessarily non-zero - it was initially, but u or the chunk at u
-       may have changed. */
-
-#   if OPT_CARRY
-    {
-      c = sacc->chunk[i];
-      if (c != 0)
-      { goto nonzero;
-      }
-      i += 1;
-      if (i > u)
-      { break;  /* reaching here is only possible when u == i initially, */
-      }         /*   with the last add to a chunk having changed it to 0 */
-
-      for (;;)
-      { c = sacc->chunk[i];
-        if (c != 0)
-        { goto nonzero;
-        }
-        i += 1;
-        c = sacc->chunk[i];
-        if (c != 0)
-        { goto nonzero;
-        }
-        i += 1;
-        c = sacc->chunk[i];
-        if (c != 0)
-        { goto nonzero;
-        }
-        i += 1;
-        c = sacc->chunk[i];
-        if (c != 0)
-        { goto nonzero;
-        }
-        i += 1;
-      }
-    }
-#   else
-    {
-      do
-      { c = sacc->chunk[i];
-        if (c != 0)
-        { goto nonzero;
-        }
-        i += 1;
-      } while (i <= u);
-
-      break;
-    }
-#   endif
-
-    /* Propagate possible carry from this chunk to next chunk up. */
-
-  nonzero:
-    chigh = c >> XSUM_LOW_MANTISSA_BITS;
-    if (chigh == 0)
-    { uix = i;
-      i += 1;
-      continue;  /* no need to change this chunk */
-    }
-
-    if (u == i)
-    { if (chigh == -1)
-      { uix = i;
-        break;   /* don't propagate -1 into the region of all zeros above */
-      }
-      u = i+1;   /* we will change chunk[u+1], so we'll need to look at it */
-    }
-
-    clow = c & XSUM_LOW_MANTISSA_MASK;
-    if (clow != 0)
-    { uix = i;
-    }
-
-    /* We now change chunk[i] and add to chunk[i+1]. Note that i+1 should be
-       in range (no bigger than XSUM_CHUNKS-1) if summing memory, since
-       the number of chunks is big enough to hold any sum, and we do not
-       store redundant chunks with values 0 or -1 above previously non-zero
-       chunks.  But other add operations might cause overflow, in which
-       case we produce a NaN with all 1s as payload.  (We can't reliably produce
-       an Inf of the right sign.) */
-
-    sacc->chunk[i] = clow;
-    if (i+1 >= XSUM_SCHUNKS)
-    { xsum_small_add_inf_nan (sacc,
-        ((xsum_int)XSUM_EXP_MASK << XSUM_MANTISSA_BITS) | XSUM_MANTISSA_MASK);
-      u = i;
-    }
-    else
-    { sacc->chunk[i+1] += chigh;  /* note: this could make this chunk be zero */
-    }
-
-    i += 1;
-
-  } while (i <= u);
-
-  /* Check again for the number being zero, since carry propagation might
-     have created zero from something that initially looked non-zero. */
-
-  if (uix < 0)
-  {
-    uix = 0;
-    goto done;
-  }
-
-  /* While the uppermost chunk is negative, with value -1, combine it with
-     the chunk below (if there is one) to produce the same number but with
-     one fewer non-zero chunks. */
-
-  while (sacc->chunk[uix] == -1 && uix > 0)
-  { /* Left shift of a negative number is undefined according to the standard,
-       so do a multiply - it's all presumably constant-folded by the compiler.*/
-    sacc->chunk[uix-1] += ((xsum_schunk) -1)
-                             * (((xsum_schunk) 1) << XSUM_LOW_MANTISSA_BITS);
-    sacc->chunk[uix] = 0;
-    uix -= 1;
-  }
-
-  /* We can now add one less than the total allowed terms before the
-     next carry propagate. */
-
-done:
-  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-1;
-
-  /* Return index of uppermost non-zero chunk. */
-
-  return uix;
-}
-
-
-/* ------------------------ EXTERNAL ROUTINES ------------------------------- */
-
-
-/* INITIALIZE A SMALL ACCUMULATOR TO ZERO. */
-
-void xsum_small_init (xsum_small_accumulator *restrict sacc)
-{
-  sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS;
-  sacc->Inf = sacc->NaN = 0;
-# if USE_MEMSET_SMALL
-  { memset (sacc->chunk, 0, XSUM_SCHUNKS * sizeof(xsum_schunk));
-  }
-# elif USE_SIMD && __AVX__ && XSUM_SCHUNKS==67
-  { xsum_schunk *ch = sacc->chunk;
-    __m256i z = _mm256_setzero_si256();
-    _mm256_storeu_si256 ((__m256i *)(ch+0), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+4), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+8), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+12), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+16), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+20), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+24), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+28), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+32), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+36), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+40), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+44), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+48), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+52), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+56), z);
-    _mm256_storeu_si256 ((__m256i *)(ch+60), z);
-    _mm_storeu_si128    ((__m128i *)(ch+64), _mm256_castsi256_si128(z));
-    _mm_storeu_si64     (ch+66, _mm256_castsi256_si128(z));
-  }
-# else
-  { xsum_schunk *p;
-    int n;
-    p = sacc->chunk;
-    n = XSUM_SCHUNKS;
-    do { *p++ = 0; n -= 1; } while (n > 0);
-  }
-# endif
-}
-
-
-/* ADD ONE NUMBER TO A SMALL ACCUMULATOR ASSUMING NO CARRY PROPAGATION REQ'D.
-   This function is declared INLINE regardless of the setting of INLINE_SMALL
-   and for good performance it must be inlined by the compiler (otherwise the
-   procedure call overhead will result in substantial inefficiency). */
-
-static INLINE void xsum_add1_no_carry (xsum_small_accumulator *restrict sacc,
-                                       xsum_flt value)
-{
-  xsum_int ivalue;
-  xsum_int mantissa;
-  xsum_expint exp, low_exp, high_exp;
-  xsum_schunk *chunk_ptr;
-
-  /* Extract exponent and mantissa.  Split exponent into high and low parts. */
-
-  COPY64 (ivalue, value);
-
-  exp = (ivalue >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK;
-  mantissa = ivalue & XSUM_MANTISSA_MASK;
-  high_exp = exp >> XSUM_LOW_EXP_BITS;
-  low_exp = exp & XSUM_LOW_EXP_MASK;
-
-  /* Categorize number as normal, denormalized, or Inf/NaN according to
-     the value of the exponent field. */
-
-  if (exp == 0) /* zero or denormalized */
-  { /* If it's a zero (positive or negative), we do nothing. */
-    if (mantissa == 0)
-    { return;
-    }
-    /* Denormalized mantissa has no implicit 1, but exponent is 1 not 0. */
-    exp = low_exp = 1;
-  }
-  else if (exp == XSUM_EXP_MASK)  /* Inf or NaN */
-  { /* Just update flags in accumulator structure. */
-    xsum_small_add_inf_nan (sacc, ivalue);
-    return;
-  }
-  else /* normalized */
-  { /* OR in implicit 1 bit at top of mantissa */
-    mantissa |= (xsum_int)1 << XSUM_MANTISSA_BITS;
-  }
-
-  /* Use high part of exponent as index of chunk, and low part of
-     exponent to give position within chunk.  Fetch the two chunks
-     that will be modified. */
-
-  chunk_ptr = sacc->chunk + high_exp;
-
-  /* Separate mantissa into two parts, after shifting, and add to (or
-     subtract from) this chunk and the next higher chunk (which always
-     exists since there are three extra ones at the top).
-
-     Note that low_mantissa will have at most XSUM_LOW_MANTISSA_BITS bits,
-     while high_mantissa will have at most XSUM_MANTISSA_BITS bits, since
-     even though the high mantissa includes the extra implicit 1 bit, it will
-     also be shifted right by at least one bit. */
-
-  xsum_int split_mantissa[2];
-  split_mantissa[0] = ((xsum_uint)mantissa << low_exp) & XSUM_LOW_MANTISSA_MASK;
-  split_mantissa[1] = mantissa >> (XSUM_LOW_MANTISSA_BITS - low_exp);
-
-  /* Add to, or subtract from, the two affected chunks. */
-
-# if OPT_SMALL==1
-  { xsum_int ivalue_sign = ivalue<0 ? -1 : 1;
-    chunk_ptr[0] += ivalue_sign * split_mantissa[0];
-    chunk_ptr[1] += ivalue_sign * split_mantissa[1];
-  }
-# elif OPT_SMALL==2
-  { xsum_int ivalue_neg
-              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
-    chunk_ptr[0] += (split_mantissa[0] ^ ivalue_neg) + (ivalue_neg & 1);
-    chunk_ptr[1] += (split_mantissa[1] ^ ivalue_neg) + (ivalue_neg & 1);
-  }
-# elif OPT_SMALL==3 && USE_SIMD && __SSE2__
-  { xsum_int ivalue_neg
-              = ivalue>>(XSUM_SCHUNK_BITS-1); /* all 0s if +ve, all 1s if -ve */
-    _mm_storeu_si128 ((__m128i *)chunk_ptr,
-                      _mm_add_epi64 (_mm_loadu_si128 ((__m128i *)chunk_ptr),
-                       _mm_add_epi64 (_mm_set1_epi64((__m64)(ivalue_neg&1)),
-                        _mm_xor_si128 (_mm_set1_epi64((__m64)ivalue_neg),
-                         _mm_loadu_si128 ((__m128i *)split_mantissa)))));
-  }
-# else
-  { if (ivalue < 0)
-    { chunk_ptr[0] -= split_mantissa[0];
-      chunk_ptr[1] -= split_mantissa[1];
-    }
-    else
-    { chunk_ptr[0] += split_mantissa[0];
-      chunk_ptr[1] += split_mantissa[1];
-    }
-  }
-# endif
-}
-
-
-/* ADD ONE DOUBLE TO A SMALL ACCUMULATOR.  This is equivalent to, but
-   somewhat faster than, calling xsum_small_addv with a vector of one
-   value. */
-
-void xsum_small_add1 (xsum_small_accumulator *restrict sacc, xsum_flt value)
-{
-  if (sacc->adds_until_propagate == 0)
-  { (void) xsum_carry_propagate(sacc);
-  }
-
-  xsum_add1_no_carry (sacc, value);
-
-  sacc->adds_until_propagate -= 1;
-}
-
-
-/* ADD A VECTOR OF FLOATING-POINT NUMBERS TO A SMALL ACCUMULATOR.  Mixes
-   calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
-
-void xsum_small_addv (xsum_small_accumulator *restrict sacc,
-                      const xsum_flt *restrict vec,
-                      xsum_length n)
-{ xsum_length m, i;
-
-  while (n > 0)
-  { if (sacc->adds_until_propagate == 0)
-    { (void) xsum_carry_propagate(sacc);
-    }
-    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
-    for (i = 0; i < m; i++)
-    { xsum_add1_no_carry (sacc, vec[i]);
-    }
-    sacc->adds_until_propagate -= m;
-    vec += m;
-    n -= m;
-  }
-}
-
-
-/* ADD SQUARED NORM OF VECTOR OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
-   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
-
-void xsum_small_add_sqnorm (xsum_small_accumulator *restrict sacc,
-                            const xsum_flt *restrict vec,
-                            xsum_length n)
-{ xsum_length m, i;
-
-  while (n > 0)
-  { if (sacc->adds_until_propagate == 0)
-    { (void) xsum_carry_propagate(sacc);
-    }
-    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
-    for (i = 0; i < m; i++)
-    { xsum_add1_no_carry (sacc, vec[i] * vec[i]);
-    }
-    sacc->adds_until_propagate -= m;
-    vec += m;
-    n -= m;
-  }
-}
-
-
-/* ADD DOT PRODUCT OF VECTORS OF FLOATING-POINT NUMBERS TO SMALL ACCUMULATOR.
-   Mixes calls of xsum_carry_propagate with calls of xsum_add1_no_carry. */
-
-void xsum_small_add_dot (xsum_small_accumulator *restrict sacc,
-                         const xsum_flt *vec1, const xsum_flt *vec2,
-                         xsum_length n)
-{ xsum_length m, i;
-
-  while (n > 0)
-  { if (sacc->adds_until_propagate == 0)
-    { (void) xsum_carry_propagate(sacc);
-    }
-    m = n <= sacc->adds_until_propagate ? n : sacc->adds_until_propagate;
-    for (i = 0; i < m; i++)
-    { xsum_add1_no_carry (sacc, vec1[i] * vec2[i]);
-    }
-    sacc->adds_until_propagate -= m;
-    vec1 += m;
-    vec2 += m;
-    n -= m;
-  }
-}
-
-
-/* ADD A SMALL ACCUMULATOR TO ANOTHER SMALL ACCUMULATOR.  The first argument
-   is the destination, which is modified.  The second is the accumulator to
-   add, which may also be modified, but should still represent the same
-   number.  Source and destination may be the same. */
-
-void xsum_small_add_accumulator (xsum_small_accumulator *dst_sacc,
-                                 xsum_small_accumulator *src_sacc)
-{
-  int i;
-
-  xsum_carry_propagate (dst_sacc);
-
-  if (dst_sacc == src_sacc)
-  { for (i = 0; i < XSUM_SCHUNKS; i++)
-    { dst_sacc->chunk[i] += dst_sacc->chunk[i];
-    }
-  }
-  else
-  {
-    xsum_carry_propagate (src_sacc);
-
-    if (src_sacc->Inf) xsum_small_add_inf_nan (dst_sacc, src_sacc->Inf);
-    if (src_sacc->NaN) xsum_small_add_inf_nan (dst_sacc, src_sacc->NaN);
-
-    for (i = 0; i < XSUM_SCHUNKS; i++)
-    { dst_sacc->chunk[i] += src_sacc->chunk[i];
-    }
-  }
-
-  dst_sacc->adds_until_propagate = XSUM_SMALL_CARRY_TERMS-2;
-}
-
-
-/* NEGATE THE VALUE IN A SMALL ACCUMULATOR. */
-
-void xsum_small_negate (xsum_small_accumulator *restrict sacc)
-{
-  int i;
-
-  for (i = 0; i < XSUM_SCHUNKS; i++)
-  { sacc->chunk[i] = -sacc->chunk[i];
-  }
-
-  if (sacc->Inf != 0)
-  { sacc->Inf ^= XSUM_SIGN_MASK;
-  }
-}
-
-
-/* RETURN THE RESULT OF ROUNDING A SMALL ACCUMULATOR.  The rounding mode
-   is to nearest, with ties to even.  The small accumulator may be modified
-   by this operation (by carry propagation being done), but the value it
-   represents should not change. */
-
-xsum_flt xsum_small_round (xsum_small_accumulator *restrict sacc)
-{
-  xsum_int ivalue;
-  xsum_schunk lower;
-  int i, j, e, more;
-  xsum_int intv;
-  double fltv;
-
-  /* See if we have a NaN from one of the numbers being a NaN, in
-     which case we return the NaN with largest payload, or an infinite
-     result (+Inf, -Inf, or a NaN if both +Inf and -Inf occurred).
-     Note that we do NOT return NaN if we have both an infinite number
-     and a sum of other numbers that overflows with opposite sign,
-     since there is no real ambiguity regarding the sign in such a case. */
-
-  if (sacc->NaN != 0)
-  { COPY64(fltv, sacc->NaN);
-    return fltv;
-  }
-
-  if (sacc->Inf != 0)
-  { COPY64 (fltv, sacc->Inf);
-    return fltv;
-  }
-
-  /* If none of the numbers summed were infinite or NaN, we proceed to
-     propagate carries, as a preliminary to finding the magnitude of
-     the sum.  This also ensures that the sign of the result can be
-     determined from the uppermost non-zero chunk.
-
-     We also find the index, i, of this uppermost non-zero chunk, as
-     the value returned by xsum_carry_propagate, and set ivalue to
-     sacc->chunk[i].  Note that ivalue will not be 0 or -1, unless
-     i is 0 (the lowest chunk), in which case it will be handled by
-     the code for denormalized numbers. */
-
-  i = xsum_carry_propagate(sacc);
-
-  ivalue = sacc->chunk[i];
-
-  /* Handle a possible denormalized number, including zero. */
-
-  if (i <= 1)
-  {
-    /* Check for zero value, in which case we can return immediately. */
-
-    if (ivalue == 0)
-    { return 0.0;
-    }
-
-    /* Check if it is actually a denormalized number.  It always is if only
-       the lowest chunk is non-zero.  If the highest non-zero chunk is the
-       next-to-lowest, we check the magnitude of the absolute value.
-       Note that the real exponent is 1 (not 0), so we need to shift right
-       by 1 here. */
-
-    if (i == 0)
-    { intv = ivalue >= 0 ? ivalue : -ivalue;
-      intv >>= 1;
-      if (ivalue < 0)
-      { intv |= XSUM_SIGN_MASK;
-      }
-      COPY64 (fltv, intv);
-      return fltv;
-    }
-    else
-    { /* Note: Left shift of -ve number is undefined, so do a multiply instead,
-               which is probably optimized to a shift. */
-      intv = ivalue * ((xsum_int)1 << (XSUM_LOW_MANTISSA_BITS-1))
-               + (sacc->chunk[0] >> 1);
-      if (intv < 0)
-      { if (intv > - ((xsum_int)1 << XSUM_MANTISSA_BITS))
-        { intv = (-intv) | XSUM_SIGN_MASK;
-          COPY64 (fltv, intv);
-          return fltv;
-        }
-      }
-      else /* non-negative */
-      { if ((xsum_uint)intv < (xsum_uint)1 << XSUM_MANTISSA_BITS)
-        {
-          COPY64 (fltv, intv);
-          return fltv;
-        }
-      }
-      /* otherwise, it's not actually denormalized, so fall through to below */
-    }
-  }
-
-  /* Find the location of the uppermost 1 bit in the absolute value of
-     the upper chunk by converting it (as a signed integer) to a
-     floating point value, and looking at the exponent.  Then set
-     'more' to the number of bits from the lower chunk (and maybe the
-     next lower) that are needed to fill out the mantissa of the
-     result (including the top implicit 1 bit), plus two extra bits to
-     help decide on rounding.  For negative numbers, it may turn out
-     later that we need another bit, because negating a negative value
-     may carry out of the top here, but not carry out of the top once
-     more bits are shifted into the bottom later on. */
-
-  fltv = (xsum_flt) ivalue;  /* finds position of topmost 1 bit of |ivalue| */
-  COPY64 (intv, fltv);
-  e = (intv >> XSUM_MANTISSA_BITS) & XSUM_EXP_MASK; /* e-bias is in 0..32 */
-  more = 2 + XSUM_MANTISSA_BITS + XSUM_EXP_BIAS - e;
-
-  /* Change 'ivalue' to put in 'more' bits from lower chunks into the bottom.
-     Also set 'j' to the index of the lowest chunk from which these bits came,
-     and 'lower' to the remaining bits of that chunk not now in 'ivalue'.
-     Note that 'lower' initially has at least one bit in it, which we can
-     later move into 'ivalue' if it turns out that one more bit is needed. */
-
-  ivalue *= (xsum_int)1 << more;  /* multiply, since << of negative undefined */
-
-  j = i-1;
-  lower = sacc->chunk[j];  /* must exist, since denormalized if i==0 */
-  if (more >= XSUM_LOW_MANTISSA_BITS)
-  { more -= XSUM_LOW_MANTISSA_BITS;
-    ivalue += lower << more;
-    j -= 1;
-    lower = j < 0 ? 0 : sacc->chunk[j];
-  }
-  ivalue += lower >> (XSUM_LOW_MANTISSA_BITS - more);
-  lower &= ((xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - more)) - 1;
-
-  /* Decide on rounding, with separate code for positive and negative values.
-
-     At this point, 'ivalue' has the signed mantissa bits, plus two extra
-     bits, with 'e' recording the exponent position for these within their
-     top chunk.  For positive 'ivalue', the bits in 'lower' and chunks
-     below 'j' add to the absolute value; for negative 'ivalue' they
-     subtract.
-
-     After setting 'ivalue' to the tentative unsigned mantissa
-     (shifted left 2), and 'intv' to have the correct sign, this
-     code goes to done_rounding if it finds that just discarding lower
-     order bits is correct, and to round_away_from_zero if instead the
-     magnitude should be increased by one in the lowest mantissa bit. */
-
-  if (ivalue >= 0)  /* number is positive, lower bits are added to magnitude */
-  {
-    intv = 0;  /* positive sign */
-
-    if ((ivalue & 2) == 0)  /* extra bits are 0x */
-    {
-      goto done_rounding;
-    }
-
-    if ((ivalue & 1) != 0)  /* extra bits are 11 */
-    {
-      goto round_away_from_zero;
-    }
-
-    if ((ivalue & 4) != 0)  /* low bit is 1 (odd), extra bits are 10 */
-    {
-      goto round_away_from_zero;
-    }
-
-    if (lower == 0)  /* see if any lower bits are non-zero */
-    { while (j > 0)
-      { j -= 1;
-        if (sacc->chunk[j] != 0)
-        { lower = 1;
-          break;
-        }
-      }
-    }
-
-    if (lower != 0)  /* low bit 0 (even), extra bits 10, non-zero lower bits */
-    {
-      goto round_away_from_zero;
-    }
-    else  /* low bit 0 (even), extra bits 10, all lower bits 0 */
-    {
-      goto done_rounding;
-    }
-  }
-
-  else  /* number is negative, lower bits are subtracted from magnitude */
-  {
-    /* Check for a negative 'ivalue' that when negated doesn't contain a full
-       mantissa's worth of bits, plus one to help rounding.  If so, move one
-       more bit into 'ivalue' from 'lower' (and remove it from 'lower').
-       This happens when the negation of the upper part of 'ivalue' has the
-       form 10000... but the negation of the full 'ivalue' is not 10000... */
-
-    if (((-ivalue) & ((xsum_int)1 << (XSUM_MANTISSA_BITS+2))) == 0)
-    { int pos = (xsum_schunk)1 << (XSUM_LOW_MANTISSA_BITS - 1 - more);
-      ivalue *= 2;  /* note that left shift undefined if ivalue is negative */
-      if (lower & pos)
-      { ivalue += 1;
-        lower &= ~pos;
-      }
-      e -= 1;
-    }
-
-    intv = XSUM_SIGN_MASK;    /* negative sign */
-    ivalue = -ivalue;         /* ivalue now contains the absolute value */
-
-    if ((ivalue & 3) == 3)  /* extra bits are 11 */
-    {
-      goto round_away_from_zero;
-    }
-
-    if ((ivalue & 3) <= 1)  /* extra bits are 00 or 01 */
-    {
-      goto done_rounding;
-    }
-
-    if ((ivalue & 4) == 0)  /* low bit is 0 (even), extra bits are 10 */
-    {
-      goto done_rounding;
-    }
-
-    if (lower == 0)  /* see if any lower bits are non-zero */
-    { while (j > 0)
-      { j -= 1;
-        if (sacc->chunk[j] != 0)
-        { lower = 1;
-          break;
-        }
-      }
-    }
-
-    if (lower != 0)  /* low bit 1 (odd), extra bits 10, non-zero lower bits */
-    {
-      goto done_rounding;
-    }
-    else  /* low bit 1 (odd), extra bits are 10, lower bits are all 0 */
-    {
-      goto round_away_from_zero;
-    }
-
-  }
-
-round_away_from_zero:
-
-  /* Round away from zero, then check for carry having propagated out the
-     top, and shift if so. */
-
-  ivalue += 4;  /* add 1 to low-order mantissa bit */
-  if (ivalue & ((xsum_int)1 << (XSUM_MANTISSA_BITS+3)))
-  { ivalue >>= 1;
-    e += 1;
-  }
-
-done_rounding: ;
-
-  /* Get rid of the bottom 2 bits that were used to decide on rounding. */
-
-  ivalue >>= 2;
-
-  /* Adjust to the true exponent, accounting for where this chunk is. */
-
-  e += (i<<XSUM_LOW_EXP_BITS) - XSUM_EXP_BIAS - XSUM_MANTISSA_BITS;
-
-  /* If exponent has overflowed, change to plus or minus Inf and return. */
-
-  if (e >= XSUM_EXP_MASK)
-  { intv |= (xsum_int) XSUM_EXP_MASK << XSUM_MANTISSA_BITS;
-    COPY64 (fltv, intv);
-
-    return fltv;
-  }
-
-  /* Put exponent and mantissa into intv, which already has the sign,
-     then copy into fltv. */
-
-  intv += (xsum_int)e << XSUM_MANTISSA_BITS;
-  intv += ivalue & XSUM_MANTISSA_MASK;  /* mask out the implicit 1 bit */
-  COPY64 (fltv, intv);
-
-  if (xsum_debug)
-  {
-    if ((ivalue >> XSUM_MANTISSA_BITS) != 1) abort();
-  }
-
-  return fltv;
-}
-
-
-/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY UNSIGNED INTEGER. */
-
-xsum_flt xsum_small_div_unsigned
-           (xsum_small_accumulator *restrict sacc, unsigned div)
-{
-  xsum_flt result;
-  unsigned rem;
-  double fltv;
-  int sign;
-  int i, j;
-
-  /* Return NaN or an Inf if that's what's in the superaccumulator. */
-
-  if (sacc->NaN != 0)
-  { COPY64(fltv, sacc->NaN);
-    return fltv;
-  }
-
-  if (sacc->Inf != 0)
-  { COPY64 (fltv, sacc->Inf);
-    return fltv;
-  }
-
-  /* Make a copy of the superaccumulator, so we can change it here without
-     changing *sacc. */
-
-  xsum_small_accumulator tacc = *sacc;
-
-  /* Carry propagate in the temporary copy of the superaccumulator.
-     Sets 'i' to the index of the topmost nonzero chunk. */
-
-  i = xsum_carry_propagate(&tacc);
-
-  /* Check for division by zero, and if so, return +Inf, -Inf, or NaN,
-     depending on whether the superaccumulator is positive, negative,
-     or zero. */
-
-  if (div == 0)
-  {
-    return tacc.chunk[i] > 0 ? INFINITY : tacc.chunk[i] < 0 ? -INFINITY : NAN;
-  }
-
-  /* Record sign of accumulator, and if it's negative, negate and
-     re-propagate so that it will be positive. */
-
-  sign = +1;
-
-  if (tacc.chunk[i] < 0)
-  { xsum_small_negate(&tacc);
-    i = xsum_carry_propagate(&tacc);
-    if (xsum_debug)
-    {
-      if (tacc.chunk[i] < 0) abort();
-    }
-    sign = -1;
-  }
-
-  /* Do the division in the small accumulator, putting the remainder after
-     dividing the bottom chunk in 'rem'. */
-
-  rem = 0;
-  for (j = i; j>=0; j--)
-  { xsum_uint num = ((xsum_uint) rem << XSUM_LOW_MANTISSA_BITS) + tacc.chunk[j];
-    xsum_uint quo = num / div;
-    rem = num - quo*div;
-    tacc.chunk[j] = quo;
-  }
-
-  /* Find new top chunk. */
-
-  while (i > 0 && tacc.chunk[i] == 0)
-  { i -= 1;
-  }
-
-  /* Do rounding, with separate approachs for a normal number with biased
-     exponent greater than 1, and for a normal number with exponent of 1
-     or a denormalized number (also having true biased exponent of 1). */
-
-  if (i > 1 || tacc.chunk[1] >= (1 << (XSUM_HIGH_MANTISSA_BITS+2)))
-  {
-    /* Normalized number with at least two bits at bottom of chunk 0
-       below the mantissa.  Just need to 'or' in a 1 at the bottom if
-       remainder is non-zero to break a tie if bits below bottom of
-       mantissa are exactly 1/2. */
-
-    if (rem > 0)
-    { tacc.chunk[0] |= 1;
-    }
-  }
-  else
-  {
-    /* Denormalized number or normal number with biased exponent of 1.
-       Lowest bit of bottom chunk is just below lowest bit of
-       mantissa.  Need to explicitly round here using the bottom bit
-       and the remainder - round up if lower > 1/2 or >= 1/2 and
-       odd. */
-
-    if (tacc.chunk[0] & 1)  /* lower part is >= 1/2 */
-    {
-      if (tacc.chunk[0] & 2)  /* lowest bit of mantissa is 1 (odd) */
-      { tacc.chunk[0] += 2;     /* round up */
-      }
-      else                    /* lowest bit of mantissa is 0 (even) */
-      { if (rem > 0)            /* lower part is > 1/2 */
-        { tacc.chunk[0] += 2;     /* round up */
-        }
-      }
-
-      tacc.chunk[0] &= ~1;  /* clear low bit (but should anyway be ignored) */
-    }
-  }
-
-  /* Do the final rounding, with the lowest bit set as above. */
-
-  result = xsum_small_round (&tacc);
-
-  return sign*result;
-}
-
-
-/* FIND RESULT OF DIVIDING SMALL ACCUMULATOR BY SIGNED INTEGER. */
-
-xsum_flt xsum_small_div_int
-           (xsum_small_accumulator *restrict sacc, int div)
-{
-  if (div < 0)
-  { return -xsum_small_div_unsigned (sacc, (unsigned) -div);
-  }
-  else
-  { return xsum_small_div_unsigned (sacc, (unsigned) div);
-  }
-}
diff --git a/xsum.h b/xsum.h
deleted file mode 100644
index 2372cac61..000000000
--- a/xsum.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* INTERFACE TO FUNCTIONS FOR EXACT SUMMATION. */
-
-/* Copyright 2015, 2018, 2021 Radford M. Neal
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef XSUM_H
-#define XSUM_H
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <stddef.h>
-
-
-/* CONSTANTS DEFINING THE FLOATING POINT FORMAT. */
-
-typedef double xsum_flt;           /* C floating point type sums are done for */
-
-typedef int64_t xsum_int;          /* Signed integer type for a fp value */
-typedef uint64_t xsum_uint;        /* Unsigned integer type for a fp value */
-typedef int_fast16_t xsum_expint;  /* Integer type for holding an exponent */
-
-#define XSUM_MANTISSA_BITS 52      /* Bits in fp mantissa, excludes implict 1 */
-#define XSUM_EXP_BITS 11           /* Bits in fp exponent */
-
-#define XSUM_MANTISSA_MASK \
-  (((xsum_int)1 << XSUM_MANTISSA_BITS) - 1)  /* Mask for mantissa bits */
-
-#define XSUM_EXP_MASK \
-  ((1 << XSUM_EXP_BITS) - 1)                 /* Mask for exponent */
-
-#define XSUM_EXP_BIAS \
-  ((1 << (XSUM_EXP_BITS-1)) - 1)             /* Bias added to signed exponent */
-
-#define XSUM_SIGN_BIT \
-  (XSUM_MANTISSA_BITS + XSUM_EXP_BITS)       /* Position of sign bit */
-
-#define XSUM_SIGN_MASK \
-  ((xsum_uint)1 << XSUM_SIGN_BIT)            /* Mask for sign bit */
-
-
-/* CONSTANTS DEFINING THE SMALL ACCUMULATOR FORMAT. */
-
-#define XSUM_SCHUNK_BITS 64        /* Bits in chunk of the small accumulator */
-typedef int64_t xsum_schunk;       /* Integer type of small accumulator chunk */
-
-#define XSUM_LOW_EXP_BITS 5        /* # of low bits of exponent, in one chunk */
-
-#define XSUM_LOW_EXP_MASK \
-  ((1 << XSUM_LOW_EXP_BITS) - 1)       /* Mask for low-order exponent bits */
-
-#define XSUM_HIGH_EXP_BITS \
-  (XSUM_EXP_BITS - XSUM_LOW_EXP_BITS)  /* # of high exponent bits for index */
-
-#define XSUM_HIGH_EXP_MASK \
-  ((1 << HIGH_EXP_BITS) - 1)           /* Mask for high-order exponent bits */
-
-#define XSUM_SCHUNKS \
-  ((1 << XSUM_HIGH_EXP_BITS) + 3)      /* # of chunks in small accumulator */
-
-#define XSUM_LOW_MANTISSA_BITS \
-  (1 << XSUM_LOW_EXP_BITS)             /* Bits in low part of mantissa */
-
-#define XSUM_HIGH_MANTISSA_BITS \
-  (XSUM_MANTISSA_BITS - XSUM_LOW_MANTISSA_BITS)  /* Bits in high part */
-
-#define XSUM_LOW_MANTISSA_MASK \
-  (((xsum_int)1 << XSUM_LOW_MANTISSA_BITS) - 1)  /* Mask for low bits */
-
-#define XSUM_SMALL_CARRY_BITS \
- ((XSUM_SCHUNK_BITS-1) - XSUM_MANTISSA_BITS)     /* Bits sums can carry into */
-
-#define XSUM_SMALL_CARRY_TERMS \
-  ((1 << XSUM_SMALL_CARRY_BITS) - 1)   /* # terms can add before need prop. */
-
-typedef struct
-{ xsum_schunk chunk[XSUM_SCHUNKS]; /* Chunks making up small accumulator */
-  xsum_int Inf;                    /* If non-zero, +Inf, -Inf, or NaN */
-  xsum_int NaN;                    /* If non-zero, a NaN value with payload */
-  int adds_until_propagate;        /* Number of remaining adds before carry */
-} xsum_small_accumulator;          /*     propagation must be done again    */
-
-
-/* TYPE FOR LENGTHS OF ARRAYS.  Must be a signed integer type.  Set to
-   ptrdiff_t here on the assumption that this will be big enough, but
-   not unnecessarily big, which seems to be true. */
-
-typedef ptrdiff_t xsum_length;
-
-
-/* FUNCTIONS FOR EXACT SUMMATION, WITH POSSIBLE DIVISION BY AN INTEGER. */
-
-void xsum_small_init (xsum_small_accumulator *restrict);
-void xsum_small_add1 (xsum_small_accumulator *restrict, xsum_flt);
-void xsum_small_addv (xsum_small_accumulator *restrict,
-                      const xsum_flt *restrict, xsum_length);
-void xsum_small_add_sqnorm (xsum_small_accumulator *restrict,
-                            const xsum_flt *restrict, xsum_length);
-void xsum_small_add_dot (xsum_small_accumulator *restrict,
-                         const xsum_flt *, const xsum_flt *, xsum_length);
-void xsum_small_add_accumulator (xsum_small_accumulator *,
-                                 xsum_small_accumulator *);
-void xsum_small_negate (xsum_small_accumulator *restrict);
-xsum_flt xsum_small_round (xsum_small_accumulator *restrict);
-
-xsum_flt xsum_small_div_unsigned (xsum_small_accumulator *restrict, unsigned);
-xsum_flt xsum_small_div_int (xsum_small_accumulator *restrict, int);
-
-
-/* DEBUG FLAG.  Set to non-zero for debug ouptut.  Ignored unless xsum.c
-   is compiled with -DDEBUG. */
-
-extern int xsum_debug;
-
-#endif