Here's my version with a key spline improvement. I should really write this up... #include &l...

aamargulies • yesterday at 5:33 PM • 0 replies • view on HN

Here's my version with a key spline improvement. I should really write this up...

#include <stdbool.h> #include <stdint.h> #include <arm_neon.h>

/* Author: [email protected] * * Apple M4 Max (P-core) variant of simd_quad which uses a key spline * to great effect (blog post summary incoming!) / bool simd_quad_m4(const uint16_t carr, int32_t cardinality, uint16_t pos) { enum { gap = 64 };

    if (cardinality < gap) {
        if (cardinality >= 32) {
            // 32 <= n < 64: NEON-compare the first 32 as a single x4 load,
            // sweep the remainder.
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8x4_t v = vld1q_u16_x4(carr);
            uint16x8_t hit = vorrq_u16(
                vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
                vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
            if (vmaxvq_u16(hit) != 0) return true;
            for (int32_t j = 32; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        if (cardinality >= 16) {
            // 16 <= n < 32: paired x2 load + sweep tail.
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8x2_t v = vld1q_u16_x2(carr);
            uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
                                       vceqq_u16(v.val[1], needle));
            if (vmaxvq_u16(hit) != 0) return true;
            for (int32_t j = 16; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        if (cardinality >= 8) {
            // 8 <= n < 16: single 128-bit compare + sweep tail.
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8_t v = vld1q_u16(carr);
            if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
            for (int32_t j = 8; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        for (int32_t j = 0; j < cardinality; j++) {
            uint16_t v = carr[j];
            if (v >= pos) return v == pos;
        }
        return false;
    }

    int32_t num_blocks = cardinality / gap;
    int32_t base = 0;
    int32_t n = num_blocks;

    while (n > 3) {
        int32_t quarter = n >> 2;
        int32_t k1 = carr[(base + quarter + 1) * gap - 1];
        int32_t k2 = carr[(base + 2 * quarter + 1) * gap - 1];
        int32_t k3 = carr[(base + 3 * quarter + 1) * gap - 1];
        int32_t c1 = (k1 < pos);
        int32_t c2 = (k2 < pos);
        int32_t c3 = (k3 < pos);
        base += (c1 + c2 + c3) * quarter;
        n -= 3 * quarter;
    }
    while (n > 1) {
        int32_t half = n >> 1;
        base = (carr[(base + half + 1) * gap - 1] < pos) ? base + half : base;
        n -= half;
    }
    int32_t lo = (carr[(base + 1) * gap - 1] < pos) ? base + 1 : base;

    if (lo < num_blocks) {
        const uint16_t *blk = carr + lo * gap;
        uint16x8_t needle = vdupq_n_u16(pos);
        uint16x8x4_t a = vld1q_u16_x4(blk);
        uint16x8x4_t b = vld1q_u16_x4(blk + 32);
        uint16x8_t h0 = vorrq_u16(
            vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
            vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
        uint16x8_t h1 = vorrq_u16(
            vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
            vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
        return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
    }

    for (int32_t j = num_blocks * gap; j < cardinality; j++) {
        uint16_t v = carr[j];
        if (v >= pos) return v == pos;
    }
    return false;

}

/* * Spine variant, M4 edition. * * pack the interpolation probe keys into a dense contiguous region so the * cold-cache pointer chase streams through consecutive cache lines: * * n=4096 -> 64 spine keys -> 128 B = 1 M4 cache line * n=2048 -> 32 spine keys -> 64 B = half a line * n=1024 -> 16 spine keys -> 32 B * * The entire interpolation phase for a max-sized Roaring container now * lives in one cache line. The final SIMD block check still loads from * carr. * * The num_blocks <= 3 fallback: * with very few blocks the carr-based probes accidentally prime the final * block's lines, which the spine path disrupts. / bool simd_quad_m4_spine(const uint16_t carr, const uint16_t spine, int32_t cardinality, uint16_t pos) { enum { gap = 64 };

    if (cardinality < gap) {
        // Same fast paths as simd_quad_m4 -- spine is irrelevant here.
        if (cardinality >= 32) {
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8x4_t v = vld1q_u16_x4(carr);
            uint16x8_t hit = vorrq_u16(
                vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
                vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
            if (vmaxvq_u16(hit) != 0) return true;
            for (int32_t j = 32; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        if (cardinality >= 16) {
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8x2_t v = vld1q_u16_x2(carr);
            uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
                                       vceqq_u16(v.val[1], needle));
            if (vmaxvq_u16(hit) != 0) return true;
            for (int32_t j = 16; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        if (cardinality >= 8) {
            uint16x8_t needle = vdupq_n_u16(pos);
            uint16x8_t v = vld1q_u16(carr);
            if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
            for (int32_t j = 8; j < cardinality; j++) {
                uint16_t x = carr[j];
                if (x >= pos) return x == pos;
            }
            return false;
        }
        for (int32_t j = 0; j < cardinality; j++) {
            uint16_t v = carr[j];
            if (v >= pos) return v == pos;
        }
        return false;
    }

    int32_t num_blocks = cardinality / gap;

    if (num_blocks <= 3) {
        return simd_quad_m4(carr, cardinality, pos);
    }

    int32_t base = 0;
    int32_t n = num_blocks;

    // Pull the whole spine into L1 up front. For n in [256, 4096] this is
    // 1 line (128 B); for smaller n it is a partial line. Cheap on cold.
    __builtin_prefetch(spine);

    while (n > 3) {
        int32_t quarter = n >> 2;
        int32_t k1 = spine[base + quarter];
        int32_t k2 = spine[base + 2 * quarter];
        int32_t k3 = spine[base + 3 * quarter];
        int32_t c1 = (k1 < pos);
        int32_t c2 = (k2 < pos);
        int32_t c3 = (k3 < pos);
        base += (c1 + c2 + c3) * quarter;
        n -= 3 * quarter;
    }
    while (n > 1) {
        int32_t half = n >> 1;
        base = (spine[base + half] < pos) ? base + half : base;
        n -= half;
    }
    int32_t lo = (spine[base] < pos) ? base + 1 : base;

    if (lo < num_blocks) {
        const uint16_t *blk = carr + lo * gap;
        uint16x8_t needle = vdupq_n_u16(pos);
        uint16x8x4_t a = vld1q_u16_x4(blk);
        uint16x8x4_t b = vld1q_u16_x4(blk + 32);
        uint16x8_t h0 = vorrq_u16(
            vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
            vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
        uint16x8_t h1 = vorrq_u16(
            vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
            vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
        return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
    }

    for (int32_t j = num_blocks * gap; j < cardinality; j++) {
        uint16_t v = carr[j];
        if (v >= pos) return v == pos;
    }
    return false;

}

// Build the spine for a given carr. Caller allocates cardinality/64 u16s. void simd_quad_m4_build_spine(const uint16_t carr, int32_t cardinality, uint16_t spine) { enum { gap = 64 }; int32_t num_blocks = cardinality / gap; for (int32_t i = 0; i < num_blocks; i++) { spine[i] = carr[(i + 1) gap - 1]; } }

alt Hacker News