Here's my version with a key spline improvement. I should really write this up...
#include <stdbool.h> #include <stdint.h> #include <arm_neon.h>
/* Author: [email protected] * * Apple M4 Max (P-core) variant of simd_quad which uses a key spline * to great effect (blog post summary incoming!) / bool simd_quad_m4(const uint16_t carr, int32_t cardinality, uint16_t pos) { enum { gap = 64 };
if (cardinality < gap) {
if (cardinality >= 32) {
// 32 <= n < 64: NEON-compare the first 32 as a single x4 load,
// sweep the remainder.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t v = vld1q_u16_x4(carr);
uint16x8_t hit = vorrq_u16(
vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 32; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 16) {
// 16 <= n < 32: paired x2 load + sweep tail.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x2_t v = vld1q_u16_x2(carr);
uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
vceqq_u16(v.val[1], needle));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 16; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 8) {
// 8 <= n < 16: single 128-bit compare + sweep tail.
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8_t v = vld1q_u16(carr);
if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
for (int32_t j = 8; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
for (int32_t j = 0; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}
int32_t num_blocks = cardinality / gap;
int32_t base = 0;
int32_t n = num_blocks;
while (n > 3) {
int32_t quarter = n >> 2;
int32_t k1 = carr[(base + quarter + 1) * gap - 1];
int32_t k2 = carr[(base + 2 * quarter + 1) * gap - 1];
int32_t k3 = carr[(base + 3 * quarter + 1) * gap - 1];
int32_t c1 = (k1 < pos);
int32_t c2 = (k2 < pos);
int32_t c3 = (k3 < pos);
base += (c1 + c2 + c3) * quarter;
n -= 3 * quarter;
}
while (n > 1) {
int32_t half = n >> 1;
base = (carr[(base + half + 1) * gap - 1] < pos) ? base + half : base;
n -= half;
}
int32_t lo = (carr[(base + 1) * gap - 1] < pos) ? base + 1 : base;
if (lo < num_blocks) {
const uint16_t *blk = carr + lo * gap;
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t a = vld1q_u16_x4(blk);
uint16x8x4_t b = vld1q_u16_x4(blk + 32);
uint16x8_t h0 = vorrq_u16(
vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
uint16x8_t h1 = vorrq_u16(
vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
}
for (int32_t j = num_blocks * gap; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}/* * Spine variant, M4 edition. * * pack the interpolation probe keys into a dense contiguous region so the * cold-cache pointer chase streams through consecutive cache lines: * * n=4096 -> 64 spine keys -> 128 B = 1 M4 cache line * n=2048 -> 32 spine keys -> 64 B = half a line * n=1024 -> 16 spine keys -> 32 B * * The entire interpolation phase for a max-sized Roaring container now * lives in one cache line. The final SIMD block check still loads from * carr. * * The num_blocks <= 3 fallback: * with very few blocks the carr-based probes accidentally prime the final * block's lines, which the spine path disrupts. / bool simd_quad_m4_spine(const uint16_t carr, const uint16_t spine, int32_t cardinality, uint16_t pos) { enum { gap = 64 };
if (cardinality < gap) {
// Same fast paths as simd_quad_m4 -- spine is irrelevant here.
if (cardinality >= 32) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t v = vld1q_u16_x4(carr);
uint16x8_t hit = vorrq_u16(
vorrq_u16(vceqq_u16(v.val[0], needle), vceqq_u16(v.val[1], needle)),
vorrq_u16(vceqq_u16(v.val[2], needle), vceqq_u16(v.val[3], needle)));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 32; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 16) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x2_t v = vld1q_u16_x2(carr);
uint16x8_t hit = vorrq_u16(vceqq_u16(v.val[0], needle),
vceqq_u16(v.val[1], needle));
if (vmaxvq_u16(hit) != 0) return true;
for (int32_t j = 16; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
if (cardinality >= 8) {
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8_t v = vld1q_u16(carr);
if (vmaxvq_u16(vceqq_u16(v, needle)) != 0) return true;
for (int32_t j = 8; j < cardinality; j++) {
uint16_t x = carr[j];
if (x >= pos) return x == pos;
}
return false;
}
for (int32_t j = 0; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}
int32_t num_blocks = cardinality / gap;
if (num_blocks <= 3) {
return simd_quad_m4(carr, cardinality, pos);
}
int32_t base = 0;
int32_t n = num_blocks;
// Pull the whole spine into L1 up front. For n in [256, 4096] this is
// 1 line (128 B); for smaller n it is a partial line. Cheap on cold.
__builtin_prefetch(spine);
while (n > 3) {
int32_t quarter = n >> 2;
int32_t k1 = spine[base + quarter];
int32_t k2 = spine[base + 2 * quarter];
int32_t k3 = spine[base + 3 * quarter];
int32_t c1 = (k1 < pos);
int32_t c2 = (k2 < pos);
int32_t c3 = (k3 < pos);
base += (c1 + c2 + c3) * quarter;
n -= 3 * quarter;
}
while (n > 1) {
int32_t half = n >> 1;
base = (spine[base + half] < pos) ? base + half : base;
n -= half;
}
int32_t lo = (spine[base] < pos) ? base + 1 : base;
if (lo < num_blocks) {
const uint16_t *blk = carr + lo * gap;
uint16x8_t needle = vdupq_n_u16(pos);
uint16x8x4_t a = vld1q_u16_x4(blk);
uint16x8x4_t b = vld1q_u16_x4(blk + 32);
uint16x8_t h0 = vorrq_u16(
vorrq_u16(vceqq_u16(a.val[0], needle), vceqq_u16(a.val[1], needle)),
vorrq_u16(vceqq_u16(a.val[2], needle), vceqq_u16(a.val[3], needle)));
uint16x8_t h1 = vorrq_u16(
vorrq_u16(vceqq_u16(b.val[0], needle), vceqq_u16(b.val[1], needle)),
vorrq_u16(vceqq_u16(b.val[2], needle), vceqq_u16(b.val[3], needle)));
return vmaxvq_u16(vorrq_u16(h0, h1)) != 0;
}
for (int32_t j = num_blocks * gap; j < cardinality; j++) {
uint16_t v = carr[j];
if (v >= pos) return v == pos;
}
return false;
}// Build the spine for a given carr. Caller allocates cardinality/64 u16s. void simd_quad_m4_build_spine(const uint16_t carr, int32_t cardinality, uint16_t spine) { enum { gap = 64 }; int32_t num_blocks = cardinality / gap; for (int32_t i = 0; i < num_blocks; i++) { spine[i] = carr[(i + 1) gap - 1]; } }