diff --git a/deps/simdjson/simdjson.cpp b/deps/simdjson/simdjson.cpp index a36c044941f..aaeca3fadde 100644 --- a/deps/simdjson/simdjson.cpp +++ b/deps/simdjson/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2025-01-27 20:34:35 -0500. Do not edit! */ +/* auto-generated on 2025-02-14 16:11:36 -0500. Do not edit! */ /* including simdjson.cpp: */ /* begin file simdjson.cpp */ #define SIMDJSON_SRC_SIMDJSON_CPP @@ -20813,14 +20813,18 @@ namespace simd { // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). // Passing a 0 value for mask would be equivalent to writing out every byte to output. - // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes // get written. // Design consideration: it seems like a function with the // signature simd8 compress(uint32_t mask) would be // sensible, but the AVX ISA makes this kind of approach difficult. template simdjson_inline void compress(uint64_t mask, L * output) const { - _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + // _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this); + _mm512_storeu_si512(output, compressed); // could use a mask } template @@ -23443,14 +23447,18 @@ namespace simd { // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). // Passing a 0 value for mask would be equivalent to writing out every byte to output. - // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes // get written. // Design consideration: it seems like a function with the // signature simd8 compress(uint32_t mask) would be // sensible, but the AVX ISA makes this kind of approach difficult. template simdjson_inline void compress(uint64_t mask, L * output) const { - _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + // _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this); + _mm512_storeu_si512(output, compressed); // could use a mask } template diff --git a/deps/simdjson/simdjson.h b/deps/simdjson/simdjson.h index fb82b032bd0..c1535ee8130 100644 --- a/deps/simdjson/simdjson.h +++ b/deps/simdjson/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on 2025-01-27 20:34:35 -0500. Do not edit! */ +/* auto-generated on 2025-02-14 16:11:36 -0500. Do not edit! */ /* including simdjson.h: */ /* begin file simdjson.h */ #ifndef SIMDJSON_H @@ -2437,7 +2437,7 @@ namespace std { #define SIMDJSON_SIMDJSON_VERSION_H /** The version of simdjson being used (major.minor.revision) */ -#define SIMDJSON_VERSION "3.12.0" +#define SIMDJSON_VERSION "3.12.2" namespace simdjson { enum { @@ -2452,7 +2452,7 @@ enum { /** * The revision (major.minor.REVISION) of simdjson being used. */ - SIMDJSON_VERSION_REVISION = 0 + SIMDJSON_VERSION_REVISION = 2 }; } // namespace simdjson @@ -17948,14 +17948,18 @@ namespace simd { // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). // Passing a 0 value for mask would be equivalent to writing out every byte to output. - // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes // get written. // Design consideration: it seems like a function with the // signature simd8 compress(uint32_t mask) would be // sensible, but the AVX ISA makes this kind of approach difficult. template simdjson_inline void compress(uint64_t mask, L * output) const { - _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + // _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this); + _mm512_storeu_si512(output, compressed); // could use a mask } template @@ -65401,14 +65405,18 @@ namespace simd { // Copies to 'output" all bytes corresponding to a 0 in the mask (interpreted as a bitset). // Passing a 0 value for mask would be equivalent to writing out every byte to output. - // Only the first 32 - count_ones(mask) bytes of the result are significant but 32 bytes + // Only the first 64 - count_ones(mask) bytes of the result are significant but 64 bytes // get written. // Design consideration: it seems like a function with the // signature simd8 compress(uint32_t mask) would be // sensible, but the AVX ISA makes this kind of approach difficult. template simdjson_inline void compress(uint64_t mask, L * output) const { - _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + // we deliberately avoid _mm512_mask_compressstoreu_epi8 for portability + // (AMD Zen4 has terrible performance with it, it is effectively broken) + // _mm512_mask_compressstoreu_epi8 (output,~mask,*this); + __m512i compressed = _mm512_maskz_compress_epi8(~mask, *this); + _mm512_storeu_si512(output, compressed); // could use a mask } template