diff --git a/include/aws/checksums/private/intel/crc32c_compiler_shims.h b/include/aws/checksums/private/intel/crc32c_compiler_shims.h index 21002de..b321757 100644 --- a/include/aws/checksums/private/intel/crc32c_compiler_shims.h +++ b/include/aws/checksums/private/intel/crc32c_compiler_shims.h @@ -20,6 +20,7 @@ typedef uint32_t slice_ptr_int_type; #ifdef AWS_HAVE_AVX512_INTRINSICS uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc); +uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc); #endif uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc); diff --git a/source/crc.c b/source/crc.c index f5d3e80..de86084 100644 --- a/source/crc.c +++ b/source/crc.c @@ -12,7 +12,7 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) { if (AWS_UNLIKELY(!s_crc32_fn_ptr)) { - if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { + if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) { s_crc32_fn_ptr = aws_checksums_crc32_hw; } else { s_crc32_fn_ptr = aws_checksums_crc32_sw; diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index d571cc0..ba50816 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -97,5 +97,37 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev } uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); + uint32_t crc = previousCrc32; + + if (AWS_UNLIKELY(!detection_performed)) { + detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); + detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ); + detection_performed = true; + } + +#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) + int chunk_size = length & ~63; + + if (detected_avx512 && detected_vpclmulqdq) { + if (length >= 256) { + crc = aws_checksums_crc32_avx512(input, length, ~crc); + /* + * fallback code expects the crc to be not bit flipped and is going to bit flip + * to compensate for that, we need to bit flip + */ + crc = ~crc; + + /* check remaining data */ + length -= chunk_size; + if (!length) { + return crc; + } + + /* Fall into the default crc32 for the remaining data. */ + input += chunk_size; + } + } +#endif + crc = aws_checksums_crc32_sw(input, length, crc); + return crc; } @@ -150,3 +150,147 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); return (uint32_t) _mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); } + + +uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t previous_crc) { + AWS_ASSERT( + length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc."); + + uint32_t crc = previous_crc; + + static const uint64_t k1k2[] = { + 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430}; + static const uint64_t k3k4[] = { + 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596}; + static const uint64_t k9k10[8] = { + 0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1}; + static const uint64_t k1k4[8] = { + 0x3db1ecdc, 0xaf449247, 0xf1da05aa, 0x81256527, 0xae689191, 0xccaa009e, 0x00000000, 0x00000000}; + static const uint64_t k5k6[] = { 0x01751997d0, 0x00ccaa009e }; + static const uint64_t k7k8[] = { 0x0163cd6124, 0x0000000000 }; + static const uint64_t poly[] = { 0x01db710641, 0x01f7011641 }; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a0, a1, a2, a3; + + /* + * There's at least one block of 256. + */ + x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + + x0 = _mm512_load_si512((__m512i *)k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96); + x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96); + x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96); + + input += 256; + length -= 256; + } + + /* + * Fold 256 bytes into 64 bytes. + */ + x0 = _mm512_load_si512((__m512i *)k9k10); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96); + + x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96); + + x0 = _mm512_load_si512((__m512i *)k3k4); + y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) { + x2 = _mm512_loadu_si512((__m512i *)input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 128-bits. + */ + x0 = _mm512_loadu_si512((__m512i *)k1k4); + + a2 = _mm512_extracti32x4_epi32(x1, 3); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96); + + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + x0 = _mm512_xor_epi64(x1, x0); + a1 = _mm512_extracti32x4_epi32(x0, 1); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); + + /* + * Fold 128-bits to 64-bits. + */ + a0 = _mm_load_si128((__m128i *)k5k6); + a2 = _mm_clmulepi64_si128(a1, a0, 0x10); + a3 = _mm_setr_epi32(~0, 0, ~0, 0); + a1 = _mm_srli_si128(a1, 8); + a1 = _mm_xor_si128(a1, a2); + + a0 = _mm_loadl_epi64((__m128i*)k7k8); + a2 = _mm_srli_si128(a1, 4); + a1 = _mm_and_si128(a1, a3); + a1 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + /* + * Barret reduce to 32-bits. + */ + a0 = _mm_load_si128((__m128i*)poly); + + a2 = _mm_and_si128(a1, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x10); + a2 = _mm_and_si128(a2, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + /* + * Return the crc32. + */ + crc = _mm_extract_epi32(a1, 1); + return crc; +}