diff --git a/include/aws/checksums/private/intel/crc32c_compiler_shims.h b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
index 21002de..b321757 100644
--- a/include/aws/checksums/private/intel/crc32c_compiler_shims.h
+++ b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
@@ -20,6 +20,7 @@ typedef uint32_t slice_ptr_int_type;
 
 #ifdef AWS_HAVE_AVX512_INTRINSICS
 uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
+uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
 #endif
 
 uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
diff --git a/source/crc.c b/source/crc.c
index f5d3e80..de86084 100644
--- a/source/crc.c
+++ b/source/crc.c
@@ -12,7 +12,7 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre
 
 uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) {
     if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
-        if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
             s_crc32_fn_ptr = aws_checksums_crc32_hw;
         } else {
             s_crc32_fn_ptr = aws_checksums_crc32_sw;
diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index d571cc0..ba50816 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -97,5 +97,37 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 }
 
 uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
+    uint32_t crc = previousCrc32;
+
+    if (AWS_UNLIKELY(!detection_performed)) {
+        detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
+        detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ);
+        detection_performed = true;
+    }
+
+#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX)
+    int chunk_size = length & ~63;
+
+    if (detected_avx512 && detected_vpclmulqdq) {
+        if (length >= 256) {
+            crc = aws_checksums_crc32_avx512(input, length, ~crc);
+            /*
+             * fallback code expects the crc to be not bit flipped and is going to bit flip
+             * to compensate for that, we need to bit flip
+             */
+            crc = ~crc;
+
+            /* check remaining data */
+            length -= chunk_size;
+            if (!length) {
+                return crc;
+            }
+
+            /* Fall into the default crc32 for the remaining data. */
+            input += chunk_size;
+        }
+    }
+#endif
+    crc = aws_checksums_crc32_sw(input, length, crc);
+    return crc;
 }
@@ -150,3 +150,147 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
     return (uint32_t) _mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
 }
+
+
+uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t previous_crc) {
+    AWS_ASSERT(
+        length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc.");
+
+    uint32_t crc = previous_crc;
+
+    static const uint64_t k1k2[] = {
+	0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430};
+    static const uint64_t k3k4[] = {
+	0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596};
+    static const uint64_t k9k10[8] = {
+	0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1, 0x33fff533, 0x910eeec1};
+    static const uint64_t k1k4[8] =  {
+	0x3db1ecdc, 0xaf449247, 0xf1da05aa, 0x81256527, 0xae689191, 0xccaa009e, 0x00000000, 0x00000000};
+    static const uint64_t k5k6[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t k7k8[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t poly[] = { 0x01db710641, 0x01f7011641 };
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+
+    /*
+     * There's at least one block of 256.
+     */
+    x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+
+    input += 256;
+    length -= 256;
+
+    /*
+     * Parallel fold blocks of 256, if any.
+     */
+    while (length >= 256) {
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+        y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+        y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+        y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+        y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+	x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+	x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
+	x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
+	x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
+
+        input += 256;
+        length -= 256;
+    }
+
+    /*
+     * Fold 256 bytes into 64 bytes.
+     */
+    x0 = _mm512_load_si512((__m512i *)k9k10);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
+
+    x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+    x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+    x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
+
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+    y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+    y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+    x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
+
+    /*
+     * Single fold blocks of 64, if any.
+     */
+    while (length >= 64) {
+        x2 = _mm512_loadu_si512((__m512i *)input);
+
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
+
+        input += 64;
+        length -= 64;
+    }
+
+    /*
+     * Fold 512-bits to 128-bits.
+     */
+    x0 = _mm512_loadu_si512((__m512i *)k1k4);
+
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96);
+
+    x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+    x0 = _mm512_xor_epi64(x1, x0);
+    a1 = _mm512_extracti32x4_epi32(x0, 1);
+    a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    a0 = _mm_load_si128((__m128i *)k5k6);
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+
+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    a0 = _mm_load_si128((__m128i*)poly);
+
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Return the crc32.
+     */
+    crc = _mm_extract_epi32(a1, 1);
+    return crc;
+}