/* crc32_simd.c * * Copyright 2017 The Chromium Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be * found in the Chromium source repository LICENSE file. */ #include "crc32_simd.h" #if defined(CRC32_SIMD_SSE42_PCLMUL) /* * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer * length must be at least 64, and a multiple of 16. Based on: * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ #include #include #include uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ const unsigned char *buf, z_size_t len, uint32_t crc) { /* * Definitions of the bit-reflected domain constants k1,k2,k3, etc and * the CRC32+Barrett polynomials given at the end of the paper. */ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; /* * There's at least one block of 64. */ x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); x0 = _mm_load_si128((__m128i *)k1k2); buf += 64; len -= 64; /* * Parallel fold blocks of 64, if any. */ while (len >= 64) { x5 = _mm_clmulepi64_si128(x1, x0, 0x00); x6 = _mm_clmulepi64_si128(x2, x0, 0x00); x7 = _mm_clmulepi64_si128(x3, x0, 0x00); x8 = _mm_clmulepi64_si128(x4, x0, 0x00); x1 = _mm_clmulepi64_si128(x1, x0, 0x11); x2 = _mm_clmulepi64_si128(x2, x0, 0x11); x3 = _mm_clmulepi64_si128(x3, x0, 0x11); x4 = _mm_clmulepi64_si128(x4, x0, 0x11); y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); x1 = _mm_xor_si128(x1, x5); x2 = _mm_xor_si128(x2, x6); x3 = _mm_xor_si128(x3, x7); x4 = _mm_xor_si128(x4, x8); x1 = _mm_xor_si128(x1, y5); x2 = _mm_xor_si128(x2, y6); x3 = _mm_xor_si128(x3, y7); x4 = _mm_xor_si128(x4, y8); buf += 64; len -= 64; } /* * Fold into 128-bits. */ x0 = _mm_load_si128((__m128i *)k3k4); x5 = _mm_clmulepi64_si128(x1, x0, 0x00); x1 = _mm_clmulepi64_si128(x1, x0, 0x11); x1 = _mm_xor_si128(x1, x2); x1 = _mm_xor_si128(x1, x5); x5 = _mm_clmulepi64_si128(x1, x0, 0x00); x1 = _mm_clmulepi64_si128(x1, x0, 0x11); x1 = _mm_xor_si128(x1, x3); x1 = _mm_xor_si128(x1, x5); x5 = _mm_clmulepi64_si128(x1, x0, 0x00); x1 = _mm_clmulepi64_si128(x1, x0, 0x11); x1 = _mm_xor_si128(x1, x4); x1 = _mm_xor_si128(x1, x5); /* * Single fold blocks of 16, if any. */ while (len >= 16) { x2 = _mm_loadu_si128((__m128i *)buf); x5 = _mm_clmulepi64_si128(x1, x0, 0x00); x1 = _mm_clmulepi64_si128(x1, x0, 0x11); x1 = _mm_xor_si128(x1, x2); x1 = _mm_xor_si128(x1, x5); buf += 16; len -= 16; } /* * Fold 128-bits to 64-bits. */ x2 = _mm_clmulepi64_si128(x1, x0, 0x10); x3 = _mm_setr_epi32(~0, 0, ~0, 0); x1 = _mm_srli_si128(x1, 8); x1 = _mm_xor_si128(x1, x2); x0 = _mm_loadl_epi64((__m128i*)k5k0); x2 = _mm_srli_si128(x1, 4); x1 = _mm_and_si128(x1, x3); x1 = _mm_clmulepi64_si128(x1, x0, 0x00); x1 = _mm_xor_si128(x1, x2); /* * Barret reduce to 32-bits. */ x0 = _mm_load_si128((__m128i*)poly); x2 = _mm_and_si128(x1, x3); x2 = _mm_clmulepi64_si128(x2, x0, 0x10); x2 = _mm_and_si128(x2, x3); x2 = _mm_clmulepi64_si128(x2, x0, 0x00); x1 = _mm_xor_si128(x1, x2); /* * Return the crc32. */ return _mm_extract_epi32(x1, 1); } #elif defined(CRC32_ARMV8_CRC32) /* CRC32 checksums using ARMv8-a crypto instructions. * * TODO: implement a version using the PMULL instruction. */ /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an * armv8 target, which is incompatible with ThinLTO optimizations on Android. * (Namely, mixing and matching different module-level targets makes ThinLTO * warn, and Android defaults to armv7-a. This restriction does not apply to * function-level `target`s, however.) * * Since we only need three crc intrinsics, and since clang's implementation of * those are just wrappers around compiler builtins, it's simplest to #define * those builtins directly. If this #define list grows too much (or we depend on * an intrinsic that isn't a trivial wrapper), we may have to find a better way * to go about this. * * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized * feature for this target (ignoring feature)." This appears to be a harmless * bug in clang. */ #define __crc32b __builtin_arm_crc32b #define __crc32d __builtin_arm_crc32d #define __crc32w __builtin_arm_crc32w #if defined(__aarch64__) #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) #else #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) #endif TARGET_ARMV8_WITH_CRC uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, const unsigned char *buf, z_size_t len) { uint32_t c = (uint32_t) ~crc; while (len && ((uintptr_t)buf & 7)) { c = __crc32b(c, *buf++); --len; } const uint64_t *buf8 = (const uint64_t *)buf; while (len >= 64) { c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); c = __crc32d(c, *buf8++); len -= 64; } while (len >= 8) { c = __crc32d(c, *buf8++); len -= 8; } buf = (const unsigned char *)buf8; while (len--) { c = __crc32b(c, *buf++); } return ~c; } TARGET_ARMV8_WITH_CRC Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str) { Pos ret; unsigned *ip, val, h = 0; ip = (unsigned *)&s->window[str]; val = *ip; if (s->level >= 6) val &= 0xFFFFFF; h = __crc32w(h, val); ret = s->head[h & s->hash_mask]; s->head[h & s->hash_mask] = str; s->prev[str & s->w_mask] = ret; return ret; } #endif