naiveproxy/third_party/zlib/crc32_simd.c

/* crc32_simd.c
 *
 * Copyright 2017 The Chromium Authors. All rights reserved.
 * Use of this source code is governed by a BSD-style license that can be
 * found in the Chromium source repository LICENSE file.
 */

#include "crc32_simd.h"

#if defined(CRC32_SIMD_SSE42_PCLMUL)

/*
 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
 * length must be at least 64, and a multiple of 16. Based on:
 *
 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
 *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
 */

#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>

uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
    const unsigned char *buf,
    z_size_t len,
    uint32_t crc)
{
    /*
     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
     * the CRC32+Barrett polynomials given at the end of the paper.
     */
    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };

    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;

    /*
     * There's at least one block of 64.
     */
    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));

    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));

    x0 = _mm_load_si128((__m128i *)k1k2);

    buf += 64;
    len -= 64;

    /*
     * Parallel fold blocks of 64, if any.
     */
    while (len >= 64)
    {
        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);

        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);

        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));

        x1 = _mm_xor_si128(x1, x5);
        x2 = _mm_xor_si128(x2, x6);
        x3 = _mm_xor_si128(x3, x7);
        x4 = _mm_xor_si128(x4, x8);

        x1 = _mm_xor_si128(x1, y5);
        x2 = _mm_xor_si128(x2, y6);
        x3 = _mm_xor_si128(x3, y7);
        x4 = _mm_xor_si128(x4, y8);

        buf += 64;
        len -= 64;
    }

    /*
     * Fold into 128-bits.
     */
    x0 = _mm_load_si128((__m128i *)k3k4);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x2);
    x1 = _mm_xor_si128(x1, x5);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x3);
    x1 = _mm_xor_si128(x1, x5);

    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
    x1 = _mm_xor_si128(x1, x4);
    x1 = _mm_xor_si128(x1, x5);

    /*
     * Single fold blocks of 16, if any.
     */
    while (len >= 16)
    {
        x2 = _mm_loadu_si128((__m128i *)buf);

        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
        x1 = _mm_xor_si128(x1, x2);
        x1 = _mm_xor_si128(x1, x5);

        buf += 16;
        len -= 16;
    }

    /*
     * Fold 128-bits to 64-bits.
     */
    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
    x1 = _mm_srli_si128(x1, 8);
    x1 = _mm_xor_si128(x1, x2);

    x0 = _mm_loadl_epi64((__m128i*)k5k0);

    x2 = _mm_srli_si128(x1, 4);
    x1 = _mm_and_si128(x1, x3);
    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
    x1 = _mm_xor_si128(x1, x2);

    /*
     * Barret reduce to 32-bits.
     */
    x0 = _mm_load_si128((__m128i*)poly);

    x2 = _mm_and_si128(x1, x3);
    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
    x2 = _mm_and_si128(x2, x3);
    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
    x1 = _mm_xor_si128(x1, x2);

    /*
     * Return the crc32.
     */
    return _mm_extract_epi32(x1, 1);
}

#elif defined(CRC32_ARMV8_CRC32)

/* CRC32 checksums using ARMv8-a crypto instructions.
 *
 * TODO: implement a version using the PMULL instruction.
 */

/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
 * (Namely, mixing and matching different module-level targets makes ThinLTO
 * warn, and Android defaults to armv7-a. This restriction does not apply to
 * function-level `target`s, however.)
 *
 * Since we only need three crc intrinsics, and since clang's implementation of
 * those are just wrappers around compiler builtins, it's simplest to #define
 * those builtins directly. If this #define list grows too much (or we depend on
 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
 * to go about this.
 *
 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
 * feature for this target (ignoring feature)." This appears to be a harmless
 * bug in clang.
 */
#define __crc32b __builtin_arm_crc32b
#define __crc32d __builtin_arm_crc32d
#define __crc32w __builtin_arm_crc32w

#if defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
#else
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif

TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
                                          const unsigned char *buf,
                                          z_size_t len)
{
    uint32_t c = (uint32_t) ~crc;

    while (len && ((uintptr_t)buf & 7)) {
        c = __crc32b(c, *buf++);
        --len;
    }

    const uint64_t *buf8 = (const uint64_t *)buf;

    while (len >= 64) {
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);

        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        c = __crc32d(c, *buf8++);
        len -= 64;
    }

    while (len >= 8) {
        c = __crc32d(c, *buf8++);
        len -= 8;
    }

    buf = (const unsigned char *)buf8;

    while (len--) {
        c = __crc32b(c, *buf++);
    }

    return ~c;
}

TARGET_ARMV8_WITH_CRC
Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str)
{
    Pos ret;
    unsigned *ip, val, h = 0;

    ip = (unsigned *)&s->window[str];
    val = *ip;

    if (s->level >= 6)
        val &= 0xFFFFFF;

    h = __crc32w(h, val);

    ret = s->head[h & s->hash_mask];
    s->head[h & s->hash_mask] = str;
    s->prev[str & s->w_mask] = ret;
    return ret;
}


#endif
Import chromium-70.0.3538.110 2018-12-10 05:59:24 +03:00			`/* crc32_simd.c`
			`*`
			`* Copyright 2017 The Chromium Authors. All rights reserved.`
			`* Use of this source code is governed by a BSD-style license that can be`
			`* found in the Chromium source repository LICENSE file.`
			`*/`

			`#include "crc32_simd.h"`

			`#if defined(CRC32_SIMD_SSE42_PCLMUL)`

			`/*`
			`* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer`
			`* length must be at least 64, and a multiple of 16. Based on:`
			`*`
			`* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"`
			`* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0`
			`*/`

			`#include <emmintrin.h>`
			`#include <smmintrin.h>`
			`#include <wmmintrin.h>`

			`uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */`
			`const unsigned char *buf,`
			`z_size_t len,`
			`uint32_t crc)`
			`{`
			`/*`
			`* Definitions of the bit-reflected domain constants k1,k2,k3, etc and`
			`* the CRC32+Barrett polynomials given at the end of the paper.`
			`*/`
			`static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };`
			`static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };`
			`static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };`
			`static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };`

			`__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;`

			`/*`
			`* There's at least one block of 64.`
			`*/`
			`x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));`
			`x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));`
			`x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));`
			`x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));`

			`x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));`

			`x0 = _mm_load_si128((__m128i *)k1k2);`

			`buf += 64;`
			`len -= 64;`

			`/*`
			`* Parallel fold blocks of 64, if any.`
			`*/`
			`while (len >= 64)`
			`{`
			`x5 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x6 = _mm_clmulepi64_si128(x2, x0, 0x00);`
			`x7 = _mm_clmulepi64_si128(x3, x0, 0x00);`
			`x8 = _mm_clmulepi64_si128(x4, x0, 0x00);`

			`x1 = _mm_clmulepi64_si128(x1, x0, 0x11);`
			`x2 = _mm_clmulepi64_si128(x2, x0, 0x11);`
			`x3 = _mm_clmulepi64_si128(x3, x0, 0x11);`
			`x4 = _mm_clmulepi64_si128(x4, x0, 0x11);`

			`y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));`
			`y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));`
			`y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));`
			`y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));`

			`x1 = _mm_xor_si128(x1, x5);`
			`x2 = _mm_xor_si128(x2, x6);`
			`x3 = _mm_xor_si128(x3, x7);`
			`x4 = _mm_xor_si128(x4, x8);`

			`x1 = _mm_xor_si128(x1, y5);`
			`x2 = _mm_xor_si128(x2, y6);`
			`x3 = _mm_xor_si128(x3, y7);`
			`x4 = _mm_xor_si128(x4, y8);`

			`buf += 64;`
			`len -= 64;`
			`}`

			`/*`
			`* Fold into 128-bits.`
			`*/`
			`x0 = _mm_load_si128((__m128i *)k3k4);`

			`x5 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x1 = _mm_clmulepi64_si128(x1, x0, 0x11);`
			`x1 = _mm_xor_si128(x1, x2);`
			`x1 = _mm_xor_si128(x1, x5);`

			`x5 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x1 = _mm_clmulepi64_si128(x1, x0, 0x11);`
			`x1 = _mm_xor_si128(x1, x3);`
			`x1 = _mm_xor_si128(x1, x5);`

			`x5 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x1 = _mm_clmulepi64_si128(x1, x0, 0x11);`
			`x1 = _mm_xor_si128(x1, x4);`
			`x1 = _mm_xor_si128(x1, x5);`

			`/*`
			`* Single fold blocks of 16, if any.`
			`*/`
			`while (len >= 16)`
			`{`
			`x2 = _mm_loadu_si128((__m128i *)buf);`

			`x5 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x1 = _mm_clmulepi64_si128(x1, x0, 0x11);`
			`x1 = _mm_xor_si128(x1, x2);`
			`x1 = _mm_xor_si128(x1, x5);`

			`buf += 16;`
			`len -= 16;`
			`}`

			`/*`
			`* Fold 128-bits to 64-bits.`
			`*/`
			`x2 = _mm_clmulepi64_si128(x1, x0, 0x10);`
			`x3 = _mm_setr_epi32(~0, 0, ~0, 0);`
			`x1 = _mm_srli_si128(x1, 8);`
			`x1 = _mm_xor_si128(x1, x2);`

			`x0 = _mm_loadl_epi64((__m128i*)k5k0);`

			`x2 = _mm_srli_si128(x1, 4);`
			`x1 = _mm_and_si128(x1, x3);`
			`x1 = _mm_clmulepi64_si128(x1, x0, 0x00);`
			`x1 = _mm_xor_si128(x1, x2);`

			`/*`
			`* Barret reduce to 32-bits.`
			`*/`
			`x0 = _mm_load_si128((__m128i*)poly);`

			`x2 = _mm_and_si128(x1, x3);`
			`x2 = _mm_clmulepi64_si128(x2, x0, 0x10);`
			`x2 = _mm_and_si128(x2, x3);`
			`x2 = _mm_clmulepi64_si128(x2, x0, 0x00);`
			`x1 = _mm_xor_si128(x1, x2);`

			`/*`
			`* Return the crc32.`
			`*/`
			`return _mm_extract_epi32(x1, 1);`
			`}`

			`#elif defined(CRC32_ARMV8_CRC32)`

			`/* CRC32 checksums using ARMv8-a crypto instructions.`
			`*`
			`* TODO: implement a version using the PMULL instruction.`
			`*/`

			`/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an`
			`* armv8 target, which is incompatible with ThinLTO optimizations on Android.`
			`* (Namely, mixing and matching different module-level targets makes ThinLTO`
			`* warn, and Android defaults to armv7-a. This restriction does not apply to`
			* function-level `target`s, however.)
			`*`
			`* Since we only need three crc intrinsics, and since clang's implementation of`
			`* those are just wrappers around compiler builtins, it's simplest to #define`
			`* those builtins directly. If this #define list grows too much (or we depend on`
			`* an intrinsic that isn't a trivial wrapper), we may have to find a better way`
			`* to go about this.`
			`*`
			`* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized`
			`* feature for this target (ignoring feature)." This appears to be a harmless`
			`* bug in clang.`
			`*/`
			`#define __crc32b __builtin_arm_crc32b`
			`#define __crc32d __builtin_arm_crc32d`
			`#define __crc32w __builtin_arm_crc32w`

			`#if defined(__aarch64__)`
			`#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))`
			`#else`
			`#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))`
			`#endif`

			`TARGET_ARMV8_WITH_CRC`
			`uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,`
			`const unsigned char *buf,`
			`z_size_t len)`
			`{`
			`uint32_t c = (uint32_t) ~crc;`

			`while (len && ((uintptr_t)buf & 7)) {`
			`c = __crc32b(c, *buf++);`
			`--len;`
			`}`

			`const uint64_t buf8 = (const uint64_t )buf;`

			`while (len >= 64) {`
			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`

			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`
			`c = __crc32d(c, *buf8++);`
			`len -= 64;`
			`}`

			`while (len >= 8) {`
			`c = __crc32d(c, *buf8++);`
			`len -= 8;`
			`}`

			`buf = (const unsigned char *)buf8;`

			`while (len--) {`
			`c = __crc32b(c, *buf++);`
			`}`

			`return ~c;`
			`}`

			`TARGET_ARMV8_WITH_CRC`
			`Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str)`
			`{`
			`Pos ret;`
			`unsigned *ip, val, h = 0;`

			`ip = (unsigned *)&s->window[str];`
			`val = *ip;`

			`if (s->level >= 6)`
			`val &= 0xFFFFFF;`

			`h = __crc32w(h, val);`

			`ret = s->head[h & s->hash_mask];`
			`s->head[h & s->hash_mask] = str;`
			`s->prev[str & s->w_mask] = ret;`
			`return ret;`
			`}`


			`#endif`