mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-11-24 22:36:09 +03:00
696 lines
21 KiB
Diff
696 lines
21 KiB
Diff
From 7b27bbdc07ed76de114bf9a82db854d2f2c8e85c Mon Sep 17 00:00:00 2001
|
|
From: Noel Gordon <noel@chromium.org>
|
|
Date: Fri, 29 Sep 2017 22:32:47 +1000
|
|
Subject: [PATCH] zlib adler_simd.c
|
|
|
|
Add SSSE3 implementation of the adler32 checksum, suitable for
|
|
both large workloads, and small workloads commonly seen during
|
|
PNG image decoding. Add a NEON implementation.
|
|
|
|
Speed is comparable to the serial adler32 computation but near
|
|
64 bytes of input data, the SIMD code paths begin to be faster
|
|
than the serial path: 3x faster at 256 bytes of input data, to
|
|
~8x faster for 1M of input data (~4x on ARMv8 NEON).
|
|
|
|
For the PNG 140 image corpus, PNG decoding speed is ~8% faster
|
|
on average on the desktop machines tested, and ~2% on an ARMv8
|
|
Pixel C Android (N) tablet, https://crbug.com/762564#c41
|
|
|
|
Update x86.{c,h} to runtime detect SSSE3 support and use it to
|
|
enable the adler32_simd code path and update inflate.c to call
|
|
x86_check_features(). Update the name mangler file names.h for
|
|
the new symbols added, add FIXME about simd.patch.
|
|
|
|
Ignore data alignment in the SSSE3 case since unaligned access
|
|
is no longer penalized on current generation Intel CPU. Use it
|
|
in the NEON case however to avoid the extra costs of unaligned
|
|
memory access on ARMv8/v7.
|
|
|
|
NEON credits: the v_s1/s2 vector component accumulate code was
|
|
provided by Adenilson Cavalcanti. The uint16 column vector sum
|
|
code is from libdeflate with corrections to process NMAX input
|
|
bytes which improves performance by 3% for large buffers.
|
|
|
|
Update BUILD.gn to put the code in its own source set, and add
|
|
it conditionally to the zlib library build rule. On ARM, build
|
|
the SIMD with max-speed config to produce the smallest code.
|
|
|
|
No change in behavior, covered by many existing tests.
|
|
|
|
Bug:762564
|
|
Change-Id: I14a39940ae113b5a67ba70a99c3741e289b1796b
|
|
---
|
|
third_party/zlib/BUILD.gn | 64 ++++++-
|
|
third_party/zlib/adler32.c | 15 ++
|
|
third_party/zlib/adler32_simd.c | 366 ++++++++++++++++++++++++++++++++++++++++
|
|
third_party/zlib/adler32_simd.h | 16 ++
|
|
third_party/zlib/inflate.c | 3 +
|
|
third_party/zlib/names.h | 7 +
|
|
third_party/zlib/x86.c | 13 +-
|
|
third_party/zlib/x86.h | 1 +
|
|
8 files changed, 481 insertions(+), 4 deletions(-)
|
|
create mode 100644 third_party/zlib/adler32_simd.c
|
|
create mode 100644 third_party/zlib/adler32_simd.h
|
|
|
|
diff --git a/third_party/zlib/BUILD.gn b/third_party/zlib/BUILD.gn
|
|
index 027a38cbcac3..11f57effb5ad 100644
|
|
--- a/third_party/zlib/BUILD.gn
|
|
+++ b/third_party/zlib/BUILD.gn
|
|
@@ -10,6 +10,53 @@ config("zlib_config") {
|
|
include_dirs = [ "." ]
|
|
}
|
|
|
|
+config("zlib_adler32_simd_config") {
|
|
+ if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
|
|
+ defines = [ "ADLER32_SIMD_SSSE3" ]
|
|
+ }
|
|
+
|
|
+ if (current_cpu == "arm" || current_cpu == "arm64") {
|
|
+ if (arm_use_neon) {
|
|
+ defines = [ "ADLER32_SIMD_NEON" ]
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+source_set("zlib_adler32_simd") {
|
|
+ visibility = [ ":*" ]
|
|
+
|
|
+ if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
|
|
+ sources = [
|
|
+ "adler32_simd.c",
|
|
+ "adler32_simd.h",
|
|
+ ]
|
|
+
|
|
+ if (!is_win || is_clang) {
|
|
+ cflags = [ "-mssse3" ]
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (current_cpu == "arm" || current_cpu == "arm64") {
|
|
+ if (arm_use_neon) {
|
|
+ sources = [
|
|
+ "adler32_simd.c",
|
|
+ "adler32_simd.h",
|
|
+ ]
|
|
+
|
|
+ if (!is_debug) {
|
|
+ # Use optimize_speed (-O3) to output the _smallest_ code.
|
|
+ configs -= [ "//build/config/compiler:default_optimization" ]
|
|
+ configs += [ "//build/config/compiler:optimize_speed" ]
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ configs -= [ "//build/config/compiler:chromium_code" ]
|
|
+ configs += [ "//build/config/compiler:no_chromium_code" ]
|
|
+
|
|
+ public_configs = [ ":zlib_adler32_simd_config" ]
|
|
+}
|
|
+
|
|
static_library("zlib_x86_simd") {
|
|
if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
|
|
sources = [
|
|
@@ -90,8 +137,19 @@ static_library("zlib") {
|
|
}
|
|
}
|
|
|
|
+ defines = []
|
|
+ deps = []
|
|
+
|
|
if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) {
|
|
sources += [ "x86.c" ]
|
|
+
|
|
+ deps += [ ":zlib_adler32_simd" ]
|
|
+ }
|
|
+
|
|
+ if (current_cpu == "arm" || current_cpu == "arm64") {
|
|
+ if (arm_use_neon) {
|
|
+ deps += [ ":zlib_adler32_simd" ]
|
|
+ }
|
|
}
|
|
|
|
configs -= [ "//build/config/compiler:chromium_code" ]
|
|
@@ -103,8 +161,9 @@ static_library("zlib") {
|
|
]
|
|
|
|
public_configs = [ ":zlib_config" ]
|
|
- deps = [
|
|
- ":zlib_x86_simd",
|
|
+
|
|
+ deps += [
|
|
+ ":zlib_x86_simd"
|
|
]
|
|
}
|
|
|
|
@@ -151,5 +210,6 @@ static_library("minizip") {
|
|
# Must be after no_chromium_code for warning flags to be ordered correctly.
|
|
":minizip_warnings",
|
|
]
|
|
+
|
|
public_configs = [ ":zlib_config" ]
|
|
}
|
|
diff --git a/third_party/zlib/adler32.c b/third_party/zlib/adler32.c
|
|
index d0be4380a39c..03491b920d3e 100644
|
|
--- a/third_party/zlib/adler32.c
|
|
+++ b/third_party/zlib/adler32.c
|
|
@@ -59,6 +59,13 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
|
|
# define MOD63(a) a %= BASE
|
|
#endif
|
|
|
|
+#if defined(ADLER32_SIMD_SSSE3)
|
|
+#include "adler32_simd.h"
|
|
+#include "x86.h"
|
|
+#elif defined(ADLER32_SIMD_NEON)
|
|
+#include "adler32_simd.h"
|
|
+#endif
|
|
+
|
|
/* ========================================================================= */
|
|
uLong ZEXPORT adler32_z(adler, buf, len)
|
|
uLong adler;
|
|
@@ -68,6 +75,14 @@ uLong ZEXPORT adler32_z(adler, buf, len)
|
|
unsigned long sum2;
|
|
unsigned n;
|
|
|
|
+#if defined(ADLER32_SIMD_SSSE3)
|
|
+ if (x86_cpu_enable_ssse3 && buf && len >= 64)
|
|
+ return adler32_simd_(adler, buf, len);
|
|
+#elif defined(ADLER32_SIMD_NEON)
|
|
+ if (buf && len >= 64)
|
|
+ return adler32_simd_(adler, buf, len);
|
|
+#endif
|
|
+
|
|
/* split Adler-32 into component sums */
|
|
sum2 = (adler >> 16) & 0xffff;
|
|
adler &= 0xffff;
|
|
diff --git a/third_party/zlib/adler32_simd.c b/third_party/zlib/adler32_simd.c
|
|
new file mode 100644
|
|
index 000000000000..d73f97e52cff
|
|
--- /dev/null
|
|
+++ b/third_party/zlib/adler32_simd.c
|
|
@@ -0,0 +1,366 @@
|
|
+/* adler32_simd.c
|
|
+ *
|
|
+ * Copyright 2017 The Chromium Authors. All rights reserved.
|
|
+ * Use of this source code is governed by a BSD-style license that can be
|
|
+ * found in the Chromium source repository LICENSE file.
|
|
+ *
|
|
+ * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
|
|
+ * the sum of N input data bytes D1 ... DN,
|
|
+ *
|
|
+ * A = A0 + D1 + D2 + ... + DN
|
|
+ *
|
|
+ * where A0 is the initial value.
|
|
+ *
|
|
+ * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
|
|
+ * for example) and accumulating the byte sums can use SSE shuffle-adds (see
|
|
+ * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
|
|
+ * similar instructions.
|
|
+ *
|
|
+ * The adler32 B value (aka s2) sums the A values from each step:
|
|
+ *
|
|
+ * B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
|
|
+ *
|
|
+ * B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
|
|
+ *
|
|
+ * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
|
|
+ *
|
|
+ * B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
|
|
+ *
|
|
+ * Adjacent blocks of 32 input bytes can be iterated with the expressions to
|
|
+ * compute the adler32 s1 s2 of M >> 32 input bytes [1].
|
|
+ *
|
|
+ * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
|
|
+ * overflow the precision of their integer representation (bad). However, s1
|
|
+ * and s2 also need to be computed modulo the adler BASE value (reduced). If
|
|
+ * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
|
|
+ * a uint32_t type (the NMAX constraint) [2].
|
|
+ *
|
|
+ * [1] the iterative equations for s2 contain constant factors; these can be
|
|
+ * hoisted from the n-blocks do loop of the SIMD code.
|
|
+ *
|
|
+ * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
|
|
+ * of the adler s1 s2 of uint32_t type (see adler32.c).
|
|
+ */
|
|
+
|
|
+#include "adler32_simd.h"
|
|
+
|
|
+/* Definitions from adler32.c: largest prime smaller than 65536 */
|
|
+#define BASE 65521U
|
|
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
|
+#define NMAX 5552
|
|
+
|
|
+#if defined(ADLER32_SIMD_SSSE3)
|
|
+
|
|
+#include <tmmintrin.h>
|
|
+
|
|
+uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */
|
|
+ uint32_t adler,
|
|
+ const unsigned char *buf,
|
|
+ z_size_t len)
|
|
+{
|
|
+ /*
|
|
+ * Split Adler-32 into component sums.
|
|
+ */
|
|
+ uint32_t s1 = adler & 0xffff;
|
|
+ uint32_t s2 = adler >> 16;
|
|
+
|
|
+ /*
|
|
+ * Process the data in blocks.
|
|
+ */
|
|
+ const unsigned BLOCK_SIZE = 1 << 5;
|
|
+
|
|
+ z_size_t blocks = len / BLOCK_SIZE;
|
|
+ len -= blocks * BLOCK_SIZE;
|
|
+
|
|
+ while (blocks)
|
|
+ {
|
|
+ unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
|
+ if (n > blocks)
|
|
+ n = blocks;
|
|
+ blocks -= n;
|
|
+
|
|
+ const __m128i tap1 =
|
|
+ _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
|
|
+ const __m128i tap2 =
|
|
+ _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
|
+ const __m128i zero =
|
|
+ _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
+ const __m128i ones =
|
|
+ _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
|
|
+
|
|
+ /*
|
|
+ * Process n blocks of data. At most NMAX data bytes can be
|
|
+ * processed before s2 must be reduced modulo BASE.
|
|
+ */
|
|
+ __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
|
+ __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
|
+ __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
|
+
|
|
+ do {
|
|
+ /*
|
|
+ * Load 32 input bytes.
|
|
+ */
|
|
+ const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
|
|
+ const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
|
|
+
|
|
+ /*
|
|
+ * Add previous block byte sum to v_ps.
|
|
+ */
|
|
+ v_ps = _mm_add_epi32(v_ps, v_s1);
|
|
+
|
|
+ /*
|
|
+ * Horizontally add the bytes for s1, multiply-adds the
|
|
+ * bytes by [ 32, 31, 30, ... ] for s2.
|
|
+ */
|
|
+ v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
|
+ const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
|
+ v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
|
+
|
|
+ v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
|
+ const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
|
+ v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
|
+
|
|
+ buf += BLOCK_SIZE;
|
|
+
|
|
+ } while (--n);
|
|
+
|
|
+ v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
|
+
|
|
+ /*
|
|
+ * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
|
+ */
|
|
+
|
|
+#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */
|
|
+#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */
|
|
+
|
|
+ v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
|
+ v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
|
+
|
|
+ s1 += _mm_cvtsi128_si32(v_s1);
|
|
+
|
|
+ v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
|
+ v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
|
+
|
|
+ s2 = _mm_cvtsi128_si32(v_s2);
|
|
+
|
|
+#undef S23O1
|
|
+#undef S1O32
|
|
+
|
|
+ /*
|
|
+ * Reduce.
|
|
+ */
|
|
+ s1 %= BASE;
|
|
+ s2 %= BASE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Handle leftover data.
|
|
+ */
|
|
+ if (len) {
|
|
+ if (len >= 16) {
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ len -= 16;
|
|
+ }
|
|
+
|
|
+ while (len--) {
|
|
+ s2 += (s1 += *buf++);
|
|
+ }
|
|
+
|
|
+ if (s1 >= BASE)
|
|
+ s1 -= BASE;
|
|
+ s2 %= BASE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Return the recombined sums.
|
|
+ */
|
|
+ return s1 | (s2 << 16);
|
|
+}
|
|
+
|
|
+#elif defined(ADLER32_SIMD_NEON)
|
|
+
|
|
+#include <arm_neon.h>
|
|
+
|
|
+uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
|
|
+ uint32_t adler,
|
|
+ const unsigned char *buf,
|
|
+ z_size_t len)
|
|
+{
|
|
+ /*
|
|
+ * Split Adler-32 into component sums.
|
|
+ */
|
|
+ uint32_t s1 = adler & 0xffff;
|
|
+ uint32_t s2 = adler >> 16;
|
|
+
|
|
+ /*
|
|
+ * Serially compute s1 & s2, until the data is 16-byte aligned.
|
|
+ */
|
|
+ if ((uintptr_t)buf & 15) {
|
|
+ while ((uintptr_t)buf & 15) {
|
|
+ s2 += (s1 += *buf++);
|
|
+ --len;
|
|
+ }
|
|
+
|
|
+ if (s1 >= BASE)
|
|
+ s1 -= BASE;
|
|
+ s2 %= BASE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Process the data in blocks.
|
|
+ */
|
|
+ const unsigned BLOCK_SIZE = 1 << 5;
|
|
+
|
|
+ z_size_t blocks = len / BLOCK_SIZE;
|
|
+ len -= blocks * BLOCK_SIZE;
|
|
+
|
|
+ while (blocks)
|
|
+ {
|
|
+ unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
|
+ if (n > blocks)
|
|
+ n = blocks;
|
|
+ blocks -= n;
|
|
+
|
|
+ /*
|
|
+ * Process n blocks of data. At most NMAX data bytes can be
|
|
+ * processed before s2 must be reduced modulo BASE.
|
|
+ */
|
|
+ uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
|
|
+ uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
|
|
+
|
|
+ uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
|
+ uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
|
+ uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
|
+ uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
|
+
|
|
+ do {
|
|
+ /*
|
|
+ * Load 32 input bytes.
|
|
+ */
|
|
+ const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
|
|
+ const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
|
|
+
|
|
+ /*
|
|
+ * Add previous block byte sum to v_s2.
|
|
+ */
|
|
+ v_s2 = vaddq_u32(v_s2, v_s1);
|
|
+
|
|
+ /*
|
|
+ * Horizontally add the bytes for s1.
|
|
+ */
|
|
+ v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
|
+
|
|
+ /*
|
|
+ * Vertically add the bytes for s2.
|
|
+ */
|
|
+ v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
|
|
+ v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
|
+ v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
|
|
+ v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
|
+
|
|
+ buf += BLOCK_SIZE;
|
|
+
|
|
+ } while (--n);
|
|
+
|
|
+ v_s2 = vshlq_n_u32(v_s2, 5);
|
|
+
|
|
+ /*
|
|
+ * Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
|
+ */
|
|
+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
|
|
+ (uint16x4_t) { 32, 31, 30, 29 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
|
|
+ (uint16x4_t) { 28, 27, 26, 25 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
|
|
+ (uint16x4_t) { 24, 23, 22, 21 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
|
|
+ (uint16x4_t) { 20, 19, 18, 17 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
|
|
+ (uint16x4_t) { 16, 15, 14, 13 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
|
|
+ (uint16x4_t) { 12, 11, 10, 9 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
|
|
+ (uint16x4_t) { 8, 7, 6, 5 });
|
|
+ v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
|
|
+ (uint16x4_t) { 4, 3, 2, 1 });
|
|
+
|
|
+ /*
|
|
+ * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
|
+ */
|
|
+ uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
|
+ uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
|
+ uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
|
+
|
|
+ s1 += vget_lane_u32(s1s2, 0);
|
|
+ s2 += vget_lane_u32(s1s2, 1);
|
|
+
|
|
+ /*
|
|
+ * Reduce.
|
|
+ */
|
|
+ s1 %= BASE;
|
|
+ s2 %= BASE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Handle leftover data.
|
|
+ */
|
|
+ if (len) {
|
|
+ if (len >= 16) {
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+ s2 += (s1 += *buf++);
|
|
+
|
|
+ len -= 16;
|
|
+ }
|
|
+
|
|
+ while (len--) {
|
|
+ s2 += (s1 += *buf++);
|
|
+ }
|
|
+
|
|
+ if (s1 >= BASE)
|
|
+ s1 -= BASE;
|
|
+ s2 %= BASE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Return the recombined sums.
|
|
+ */
|
|
+ return s1 | (s2 << 16);
|
|
+}
|
|
+
|
|
+#endif /* ADLER32_SIMD_SSSE3 */
|
|
diff --git a/third_party/zlib/adler32_simd.h b/third_party/zlib/adler32_simd.h
|
|
new file mode 100644
|
|
index 000000000000..52bb14d16751
|
|
--- /dev/null
|
|
+++ b/third_party/zlib/adler32_simd.h
|
|
@@ -0,0 +1,16 @@
|
|
+/* adler32_simd.h
|
|
+ *
|
|
+ * Copyright 2017 The Chromium Authors. All rights reserved.
|
|
+ * Use of this source code is governed by a BSD-style license that can be
|
|
+ * found in the Chromium source repository LICENSE file.
|
|
+ */
|
|
+
|
|
+#include <stdint.h>
|
|
+
|
|
+#include "zconf.h"
|
|
+#include "zutil.h"
|
|
+
|
|
+uint32_t ZLIB_INTERNAL adler32_simd_(
|
|
+ uint32_t adler,
|
|
+ const unsigned char *buf,
|
|
+ z_size_t len);
|
|
diff --git a/third_party/zlib/inflate.c b/third_party/zlib/inflate.c
|
|
index 5c40cf1ee0ff..e84be468b8de 100644
|
|
--- a/third_party/zlib/inflate.c
|
|
+++ b/third_party/zlib/inflate.c
|
|
@@ -84,6 +84,7 @@
|
|
#include "inftrees.h"
|
|
#include "inflate.h"
|
|
#include "inffast.h"
|
|
+#include "x86.h"
|
|
|
|
#ifdef MAKEFIXED
|
|
# ifndef BUILDFIXED
|
|
@@ -201,6 +202,8 @@ int stream_size;
|
|
int ret;
|
|
struct inflate_state FAR *state;
|
|
|
|
+ x86_check_features();
|
|
+
|
|
if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
|
|
stream_size != (int)(sizeof(z_stream)))
|
|
return Z_VERSION_ERROR;
|
|
diff --git a/third_party/zlib/names.h b/third_party/zlib/names.h
|
|
index 3436baa4eb57..cd98ec9940b6 100644
|
|
--- a/third_party/zlib/names.h
|
|
+++ b/third_party/zlib/names.h
|
|
@@ -162,6 +162,13 @@
|
|
#define fill_window_sse Cr_z_fill_window_sse
|
|
#define read_buf Cr_z_read_buf
|
|
#define x86_check_features Cr_z_x86_check_features
|
|
+/* FIXME: x86_cpu_enable_ssse3 wasn't part of the simd.patch */
|
|
+#define x86_cpu_enable_ssse3 Cr_z_x86_cpu_enable_ssse3
|
|
#define x86_cpu_enable_simd Cr_z_x86_cpu_enable_simd
|
|
|
|
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
|
|
+/* Symbols added by adler_simd.c, see also the FIXME above */
|
|
+#define adler32_simd_ Cr_z_adler32_simd_
|
|
+#endif
|
|
+
|
|
#endif /* THIRD_PARTY_ZLIB_NAMES_H_ */
|
|
diff --git a/third_party/zlib/x86.c b/third_party/zlib/x86.c
|
|
index e56fe8b85a39..7488ad08b976 100644
|
|
--- a/third_party/zlib/x86.c
|
|
+++ b/third_party/zlib/x86.c
|
|
@@ -4,13 +4,14 @@
|
|
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
|
* Author:
|
|
* Jim Kukunas
|
|
- *
|
|
+ *
|
|
* For conditions of distribution and use, see copyright notice in zlib.h
|
|
*/
|
|
|
|
#include "x86.h"
|
|
#include "zutil.h"
|
|
|
|
+int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
|
|
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
|
|
|
|
#ifndef _MSC_VER
|
|
@@ -27,6 +28,7 @@ void x86_check_features(void)
|
|
static void _x86_check_features(void)
|
|
{
|
|
int x86_cpu_has_sse2;
|
|
+ int x86_cpu_has_ssse3;
|
|
int x86_cpu_has_sse42;
|
|
int x86_cpu_has_pclmulqdq;
|
|
unsigned eax, ebx, ecx, edx;
|
|
@@ -47,9 +49,12 @@ static void _x86_check_features(void)
|
|
#endif /* (__i386__) */
|
|
|
|
x86_cpu_has_sse2 = edx & 0x4000000;
|
|
+ x86_cpu_has_ssse3 = ecx & 0x000200;
|
|
x86_cpu_has_sse42 = ecx & 0x100000;
|
|
x86_cpu_has_pclmulqdq = ecx & 0x2;
|
|
|
|
+ x86_cpu_enable_ssse3 = x86_cpu_has_ssse3;
|
|
+
|
|
x86_cpu_enable_simd = x86_cpu_has_sse2 &&
|
|
x86_cpu_has_sse42 &&
|
|
x86_cpu_has_pclmulqdq;
|
|
@@ -74,6 +79,7 @@ static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
|
|
PVOID *context)
|
|
{
|
|
int x86_cpu_has_sse2;
|
|
+ int x86_cpu_has_ssse3;
|
|
int x86_cpu_has_sse42;
|
|
int x86_cpu_has_pclmulqdq;
|
|
int regs[4];
|
|
@@ -81,9 +87,12 @@ static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
|
|
__cpuid(regs, 1);
|
|
|
|
x86_cpu_has_sse2 = regs[3] & 0x4000000;
|
|
- x86_cpu_has_sse42= regs[2] & 0x100000;
|
|
+ x86_cpu_has_ssse3 = regs[2] & 0x000200;
|
|
+ x86_cpu_has_sse42 = regs[2] & 0x100000;
|
|
x86_cpu_has_pclmulqdq = regs[2] & 0x2;
|
|
|
|
+ x86_cpu_enable_ssse3 = x86_cpu_has_ssse3;
|
|
+
|
|
x86_cpu_enable_simd = x86_cpu_has_sse2 &&
|
|
x86_cpu_has_sse42 &&
|
|
x86_cpu_has_pclmulqdq;
|
|
diff --git a/third_party/zlib/x86.h b/third_party/zlib/x86.h
|
|
index ebcf10ab09d2..7205d50265c3 100644
|
|
--- a/third_party/zlib/x86.h
|
|
+++ b/third_party/zlib/x86.h
|
|
@@ -8,6 +8,7 @@
|
|
|
|
#include "zlib.h"
|
|
|
|
+extern int x86_cpu_enable_ssse3;
|
|
extern int x86_cpu_enable_simd;
|
|
|
|
void x86_check_features(void);
|
|
--
|
|
2.14.0.rc1.383.gd1ce394fe2-goog
|
|
|