From 7b27bbdc07ed76de114bf9a82db854d2f2c8e85c Mon Sep 17 00:00:00 2001 From: Noel Gordon Date: Fri, 29 Sep 2017 22:32:47 +1000 Subject: [PATCH] zlib adler_simd.c Add SSSE3 implementation of the adler32 checksum, suitable for both large workloads, and small workloads commonly seen during PNG image decoding. Add a NEON implementation. Speed is comparable to the serial adler32 computation but near 64 bytes of input data, the SIMD code paths begin to be faster than the serial path: 3x faster at 256 bytes of input data, to ~8x faster for 1M of input data (~4x on ARMv8 NEON). For the PNG 140 image corpus, PNG decoding speed is ~8% faster on average on the desktop machines tested, and ~2% on an ARMv8 Pixel C Android (N) tablet, https://crbug.com/762564#c41 Update x86.{c,h} to runtime detect SSSE3 support and use it to enable the adler32_simd code path and update inflate.c to call x86_check_features(). Update the name mangler file names.h for the new symbols added, add FIXME about simd.patch. Ignore data alignment in the SSSE3 case since unaligned access is no longer penalized on current generation Intel CPU. Use it in the NEON case however to avoid the extra costs of unaligned memory access on ARMv8/v7. NEON credits: the v_s1/s2 vector component accumulate code was provided by Adenilson Cavalcanti. The uint16 column vector sum code is from libdeflate with corrections to process NMAX input bytes which improves performance by 3% for large buffers. Update BUILD.gn to put the code in its own source set, and add it conditionally to the zlib library build rule. On ARM, build the SIMD with max-speed config to produce the smallest code. No change in behavior, covered by many existing tests. Bug:762564 Change-Id: I14a39940ae113b5a67ba70a99c3741e289b1796b --- third_party/zlib/BUILD.gn | 64 ++++++- third_party/zlib/adler32.c | 15 ++ third_party/zlib/adler32_simd.c | 366 ++++++++++++++++++++++++++++++++++++++++ third_party/zlib/adler32_simd.h | 16 ++ third_party/zlib/inflate.c | 3 + third_party/zlib/names.h | 7 + third_party/zlib/x86.c | 13 +- third_party/zlib/x86.h | 1 + 8 files changed, 481 insertions(+), 4 deletions(-) create mode 100644 third_party/zlib/adler32_simd.c create mode 100644 third_party/zlib/adler32_simd.h diff --git a/third_party/zlib/BUILD.gn b/third_party/zlib/BUILD.gn index 027a38cbcac3..11f57effb5ad 100644 --- a/third_party/zlib/BUILD.gn +++ b/third_party/zlib/BUILD.gn @@ -10,6 +10,53 @@ config("zlib_config") { include_dirs = [ "." ] } +config("zlib_adler32_simd_config") { + if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) { + defines = [ "ADLER32_SIMD_SSSE3" ] + } + + if (current_cpu == "arm" || current_cpu == "arm64") { + if (arm_use_neon) { + defines = [ "ADLER32_SIMD_NEON" ] + } + } +} + +source_set("zlib_adler32_simd") { + visibility = [ ":*" ] + + if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) { + sources = [ + "adler32_simd.c", + "adler32_simd.h", + ] + + if (!is_win || is_clang) { + cflags = [ "-mssse3" ] + } + } + + if (current_cpu == "arm" || current_cpu == "arm64") { + if (arm_use_neon) { + sources = [ + "adler32_simd.c", + "adler32_simd.h", + ] + + if (!is_debug) { + # Use optimize_speed (-O3) to output the _smallest_ code. + configs -= [ "//build/config/compiler:default_optimization" ] + configs += [ "//build/config/compiler:optimize_speed" ] + } + } + } + + configs -= [ "//build/config/compiler:chromium_code" ] + configs += [ "//build/config/compiler:no_chromium_code" ] + + public_configs = [ ":zlib_adler32_simd_config" ] +} + static_library("zlib_x86_simd") { if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) { sources = [ @@ -90,8 +137,19 @@ static_library("zlib") { } } + defines = [] + deps = [] + if (!is_ios && (current_cpu == "x86" || current_cpu == "x64")) { sources += [ "x86.c" ] + + deps += [ ":zlib_adler32_simd" ] + } + + if (current_cpu == "arm" || current_cpu == "arm64") { + if (arm_use_neon) { + deps += [ ":zlib_adler32_simd" ] + } } configs -= [ "//build/config/compiler:chromium_code" ] @@ -103,8 +161,9 @@ static_library("zlib") { ] public_configs = [ ":zlib_config" ] - deps = [ - ":zlib_x86_simd", + + deps += [ + ":zlib_x86_simd" ] } @@ -151,5 +210,6 @@ static_library("minizip") { # Must be after no_chromium_code for warning flags to be ordered correctly. ":minizip_warnings", ] + public_configs = [ ":zlib_config" ] } diff --git a/third_party/zlib/adler32.c b/third_party/zlib/adler32.c index d0be4380a39c..03491b920d3e 100644 --- a/third_party/zlib/adler32.c +++ b/third_party/zlib/adler32.c @@ -59,6 +59,13 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); # define MOD63(a) a %= BASE #endif +#if defined(ADLER32_SIMD_SSSE3) +#include "adler32_simd.h" +#include "x86.h" +#elif defined(ADLER32_SIMD_NEON) +#include "adler32_simd.h" +#endif + /* ========================================================================= */ uLong ZEXPORT adler32_z(adler, buf, len) uLong adler; @@ -68,6 +75,14 @@ uLong ZEXPORT adler32_z(adler, buf, len) unsigned long sum2; unsigned n; +#if defined(ADLER32_SIMD_SSSE3) + if (x86_cpu_enable_ssse3 && buf && len >= 64) + return adler32_simd_(adler, buf, len); +#elif defined(ADLER32_SIMD_NEON) + if (buf && len >= 64) + return adler32_simd_(adler, buf, len); +#endif + /* split Adler-32 into component sums */ sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; diff --git a/third_party/zlib/adler32_simd.c b/third_party/zlib/adler32_simd.c new file mode 100644 index 000000000000..d73f97e52cff --- /dev/null +++ b/third_party/zlib/adler32_simd.c @@ -0,0 +1,366 @@ +/* adler32_simd.c + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + * + * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is + * the sum of N input data bytes D1 ... DN, + * + * A = A0 + D1 + D2 + ... + DN + * + * where A0 is the initial value. + * + * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD, + * for example) and accumulating the byte sums can use SSE shuffle-adds (see + * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has + * similar instructions. + * + * The adler32 B value (aka s2) sums the A values from each step: + * + * B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or + * + * B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN + * + * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD): + * + * B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1]. + * + * Adjacent blocks of 32 input bytes can be iterated with the expressions to + * compute the adler32 s1 s2 of M >> 32 input bytes [1]. + * + * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually + * overflow the precision of their integer representation (bad). However, s1 + * and s2 also need to be computed modulo the adler BASE value (reduced). If + * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow + * a uint32_t type (the NMAX constraint) [2]. + * + * [1] the iterative equations for s2 contain constant factors; these can be + * hoisted from the n-blocks do loop of the SIMD code. + * + * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates + * of the adler s1 s2 of uint32_t type (see adler32.c). + */ + +#include "adler32_simd.h" + +/* Definitions from adler32.c: largest prime smaller than 65536 */ +#define BASE 65521U +/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ +#define NMAX 5552 + +#if defined(ADLER32_SIMD_SSSE3) + +#include + +uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */ + uint32_t adler, + const unsigned char *buf, + z_size_t len) +{ + /* + * Split Adler-32 into component sums. + */ + uint32_t s1 = adler & 0xffff; + uint32_t s2 = adler >> 16; + + /* + * Process the data in blocks. + */ + const unsigned BLOCK_SIZE = 1 << 5; + + z_size_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + + while (blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if (n > blocks) + n = blocks; + blocks -= n; + + const __m128i tap1 = + _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17); + const __m128i tap2 = + _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i zero = + _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i ones = + _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1); + + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. + */ + __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n); + __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2); + __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0); + + do { + /* + * Load 32 input bytes. + */ + const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf)); + const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16)); + + /* + * Add previous block byte sum to v_ps. + */ + v_ps = _mm_add_epi32(v_ps, v_s1); + + /* + * Horizontally add the bytes for s1, multiply-adds the + * bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero)); + const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones)); + + v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero)); + const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2); + v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones)); + + buf += BLOCK_SIZE; + + } while (--n); + + v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5)); + + /* + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + */ + +#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */ +#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */ + + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1)); + v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32)); + + s1 += _mm_cvtsi128_si32(v_s1); + + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1)); + v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32)); + + s2 = _mm_cvtsi128_si32(v_s2); + +#undef S23O1 +#undef S1O32 + + /* + * Reduce. + */ + s1 %= BASE; + s2 %= BASE; + } + + /* + * Handle leftover data. + */ + if (len) { + if (len >= 16) { + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + len -= 16; + } + + while (len--) { + s2 += (s1 += *buf++); + } + + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + } + + /* + * Return the recombined sums. + */ + return s1 | (s2 << 16); +} + +#elif defined(ADLER32_SIMD_NEON) + +#include + +uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ + uint32_t adler, + const unsigned char *buf, + z_size_t len) +{ + /* + * Split Adler-32 into component sums. + */ + uint32_t s1 = adler & 0xffff; + uint32_t s2 = adler >> 16; + + /* + * Serially compute s1 & s2, until the data is 16-byte aligned. + */ + if ((uintptr_t)buf & 15) { + while ((uintptr_t)buf & 15) { + s2 += (s1 += *buf++); + --len; + } + + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + } + + /* + * Process the data in blocks. + */ + const unsigned BLOCK_SIZE = 1 << 5; + + z_size_t blocks = len / BLOCK_SIZE; + len -= blocks * BLOCK_SIZE; + + while (blocks) + { + unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ + if (n > blocks) + n = blocks; + blocks -= n; + + /* + * Process n blocks of data. At most NMAX data bytes can be + * processed before s2 must be reduced modulo BASE. + */ + uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; + uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; + + uint16x8_t v_column_sum_1 = vdupq_n_u16(0); + uint16x8_t v_column_sum_2 = vdupq_n_u16(0); + uint16x8_t v_column_sum_3 = vdupq_n_u16(0); + uint16x8_t v_column_sum_4 = vdupq_n_u16(0); + + do { + /* + * Load 32 input bytes. + */ + const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); + const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); + + /* + * Add previous block byte sum to v_s2. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + + /* + * Horizontally add the bytes for s1. + */ + v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); + + /* + * Vertically add the bytes for s2. + */ + v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); + v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); + v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); + v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); + + buf += BLOCK_SIZE; + + } while (--n); + + v_s2 = vshlq_n_u32(v_s2, 5); + + /* + * Multiply-add bytes by [ 32, 31, 30, ... ] for s2. + */ + v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), + (uint16x4_t) { 32, 31, 30, 29 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), + (uint16x4_t) { 28, 27, 26, 25 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), + (uint16x4_t) { 24, 23, 22, 21 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), + (uint16x4_t) { 20, 19, 18, 17 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), + (uint16x4_t) { 16, 15, 14, 13 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), + (uint16x4_t) { 12, 11, 10, 9 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), + (uint16x4_t) { 8, 7, 6, 5 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), + (uint16x4_t) { 4, 3, 2, 1 }); + + /* + * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). + */ + uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); + uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); + uint32x2_t s1s2 = vpadd_u32(sum1, sum2); + + s1 += vget_lane_u32(s1s2, 0); + s2 += vget_lane_u32(s1s2, 1); + + /* + * Reduce. + */ + s1 %= BASE; + s2 %= BASE; + } + + /* + * Handle leftover data. + */ + if (len) { + if (len >= 16) { + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + s2 += (s1 += *buf++); + + len -= 16; + } + + while (len--) { + s2 += (s1 += *buf++); + } + + if (s1 >= BASE) + s1 -= BASE; + s2 %= BASE; + } + + /* + * Return the recombined sums. + */ + return s1 | (s2 << 16); +} + +#endif /* ADLER32_SIMD_SSSE3 */ diff --git a/third_party/zlib/adler32_simd.h b/third_party/zlib/adler32_simd.h new file mode 100644 index 000000000000..52bb14d16751 --- /dev/null +++ b/third_party/zlib/adler32_simd.h @@ -0,0 +1,16 @@ +/* adler32_simd.h + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + +#include + +#include "zconf.h" +#include "zutil.h" + +uint32_t ZLIB_INTERNAL adler32_simd_( + uint32_t adler, + const unsigned char *buf, + z_size_t len); diff --git a/third_party/zlib/inflate.c b/third_party/zlib/inflate.c index 5c40cf1ee0ff..e84be468b8de 100644 --- a/third_party/zlib/inflate.c +++ b/third_party/zlib/inflate.c @@ -84,6 +84,7 @@ #include "inftrees.h" #include "inflate.h" #include "inffast.h" +#include "x86.h" #ifdef MAKEFIXED # ifndef BUILDFIXED @@ -201,6 +202,8 @@ int stream_size; int ret; struct inflate_state FAR *state; + x86_check_features(); + if (version == Z_NULL || version[0] != ZLIB_VERSION[0] || stream_size != (int)(sizeof(z_stream))) return Z_VERSION_ERROR; diff --git a/third_party/zlib/names.h b/third_party/zlib/names.h index 3436baa4eb57..cd98ec9940b6 100644 --- a/third_party/zlib/names.h +++ b/third_party/zlib/names.h @@ -162,6 +162,13 @@ #define fill_window_sse Cr_z_fill_window_sse #define read_buf Cr_z_read_buf #define x86_check_features Cr_z_x86_check_features +/* FIXME: x86_cpu_enable_ssse3 wasn't part of the simd.patch */ +#define x86_cpu_enable_ssse3 Cr_z_x86_cpu_enable_ssse3 #define x86_cpu_enable_simd Cr_z_x86_cpu_enable_simd +#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) +/* Symbols added by adler_simd.c, see also the FIXME above */ +#define adler32_simd_ Cr_z_adler32_simd_ +#endif + #endif /* THIRD_PARTY_ZLIB_NAMES_H_ */ diff --git a/third_party/zlib/x86.c b/third_party/zlib/x86.c index e56fe8b85a39..7488ad08b976 100644 --- a/third_party/zlib/x86.c +++ b/third_party/zlib/x86.c @@ -4,13 +4,14 @@ * Copyright (C) 2013 Intel Corporation. All rights reserved. * Author: * Jim Kukunas - * + * * For conditions of distribution and use, see copyright notice in zlib.h */ #include "x86.h" #include "zutil.h" +int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; int ZLIB_INTERNAL x86_cpu_enable_simd = 0; #ifndef _MSC_VER @@ -27,6 +28,7 @@ void x86_check_features(void) static void _x86_check_features(void) { int x86_cpu_has_sse2; + int x86_cpu_has_ssse3; int x86_cpu_has_sse42; int x86_cpu_has_pclmulqdq; unsigned eax, ebx, ecx, edx; @@ -47,9 +49,12 @@ static void _x86_check_features(void) #endif /* (__i386__) */ x86_cpu_has_sse2 = edx & 0x4000000; + x86_cpu_has_ssse3 = ecx & 0x000200; x86_cpu_has_sse42 = ecx & 0x100000; x86_cpu_has_pclmulqdq = ecx & 0x2; + x86_cpu_enable_ssse3 = x86_cpu_has_ssse3; + x86_cpu_enable_simd = x86_cpu_has_sse2 && x86_cpu_has_sse42 && x86_cpu_has_pclmulqdq; @@ -74,6 +79,7 @@ static BOOL CALLBACK _x86_check_features(PINIT_ONCE once, PVOID *context) { int x86_cpu_has_sse2; + int x86_cpu_has_ssse3; int x86_cpu_has_sse42; int x86_cpu_has_pclmulqdq; int regs[4]; @@ -81,9 +87,12 @@ static BOOL CALLBACK _x86_check_features(PINIT_ONCE once, __cpuid(regs, 1); x86_cpu_has_sse2 = regs[3] & 0x4000000; - x86_cpu_has_sse42= regs[2] & 0x100000; + x86_cpu_has_ssse3 = regs[2] & 0x000200; + x86_cpu_has_sse42 = regs[2] & 0x100000; x86_cpu_has_pclmulqdq = regs[2] & 0x2; + x86_cpu_enable_ssse3 = x86_cpu_has_ssse3; + x86_cpu_enable_simd = x86_cpu_has_sse2 && x86_cpu_has_sse42 && x86_cpu_has_pclmulqdq; diff --git a/third_party/zlib/x86.h b/third_party/zlib/x86.h index ebcf10ab09d2..7205d50265c3 100644 --- a/third_party/zlib/x86.h +++ b/third_party/zlib/x86.h @@ -8,6 +8,7 @@ #include "zlib.h" +extern int x86_cpu_enable_ssse3; extern int x86_cpu_enable_simd; void x86_check_features(void); -- 2.14.0.rc1.383.gd1ce394fe2-goog