From 4e02d43514db0b384bb7ab19612478e3c884deef Mon Sep 17 00:00:00 2001 From: Kevin Trogant Date: Fri, 19 Jul 2024 10:30:50 +0200 Subject: [PATCH] Add early config file parsing Also cleaned up contrib/ and made lz4 and xxhash subprojects installed via wrap. --- cfg/engine_early.cfg | 11 + contrib/lz4/LICENSE | 24 - contrib/lz4/lz4.c | 2766 ------- contrib/lz4/lz4.h | 862 --- .../simple_vulkan_synchronization/LICENSE.md | 19 - .../simple_vulkan_synchronization/README.md | 153 - .../test/README.md | 26 - .../test/tests.c | 357 - .../thsvs_simpler_vulkan_synchronization.h | 1397 ---- contrib/xxhash/LICENSE | 26 - contrib/xxhash/xxhash.c | 43 - contrib/xxhash/xxhash.h | 6773 ----------------- meson.build | 2 +- src/runtime/compression.c | 2 +- src/runtime/config.c | 291 +- src/runtime/config.h | 17 + src/runtime/fsutils.c | 60 +- src/runtime/fsutils.h | 6 + src/runtime/hashing.c | 7 +- src/runtime/init.c | 15 +- src/runtime/meson.build | 16 +- src/runtime/resource_manager.c | 34 +- src/runtime/string_storage.c | 120 + src/runtime/string_storage.h | 23 + subprojects/inih.wrap | 11 + subprojects/lz4.wrap | 12 + subprojects/xxhash.wrap | 13 + 27 files changed, 592 insertions(+), 12494 deletions(-) create mode 100644 cfg/engine_early.cfg delete mode 100644 contrib/lz4/LICENSE delete mode 100644 contrib/lz4/lz4.c delete mode 100644 contrib/lz4/lz4.h delete mode 100644 contrib/simple_vulkan_synchronization/LICENSE.md delete mode 100644 contrib/simple_vulkan_synchronization/README.md delete mode 100644 contrib/simple_vulkan_synchronization/test/README.md delete mode 100644 contrib/simple_vulkan_synchronization/test/tests.c delete mode 100644 contrib/simple_vulkan_synchronization/thsvs_simpler_vulkan_synchronization.h delete mode 100644 contrib/xxhash/LICENSE delete mode 100644 contrib/xxhash/xxhash.c delete mode 100644 contrib/xxhash/xxhash.h create mode 100644 src/runtime/string_storage.c create mode 100644 src/runtime/string_storage.h create mode 100644 subprojects/inih.wrap create mode 100644 subprojects/lz4.wrap create mode 100644 subprojects/xxhash.wrap diff --git a/cfg/engine_early.cfg b/cfg/engine_early.cfg new file mode 100644 index 0000000..12e27d2 --- /dev/null +++ b/cfg/engine_early.cfg @@ -0,0 +1,11 @@ +; This file contains configuration options that are used during runtime init. +rt_AssertEnabled = 1 +rt_MaxConcurrentAsyncIO = 1024 +rt_BufferMemoryBudget = 1073741824 ; 1GB +rt_FileTabCapacity = 1024 +rt_TemporaryArenaSize = 33554432 ; 32 MB +rt_ResourceDirectory = res +rt_ResourceCacheSize = 536870912 ; 512 MB +rt_MaxCachedResources = 1024 +rt_ResourceNamespaceSize = 1048576 +rt_DisableResourceNamespaceLoad = 0 diff --git a/contrib/lz4/LICENSE b/contrib/lz4/LICENSE deleted file mode 100644 index 4884916..0000000 --- a/contrib/lz4/LICENSE +++ /dev/null @@ -1,24 +0,0 @@ -LZ4 Library -Copyright (c) 2011-2020, Yann Collet -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/lz4/lz4.c b/contrib/lz4/lz4.c deleted file mode 100644 index 4653b34..0000000 --- a/contrib/lz4/lz4.c +++ /dev/null @@ -1,2766 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2020, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 homepage : http://www.lz4.org - - LZ4 source repository : https://github.com/lz4/lz4 -*/ - -/*-************************************ -* Tuning parameters -**************************************/ -/* - * LZ4_HEAPMODE : - * Select how stateless compression functions like `LZ4_compress_default()` - * allocate memory for their hash table, - * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). - */ -#ifndef LZ4_HEAPMODE -# define LZ4_HEAPMODE 0 -#endif - -/* - * LZ4_ACCELERATION_DEFAULT : - * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 - */ -#define LZ4_ACCELERATION_DEFAULT 1 -/* - * LZ4_ACCELERATION_MAX : - * Any "acceleration" value higher than this threshold - * get treated as LZ4_ACCELERATION_MAX instead (fix #876) - */ -#define LZ4_ACCELERATION_MAX 65537 - - -/*-************************************ -* CPU Feature Detection -**************************************/ -/* LZ4_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets which assembly generation depends on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ -# if defined(__GNUC__) && \ - ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define LZ4_FORCE_MEMORY_ACCESS 2 -# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) || defined(_MSC_VER) -# define LZ4_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -/* - * LZ4_FORCE_SW_BITCOUNT - * Define this parameter if your target system or compiler does not support hardware bit count - */ -#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit count */ -# undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */ -# define LZ4_FORCE_SW_BITCOUNT -#endif - - - -/*-************************************ -* Dependency -**************************************/ -/* - * LZ4_SRC_INCLUDED: - * Amalgamation flag, whether lz4.c is included - */ -#ifndef LZ4_SRC_INCLUDED -# define LZ4_SRC_INCLUDED 1 -#endif - -#ifndef LZ4_STATIC_LINKING_ONLY -#define LZ4_STATIC_LINKING_ONLY -#endif - -#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS -#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */ -#endif - -#define LZ4_STATIC_LINKING_ONLY /* LZ4_DISTANCE_MAX */ -#include "lz4.h" -/* see also "memory routines" below */ - - -/*-************************************ -* Compiler Options -**************************************/ -#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */ -# include /* only present in VS2005+ */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 6237) /* disable: C6237: conditional expression is always 0 */ -#endif /* _MSC_VER */ - -#ifndef LZ4_FORCE_INLINE -# ifdef _MSC_VER /* Visual Studio */ -# define LZ4_FORCE_INLINE static __forceinline -# else -# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ -# ifdef __GNUC__ -# define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define LZ4_FORCE_INLINE static inline -# endif -# else -# define LZ4_FORCE_INLINE static -# endif /* __STDC_VERSION__ */ -# endif /* _MSC_VER */ -#endif /* LZ4_FORCE_INLINE */ - -/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE - * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, - * together with a simple 8-byte copy loop as a fall-back path. - * However, this optimization hurts the decompression speed by >30%, - * because the execution does not go to the optimized loop - * for typical compressible data, and all of the preamble checks - * before going to the fall-back path become useless overhead. - * This optimization happens only with the -O3 flag, and -O2 generates - * a simple 8-byte copy loop. - * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 - * functions are annotated with __attribute__((optimize("O2"))), - * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute - * of LZ4_wildCopy8 does not affect the compression speed. - */ -#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__) -# define LZ4_FORCE_O2 __attribute__((optimize("O2"))) -# undef LZ4_FORCE_INLINE -# define LZ4_FORCE_INLINE static __inline __attribute__((optimize("O2"),always_inline)) -#else -# define LZ4_FORCE_O2 -#endif - -#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) -# define expect(expr,value) (__builtin_expect ((expr),(value)) ) -#else -# define expect(expr,value) (expr) -#endif - -#ifndef likely -#define likely(expr) expect((expr) != 0, 1) -#endif -#ifndef unlikely -#define unlikely(expr) expect((expr) != 0, 0) -#endif - -/* Should the alignment test prove unreliable, for some reason, - * it can be disabled by setting LZ4_ALIGN_TEST to 0 */ -#ifndef LZ4_ALIGN_TEST /* can be externally provided */ -# define LZ4_ALIGN_TEST 1 -#endif - - -/*-************************************ -* Memory routines -**************************************/ - -/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION : - * Disable relatively high-level LZ4/HC functions that use dynamic memory - * allocation functions (malloc(), calloc(), free()). - * - * Note that this is a compile-time switch. And since it disables - * public/stable LZ4 v1 API functions, we don't recommend using this - * symbol to generate a library for distribution. - * - * The following public functions are removed when this symbol is defined. - * - lz4 : LZ4_createStream, LZ4_freeStream, - * LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create (deprecated) - * - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC, - * LZ4_createHC (deprecated), LZ4_freeHC (deprecated) - * - lz4frame, lz4file : All LZ4F_* functions - */ -#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -# define ALLOC(s) lz4_error_memory_allocation_is_disabled -# define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled -# define FREEMEM(p) lz4_error_memory_allocation_is_disabled -#elif defined(LZ4_USER_MEMORY_FUNCTIONS) -/* memory management functions can be customized by user project. - * Below functions must exist somewhere in the Project - * and be available at link time */ -void* LZ4_malloc(size_t s); -void* LZ4_calloc(size_t n, size_t s); -void LZ4_free(void* p); -# define ALLOC(s) LZ4_malloc(s) -# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s) -# define FREEMEM(p) LZ4_free(p) -#else -# include /* malloc, calloc, free */ -# define ALLOC(s) malloc(s) -# define ALLOC_AND_ZERO(s) calloc(1,s) -# define FREEMEM(p) free(p) -#endif - -#if ! LZ4_FREESTANDING -# include /* memset, memcpy */ -#endif -#if !defined(LZ4_memset) -# define LZ4_memset(p,v,s) memset((p),(v),(s)) -#endif -#define MEM_INIT(p,v,s) LZ4_memset((p),(v),(s)) - - -/*-************************************ -* Common Constants -**************************************/ -#define MINMATCH 4 - -#define WILDCOPYLENGTH 8 -#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ -#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ -#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */ -#define FASTLOOP_SAFE_DISTANCE 64 -static const int LZ4_minLength = (MFLIMIT+1); - -#define KB *(1 <<10) -#define MB *(1 <<20) -#define GB *(1U<<30) - -#define LZ4_DISTANCE_ABSOLUTE_MAX 65535 -#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */ -# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" -#endif - -#define ML_BITS 4 -#define ML_MASK ((1U<=1) -# include -#else -# ifndef assert -# define assert(condition) ((void)0) -# endif -#endif - -#define LZ4_STATIC_ASSERT(c) { enum { LZ4_static_assert = 1/(int)(!!(c)) }; } /* use after variable declarations */ - -#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2) -# include - static int g_debuglog_enable = 1; -# define DEBUGLOG(l, ...) { \ - if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) { \ - fprintf(stderr, __FILE__ " %i: ", __LINE__); \ - fprintf(stderr, __VA_ARGS__); \ - fprintf(stderr, " \n"); \ - } } -#else -# define DEBUGLOG(l, ...) {} /* disabled */ -#endif - -static int LZ4_isAligned(const void* ptr, size_t alignment) -{ - return ((size_t)ptr & (alignment -1)) == 0; -} - - -/*-************************************ -* Types -**************************************/ -#include -#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; - typedef uintptr_t uptrval; -#else -# if UINT_MAX != 4294967295UL -# error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4" -# endif - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; - typedef size_t uptrval; /* generally true, except OpenVMS-64 */ -#endif - -#if defined(__x86_64__) - typedef U64 reg_t; /* 64-bits in x32 mode */ -#else - typedef size_t reg_t; /* 32-bits in x32 mode */ -#endif - -typedef enum { - notLimited = 0, - limitedOutput = 1, - fillOutput = 2 -} limitedOutput_directive; - - -/*-************************************ -* Reading and writing into memory -**************************************/ - -/** - * LZ4 relies on memcpy with a constant size being inlined. In freestanding - * environments, the compiler can't assume the implementation of memcpy() is - * standard compliant, so it can't apply its specialized memcpy() inlining - * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze - * memcpy() as if it were standard compliant, so it can inline it in freestanding - * environments. This is needed when decompressing the Linux Kernel, for example. - */ -#if !defined(LZ4_memcpy) -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) -# else -# define LZ4_memcpy(dst, src, size) memcpy(dst, src, size) -# endif -#endif - -#if !defined(LZ4_memmove) -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define LZ4_memmove __builtin_memmove -# else -# define LZ4_memmove memmove -# endif -#endif - -static unsigned LZ4_isLittleEndian(void) -{ - const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ - return one.c[0]; -} - -#if defined(__GNUC__) || defined(__INTEL_COMPILER) -#define LZ4_PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__)) -#elif defined(_MSC_VER) -#define LZ4_PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop)) -#endif - -#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) -/* lie to the compiler about data alignment; use with caution */ - -static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } -static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; } -static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; } - -static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } -static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } - -#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16; -LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32; -LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST; - -static U16 LZ4_read16(const void* ptr) { return ((const LZ4_unalign16*)ptr)->u16; } -static U32 LZ4_read32(const void* ptr) { return ((const LZ4_unalign32*)ptr)->u32; } -static reg_t LZ4_read_ARCH(const void* ptr) { return ((const LZ4_unalignST*)ptr)->uArch; } - -static void LZ4_write16(void* memPtr, U16 value) { ((LZ4_unalign16*)memPtr)->u16 = value; } -static void LZ4_write32(void* memPtr, U32 value) { ((LZ4_unalign32*)memPtr)->u32 = value; } - -#else /* safe and portable access using memcpy() */ - -static U16 LZ4_read16(const void* memPtr) -{ - U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; -} - -static U32 LZ4_read32(const void* memPtr) -{ - U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; -} - -static reg_t LZ4_read_ARCH(const void* memPtr) -{ - reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; -} - -static void LZ4_write16(void* memPtr, U16 value) -{ - LZ4_memcpy(memPtr, &value, sizeof(value)); -} - -static void LZ4_write32(void* memPtr, U32 value) -{ - LZ4_memcpy(memPtr, &value, sizeof(value)); -} - -#endif /* LZ4_FORCE_MEMORY_ACCESS */ - - -static U16 LZ4_readLE16(const void* memPtr) -{ - if (LZ4_isLittleEndian()) { - return LZ4_read16(memPtr); - } else { - const BYTE* p = (const BYTE*)memPtr; - return (U16)((U16)p[0] + (p[1]<<8)); - } -} - -static void LZ4_writeLE16(void* memPtr, U16 value) -{ - if (LZ4_isLittleEndian()) { - LZ4_write16(memPtr, value); - } else { - BYTE* p = (BYTE*)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ -LZ4_FORCE_INLINE -void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) -{ - BYTE* d = (BYTE*)dstPtr; - const BYTE* s = (const BYTE*)srcPtr; - BYTE* const e = (BYTE*)dstEnd; - - do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d= 16. */ -LZ4_FORCE_INLINE void -LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) -{ - BYTE* d = (BYTE*)dstPtr; - const BYTE* s = (const BYTE*)srcPtr; - BYTE* const e = (BYTE*)dstEnd; - - do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d= dstPtr + MINMATCH - * - there is at least 8 bytes available to write after dstEnd */ -LZ4_FORCE_INLINE void -LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) -{ - BYTE v[8]; - - assert(dstEnd >= dstPtr + MINMATCH); - - switch(offset) { - case 1: - MEM_INIT(v, *srcPtr, 8); - break; - case 2: - LZ4_memcpy(v, srcPtr, 2); - LZ4_memcpy(&v[2], srcPtr, 2); -#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */ -# pragma warning(push) -# pragma warning(disable : 6385) /* warning C6385: Reading invalid data from 'v'. */ -#endif - LZ4_memcpy(&v[4], v, 4); -#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier */ -# pragma warning(pop) -#endif - break; - case 4: - LZ4_memcpy(v, srcPtr, 4); - LZ4_memcpy(&v[4], srcPtr, 4); - break; - default: - LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); - return; - } - - LZ4_memcpy(dstPtr, v, 8); - dstPtr += 8; - while (dstPtr < dstEnd) { - LZ4_memcpy(dstPtr, v, 8); - dstPtr += 8; - } -} -#endif - - -/*-************************************ -* Common functions -**************************************/ -static unsigned LZ4_NbCommonBytes (reg_t val) -{ - assert(val != 0); - if (LZ4_isLittleEndian()) { - if (sizeof(val) == 8) { -# if defined(_MSC_VER) && (_MSC_VER >= 1800) && (defined(_M_AMD64) && !defined(_M_ARM64EC)) && !defined(LZ4_FORCE_SW_BITCOUNT) -/*-************************************************************************************************* -* ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications on ARM64 Windows 11. -* The ARM64EC ABI does not support AVX/AVX2/AVX512 instructions, nor their relevant intrinsics -* including _tzcnt_u64. Therefore, we need to neuter the _tzcnt_u64 code path for ARM64EC. -****************************************************************************************************/ -# if defined(__clang__) && (__clang_major__ < 10) - /* Avoid undefined clang-cl intrinsics issue. - * See https://github.com/lz4/lz4/pull/1017 for details. */ - return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3; -# else - /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ - return (unsigned)_tzcnt_u64(val) >> 3; -# endif -# elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64(&r, (U64)val); - return (unsigned)r >> 3; -# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ - ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (unsigned)__builtin_ctzll((U64)val) >> 3; -# else - const U64 m = 0x0101010101010101ULL; - val ^= val - 1; - return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56); -# endif - } else /* 32 bits */ { -# if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward(&r, (U32)val); - return (unsigned)r >> 3; -# elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ - ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ - !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (unsigned)__builtin_ctz((U32)val) >> 3; -# else - const U32 m = 0x01010101; - return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24; -# endif - } - } else /* Big Endian CPU */ { - if (sizeof(val)==8) { -# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ - ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ - !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (unsigned)__builtin_clzll((U64)val) >> 3; -# else -#if 1 - /* this method is probably faster, - * but adds a 128 bytes lookup table */ - static const unsigned char ctz7_tab[128] = { - 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - }; - U64 const mask = 0x0101010101010101ULL; - U64 const t = (((val >> 8) - mask) | val) & mask; - return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; -#else - /* this method doesn't consume memory space like the previous one, - * but it contains several branches, - * that may end up slowing execution */ - static const U32 by32 = sizeof(val)*4; /* 32 on 64 bits (goal), 16 on 32 bits. - Just to avoid some static analyzer complaining about shift by 32 on 32-bits target. - Note that this code path is never triggered in 32-bits mode. */ - unsigned r; - if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -#endif -# endif - } else /* 32 bits */ { -# if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \ - ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (unsigned)__builtin_clz((U32)val) >> 3; -# else - val >>= 8; - val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | - (val + 0x00FF0000)) >> 24; - return (unsigned)val ^ 3; -# endif - } - } -} - - -#define STEPSIZE sizeof(reg_t) -LZ4_FORCE_INLINE -unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) -{ - const BYTE* const pStart = pIn; - - if (likely(pIn < pInLimit-(STEPSIZE-1))) { - reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); - if (!diff) { - pIn+=STEPSIZE; pMatch+=STEPSIZE; - } else { - return LZ4_NbCommonBytes(diff); - } } - - while (likely(pIn < pInLimit-(STEPSIZE-1))) { - reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); - if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; } - pIn += LZ4_NbCommonBytes(diff); - return (unsigned)(pIn - pStart); - } - - if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; } - if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; } - if ((pIn compression run slower on incompressible data */ - - -/*-************************************ -* Local Structures and types -**************************************/ -typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; - -/** - * This enum distinguishes several different modes of accessing previous - * content in the stream. - * - * - noDict : There is no preceding content. - * - withPrefix64k : Table entries up to ctx->dictSize before the current blob - * blob being compressed are valid and refer to the preceding - * content (of length ctx->dictSize), which is available - * contiguously preceding in memory the content currently - * being compressed. - * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere - * else in memory, starting at ctx->dictionary with length - * ctx->dictSize. - * - usingDictCtx : Everything concerning the preceding content is - * in a separate context, pointed to by ctx->dictCtx. - * ctx->dictionary, ctx->dictSize, and table entries - * in the current context that refer to positions - * preceding the beginning of the current compression are - * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx - * ->dictSize describe the location and size of the preceding - * content, and matches are found by looking in the ctx - * ->dictCtx->hashTable. - */ -typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive; -typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; - - -/*-************************************ -* Local Utils -**************************************/ -int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } -const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } -int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } -int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); } - - -/*-**************************************** -* Internal Definitions, used only in Tests -*******************************************/ -#if defined (__cplusplus) -extern "C" { -#endif - -int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize); - -int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, - int compressedSize, int maxOutputSize, - const void* dictStart, size_t dictSize); -int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, - int compressedSize, int targetOutputSize, int dstCapacity, - const void* dictStart, size_t dictSize); -#if defined (__cplusplus) -} -#endif - -/*-****************************** -* Compression functions -********************************/ -LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType) -{ - if (tableType == byU16) - return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); - else - return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); -} - -LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType) -{ - const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; - if (LZ4_isLittleEndian()) { - const U64 prime5bytes = 889523592379ULL; - return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); - } else { - const U64 prime8bytes = 11400714785074694791ULL; - return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); - } -} - -LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType) -{ - if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType); - return LZ4_hash4(LZ4_read32(p), tableType); -} - -LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType) -{ - switch (tableType) - { - default: /* fallthrough */ - case clearedTable: { /* illegal! */ assert(0); return; } - case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; } - case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; } - } -} - -LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType) -{ - switch (tableType) - { - default: /* fallthrough */ - case clearedTable: /* fallthrough */ - case byPtr: { /* illegal! */ assert(0); return; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; } - case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; } - } -} - -/* LZ4_putPosition*() : only used in byPtr mode */ -LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, - void* tableBase, tableType_t const tableType) -{ - const BYTE** const hashTable = (const BYTE**)tableBase; - assert(tableType == byPtr); (void)tableType; - hashTable[h] = p; -} - -LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType) -{ - U32 const h = LZ4_hashPosition(p, tableType); - LZ4_putPositionOnHash(p, h, tableBase, tableType); -} - -/* LZ4_getIndexOnHash() : - * Index of match position registered in hash table. - * hash position must be calculated by using base+index, or dictBase+index. - * Assumption 1 : only valid if tableType == byU32 or byU16. - * Assumption 2 : h is presumed valid (within limits of hash table) - */ -LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType) -{ - LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2); - if (tableType == byU32) { - const U32* const hashTable = (const U32*) tableBase; - assert(h < (1U << (LZ4_MEMORY_USAGE-2))); - return hashTable[h]; - } - if (tableType == byU16) { - const U16* const hashTable = (const U16*) tableBase; - assert(h < (1U << (LZ4_MEMORY_USAGE-1))); - return hashTable[h]; - } - assert(0); return 0; /* forbidden case */ -} - -static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType) -{ - assert(tableType == byPtr); (void)tableType; - { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; } -} - -LZ4_FORCE_INLINE const BYTE* -LZ4_getPosition(const BYTE* p, - const void* tableBase, tableType_t tableType) -{ - U32 const h = LZ4_hashPosition(p, tableType); - return LZ4_getPositionOnHash(h, tableBase, tableType); -} - -LZ4_FORCE_INLINE void -LZ4_prepareTable(LZ4_stream_t_internal* const cctx, - const int inputSize, - const tableType_t tableType) { - /* If the table hasn't been used, it's guaranteed to be zeroed out, and is - * therefore safe to use no matter what mode we're in. Otherwise, we figure - * out if it's safe to leave as is or whether it needs to be reset. - */ - if ((tableType_t)cctx->tableType != clearedTable) { - assert(inputSize >= 0); - if ((tableType_t)cctx->tableType != tableType - || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) - || ((tableType == byU32) && cctx->currentOffset > 1 GB) - || tableType == byPtr - || inputSize >= 4 KB) - { - DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx); - MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE); - cctx->currentOffset = 0; - cctx->tableType = (U32)clearedTable; - } else { - DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)"); - } - } - - /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, - * is faster than compressing without a gap. - * However, compressing with currentOffset == 0 is faster still, - * so we preserve that case. - */ - if (cctx->currentOffset != 0 && tableType == byU32) { - DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset"); - cctx->currentOffset += 64 KB; - } - - /* Finally, clear history */ - cctx->dictCtx = NULL; - cctx->dictionary = NULL; - cctx->dictSize = 0; -} - -/** LZ4_compress_generic_validated() : - * inlined, to ensure branches are decided at compilation time. - * The following conditions are presumed already validated: - * - source != NULL - * - inputSize > 0 - */ -LZ4_FORCE_INLINE int LZ4_compress_generic_validated( - LZ4_stream_t_internal* const cctx, - const char* const source, - char* const dest, - const int inputSize, - int* inputConsumed, /* only written when outputDirective == fillOutput */ - const int maxOutputSize, - const limitedOutput_directive outputDirective, - const tableType_t tableType, - const dict_directive dictDirective, - const dictIssue_directive dictIssue, - const int acceleration) -{ - int result; - const BYTE* ip = (const BYTE*)source; - - U32 const startIndex = cctx->currentOffset; - const BYTE* base = (const BYTE*)source - startIndex; - const BYTE* lowLimit; - - const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx; - const BYTE* const dictionary = - dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary; - const U32 dictSize = - dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize; - const U32 dictDelta = - (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0; /* make indexes in dictCtx comparable with indexes in current context */ - - int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx); - U32 const prefixIdxLimit = startIndex - dictSize; /* used when dictDirective == dictSmall */ - const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary; - const BYTE* anchor = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1; - const BYTE* const matchlimit = iend - LASTLITERALS; - - /* the dictCtx currentOffset is indexed on the start of the dictionary, - * while a dictionary in the current context precedes the currentOffset */ - const BYTE* dictBase = (dictionary == NULL) ? NULL : - (dictDirective == usingDictCtx) ? - dictionary + dictSize - dictCtx->currentOffset : - dictionary + dictSize - startIndex; - - BYTE* op = (BYTE*) dest; - BYTE* const olimit = op + maxOutputSize; - - U32 offset = 0; - U32 forwardH; - - DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType); - assert(ip != NULL); - if (tableType == byU16) assert(inputSize= 1); - - lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0); - - /* Update context state */ - if (dictDirective == usingDictCtx) { - /* Subsequent linked blocks can't use the dictionary. */ - /* Instead, they use the block we just compressed. */ - cctx->dictCtx = NULL; - cctx->dictSize = (U32)inputSize; - } else { - cctx->dictSize += (U32)inputSize; - } - cctx->currentOffset += (U32)inputSize; - cctx->tableType = (U32)tableType; - - if (inputSizehashTable, byPtr); - } else { - LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType); - } } - ip++; forwardH = LZ4_hashPosition(ip, tableType); - - /* Main Loop */ - for ( ; ; ) { - const BYTE* match; - BYTE* token; - const BYTE* filledIp; - - /* Find a match */ - if (tableType == byPtr) { - const BYTE* forwardIp = ip; - int step = 1; - int searchMatchNb = acceleration << LZ4_skipTrigger; - do { - U32 const h = forwardH; - ip = forwardIp; - forwardIp += step; - step = (searchMatchNb++ >> LZ4_skipTrigger); - - if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; - assert(ip < mflimitPlusOne); - - match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType); - forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType); - - } while ( (match+LZ4_DISTANCE_MAX < ip) - || (LZ4_read32(match) != LZ4_read32(ip)) ); - - } else { /* byU32, byU16 */ - - const BYTE* forwardIp = ip; - int step = 1; - int searchMatchNb = acceleration << LZ4_skipTrigger; - do { - U32 const h = forwardH; - U32 const current = (U32)(forwardIp - base); - U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); - assert(matchIndex <= current); - assert(forwardIp - base < (ptrdiff_t)(2 GB - 1)); - ip = forwardIp; - forwardIp += step; - step = (searchMatchNb++ >> LZ4_skipTrigger); - - if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals; - assert(ip < mflimitPlusOne); - - if (dictDirective == usingDictCtx) { - if (matchIndex < startIndex) { - /* there was no match, try the dictionary */ - assert(tableType == byU32); - matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); - match = dictBase + matchIndex; - matchIndex += dictDelta; /* make dictCtx index comparable with current context */ - lowLimit = dictionary; - } else { - match = base + matchIndex; - lowLimit = (const BYTE*)source; - } - } else if (dictDirective == usingExtDict) { - if (matchIndex < startIndex) { - DEBUGLOG(7, "extDict candidate: matchIndex=%5u < startIndex=%5u", matchIndex, startIndex); - assert(startIndex - matchIndex >= MINMATCH); - assert(dictBase); - match = dictBase + matchIndex; - lowLimit = dictionary; - } else { - match = base + matchIndex; - lowLimit = (const BYTE*)source; - } - } else { /* single continuous memory segment */ - match = base + matchIndex; - } - forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); - - DEBUGLOG(7, "candidate at pos=%u (offset=%u \n", matchIndex, current - matchIndex); - if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; } /* match outside of valid area */ - assert(matchIndex < current); - if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) - && (matchIndex+LZ4_DISTANCE_MAX < current)) { - continue; - } /* too far */ - assert((current - matchIndex) <= LZ4_DISTANCE_MAX); /* match now expected within distance */ - - if (LZ4_read32(match) == LZ4_read32(ip)) { - if (maybe_extMem) offset = current - matchIndex; - break; /* match found */ - } - - } while(1); - } - - /* Catch up */ - filledIp = ip; - assert(ip > anchor); /* this is always true as ip has been advanced before entering the main loop */ - if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) { - do { ip--; match--; } while (((ip > anchor) & (match > lowLimit)) && (unlikely(ip[-1] == match[-1]))); - } - - /* Encode Literals */ - { unsigned const litLength = (unsigned)(ip - anchor); - token = op++; - if ((outputDirective == limitedOutput) && /* Check output buffer overflow */ - (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) { - return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ - } - if ((outputDirective == fillOutput) && - (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) { - op--; - goto _last_literals; - } - if (litLength >= RUN_MASK) { - int len = (int)(litLength - RUN_MASK); - *token = (RUN_MASK<= 255 ; len-=255) *op++ = 255; - *op++ = (BYTE)len; - } - else *token = (BYTE)(litLength< olimit)) { - /* the match was too close to the end, rewind and go to last literals */ - op = token; - goto _last_literals; - } - - /* Encode Offset */ - if (maybe_extMem) { /* static test */ - DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, (int)(ip - (const BYTE*)source)); - assert(offset <= LZ4_DISTANCE_MAX && offset > 0); - LZ4_writeLE16(op, (U16)offset); op+=2; - } else { - DEBUGLOG(6, " with offset=%u (same segment)", (U32)(ip - match)); - assert(ip-match <= LZ4_DISTANCE_MAX); - LZ4_writeLE16(op, (U16)(ip - match)); op+=2; - } - - /* Encode MatchLength */ - { unsigned matchCode; - - if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx) - && (lowLimit==dictionary) /* match within extDict */ ) { - const BYTE* limit = ip + (dictEnd-match); - assert(dictEnd > match); - if (limit > matchlimit) limit = matchlimit; - matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); - ip += (size_t)matchCode + MINMATCH; - if (ip==limit) { - unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit); - matchCode += more; - ip += more; - } - DEBUGLOG(6, " with matchLength=%u starting in extDict", matchCode+MINMATCH); - } else { - matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); - ip += (size_t)matchCode + MINMATCH; - DEBUGLOG(6, " with matchLength=%u", matchCode+MINMATCH); - } - - if ((outputDirective) && /* Check output buffer overflow */ - (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) { - if (outputDirective == fillOutput) { - /* Match description too long : reduce it */ - U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255; - ip -= matchCode - newMatchCode; - assert(newMatchCode < matchCode); - matchCode = newMatchCode; - if (unlikely(ip <= filledIp)) { - /* We have already filled up to filledIp so if ip ends up less than filledIp - * we have positions in the hash table beyond the current position. This is - * a problem if we reuse the hash table. So we have to remove these positions - * from the hash table. - */ - const BYTE* ptr; - DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip)); - for (ptr = ip; ptr <= filledIp; ++ptr) { - U32 const h = LZ4_hashPosition(ptr, tableType); - LZ4_clearHash(h, cctx->hashTable, tableType); - } - } - } else { - assert(outputDirective == limitedOutput); - return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ - } - } - if (matchCode >= ML_MASK) { - *token += ML_MASK; - matchCode -= ML_MASK; - LZ4_write32(op, 0xFFFFFFFF); - while (matchCode >= 4*255) { - op+=4; - LZ4_write32(op, 0xFFFFFFFF); - matchCode -= 4*255; - } - op += matchCode / 255; - *op++ = (BYTE)(matchCode % 255); - } else - *token += (BYTE)(matchCode); - } - /* Ensure we have enough space for the last literals. */ - assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit)); - - anchor = ip; - - /* Test end of chunk */ - if (ip >= mflimitPlusOne) break; - - /* Fill table */ - { U32 const h = LZ4_hashPosition(ip-2, tableType); - if (tableType == byPtr) { - LZ4_putPositionOnHash(ip-2, h, cctx->hashTable, byPtr); - } else { - U32 const idx = (U32)((ip-2) - base); - LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType); - } } - - /* Test next position */ - if (tableType == byPtr) { - - match = LZ4_getPosition(ip, cctx->hashTable, tableType); - LZ4_putPosition(ip, cctx->hashTable, tableType); - if ( (match+LZ4_DISTANCE_MAX >= ip) - && (LZ4_read32(match) == LZ4_read32(ip)) ) - { token=op++; *token=0; goto _next_match; } - - } else { /* byU32, byU16 */ - - U32 const h = LZ4_hashPosition(ip, tableType); - U32 const current = (U32)(ip-base); - U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); - assert(matchIndex < current); - if (dictDirective == usingDictCtx) { - if (matchIndex < startIndex) { - /* there was no match, try the dictionary */ - assert(tableType == byU32); - matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); - match = dictBase + matchIndex; - lowLimit = dictionary; /* required for match length counter */ - matchIndex += dictDelta; - } else { - match = base + matchIndex; - lowLimit = (const BYTE*)source; /* required for match length counter */ - } - } else if (dictDirective==usingExtDict) { - if (matchIndex < startIndex) { - assert(dictBase); - match = dictBase + matchIndex; - lowLimit = dictionary; /* required for match length counter */ - } else { - match = base + matchIndex; - lowLimit = (const BYTE*)source; /* required for match length counter */ - } - } else { /* single memory segment */ - match = base + matchIndex; - } - LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); - assert(matchIndex < current); - if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) - && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current)) - && (LZ4_read32(match) == LZ4_read32(ip)) ) { - token=op++; - *token=0; - if (maybe_extMem) offset = current - matchIndex; - DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", - (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source)); - goto _next_match; - } - } - - /* Prepare next loop */ - forwardH = LZ4_hashPosition(++ip, tableType); - - } - -_last_literals: - /* Encode Last Literals */ - { size_t lastRun = (size_t)(iend - anchor); - if ( (outputDirective) && /* Check output buffer overflow */ - (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) { - if (outputDirective == fillOutput) { - /* adapt lastRun to fill 'dst' */ - assert(olimit >= op); - lastRun = (size_t)(olimit-op) - 1/*token*/; - lastRun -= (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/ - } else { - assert(outputDirective == limitedOutput); - return 0; /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */ - } - } - DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun); - if (lastRun >= RUN_MASK) { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; - *op++ = (BYTE) accumulator; - } else { - *op++ = (BYTE)(lastRun< 0); - DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result); - return result; -} - -/** LZ4_compress_generic() : - * inlined, to ensure branches are decided at compilation time; - * takes care of src == (NULL, 0) - * and forward the rest to LZ4_compress_generic_validated */ -LZ4_FORCE_INLINE int LZ4_compress_generic( - LZ4_stream_t_internal* const cctx, - const char* const src, - char* const dst, - const int srcSize, - int *inputConsumed, /* only written when outputDirective == fillOutput */ - const int dstCapacity, - const limitedOutput_directive outputDirective, - const tableType_t tableType, - const dict_directive dictDirective, - const dictIssue_directive dictIssue, - const int acceleration) -{ - DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", - srcSize, dstCapacity); - - if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; } /* Unsupported srcSize, too large (or negative) */ - if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */ - if (outputDirective != notLimited && dstCapacity <= 0) return 0; /* no output, can't write anything */ - DEBUGLOG(5, "Generating an empty block"); - assert(outputDirective == notLimited || dstCapacity >= 1); - assert(dst != NULL); - dst[0] = 0; - if (outputDirective == fillOutput) { - assert (inputConsumed != NULL); - *inputConsumed = 0; - } - return 1; - } - assert(src != NULL); - - return LZ4_compress_generic_validated(cctx, src, dst, srcSize, - inputConsumed, /* only written into if outputDirective == fillOutput */ - dstCapacity, outputDirective, - tableType, dictDirective, dictIssue, acceleration); -} - - -int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) -{ - LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse; - assert(ctx != NULL); - if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; - if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; - if (maxOutputSize >= LZ4_compressBound(inputSize)) { - if (inputSize < LZ4_64Klimit) { - return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration); - } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; - return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); - } - } else { - if (inputSize < LZ4_64Klimit) { - return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); - } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32; - return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration); - } - } -} - -/** - * LZ4_compress_fast_extState_fastReset() : - * A variant of LZ4_compress_fast_extState(). - * - * Using this variant avoids an expensive initialization step. It is only safe - * to call if the state buffer is known to be correctly initialized already - * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of - * "correctly initialized"). - */ -int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration) -{ - LZ4_stream_t_internal* const ctx = &((LZ4_stream_t*)state)->internal_donotuse; - if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; - if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; - assert(ctx != NULL); - - if (dstCapacity >= LZ4_compressBound(srcSize)) { - if (srcSize < LZ4_64Klimit) { - const tableType_t tableType = byU16; - LZ4_prepareTable(ctx, srcSize, tableType); - if (ctx->currentOffset) { - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration); - } else { - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); - } - } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; - LZ4_prepareTable(ctx, srcSize, tableType); - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration); - } - } else { - if (srcSize < LZ4_64Klimit) { - const tableType_t tableType = byU16; - LZ4_prepareTable(ctx, srcSize, tableType); - if (ctx->currentOffset) { - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration); - } else { - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); - } - } else { - const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; - LZ4_prepareTable(ctx, srcSize, tableType); - return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration); - } - } -} - - -int LZ4_compress_fast(const char* src, char* dest, int srcSize, int dstCapacity, int acceleration) -{ - int result; -#if (LZ4_HEAPMODE) - LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ - if (ctxPtr == NULL) return 0; -#else - LZ4_stream_t ctx; - LZ4_stream_t* const ctxPtr = &ctx; -#endif - result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, acceleration); - -#if (LZ4_HEAPMODE) - FREEMEM(ctxPtr); -#endif - return result; -} - - -int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity) -{ - return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1); -} - - -/* Note!: This function leaves the stream in an unclean/broken state! - * It is not safe to subsequently use the same state with a _fastReset() or - * _continue() call without resetting it. */ -static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize) -{ - void* const s = LZ4_initStream(state, sizeof (*state)); - assert(s != NULL); (void)s; - - if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ - return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); - } else { - if (*srcSizePtr < LZ4_64Klimit) { - return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1); - } else { - tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32; - return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1); - } } -} - - -int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) -{ -#if (LZ4_HEAPMODE) - LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ - if (ctx == NULL) return 0; -#else - LZ4_stream_t ctxBody; - LZ4_stream_t* const ctx = &ctxBody; -#endif - - int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); - -#if (LZ4_HEAPMODE) - FREEMEM(ctx); -#endif - return result; -} - - - -/*-****************************** -* Streaming functions -********************************/ - -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -LZ4_stream_t* LZ4_createStream(void) -{ - LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); - LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal)); - DEBUGLOG(4, "LZ4_createStream %p", lz4s); - if (lz4s == NULL) return NULL; - LZ4_initStream(lz4s, sizeof(*lz4s)); - return lz4s; -} -#endif - -static size_t LZ4_stream_t_alignment(void) -{ -#if LZ4_ALIGN_TEST - typedef struct { char c; LZ4_stream_t t; } t_a; - return sizeof(t_a) - sizeof(LZ4_stream_t); -#else - return 1; /* effectively disabled */ -#endif -} - -LZ4_stream_t* LZ4_initStream (void* buffer, size_t size) -{ - DEBUGLOG(5, "LZ4_initStream"); - if (buffer == NULL) { return NULL; } - if (size < sizeof(LZ4_stream_t)) { return NULL; } - if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL; - MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal)); - return (LZ4_stream_t*)buffer; -} - -/* resetStream is now deprecated, - * prefer initStream() which is more general */ -void LZ4_resetStream (LZ4_stream_t* LZ4_stream) -{ - DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream); - MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal)); -} - -void LZ4_resetStream_fast(LZ4_stream_t* ctx) { - LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); -} - -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -int LZ4_freeStream (LZ4_stream_t* LZ4_stream) -{ - if (!LZ4_stream) return 0; /* support free on NULL */ - DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream); - FREEMEM(LZ4_stream); - return (0); -} -#endif - - -#define HASH_UNIT sizeof(reg_t) -int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) -{ - LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; - const tableType_t tableType = byU32; - const BYTE* p = (const BYTE*)dictionary; - const BYTE* const dictEnd = p + dictSize; - U32 idx32; - - DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict); - - /* It's necessary to reset the context, - * and not just continue it with prepareTable() - * to avoid any risk of generating overflowing matchIndex - * when compressing using this dictionary */ - LZ4_resetStream(LZ4_dict); - - /* We always increment the offset by 64 KB, since, if the dict is longer, - * we truncate it to the last 64k, and if it's shorter, we still want to - * advance by a whole window length so we can provide the guarantee that - * there are only valid offsets in the window, which allows an optimization - * in LZ4_compress_fast_continue() where it uses noDictIssue even when the - * dictionary isn't a full 64k. */ - dict->currentOffset += 64 KB; - - if (dictSize < (int)HASH_UNIT) { - return 0; - } - - if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; - dict->dictionary = p; - dict->dictSize = (U32)(dictEnd - p); - dict->tableType = (U32)tableType; - idx32 = dict->currentOffset - dict->dictSize; - - while (p <= dictEnd-HASH_UNIT) { - U32 const h = LZ4_hashPosition(p, tableType); - LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType); - p+=3; idx32+=3; - } - - return (int)dict->dictSize; -} - -void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream) -{ - const LZ4_stream_t_internal* dictCtx = (dictionaryStream == NULL) ? NULL : - &(dictionaryStream->internal_donotuse); - - DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", - workingStream, dictionaryStream, - dictCtx != NULL ? dictCtx->dictSize : 0); - - if (dictCtx != NULL) { - /* If the current offset is zero, we will never look in the - * external dictionary context, since there is no value a table - * entry can take that indicate a miss. In that case, we need - * to bump the offset to something non-zero. - */ - if (workingStream->internal_donotuse.currentOffset == 0) { - workingStream->internal_donotuse.currentOffset = 64 KB; - } - - /* Don't actually attach an empty dictionary. - */ - if (dictCtx->dictSize == 0) { - dictCtx = NULL; - } - } - workingStream->internal_donotuse.dictCtx = dictCtx; -} - - -static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize) -{ - assert(nextSize >= 0); - if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */ - /* rescale hash table */ - U32 const delta = LZ4_dict->currentOffset - 64 KB; - const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; - int i; - DEBUGLOG(4, "LZ4_renormDictT"); - for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; - else LZ4_dict->hashTable[i] -= delta; - } - LZ4_dict->currentOffset = 64 KB; - if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; - LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; - } -} - - -int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, - const char* source, char* dest, - int inputSize, int maxOutputSize, - int acceleration) -{ - const tableType_t tableType = byU32; - LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse; - const char* dictEnd = streamPtr->dictSize ? (const char*)streamPtr->dictionary + streamPtr->dictSize : NULL; - - DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", inputSize, streamPtr->dictSize); - - LZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */ - if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT; - if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX; - - /* invalidate tiny dictionaries */ - if ( (streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */ - && (dictEnd != source) /* prefix mode */ - && (inputSize > 0) /* tolerance : don't lose history, in case next invocation would use prefix mode */ - && (streamPtr->dictCtx == NULL) /* usingDictCtx */ - ) { - DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary); - /* remove dictionary existence from history, to employ faster prefix mode */ - streamPtr->dictSize = 0; - streamPtr->dictionary = (const BYTE*)source; - dictEnd = source; - } - - /* Check overlapping input/dictionary space */ - { const char* const sourceEnd = source + inputSize; - if ((sourceEnd > (const char*)streamPtr->dictionary) && (sourceEnd < dictEnd)) { - streamPtr->dictSize = (U32)(dictEnd - sourceEnd); - if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; - if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; - streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize; - } - } - - /* prefix mode : source data follows dictionary */ - if (dictEnd == source) { - if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) - return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration); - else - return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration); - } - - /* external dictionary mode */ - { int result; - if (streamPtr->dictCtx) { - /* We depend here on the fact that dictCtx'es (produced by - * LZ4_loadDict) guarantee that their tables contain no references - * to offsets between dictCtx->currentOffset - 64 KB and - * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe - * to use noDictIssue even when the dict isn't a full 64 KB. - */ - if (inputSize > 4 KB) { - /* For compressing large blobs, it is faster to pay the setup - * cost to copy the dictionary's tables into the active context, - * so that the compression loop is only looking into one table. - */ - LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr)); - result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); - } else { - result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration); - } - } else { /* small data <= 4 KB */ - if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { - result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration); - } else { - result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration); - } - } - streamPtr->dictionary = (const BYTE*)source; - streamPtr->dictSize = (U32)inputSize; - return result; - } -} - - -/* Hidden debug function, to force-test external dictionary mode */ -int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize) -{ - LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse; - int result; - - LZ4_renormDictT(streamPtr, srcSize); - - if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) { - result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1); - } else { - result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); - } - - streamPtr->dictionary = (const BYTE*)source; - streamPtr->dictSize = (U32)srcSize; - - return result; -} - - -/*! LZ4_saveDict() : - * If previously compressed data block is not guaranteed to remain available at its memory location, - * save it into a safer place (char* safeBuffer). - * Note : no need to call LZ4_loadDict() afterwards, dictionary is immediately usable, - * one can therefore call LZ4_compress_fast_continue() right after. - * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error. - */ -int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) -{ - LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; - - DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, safeBuffer); - - if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */ - if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; } - - if (safeBuffer == NULL) assert(dictSize == 0); - if (dictSize > 0) { - const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; - assert(dict->dictionary); - LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize); - } - - dict->dictionary = (const BYTE*)safeBuffer; - dict->dictSize = (U32)dictSize; - - return dictSize; -} - - - -/*-******************************* - * Decompression functions - ********************************/ - -typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; - -#undef MIN -#define MIN(a,b) ( (a) < (b) ? (a) : (b) ) - - -/* variant for decompress_unsafe() - * does not know end of input - * presumes input is well formed - * note : will consume at least one byte */ -static size_t read_long_length_no_check(const BYTE** pp) -{ - size_t b, l = 0; - do { b = **pp; (*pp)++; l += b; } while (b==255); - DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", l, l/255 + 1) - return l; -} - -/* core decoder variant for LZ4_decompress_fast*() - * for legacy support only : these entry points are deprecated. - * - Presumes input is correctly formed (no defense vs malformed inputs) - * - Does not know input size (presume input buffer is "large enough") - * - Decompress a full block (only) - * @return : nb of bytes read from input. - * Note : this variant is not optimized for speed, just for maintenance. - * the goal is to remove support of decompress_fast*() variants by v2.0 -**/ -LZ4_FORCE_INLINE int -LZ4_decompress_unsafe_generic( - const BYTE* const istart, - BYTE* const ostart, - int decompressedSize, - - size_t prefixSize, - const BYTE* const dictStart, /* only if dict==usingExtDict */ - const size_t dictSize /* note: =0 if dictStart==NULL */ - ) -{ - const BYTE* ip = istart; - BYTE* op = (BYTE*)ostart; - BYTE* const oend = ostart + decompressedSize; - const BYTE* const prefixStart = ostart - prefixSize; - - DEBUGLOG(5, "LZ4_decompress_unsafe_generic"); - if (dictStart == NULL) assert(dictSize == 0); - - while (1) { - /* start new sequence */ - unsigned token = *ip++; - - /* literals */ - { size_t ll = token >> ML_BITS; - if (ll==15) { - /* long literal length */ - ll += read_long_length_no_check(&ip); - } - if ((size_t)(oend-op) < ll) return -1; /* output buffer overflow */ - LZ4_memmove(op, ip, ll); /* support in-place decompression */ - op += ll; - ip += ll; - if ((size_t)(oend-op) < MFLIMIT) { - if (op==oend) break; /* end of block */ - DEBUGLOG(5, "invalid: literals end at distance %zi from end of block", oend-op); - /* incorrect end of block : - * last match must start at least MFLIMIT==12 bytes before end of output block */ - return -1; - } } - - /* match */ - { size_t ml = token & 15; - size_t const offset = LZ4_readLE16(ip); - ip+=2; - - if (ml==15) { - /* long literal length */ - ml += read_long_length_no_check(&ip); - } - ml += MINMATCH; - - if ((size_t)(oend-op) < ml) return -1; /* output buffer overflow */ - - { const BYTE* match = op - offset; - - /* out of range */ - if (offset > (size_t)(op - prefixStart) + dictSize) { - DEBUGLOG(6, "offset out of range"); - return -1; - } - - /* check special case : extDict */ - if (offset > (size_t)(op - prefixStart)) { - /* extDict scenario */ - const BYTE* const dictEnd = dictStart + dictSize; - const BYTE* extMatch = dictEnd - (offset - (size_t)(op-prefixStart)); - size_t const extml = (size_t)(dictEnd - extMatch); - if (extml > ml) { - /* match entirely within extDict */ - LZ4_memmove(op, extMatch, ml); - op += ml; - ml = 0; - } else { - /* match split between extDict & prefix */ - LZ4_memmove(op, extMatch, extml); - op += extml; - ml -= extml; - } - match = prefixStart; - } - - /* match copy - slow variant, supporting overlap copy */ - { size_t u; - for (u=0; u= ipmax before start of loop. Returns initial_error if so. - * @error (output) - error code. Must be set to 0 before call. -**/ -typedef size_t Rvl_t; -static const Rvl_t rvl_error = (Rvl_t)(-1); -LZ4_FORCE_INLINE Rvl_t -read_variable_length(const BYTE** ip, const BYTE* ilimit, - int initial_check) -{ - Rvl_t s, length = 0; - assert(ip != NULL); - assert(*ip != NULL); - assert(ilimit != NULL); - if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */ - return rvl_error; - } - s = **ip; - (*ip)++; - length += s; - if (unlikely((*ip) > ilimit)) { /* read limit reached */ - return rvl_error; - } - /* accumulator overflow detection (32-bit mode only) */ - if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) { - return rvl_error; - } - if (likely(s != 255)) return length; - do { - s = **ip; - (*ip)++; - length += s; - if (unlikely((*ip) > ilimit)) { /* read limit reached */ - return rvl_error; - } - /* accumulator overflow detection (32-bit mode only) */ - if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1)/2)) ) { - return rvl_error; - } - } while (s == 255); - - return length; -} - -/*! LZ4_decompress_generic() : - * This generic decompression function covers all use cases. - * It shall be instantiated several times, using different sets of directives. - * Note that it is important for performance that this function really get inlined, - * in order to remove useless branches during compilation optimization. - */ -LZ4_FORCE_INLINE int -LZ4_decompress_generic( - const char* const src, - char* const dst, - int srcSize, - int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ - - earlyEnd_directive partialDecoding, /* full, partial */ - dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ - const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ - const BYTE* const dictStart, /* only if dict==usingExtDict */ - const size_t dictSize /* note : = 0 if noDict */ - ) -{ - if ((src == NULL) || (outputSize < 0)) { return -1; } - - { const BYTE* ip = (const BYTE*) src; - const BYTE* const iend = ip + srcSize; - - BYTE* op = (BYTE*) dst; - BYTE* const oend = op + outputSize; - BYTE* cpy; - - const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; - - const int checkOffset = (dictSize < (int)(64 KB)); - - - /* Set up the "end" pointers for the shortcut. */ - const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/; - const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/; - - const BYTE* match; - size_t offset; - unsigned token; - size_t length; - - - DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize); - - /* Special cases */ - assert(lowPrefix <= op); - if (unlikely(outputSize==0)) { - /* Empty output buffer */ - if (partialDecoding) return 0; - return ((srcSize==1) && (*ip==0)) ? 0 : -1; - } - if (unlikely(srcSize==0)) { return -1; } - - /* LZ4_FAST_DEC_LOOP: - * designed for modern OoO performance cpus, - * where copying reliably 32-bytes is preferable to an unpredictable branch. - * note : fast loop may show a regression for some client arm chips. */ -#if LZ4_FAST_DEC_LOOP - if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { - DEBUGLOG(6, "skip fast decode loop"); - goto safe_decode; - } - - /* Fast loop : decode sequences as long as output < oend-FASTLOOP_SAFE_DISTANCE */ - DEBUGLOG(6, "using fast decode loop"); - while (1) { - /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ - assert(oend - op >= FASTLOOP_SAFE_DISTANCE); - assert(ip < iend); - token = *ip++; - length = token >> ML_BITS; /* literal length */ - - /* decode literal length */ - if (length == RUN_MASK) { - size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); - if (addl == rvl_error) { - DEBUGLOG(6, "error reading long literal length"); - goto _output_error; - } - length += addl; - if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ - if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ - - /* copy literals */ - LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); - if ((op+length>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } - LZ4_wildCopy32(op, ip, op+length); - ip += length; op += length; - } else if (ip <= iend-(16 + 1/*max lit + offset + nextToken*/)) { - /* We don't need to check oend, since we check it once for each loop below */ - DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); - /* Literals can only be <= 14, but hope compilers optimize better when copy by a register size */ - LZ4_memcpy(op, ip, 16); - ip += length; op += length; - } else { - goto safe_literal_copy; - } - - /* get offset */ - offset = LZ4_readLE16(ip); ip+=2; - DEBUGLOG(6, " offset = %zu", offset); - match = op - offset; - assert(match <= op); /* overflow check */ - - /* get matchlength */ - length = token & ML_MASK; - - if (length == ML_MASK) { - size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); - if (addl == rvl_error) { - DEBUGLOG(6, "error reading long match length"); - goto _output_error; - } - length += addl; - length += MINMATCH; - if (unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ - if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { - goto safe_match_copy; - } - } else { - length += MINMATCH; - if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { - goto safe_match_copy; - } - - /* Fastpath check: skip LZ4_wildCopy32 when true */ - if ((dict == withPrefix64k) || (match >= lowPrefix)) { - if (offset >= 8) { - assert(match >= lowPrefix); - assert(match <= op); - assert(op + 18 <= oend); - - LZ4_memcpy(op, match, 8); - LZ4_memcpy(op+8, match+8, 8); - LZ4_memcpy(op+16, match+16, 2); - op += length; - continue; - } } } - - if ( checkOffset && (unlikely(match + dictSize < lowPrefix)) ) { - DEBUGLOG(6, "Error : pos=%zi, offset=%zi => outside buffers", op-lowPrefix, op-match); - goto _output_error; - } - /* match starting within external dictionary */ - if ((dict==usingExtDict) && (match < lowPrefix)) { - assert(dictEnd != NULL); - if (unlikely(op+length > oend-LASTLITERALS)) { - if (partialDecoding) { - DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd"); - length = MIN(length, (size_t)(oend-op)); - } else { - DEBUGLOG(6, "end-of-block condition violated") - goto _output_error; - } } - - if (length <= (size_t)(lowPrefix-match)) { - /* match fits entirely within external dictionary : just copy */ - LZ4_memmove(op, dictEnd - (lowPrefix-match), length); - op += length; - } else { - /* match stretches into both external dictionary and current block */ - size_t const copySize = (size_t)(lowPrefix - match); - size_t const restSize = length - copySize; - LZ4_memcpy(op, dictEnd - copySize, copySize); - op += copySize; - if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ - BYTE* const endOfMatch = op + restSize; - const BYTE* copyFrom = lowPrefix; - while (op < endOfMatch) { *op++ = *copyFrom++; } - } else { - LZ4_memcpy(op, lowPrefix, restSize); - op += restSize; - } } - continue; - } - - /* copy match within block */ - cpy = op + length; - - assert((op <= oend) && (oend-op >= 32)); - if (unlikely(offset<16)) { - LZ4_memcpy_using_offset(op, match, cpy, offset); - } else { - LZ4_wildCopy32(op, match, cpy); - } - - op = cpy; /* wildcopy correction */ - } - safe_decode: -#endif - - /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ - DEBUGLOG(6, "using safe decode loop"); - while (1) { - assert(ip < iend); - token = *ip++; - length = token >> ML_BITS; /* literal length */ - - /* A two-stage shortcut for the most common case: - * 1) If the literal length is 0..14, and there is enough space, - * enter the shortcut and copy 16 bytes on behalf of the literals - * (in the fast mode, only 8 bytes can be safely copied this way). - * 2) Further if the match length is 4..18, copy 18 bytes in a similar - * manner; but we ensure that there's enough space in the output for - * those 18 bytes earlier, upon entering the shortcut (in other words, - * there is a combined check for both stages). - */ - if ( (length != RUN_MASK) - /* strictly "less than" on input, to re-enter the loop with at least one byte */ - && likely((ip < shortiend) & (op <= shortoend)) ) { - /* Copy the literals */ - LZ4_memcpy(op, ip, 16); - op += length; ip += length; - - /* The second stage: prepare for match copying, decode full info. - * If it doesn't work out, the info won't be wasted. */ - length = token & ML_MASK; /* match length */ - offset = LZ4_readLE16(ip); ip += 2; - match = op - offset; - assert(match <= op); /* check overflow */ - - /* Do not deal with overlapping matches. */ - if ( (length != ML_MASK) - && (offset >= 8) - && (dict==withPrefix64k || match >= lowPrefix) ) { - /* Copy the match. */ - LZ4_memcpy(op + 0, match + 0, 8); - LZ4_memcpy(op + 8, match + 8, 8); - LZ4_memcpy(op +16, match +16, 2); - op += length + MINMATCH; - /* Both stages worked, load the next token. */ - continue; - } - - /* The second stage didn't work out, but the info is ready. - * Propel it right to the point of match copying. */ - goto _copy_match; - } - - /* decode literal length */ - if (length == RUN_MASK) { - size_t const addl = read_variable_length(&ip, iend-RUN_MASK, 1); - if (addl == rvl_error) { goto _output_error; } - length += addl; - if (unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ - if (unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ - } - -#if LZ4_FAST_DEC_LOOP - safe_literal_copy: -#endif - /* copy literals */ - cpy = op+length; - - LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); - if ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) { - /* We've either hit the input parsing restriction or the output parsing restriction. - * In the normal scenario, decoding a full block, it must be the last sequence, - * otherwise it's an error (invalid input or dimensions). - * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow. - */ - if (partialDecoding) { - /* Since we are partial decoding we may be in this block because of the output parsing - * restriction, which is not valid since the output buffer is allowed to be undersized. - */ - DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end") - DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length); - DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op)); - DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip)); - /* Finishing in the middle of a literals segment, - * due to lack of input. - */ - if (ip+length > iend) { - length = (size_t)(iend-ip); - cpy = op + length; - } - /* Finishing in the middle of a literals segment, - * due to lack of output space. - */ - if (cpy > oend) { - cpy = oend; - assert(op<=oend); - length = (size_t)(oend-op); - } - } else { - /* We must be on the last sequence (or invalid) because of the parsing limitations - * so check that we exactly consume the input and don't overrun the output buffer. - */ - if ((ip+length != iend) || (cpy > oend)) { - DEBUGLOG(6, "should have been last run of literals") - DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend); - DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend); - goto _output_error; - } - } - LZ4_memmove(op, ip, length); /* supports overlapping memory regions, for in-place decompression scenarios */ - ip += length; - op += length; - /* Necessarily EOF when !partialDecoding. - * When partialDecoding, it is EOF if we've either - * filled the output buffer or - * can't proceed with reading an offset for following match. - */ - if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) { - break; - } - } else { - LZ4_wildCopy8(op, ip, cpy); /* can overwrite up to 8 bytes beyond cpy */ - ip += length; op = cpy; - } - - /* get offset */ - offset = LZ4_readLE16(ip); ip+=2; - match = op - offset; - - /* get matchlength */ - length = token & ML_MASK; - - _copy_match: - if (length == ML_MASK) { - size_t const addl = read_variable_length(&ip, iend - LASTLITERALS + 1, 0); - if (addl == rvl_error) { goto _output_error; } - length += addl; - if (unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ - } - length += MINMATCH; - -#if LZ4_FAST_DEC_LOOP - safe_match_copy: -#endif - if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ - /* match starting within external dictionary */ - if ((dict==usingExtDict) && (match < lowPrefix)) { - assert(dictEnd != NULL); - if (unlikely(op+length > oend-LASTLITERALS)) { - if (partialDecoding) length = MIN(length, (size_t)(oend-op)); - else goto _output_error; /* doesn't respect parsing restriction */ - } - - if (length <= (size_t)(lowPrefix-match)) { - /* match fits entirely within external dictionary : just copy */ - LZ4_memmove(op, dictEnd - (lowPrefix-match), length); - op += length; - } else { - /* match stretches into both external dictionary and current block */ - size_t const copySize = (size_t)(lowPrefix - match); - size_t const restSize = length - copySize; - LZ4_memcpy(op, dictEnd - copySize, copySize); - op += copySize; - if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ - BYTE* const endOfMatch = op + restSize; - const BYTE* copyFrom = lowPrefix; - while (op < endOfMatch) *op++ = *copyFrom++; - } else { - LZ4_memcpy(op, lowPrefix, restSize); - op += restSize; - } } - continue; - } - assert(match >= lowPrefix); - - /* copy match within block */ - cpy = op + length; - - /* partialDecoding : may end anywhere within the block */ - assert(op<=oend); - if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { - size_t const mlen = MIN(length, (size_t)(oend-op)); - const BYTE* const matchEnd = match + mlen; - BYTE* const copyEnd = op + mlen; - if (matchEnd > op) { /* overlap copy */ - while (op < copyEnd) { *op++ = *match++; } - } else { - LZ4_memcpy(op, match, mlen); - } - op = copyEnd; - if (op == oend) { break; } - continue; - } - - if (unlikely(offset<8)) { - LZ4_write32(op, 0); /* silence msan warning when offset==0 */ - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += inc32table[offset]; - LZ4_memcpy(op+4, match, 4); - match -= dec64table[offset]; - } else { - LZ4_memcpy(op, match, 8); - match += 8; - } - op += 8; - - if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { - BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); - if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ - if (op < oCopyLimit) { - LZ4_wildCopy8(op, match, oCopyLimit); - match += oCopyLimit - op; - op = oCopyLimit; - } - while (op < cpy) { *op++ = *match++; } - } else { - LZ4_memcpy(op, match, 8); - if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); } - } - op = cpy; /* wildcopy correction */ - } - - /* end of decoding */ - DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); - return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ - - /* Overflow error detected */ - _output_error: - return (int) (-(((const char*)ip)-src))-1; - } -} - - -/*===== Instantiate the API decoding functions. =====*/ - -LZ4_FORCE_O2 -int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, - decode_full_block, noDict, - (BYTE*)dest, NULL, 0); -} - -LZ4_FORCE_O2 -int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity) -{ - dstCapacity = MIN(targetOutputSize, dstCapacity); - return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, - partial_decode, - noDict, (BYTE*)dst, NULL, 0); -} - -LZ4_FORCE_O2 -int LZ4_decompress_fast(const char* source, char* dest, int originalSize) -{ - DEBUGLOG(5, "LZ4_decompress_fast"); - return LZ4_decompress_unsafe_generic( - (const BYTE*)source, (BYTE*)dest, originalSize, - 0, NULL, 0); -} - -/*===== Instantiate a few more decoding cases, used more than once. =====*/ - -LZ4_FORCE_O2 /* Exported, an obsolete API function. */ -int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - decode_full_block, withPrefix64k, - (BYTE*)dest - 64 KB, NULL, 0); -} - -LZ4_FORCE_O2 -static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity) -{ - dstCapacity = MIN(targetOutputSize, dstCapacity); - return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, - partial_decode, withPrefix64k, - (BYTE*)dest - 64 KB, NULL, 0); -} - -/* Another obsolete API function, paired with the previous one. */ -int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) -{ - return LZ4_decompress_unsafe_generic( - (const BYTE*)source, (BYTE*)dest, originalSize, - 64 KB, NULL, 0); -} - -LZ4_FORCE_O2 -static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize, - size_t prefixSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - decode_full_block, noDict, - (BYTE*)dest-prefixSize, NULL, 0); -} - -LZ4_FORCE_O2 -static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, - size_t prefixSize) -{ - dstCapacity = MIN(targetOutputSize, dstCapacity); - return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, - partial_decode, noDict, - (BYTE*)dest-prefixSize, NULL, 0); -} - -LZ4_FORCE_O2 -int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, - int compressedSize, int maxOutputSize, - const void* dictStart, size_t dictSize) -{ - DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict"); - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - decode_full_block, usingExtDict, - (BYTE*)dest, (const BYTE*)dictStart, dictSize); -} - -LZ4_FORCE_O2 -int LZ4_decompress_safe_partial_forceExtDict(const char* source, char* dest, - int compressedSize, int targetOutputSize, int dstCapacity, - const void* dictStart, size_t dictSize) -{ - dstCapacity = MIN(targetOutputSize, dstCapacity); - return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, - partial_decode, usingExtDict, - (BYTE*)dest, (const BYTE*)dictStart, dictSize); -} - -LZ4_FORCE_O2 -static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize, - const void* dictStart, size_t dictSize) -{ - return LZ4_decompress_unsafe_generic( - (const BYTE*)source, (BYTE*)dest, originalSize, - 0, (const BYTE*)dictStart, dictSize); -} - -/* The "double dictionary" mode, for use with e.g. ring buffers: the first part - * of the dictionary is passed as prefix, and the second via dictStart + dictSize. - * These routines are used only once, in LZ4_decompress_*_continue(). - */ -LZ4_FORCE_INLINE -int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize, - size_t prefixSize, const void* dictStart, size_t dictSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - decode_full_block, usingExtDict, - (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize); -} - -/*===== streaming decompression functions =====*/ - -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -LZ4_streamDecode_t* LZ4_createStreamDecode(void) -{ - LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= sizeof(LZ4_streamDecode_t_internal)); - return (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t)); -} - -int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream) -{ - if (LZ4_stream == NULL) { return 0; } /* support free on NULL */ - FREEMEM(LZ4_stream); - return 0; -} -#endif - -/*! LZ4_setStreamDecode() : - * Use this function to instruct where to find the dictionary. - * This function is not necessary if previous data is still available where it was decoded. - * Loading a size of 0 is allowed (same effect as no dictionary). - * @return : 1 if OK, 0 if error - */ -int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize) -{ - LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; - lz4sd->prefixSize = (size_t)dictSize; - if (dictSize) { - assert(dictionary != NULL); - lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; - } else { - lz4sd->prefixEnd = (const BYTE*) dictionary; - } - lz4sd->externalDict = NULL; - lz4sd->extDictSize = 0; - return 1; -} - -/*! LZ4_decoderRingBufferSize() : - * when setting a ring buffer for streaming decompression (optional scenario), - * provides the minimum size of this ring buffer - * to be compatible with any source respecting maxBlockSize condition. - * Note : in a ring buffer scenario, - * blocks are presumed decompressed next to each other. - * When not enough space remains for next block (remainingSize < maxBlockSize), - * decoding resumes from beginning of ring buffer. - * @return : minimum ring buffer size, - * or 0 if there is an error (invalid maxBlockSize). - */ -int LZ4_decoderRingBufferSize(int maxBlockSize) -{ - if (maxBlockSize < 0) return 0; - if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0; - if (maxBlockSize < 16) maxBlockSize = 16; - return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize); -} - -/* -*_continue() : - These decoding functions allow decompression of multiple blocks in "streaming" mode. - Previously decoded blocks must still be available at the memory position where they were decoded. - If it's not possible, save the relevant part of decoded data into a safe buffer, - and indicate where it stands using LZ4_setStreamDecode() -*/ -LZ4_FORCE_O2 -int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) -{ - LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; - int result; - - if (lz4sd->prefixSize == 0) { - /* The first call, no dictionary yet. */ - assert(lz4sd->extDictSize == 0); - result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); - if (result <= 0) return result; - lz4sd->prefixSize = (size_t)result; - lz4sd->prefixEnd = (BYTE*)dest + result; - } else if (lz4sd->prefixEnd == (BYTE*)dest) { - /* They're rolling the current segment. */ - if (lz4sd->prefixSize >= 64 KB - 1) - result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); - else if (lz4sd->extDictSize == 0) - result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, - lz4sd->prefixSize); - else - result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize, - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize += (size_t)result; - lz4sd->prefixEnd += result; - } else { - /* The buffer wraps around, or they're switching to another buffer. */ - lz4sd->extDictSize = lz4sd->prefixSize; - lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; - result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, - lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize = (size_t)result; - lz4sd->prefixEnd = (BYTE*)dest + result; - } - - return result; -} - -LZ4_FORCE_O2 int -LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, - const char* source, char* dest, int originalSize) -{ - LZ4_streamDecode_t_internal* const lz4sd = - (assert(LZ4_streamDecode!=NULL), &LZ4_streamDecode->internal_donotuse); - int result; - - DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize); - assert(originalSize >= 0); - - if (lz4sd->prefixSize == 0) { - DEBUGLOG(5, "first invocation : no prefix nor extDict"); - assert(lz4sd->extDictSize == 0); - result = LZ4_decompress_fast(source, dest, originalSize); - if (result <= 0) return result; - lz4sd->prefixSize = (size_t)originalSize; - lz4sd->prefixEnd = (BYTE*)dest + originalSize; - } else if (lz4sd->prefixEnd == (BYTE*)dest) { - DEBUGLOG(5, "continue using existing prefix"); - result = LZ4_decompress_unsafe_generic( - (const BYTE*)source, (BYTE*)dest, originalSize, - lz4sd->prefixSize, - lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize += (size_t)originalSize; - lz4sd->prefixEnd += originalSize; - } else { - DEBUGLOG(5, "prefix becomes extDict"); - lz4sd->extDictSize = lz4sd->prefixSize; - lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; - result = LZ4_decompress_fast_extDict(source, dest, originalSize, - lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize = (size_t)originalSize; - lz4sd->prefixEnd = (BYTE*)dest + originalSize; - } - - return result; -} - - -/* -Advanced decoding functions : -*_usingDict() : - These decoding functions work the same as "_continue" ones, - the dictionary must be explicitly provided within parameters -*/ - -int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) -{ - if (dictSize==0) - return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); - if (dictStart+dictSize == dest) { - if (dictSize >= 64 KB - 1) { - return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize); - } - assert(dictSize >= 0); - return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize); - } - assert(dictSize >= 0); - return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize); -} - -int LZ4_decompress_safe_partial_usingDict(const char* source, char* dest, int compressedSize, int targetOutputSize, int dstCapacity, const char* dictStart, int dictSize) -{ - if (dictSize==0) - return LZ4_decompress_safe_partial(source, dest, compressedSize, targetOutputSize, dstCapacity); - if (dictStart+dictSize == dest) { - if (dictSize >= 64 KB - 1) { - return LZ4_decompress_safe_partial_withPrefix64k(source, dest, compressedSize, targetOutputSize, dstCapacity); - } - assert(dictSize >= 0); - return LZ4_decompress_safe_partial_withSmallPrefix(source, dest, compressedSize, targetOutputSize, dstCapacity, (size_t)dictSize); - } - assert(dictSize >= 0); - return LZ4_decompress_safe_partial_forceExtDict(source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, (size_t)dictSize); -} - -int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) -{ - if (dictSize==0 || dictStart+dictSize == dest) - return LZ4_decompress_unsafe_generic( - (const BYTE*)source, (BYTE*)dest, originalSize, - (size_t)dictSize, NULL, 0); - assert(dictSize >= 0); - return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize); -} - - -/*=************************************************* -* Obsolete Functions -***************************************************/ -/* obsolete compression functions */ -int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) -{ - return LZ4_compress_default(source, dest, inputSize, maxOutputSize); -} -int LZ4_compress(const char* src, char* dest, int srcSize) -{ - return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize)); -} -int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) -{ - return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); -} -int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) -{ - return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); -} -int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity) -{ - return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1); -} -int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) -{ - return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); -} - -/* -These decompression functions are deprecated and should no longer be used. -They are only provided here for compatibility with older user programs. -- LZ4_uncompress is totally equivalent to LZ4_decompress_fast -- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe -*/ -int LZ4_uncompress (const char* source, char* dest, int outputSize) -{ - return LZ4_decompress_fast(source, dest, outputSize); -} -int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) -{ - return LZ4_decompress_safe(source, dest, isize, maxOutputSize); -} - -/* Obsolete Streaming functions */ - -int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); } - -int LZ4_resetStreamState(void* state, char* inputBuffer) -{ - (void)inputBuffer; - LZ4_resetStream((LZ4_stream_t*)state); - return 0; -} - -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -void* LZ4_create (char* inputBuffer) -{ - (void)inputBuffer; - return LZ4_createStream(); -} -#endif - -char* LZ4_slideInputBuffer (void* state) -{ - /* avoid const char * -> char * conversion warning */ - return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary; -} - -#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/contrib/lz4/lz4.h b/contrib/lz4/lz4.h deleted file mode 100644 index cdcaa94..0000000 --- a/contrib/lz4/lz4.h +++ /dev/null @@ -1,862 +0,0 @@ -/* - * LZ4 - Fast LZ compression algorithm - * Header File - * Copyright (C) 2011-2020, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 homepage : http://www.lz4.org - - LZ4 source repository : https://github.com/lz4/lz4 -*/ -#if defined (__cplusplus) -extern "C" { -#endif - -#ifndef LZ4_H_2983827168210 -#define LZ4_H_2983827168210 - -/* --- Dependency --- */ -#include /* size_t */ - - -/** - Introduction - - LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core, - scalable with multi-cores CPU. It features an extremely fast decoder, with speed in - multiple GB/s per core, typically reaching RAM speed limits on multi-core systems. - - The LZ4 compression library provides in-memory compression and decompression functions. - It gives full buffer control to user. - Compression can be done in: - - a single step (described as Simple Functions) - - a single step, reusing a context (described in Advanced Functions) - - unbounded multiple steps (described as Streaming compression) - - lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md). - Decompressing such a compressed block requires additional metadata. - Exact metadata depends on exact decompression function. - For the typical case of LZ4_decompress_safe(), - metadata includes block's compressed size, and maximum bound of decompressed size. - Each application is free to encode and pass such metadata in whichever way it wants. - - lz4.h only handle blocks, it can not generate Frames. - - Blocks are different from Frames (doc/lz4_Frame_format.md). - Frames bundle both blocks and metadata in a specified manner. - Embedding metadata is required for compressed data to be self-contained and portable. - Frame format is delivered through a companion API, declared in lz4frame.h. - The `lz4` CLI can only manage frames. -*/ - -/*^*************************************************************** -* Export parameters -*****************************************************************/ -/* -* LZ4_DLL_EXPORT : -* Enable exporting of functions when building a Windows DLL -* LZ4LIB_VISIBILITY : -* Control library symbols visibility. -*/ -#ifndef LZ4LIB_VISIBILITY -# if defined(__GNUC__) && (__GNUC__ >= 4) -# define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default"))) -# else -# define LZ4LIB_VISIBILITY -# endif -#endif -#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1) -# define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY -#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1) -# define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ -#else -# define LZ4LIB_API LZ4LIB_VISIBILITY -#endif - -/*! LZ4_FREESTANDING : - * When this macro is set to 1, it enables "freestanding mode" that is - * suitable for typical freestanding environment which doesn't support - * standard C library. - * - * - LZ4_FREESTANDING is a compile-time switch. - * - It requires the following macros to be defined: - * LZ4_memcpy, LZ4_memmove, LZ4_memset. - * - It only enables LZ4/HC functions which don't use heap. - * All LZ4F_* functions are not supported. - * - See tests/freestanding.c to check its basic setup. - */ -#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1) -# define LZ4_HEAPMODE 0 -# define LZ4HC_HEAPMODE 0 -# define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1 -# if !defined(LZ4_memcpy) -# error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'." -# endif -# if !defined(LZ4_memset) -# error "LZ4_FREESTANDING requires macro 'LZ4_memset'." -# endif -# if !defined(LZ4_memmove) -# error "LZ4_FREESTANDING requires macro 'LZ4_memmove'." -# endif -#elif ! defined(LZ4_FREESTANDING) -# define LZ4_FREESTANDING 0 -#endif - - -/*------ Version ------*/ -#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ -#define LZ4_VERSION_MINOR 9 /* for new (non-breaking) interface capabilities */ -#define LZ4_VERSION_RELEASE 5 /* for tweaks, bug-fixes, or development */ - -#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) - -#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE -#define LZ4_QUOTE(str) #str -#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) -#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) /* requires v1.7.3+ */ - -LZ4LIB_API int LZ4_versionNumber (void); /**< library version number; useful to check dll version; requires v1.3.0+ */ -LZ4LIB_API const char* LZ4_versionString (void); /**< library version string; useful to check dll version; requires v1.7.5+ */ - - -/*-************************************ -* Tuning parameter -**************************************/ -#define LZ4_MEMORY_USAGE_MIN 10 -#define LZ4_MEMORY_USAGE_DEFAULT 14 -#define LZ4_MEMORY_USAGE_MAX 20 - -/*! - * LZ4_MEMORY_USAGE : - * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; ) - * Increasing memory usage improves compression ratio, at the cost of speed. - * Reduced memory usage may improve speed at the cost of ratio, thanks to better cache locality. - * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache - */ -#ifndef LZ4_MEMORY_USAGE -# define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT -#endif - -#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN) -# error "LZ4_MEMORY_USAGE is too small !" -#endif - -#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX) -# error "LZ4_MEMORY_USAGE is too large !" -#endif - -/*-************************************ -* Simple Functions -**************************************/ -/*! LZ4_compress_default() : - * Compresses 'srcSize' bytes from buffer 'src' - * into already allocated 'dst' buffer of size 'dstCapacity'. - * Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize). - * It also runs faster, so it's a recommended setting. - * If the function cannot compress 'src' into a more limited 'dst' budget, - * compression stops *immediately*, and the function result is zero. - * In which case, 'dst' content is undefined (invalid). - * srcSize : max supported value is LZ4_MAX_INPUT_SIZE. - * dstCapacity : size of buffer 'dst' (which must be already allocated) - * @return : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity) - * or 0 if compression fails - * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer). - */ -LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity); - -/*! LZ4_decompress_safe() : - * @compressedSize : is the exact complete size of the compressed block. - * @dstCapacity : is the size of destination buffer (which must be already allocated), - * is an upper bound of decompressed size. - * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity) - * If destination buffer is not large enough, decoding will stop and output an error code (negative value). - * If the source stream is detected malformed, the function will stop decoding and return a negative result. - * Note 1 : This function is protected against malicious data packets : - * it will never writes outside 'dst' buffer, nor read outside 'source' buffer, - * even if the compressed block is maliciously modified to order the decoder to do these actions. - * In such case, the decoder stops immediately, and considers the compressed block malformed. - * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them. - * The implementation is free to send / store / derive this information in whichever way is most beneficial. - * If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead. - */ -LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity); - - -/*-************************************ -* Advanced Functions -**************************************/ -#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ -#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) - -/*! LZ4_compressBound() : - Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) - This function is primarily useful for memory allocation purposes (destination buffer size). - Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). - Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize) - inputSize : max supported value is LZ4_MAX_INPUT_SIZE - return : maximum output size in a "worst case" scenario - or 0, if input size is incorrect (too large or negative) -*/ -LZ4LIB_API int LZ4_compressBound(int inputSize); - -/*! LZ4_compress_fast() : - Same as LZ4_compress_default(), but allows selection of "acceleration" factor. - The larger the acceleration value, the faster the algorithm, but also the lesser the compression. - It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. - An acceleration value of "1" is the same as regular LZ4_compress_default() - Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). - Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c). -*/ -LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); - - -/*! LZ4_compress_fast_extState() : - * Same as LZ4_compress_fast(), using an externally allocated memory space for its state. - * Use LZ4_sizeofState() to know how much memory must be allocated, - * and allocate it on 8-bytes boundaries (using `malloc()` typically). - * Then, provide this buffer as `void* state` to compression function. - */ -LZ4LIB_API int LZ4_sizeofState(void); -LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); - - -/*! LZ4_compress_destSize() : - * Reverse the logic : compresses as much data as possible from 'src' buffer - * into already allocated buffer 'dst', of size >= 'targetDestSize'. - * This function either compresses the entire 'src' content into 'dst' if it's large enough, - * or fill 'dst' buffer completely with as much data as possible from 'src'. - * note: acceleration parameter is fixed to "default". - * - * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'. - * New value is necessarily <= input value. - * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize) - * or 0 if compression fails. - * - * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+): - * the produced compressed content could, in specific circumstances, - * require to be decompressed into a destination buffer larger - * by at least 1 byte than the content to decompress. - * If an application uses `LZ4_compress_destSize()`, - * it's highly recommended to update liblz4 to v1.9.2 or better. - * If this can't be done or ensured, - * the receiving decompression function should provide - * a dstCapacity which is > decompressedSize, by at least 1 byte. - * See https://github.com/lz4/lz4/issues/859 for details - */ -LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize); - - -/*! LZ4_decompress_safe_partial() : - * Decompress an LZ4 compressed block, of size 'srcSize' at position 'src', - * into destination buffer 'dst' of size 'dstCapacity'. - * Up to 'targetOutputSize' bytes will be decoded. - * The function stops decoding on reaching this objective. - * This can be useful to boost performance - * whenever only the beginning of a block is required. - * - * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize) - * If source stream is detected malformed, function returns a negative result. - * - * Note 1 : @return can be < targetOutputSize, if compressed block contains less data. - * - * Note 2 : targetOutputSize must be <= dstCapacity - * - * Note 3 : this function effectively stops decoding on reaching targetOutputSize, - * so dstCapacity is kind of redundant. - * This is because in older versions of this function, - * decoding operation would still write complete sequences. - * Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize, - * it could write more bytes, though only up to dstCapacity. - * Some "margin" used to be required for this operation to work properly. - * Thankfully, this is no longer necessary. - * The function nonetheless keeps the same signature, in an effort to preserve API compatibility. - * - * Note 4 : If srcSize is the exact size of the block, - * then targetOutputSize can be any value, - * including larger than the block's decompressed size. - * The function will, at most, generate block's decompressed size. - * - * Note 5 : If srcSize is _larger_ than block's compressed size, - * then targetOutputSize **MUST** be <= block's decompressed size. - * Otherwise, *silent corruption will occur*. - */ -LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity); - - -/*-********************************************* -* Streaming Compression Functions -***********************************************/ -typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ - -/** - Note about RC_INVOKED - - - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is part of MSVC/Visual Studio). - https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros - - - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars) - and reports warning "RC4011: identifier truncated". - - - To eliminate the warning, we surround long preprocessor symbol with - "#if !defined(RC_INVOKED) ... #endif" block that means - "skip this block when rc.exe is trying to read it". -*/ -#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); -LZ4LIB_API int LZ4_freeStream (LZ4_stream_t* streamPtr); -#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ -#endif - -/*! LZ4_resetStream_fast() : v1.9.0+ - * Use this to prepare an LZ4_stream_t for a new chain of dependent blocks - * (e.g., LZ4_compress_fast_continue()). - * - * An LZ4_stream_t must be initialized once before usage. - * This is automatically done when created by LZ4_createStream(). - * However, should the LZ4_stream_t be simply declared on stack (for example), - * it's necessary to initialize it first, using LZ4_initStream(). - * - * After init, start any new stream with LZ4_resetStream_fast(). - * A same LZ4_stream_t can be re-used multiple times consecutively - * and compress multiple streams, - * provided that it starts each new stream with LZ4_resetStream_fast(). - * - * LZ4_resetStream_fast() is much faster than LZ4_initStream(), - * but is not compatible with memory regions containing garbage data. - * - * Note: it's only useful to call LZ4_resetStream_fast() - * in the context of streaming compression. - * The *extState* functions perform their own resets. - * Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive. - */ -LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr); - -/*! LZ4_loadDict() : - * Use this function to reference a static dictionary into LZ4_stream_t. - * The dictionary must remain available during compression. - * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. - * The same dictionary will have to be loaded on decompression side for successful decoding. - * Dictionary are useful for better compression of small data (KB range). - * While LZ4 accept any input as dictionary, - * results are generally better when using Zstandard's Dictionary Builder. - * Loading a size of 0 is allowed, and is the same as reset. - * @return : loaded dictionary size, in bytes (necessarily <= 64 KB) - */ -LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); - -/*! LZ4_compress_fast_continue() : - * Compress 'src' content using data from previously compressed blocks, for better compression ratio. - * 'dst' buffer must be already allocated. - * If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. - * - * @return : size of compressed block - * or 0 if there is an error (typically, cannot fit into 'dst'). - * - * Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block. - * Each block has precise boundaries. - * Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata. - * It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together. - * - * Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory ! - * - * Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB. - * Make sure that buffers are separated, by at least one byte. - * This construction ensures that each block only depends on previous block. - * - * Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB. - * - * Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed. - */ -LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); - -/*! LZ4_saveDict() : - * If last 64KB data cannot be guaranteed to remain available at its current memory location, - * save it into a safer place (char* safeBuffer). - * This is schematically equivalent to a memcpy() followed by LZ4_loadDict(), - * but is much faster, because LZ4_saveDict() doesn't need to rebuild tables. - * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error. - */ -LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize); - - -/*-********************************************** -* Streaming Decompression Functions -* Bufferless synchronous API -************************************************/ -typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */ - -/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : - * creation / destruction of streaming decompression tracking context. - * A tracking context can be re-used multiple times. - */ -#if !defined(RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros */ -#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) -LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); -LZ4LIB_API int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); -#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ -#endif - -/*! LZ4_setStreamDecode() : - * An LZ4_streamDecode_t context can be allocated once and re-used multiple times. - * Use this function to start decompression of a new stream of blocks. - * A dictionary can optionally be set. Use NULL or size 0 for a reset order. - * Dictionary is presumed stable : it must remain accessible and unmodified during next decompression. - * @return : 1 if OK, 0 if error - */ -LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); - -/*! LZ4_decoderRingBufferSize() : v1.8.2+ - * Note : in a ring buffer scenario (optional), - * blocks are presumed decompressed next to each other - * up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize), - * at which stage it resumes from beginning of ring buffer. - * When setting such a ring buffer for streaming decompression, - * provides the minimum size of this ring buffer - * to be compatible with any source respecting maxBlockSize condition. - * @return : minimum ring buffer size, - * or 0 if there is an error (invalid maxBlockSize). - */ -LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize); -#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */ - -/*! LZ4_decompress_safe_continue() : - * This decoding function allows decompression of consecutive blocks in "streaming" mode. - * The difference with the usual independent blocks is that - * new blocks are allowed to find references into former blocks. - * A block is an unsplittable entity, and must be presented entirely to the decompression function. - * LZ4_decompress_safe_continue() only accepts one block at a time. - * It's modeled after `LZ4_decompress_safe()` and behaves similarly. - * - * @LZ4_streamDecode : decompression state, tracking the position in memory of past data - * @compressedSize : exact complete size of one compressed block. - * @dstCapacity : size of destination buffer (which must be already allocated), - * must be an upper bound of decompressed size. - * @return : number of bytes decompressed into destination buffer (necessarily <= dstCapacity) - * If destination buffer is not large enough, decoding will stop and output an error code (negative value). - * If the source stream is detected malformed, the function will stop decoding and return a negative result. - * - * The last 64KB of previously decoded data *must* remain available and unmodified - * at the memory position where they were previously decoded. - * If less than 64KB of data has been decoded, all the data must be present. - * - * Special : if decompression side sets a ring buffer, it must respect one of the following conditions : - * - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize). - * maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes. - * In which case, encoding and decoding buffers do not need to be synchronized. - * Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize. - * - Synchronized mode : - * Decompression buffer size is _exactly_ the same as compression buffer size, - * and follows exactly same update rule (block boundaries at same positions), - * and decoding function is provided with exact decompressed size of each block (exception for last block of the stream), - * _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB). - * - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes. - * In which case, encoding and decoding buffers do not need to be synchronized, - * and encoding ring buffer can have any size, including small ones ( < 64 KB). - * - * Whenever these conditions are not possible, - * save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression, - * then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block. -*/ -LZ4LIB_API int -LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, - const char* src, char* dst, - int srcSize, int dstCapacity); - - -/*! LZ4_decompress_safe_usingDict() : - * Works the same as - * a combination of LZ4_setStreamDecode() followed by LZ4_decompress_safe_continue() - * However, it's stateless: it doesn't need any LZ4_streamDecode_t state. - * Dictionary is presumed stable : it must remain accessible and unmodified during decompression. - * Performance tip : Decompression speed can be substantially increased - * when dst == dictStart + dictSize. - */ -LZ4LIB_API int -LZ4_decompress_safe_usingDict(const char* src, char* dst, - int srcSize, int dstCapacity, - const char* dictStart, int dictSize); - -/*! LZ4_decompress_safe_partial_usingDict() : - * Behaves the same as LZ4_decompress_safe_partial() - * with the added ability to specify a memory segment for past data. - * Performance tip : Decompression speed can be substantially increased - * when dst == dictStart + dictSize. - */ -LZ4LIB_API int -LZ4_decompress_safe_partial_usingDict(const char* src, char* dst, - int compressedSize, - int targetOutputSize, int maxOutputSize, - const char* dictStart, int dictSize); - -#endif /* LZ4_H_2983827168210 */ - - -/*^************************************* - * !!!!!! STATIC LINKING ONLY !!!!!! - ***************************************/ - -/*-**************************************************************************** - * Experimental section - * - * Symbols declared in this section must be considered unstable. Their - * signatures or semantics may change, or they may be removed altogether in the - * future. They are therefore only safe to depend on when the caller is - * statically linked against the library. - * - * To protect against unsafe usage, not only are the declarations guarded, - * the definitions are hidden by default - * when building LZ4 as a shared/dynamic library. - * - * In order to access these declarations, - * define LZ4_STATIC_LINKING_ONLY in your application - * before including LZ4's headers. - * - * In order to make their implementations accessible dynamically, you must - * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library. - ******************************************************************************/ - -#ifdef LZ4_STATIC_LINKING_ONLY - -#ifndef LZ4_STATIC_3504398509 -#define LZ4_STATIC_3504398509 - -#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS -#define LZ4LIB_STATIC_API LZ4LIB_API -#else -#define LZ4LIB_STATIC_API -#endif - - -/*! LZ4_compress_fast_extState_fastReset() : - * A variant of LZ4_compress_fast_extState(). - * - * Using this variant avoids an expensive initialization step. - * It is only safe to call if the state buffer is known to be correctly initialized already - * (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized"). - * From a high level, the difference is that - * this function initializes the provided state with a call to something like LZ4_resetStream_fast() - * while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream(). - */ -LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); - -/*! LZ4_attach_dictionary() : - * This is an experimental API that allows - * efficient use of a static dictionary many times. - * - * Rather than re-loading the dictionary buffer into a working context before - * each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a - * working LZ4_stream_t, this function introduces a no-copy setup mechanism, - * in which the working stream references the dictionary stream in-place. - * - * Several assumptions are made about the state of the dictionary stream. - * Currently, only streams which have been prepared by LZ4_loadDict() should - * be expected to work. - * - * Alternatively, the provided dictionaryStream may be NULL, - * in which case any existing dictionary stream is unset. - * - * If a dictionary is provided, it replaces any pre-existing stream history. - * The dictionary contents are the only history that can be referenced and - * logically immediately precede the data compressed in the first subsequent - * compression call. - * - * The dictionary will only remain attached to the working stream through the - * first compression call, at the end of which it is cleared. The dictionary - * stream (and source buffer) must remain in-place / accessible / unchanged - * through the completion of the first compression call on the stream. - */ -LZ4LIB_STATIC_API void -LZ4_attach_dictionary(LZ4_stream_t* workingStream, - const LZ4_stream_t* dictionaryStream); - - -/*! In-place compression and decompression - * - * It's possible to have input and output sharing the same buffer, - * for highly constrained memory environments. - * In both cases, it requires input to lay at the end of the buffer, - * and decompression to start at beginning of the buffer. - * Buffer size must feature some margin, hence be larger than final size. - * - * |<------------------------buffer--------------------------------->| - * |<-----------compressed data--------->| - * |<-----------decompressed size------------------>| - * |<----margin---->| - * - * This technique is more useful for decompression, - * since decompressed size is typically larger, - * and margin is short. - * - * In-place decompression will work inside any buffer - * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize). - * This presumes that decompressedSize > compressedSize. - * Otherwise, it means compression actually expanded data, - * and it would be more efficient to store such data with a flag indicating it's not compressed. - * This can happen when data is not compressible (already compressed, or encrypted). - * - * For in-place compression, margin is larger, as it must be able to cope with both - * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX, - * and data expansion, which can happen when input is not compressible. - * As a consequence, buffer size requirements are much higher, - * and memory savings offered by in-place compression are more limited. - * - * There are ways to limit this cost for compression : - * - Reduce history size, by modifying LZ4_DISTANCE_MAX. - * Note that it is a compile-time constant, so all compressions will apply this limit. - * Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX, - * so it's a reasonable trick when inputs are known to be small. - * - Require the compressor to deliver a "maximum compressed size". - * This is the `dstCapacity` parameter in `LZ4_compress*()`. - * When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail, - * in which case, the return code will be 0 (zero). - * The caller must be ready for these cases to happen, - * and typically design a backup scheme to send data uncompressed. - * The combination of both techniques can significantly reduce - * the amount of margin required for in-place compression. - * - * In-place compression can work in any buffer - * which size is >= (maxCompressedSize) - * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success. - * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX, - * so it's possible to reduce memory requirements by playing with them. - */ - -#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) (((compressedSize) >> 8) + 32) -#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize) ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize)) /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */ - -#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ -# define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ -#endif - -#define LZ4_COMPRESS_INPLACE_MARGIN (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */ -#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize) ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */ - -#endif /* LZ4_STATIC_3504398509 */ -#endif /* LZ4_STATIC_LINKING_ONLY */ - - - -#ifndef LZ4_H_98237428734687 -#define LZ4_H_98237428734687 - -/*-************************************************************ - * Private Definitions - ************************************************************** - * Do not use these definitions directly. - * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`. - * Accessing members will expose user code to API and/or ABI break in future versions of the library. - **************************************************************/ -#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) -#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) -#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) /* required as macro for static allocation */ - -#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# include - typedef int8_t LZ4_i8; - typedef uint8_t LZ4_byte; - typedef uint16_t LZ4_u16; - typedef uint32_t LZ4_u32; -#else - typedef signed char LZ4_i8; - typedef unsigned char LZ4_byte; - typedef unsigned short LZ4_u16; - typedef unsigned int LZ4_u32; -#endif - -/*! LZ4_stream_t : - * Never ever use below internal definitions directly ! - * These definitions are not API/ABI safe, and may change in future versions. - * If you need static allocation, declare or allocate an LZ4_stream_t object. -**/ - -typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; -struct LZ4_stream_t_internal { - LZ4_u32 hashTable[LZ4_HASH_SIZE_U32]; - const LZ4_byte* dictionary; - const LZ4_stream_t_internal* dictCtx; - LZ4_u32 currentOffset; - LZ4_u32 tableType; - LZ4_u32 dictSize; - /* Implicit padding to ensure structure is aligned */ -}; - -#define LZ4_STREAM_MINSIZE ((1UL << LZ4_MEMORY_USAGE) + 32) /* static size, for inter-version compatibility */ -union LZ4_stream_u { - char minStateSize[LZ4_STREAM_MINSIZE]; - LZ4_stream_t_internal internal_donotuse; -}; /* previously typedef'd to LZ4_stream_t */ - - -/*! LZ4_initStream() : v1.9.0+ - * An LZ4_stream_t structure must be initialized at least once. - * This is automatically done when invoking LZ4_createStream(), - * but it's not when the structure is simply declared on stack (for example). - * - * Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t. - * It can also initialize any arbitrary buffer of sufficient size, - * and will @return a pointer of proper type upon initialization. - * - * Note : initialization fails if size and alignment conditions are not respected. - * In which case, the function will @return NULL. - * Note2: An LZ4_stream_t structure guarantees correct alignment and size. - * Note3: Before v1.9.0, use LZ4_resetStream() instead -**/ -LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size); - - -/*! LZ4_streamDecode_t : - * Never ever use below internal definitions directly ! - * These definitions are not API/ABI safe, and may change in future versions. - * If you need static allocation, declare or allocate an LZ4_streamDecode_t object. -**/ -typedef struct { - const LZ4_byte* externalDict; - const LZ4_byte* prefixEnd; - size_t extDictSize; - size_t prefixSize; -} LZ4_streamDecode_t_internal; - -#define LZ4_STREAMDECODE_MINSIZE 32 -union LZ4_streamDecode_u { - char minStateSize[LZ4_STREAMDECODE_MINSIZE]; - LZ4_streamDecode_t_internal internal_donotuse; -} ; /* previously typedef'd to LZ4_streamDecode_t */ - - - -/*-************************************ -* Obsolete Functions -**************************************/ - -/*! Deprecation warnings - * - * Deprecated functions make the compiler generate a warning when invoked. - * This is meant to invite users to update their source code. - * Should deprecation warnings be a problem, it is generally possible to disable them, - * typically with -Wno-deprecated-declarations for gcc - * or _CRT_SECURE_NO_WARNINGS in Visual. - * - * Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS - * before including the header file. - */ -#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS -# define LZ4_DEPRECATED(message) /* disable deprecation warnings */ -#else -# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ -# define LZ4_DEPRECATED(message) [[deprecated(message)]] -# elif defined(_MSC_VER) -# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) -# elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45)) -# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) -# elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31) -# define LZ4_DEPRECATED(message) __attribute__((deprecated)) -# else -# pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler") -# define LZ4_DEPRECATED(message) /* disabled */ -# endif -#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ - -/*! Obsolete compression functions (since v1.7.3) */ -LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress (const char* src, char* dest, int srcSize); -LZ4_DEPRECATED("use LZ4_compress_default() instead") LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize); -LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); -LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); -LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); -LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); - -/*! Obsolete decompression functions (since v1.8.0) */ -LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize); -LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); - -/* Obsolete streaming functions (since v1.7.0) - * degraded functionality; do not use! - * - * In order to perform streaming compression, these functions depended on data - * that is no longer tracked in the state. They have been preserved as well as - * possible: using them will still produce a correct output. However, they don't - * actually retain any history between compression calls. The compression ratio - * achieved will therefore be no better than compressing each chunk - * independently. - */ -LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer); -LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int LZ4_sizeofStreamState(void); -LZ4_DEPRECATED("Use LZ4_resetStream() instead") LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer); -LZ4_DEPRECATED("Use LZ4_saveDict() instead") LZ4LIB_API char* LZ4_slideInputBuffer (void* state); - -/*! Obsolete streaming decoding functions (since v1.7.0) */ -LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); -LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); - -/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) : - * These functions used to be faster than LZ4_decompress_safe(), - * but this is no longer the case. They are now slower. - * This is because LZ4_decompress_fast() doesn't know the input size, - * and therefore must progress more cautiously into the input buffer to not read beyond the end of block. - * On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability. - * As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated. - * - * The last remaining LZ4_decompress_fast() specificity is that - * it can decompress a block without knowing its compressed size. - * Such functionality can be achieved in a more secure manner - * by employing LZ4_decompress_safe_partial(). - * - * Parameters: - * originalSize : is the uncompressed size to regenerate. - * `dst` must be already allocated, its size must be >= 'originalSize' bytes. - * @return : number of bytes read from source buffer (== compressed size). - * The function expects to finish at block's end exactly. - * If the source stream is detected malformed, the function stops decoding and returns a negative result. - * note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer. - * However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds. - * Also, since match offsets are not validated, match reads from 'src' may underflow too. - * These issues never happen if input (compressed) data is correct. - * But they may happen if input data is invalid (error or intentional tampering). - * As a consequence, use these functions in trusted environments with trusted data **only**. - */ -LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead") -LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize); -LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead") -LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize); -LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead") -LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize); - -/*! LZ4_resetStream() : - * An LZ4_stream_t structure must be initialized at least once. - * This is done with LZ4_initStream(), or LZ4_resetStream(). - * Consider switching to LZ4_initStream(), - * invoking LZ4_resetStream() will trigger deprecation warnings in the future. - */ -LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr); - - -#endif /* LZ4_H_98237428734687 */ - - -#if defined (__cplusplus) -} -#endif diff --git a/contrib/simple_vulkan_synchronization/LICENSE.md b/contrib/simple_vulkan_synchronization/LICENSE.md deleted file mode 100644 index 0f321c2..0000000 --- a/contrib/simple_vulkan_synchronization/LICENSE.md +++ /dev/null @@ -1,19 +0,0 @@ -Copyright (c) 2017 Tobias Hector - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/contrib/simple_vulkan_synchronization/README.md b/contrib/simple_vulkan_synchronization/README.md deleted file mode 100644 index 2a26c2e..0000000 --- a/contrib/simple_vulkan_synchronization/README.md +++ /dev/null @@ -1,153 +0,0 @@ -# Simplified Vulkan Synchronization - -In an effort to make Vulkan synchronization more accessible, I created this -stb-inspired single-header library in order to somewhat simplify the core -synchronization mechanisms in Vulkan - pipeline barriers and events. - -Rather than the complex maze of enums and bitflags in Vulkan - many -combinations of which are invalid or nonsensical - this library collapses -this to a much shorter list of 40 distinct usage types, and a couple of -options for handling image layouts. - -Use of other synchonization mechanisms such as semaphores, fences and render -passes are not addressed in this API at present. - -## Usage - -#define the symbol THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION in -*one* C/C++ file before the #include of the header; the implementation -will be generated in that file. - -## Version - -alpha.9 - -Alpha.9 adds the thsvsGetAccessInfo function to translate access types into a thsvsVkAccessInfo. - -## Version History - -alpha.8 - -Alpha.8 adds a host preinitialization state for linear images, as well as a number of new access sets for extensions released since the last update. - -alpha.7 - -Alpha.7 incorporates a number of fixes from @gwihlidal, and fixes -handling of pipeline stages in the presence of multiple access types or -barriers in light of other recent changes. - -alpha.6 - -Alpha.6 fixes a typo (VK_ACCESS_TYPE_MEMORY_READ|WRITE_BIT should have been VK_ACCESS_MEMORY_READ|WRITE_BIT), and sets the pipeline stage src and dst flag bits to VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT and VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT during initialization, not 0 as per alpha.5 - -alpha.5 - -Alpha.5 now correctly zeroes out the pipeline stage flags before trying to incrementally set bits on them... common theme here, whoops. - -alpha.4 - -Alpha.4 now correctly zeroes out the access types before trying to incrementally set bits on them (!) - -alpha.3 - -Alpha.3 changes the following: - -Uniform and vertex buffer access in one enum, matching D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER: - - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER_OR_VERTEX_BUFFER - -Color read *and* write access, matching D3D12_RESOURCE_STATE_RENDER_TARGET: - - THSVS_ACCESS_COLOR_ATTACHMENT_READ_WRITE - -Also the "THSVS_ACCESS_\*\_SHADER_READ_SAMPLED_IMAGE" enums have been renamed to the form "THSVS_ACCESS_\*\_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER" - -alpha.2 - -Alpha.2 adds four new resource states for "ANY SHADER ACCESS": - - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER - - THSVS_ACCESS_ANY_SHADER_READ_SAMPLED_IMAGE - - THSVS_ACCESS_ANY_SHADER_READ_OTHER - - THSVS_ACCESS_ANY_SHADER_WRITE - -alpha.1 - -Alpha.1 adds three new resource states: - - THSVS_ACCESS_GENERAL (Any access on the device) - - THSVS_ACCESS_DEPTH_ATTACHMENT_WRITE_STENCIL_READ_ONLY (Write access to only the depth aspect of a depth/stencil attachment) - - THSVS_ACCESS_STENCIL_ATTACHMENT_WRITE_DEPTH_READ_ONLY (Write access to only the stencil aspect of a depth/stencil attachment) - -It also fixes a couple of typos, and adds clarification as to when extensions need to be enabled to use a feature. - -alpha.0 - -This is the very first public release of this library; future revisions -of this API may change the API in an incompatible manner as feedback is -received. -Once the version becomes stable, incompatible changes will only be made -to major revisions of the API - minor revisions will only contain -bugfixes or minor additions. - -## Memory Allocation - -The thsvsCmdPipelineBarrier and thsvsCmdWaitEvents commands allocate -temporary storage for the Vulkan barrier equivalents in order to pass them -to the respective Vulkan commands. - -These use the `THSVS_TEMP_ALLOC(size)` and `THSVS_TEMP_FREE(x)` macros, -which are by default set to alloca(size) and ((void)(x)), respectively. -If you don't want to use stack space or would rather use your own -allocation strategy, these can be overridden by defining these macros -in before #include-ing the header file with -THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION defined. - -I'd rather avoid the need for these allocations in what are likely to be -high-traffic commands, but currently just want to ship something - may -revisit this at a future date based on feedback. - -## Expressiveness Compared to Raw Vulkan - -Despite the fact that this API is fairly simple, it expresses 99% of -what you'd actually ever want to do in practice. -Adding the missing expressiveness would result in increased complexity -which didn't seem worth the tradeoff - however I would consider adding -something for them in future if it becomes an issue. - -Here's a list of known things you can't express: - -* Execution only dependencies cannot be expressed. - These are occasionally useful in conjunction with semaphores, or when - trying to be clever with scheduling - but their usage is both limited - and fairly tricky to get right anyway. -* Depth/Stencil Input Attachments can be read in a shader using either - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL or - VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL - this library - *always* uses VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL. - It's possible (though highly unlikely) when aliasing images that this - results in unnecessary transitions. - -## Error Checks - -By default, as with the Vulkan API, this library does NOT check for -errors. -However, a number of optional error checks (`THSVS_ERROR_CHECK_*`) can be -enabled by uncommenting the relevant #defines. -Currently, error checks simply assert at the point a failure is detected -and do not output an error message. -I certainly do not claim they capture *all* possible errors, but they -capture what should be some of the more common ones. -Use of the Vulkan Validation Layers in tandem with this library is -strongly recommended: - https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers - -## Issues - -This header was clean of warnings using -Wall as of time of publishing -on both gcc 4.8.4 and clang 3.5, using the c99 standard. - -There's a potential pitfall in thsvsCmdPipelineBarrier and thsvsCmdWaitEvents -where alloca is used for temporary allocations. See -[Memory Allocation](#memory-allocation) for more information. - -Testing of this library is so far extremely limited with no immediate -plans to add to that - so there's bound to be some amount of bugs. -Please raise these issues on the repo issue tracker, or provide a fix -via a pull request yourself if you're so inclined. diff --git a/contrib/simple_vulkan_synchronization/test/README.md b/contrib/simple_vulkan_synchronization/test/README.md deleted file mode 100644 index 6962237..0000000 --- a/contrib/simple_vulkan_synchronization/test/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Tests - -`tests.c` defines a number of unit tests to test that various scenarios -produce the desired output. -Tests are based on the common synchronization examples on the Vulkan-Docs -wiki: https://github.com/KhronosGroup/Vulkan-Docs/wiki/Synchronization-Examples. - -## Building - -On a unix based system these tests can be built using: - -`gcc -o tests tests.c -lvulkan` - -## Running - -Running is straightforward: - -`./tests` - -The executable will write out the tests that are run, whether they pass or -fail, and what caused them to fail if they did. - -## Adding tests - -If you'd like to add a test, just define a new test in main() as per those -that already exist. diff --git a/contrib/simple_vulkan_synchronization/test/tests.c b/contrib/simple_vulkan_synchronization/test/tests.c deleted file mode 100644 index f897244..0000000 --- a/contrib/simple_vulkan_synchronization/test/tests.c +++ /dev/null @@ -1,357 +0,0 @@ -// Copyright (c) 2017-2019 Tobias Hector - -// Permission is hereby granted, free of charge, to any person obtaining a copy of -// this software and associated documentation files (the "Software"), to deal in -// the Software without restriction, including without limitation the rights to -// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -// of the Software, and to permit persons to whom the Software is furnished to do -// so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include -#include - -#define THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION -#include "../thsvs_simpler_vulkan_synchronization.h" - -void global_barrier_test_array(const char* testName, - unsigned int numPrevAccesses, - ThsvsAccessType* prevAccesses, - unsigned int numNextAccesses, - ThsvsAccessType* nextAccesses, - VkPipelineStageFlags expectedSrcStageMask, - VkPipelineStageFlags expectedDstStageMask, - VkAccessFlags expectedSrcAccessMask, - VkAccessFlags expectedDstAccessMask) -{ - ThsvsGlobalBarrier barrier = {numPrevAccesses, prevAccesses, numNextAccesses, nextAccesses}; - - VkMemoryBarrier vkBarrier = { 0 }; - VkPipelineStageFlags srcStages = 0; - VkPipelineStageFlags dstStages = 0; - unsigned int testPassed = 1; - - thsvsGetVulkanMemoryBarrier(barrier, &srcStages, &dstStages, &vkBarrier); - - printf("Test: %s\n", testName); - - if (srcStages != expectedSrcStageMask) - { - printf("\tUnexpected source stage %0#10X\n", srcStages); - testPassed = 0; - } - - if (dstStages != expectedDstStageMask) - { - printf("\tUnexpected destination stage %0#10X\n", dstStages); - testPassed = 0; - } - - if (vkBarrier.srcAccessMask != expectedSrcAccessMask) - { - printf("\tUnexpected source access mask %0#10X\n", vkBarrier.srcAccessMask); - testPassed = 0; - } - - if (vkBarrier.dstAccessMask != expectedDstAccessMask) - { - printf("\tUnexpected destination access mask %0#10X\n", vkBarrier.dstAccessMask); - testPassed = 0; - } - - if (testPassed == 1) - printf("\tPASSED\n"); - else - printf("\tFAILED\n"); -} - -void global_barrier_test(const char* testName, - ThsvsAccessType prevAccess, - ThsvsAccessType nextAccess, - VkPipelineStageFlags expectedSrcStageMask, - VkPipelineStageFlags expectedDstStageMask, - VkAccessFlags expectedSrcAccessMask, - VkAccessFlags expectedDstAccessMask) -{ - global_barrier_test_array(testName, 1, &prevAccess, 1, &nextAccess, expectedSrcStageMask, expectedDstStageMask, expectedSrcAccessMask, expectedDstAccessMask); -} - -void image_barrier_test_array(const char* testName, - unsigned int numPrevAccesses, - ThsvsAccessType* prevAccesses, - unsigned int numNextAccesses, - ThsvsAccessType* nextAccesses, - VkPipelineStageFlags expectedSrcStageMask, - VkPipelineStageFlags expectedDstStageMask, - VkAccessFlags expectedSrcAccessMask, - VkAccessFlags expectedDstAccessMask, - VkImageLayout expectedOldLayout, - VkImageLayout expectedNewLayout) -{ - ThsvsImageBarrier barrier = {numPrevAccesses, prevAccesses, numNextAccesses, nextAccesses}; - - VkImageMemoryBarrier vkBarrier = { 0 }; - VkPipelineStageFlags srcStages = 0; - VkPipelineStageFlags dstStages = 0; - unsigned int testPassed = 1; - - thsvsGetVulkanImageMemoryBarrier(barrier, &srcStages, &dstStages, &vkBarrier); - - printf("Test: %s\n", testName); - - if (srcStages != expectedSrcStageMask) - { - printf("\tUnexpected source stage %0#10X\n", srcStages); - testPassed = 0; - } - - if (dstStages != expectedDstStageMask) - { - printf("\tUnexpected destination stage %0#10X\n", dstStages); - testPassed = 0; - } - - if (vkBarrier.srcAccessMask != expectedSrcAccessMask) - { - printf("\tUnexpected source access mask %0#10X\n", vkBarrier.srcAccessMask); - testPassed = 0; - } - - if (vkBarrier.dstAccessMask != expectedDstAccessMask) - { - printf("\tUnexpected destination access mask %0#10X\n", vkBarrier.dstAccessMask); - testPassed = 0; - } - - if (vkBarrier.oldLayout != expectedOldLayout) - { - printf("\tUnexpected old layout %d\n", vkBarrier.oldLayout); - testPassed = 0; - } - - if (vkBarrier.newLayout != expectedNewLayout) - { - printf("\tUnexpected new layout %d\n", vkBarrier.newLayout); - testPassed = 0; - } - - if (testPassed == 1) - printf("\tPASSED\n"); - else - printf("\tFAILED\n"); -} - -void image_barrier_test(const char* testName, - ThsvsAccessType prevAccess, - ThsvsAccessType nextAccess, - VkPipelineStageFlags expectedSrcStageMask, - VkPipelineStageFlags expectedDstStageMask, - VkAccessFlags expectedSrcAccessMask, - VkAccessFlags expectedDstAccessMask, - VkImageLayout expectedOldLayout, - VkImageLayout expectedNewLayout) -{ - image_barrier_test_array(testName, 1, &prevAccess, 1, &nextAccess, expectedSrcStageMask, expectedDstStageMask, expectedSrcAccessMask, expectedDstAccessMask, expectedOldLayout, expectedNewLayout); -} - -int main(int argc, char* argv[]) -{ - global_barrier_test("Compute write to storage buffer/image, Compute read from storage buffer/image", - THSVS_ACCESS_COMPUTE_SHADER_WRITE, - THSVS_ACCESS_COMPUTE_SHADER_READ_OTHER, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT); - - global_barrier_test("Compute read from storage buffer, Compute write from storage buffer", - THSVS_ACCESS_COMPUTE_SHADER_READ_OTHER, - THSVS_ACCESS_COMPUTE_SHADER_WRITE, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - 0, - 0); - - global_barrier_test("Compute write to storage buffer, Graphics read as index buffer", - THSVS_ACCESS_COMPUTE_SHADER_WRITE, - THSVS_ACCESS_INDEX_BUFFER, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_INDEX_READ_BIT); - - { - ThsvsAccessType prevAccesses[] = {THSVS_ACCESS_COMPUTE_SHADER_WRITE}; - ThsvsAccessType nextAccesses[] = {THSVS_ACCESS_INDEX_BUFFER, THSVS_ACCESS_COMPUTE_SHADER_READ_UNIFORM_BUFFER}; - global_barrier_test_array("Compute write to storage buffer, Graphics read as index buffer & Compute read as uniform buffer", - 1, prevAccesses, - 2, nextAccesses, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_INDEX_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT); - } - - global_barrier_test("Compute write to storage buffer, Graphics read as indirect buffer", - THSVS_ACCESS_COMPUTE_SHADER_WRITE, - THSVS_ACCESS_INDIRECT_BUFFER, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_INDIRECT_COMMAND_READ_BIT); - - image_barrier_test("Compute write to storage image, Graphics fragment read as sampled image", - THSVS_ACCESS_COMPUTE_SHADER_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - { - ThsvsAccessType prevAccesses[] = {THSVS_ACCESS_COMPUTE_SHADER_WRITE}; - ThsvsAccessType nextAccesses[] = {THSVS_ACCESS_INDIRECT_BUFFER, THSVS_ACCESS_FRAGMENT_SHADER_READ_UNIFORM_BUFFER}; - global_barrier_test_array("Compute write to storage texel buffer, Graphics read as indirect buffer & fragment read as uniform buffer", - 1, prevAccesses, - 2, nextAccesses, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT); - } - - image_barrier_test("Graphics write to color attachment, Compute read from sampled image", - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - THSVS_ACCESS_COMPUTE_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to depth attachment, Compute read from sampled image", - THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE, - THSVS_ACCESS_COMPUTE_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to depth attachment, Graphics fragment read from input attachment", - THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_DEPTH_STENCIL_INPUT_ATTACHMENT, - VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_ACCESS_INPUT_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to depth attachment, Graphics fragment read from sampled image", - THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to color attachment, Graphics fragment read from input attachment", - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_COLOR_INPUT_ATTACHMENT, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_ACCESS_INPUT_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to color attachment, Graphics fragment read from sampled image", - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics write to color attachment, Graphics vertex read from sampled image", - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - THSVS_ACCESS_VERTEX_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics fragment read from sampled image, Graphics write to color attachment", - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - 0, - 0, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); - - global_barrier_test("None, Transfer read from buffer", - THSVS_ACCESS_NONE, - THSVS_ACCESS_TRANSFER_READ, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, - 0); - - global_barrier_test("Transfer write to buffer, Graphics read from vertex buffer", - THSVS_ACCESS_TRANSFER_WRITE, - THSVS_ACCESS_VERTEX_BUFFER, - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT); - - image_barrier_test("Transfer write to image, Graphics fragment read from sampled image", - THSVS_ACCESS_TRANSFER_WRITE, - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - - image_barrier_test("Graphics color attachment write, Presentation", - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, - THSVS_ACCESS_PRESENT, - VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - 0, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, - VK_IMAGE_LAYOUT_PRESENT_SRC_KHR); - - global_barrier_test("Full pipeline barrier", - THSVS_ACCESS_GENERAL, - THSVS_ACCESS_GENERAL, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT); -} diff --git a/contrib/simple_vulkan_synchronization/thsvs_simpler_vulkan_synchronization.h b/contrib/simple_vulkan_synchronization/thsvs_simpler_vulkan_synchronization.h deleted file mode 100644 index cc55a72..0000000 --- a/contrib/simple_vulkan_synchronization/thsvs_simpler_vulkan_synchronization.h +++ /dev/null @@ -1,1397 +0,0 @@ -// Copyright (c) 2017-2019 Tobias Hector - -// Permission is hereby granted, free of charge, to any person obtaining a copy of -// this software and associated documentation files (the "Software"), to deal in -// the Software without restriction, including without limitation the rights to -// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -// of the Software, and to permit persons to whom the Software is furnished to do -// so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -//// Simpler Vulkan Synchronization //// -/* -In an effort to make Vulkan synchronization more accessible, I created this -stb-inspired single-header library in order to somewhat simplify the core -synchronization mechanisms in Vulkan - pipeline barriers and events. - -Rather than the complex maze of enums and bit flags in Vulkan - many -combinations of which are invalid or nonsensical - this library collapses -this to a much shorter list of 40 distinct usage types, and a couple of -options for handling image layouts. - -Use of other synchronization mechanisms such as semaphores, fences and render -passes are not addressed in this API at present. - -USAGE - - #define the symbol THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION in - *one* C/C++ file before the #include of this file; the implementation - will be generated in that file. - -VERSION - - alpha.9 - - Alpha.9 adds the thsvsGetAccessInfo function to translate access types into a thsvsVkAccessInfo. - - -VERSION HISTORY - - alpha.8 - - Alpha.8 adds a host preinitialization state for linear images, as well as a number of new access sets for extensions released since the last update. - - alpha.7 - - Alpha.7 incorporates a number of fixes from @gwihlidal, and fixes - handling of pipeline stages in the presence of multiple access types or - barriers in light of other recent changes. - - alpha.6 - - Alpha.6 fixes a typo (VK_ACCESS_TYPE_MEMORY_READ|WRITE_BIT should have been VK_ACCESS_MEMORY_READ|WRITE_BIT), and sets the pipeline stage src and dst flag bits to VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT and VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT during initialization, not 0 as per alpha.5 - - alpha.5 - - Alpha.5 now correctly zeroes out the pipeline stage flags before trying to incrementally set bits on them... common theme here, whoops. - - alpha.4 - - Alpha.4 now correctly zeroes out the access types before trying to incrementally set bits on them (!) - - alpha.3 - - Alpha.3 changes the following: - - Uniform and vertex buffer access in one enum, matching D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER: - - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER_OR_VERTEX_BUFFER - - Color read *and* write access, matching D3D12_RESOURCE_STATE_RENDER_TARGET: - - THSVS_ACCESS_COLOR_ATTACHMENT_READ_WRITE - - Also the "THSVS_ACCESS_*_SHADER_READ_SAMPLED_IMAGE" enums have been renamed to the form "THSVS_ACCESS_*_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER" - - alpha.2 - - Alpha.2 adds four new resource states for "ANY SHADER ACCESS": - - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER - - THSVS_ACCESS_ANY_SHADER_READ_SAMPLED_IMAGE - - THSVS_ACCESS_ANY_SHADER_READ_OTHER - - THSVS_ACCESS_ANY_SHADER_WRITE - - alpha.1 - - Alpha.1 adds three new resource states: - - THSVS_ACCESS_GENERAL (Any access on the device) - - THSVS_ACCESS_DEPTH_ATTACHMENT_WRITE_STENCIL_READ_ONLY (Write access to only the depth aspect of a depth/stencil attachment) - - THSVS_ACCESS_STENCIL_ATTACHMENT_WRITE_DEPTH_READ_ONLY (Write access to only the stencil aspect of a depth/stencil attachment) - - It also fixes a couple of typos, and adds clarification as to when extensions need to be enabled to use a feature. - - alpha.0 - - This is the very first public release of this library; future revisions - of this API may change the API in an incompatible manner as feedback is - received. - Once the version becomes stable, incompatible changes will only be made - to major revisions of the API - minor revisions will only contain - bug fixes or minor additions. - -MEMORY ALLOCATION - - The thsvsCmdPipelineBarrier and thWaitEvents commands allocate temporary - storage for the Vulkan barrier equivalents in order to pass them to the - respective Vulkan commands. - - These use the `THSVS_TEMP_ALLOC(size)` and `THSVS_TEMP_FREE(x)` macros, - which are by default set to alloca(size) and ((void)(x)), respectively. - If you don't want to use stack space or would rather use your own - allocation strategy, these can be overridden by defining these macros - in before #include-ing the header file with - THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION defined. - - I'd rather avoid the need for these allocations in what are likely to be - high-traffic commands, but currently just want to ship something - may - revisit this at a future date based on feedback. - -EXPRESSIVENESS COMPARED TO RAW VULKAN - - Despite the fact that this API is fairly simple, it expresses 99% of - what you'd actually ever want to do in practice. - Adding the missing expressiveness would result in increased complexity - which didn't seem worth the trade off - however I would consider adding - something for them in future if it becomes an issue. - - Here's a list of known things you can't express: - - * Execution only dependencies cannot be expressed. - These are occasionally useful in conjunction with semaphores, or when - trying to be clever with scheduling - but their usage is both limited - and fairly tricky to get right anyway. - * Depth/Stencil Input Attachments can be read in a shader using either - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL or - VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL - this library - *always* uses VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL. - It's possible (though highly unlikely) when aliasing images that this - results in unnecessary transitions. - -ERROR CHECKS - - By default, as with the Vulkan API, this library does NOT check for - errors. - However, a number of optional error checks (THSVS_ERROR_CHECK_*) can be - enabled by uncommenting the relevant #defines. - Currently, error checks simply assert at the point a failure is detected - and do not output an error message. - I certainly do not claim they capture *all* possible errors, but they - capture what should be some of the more common ones. - Use of the Vulkan Validation Layers in tandem with this library is - strongly recommended: - https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers - -ISSUES - - This header was clean of warnings using -Wall as of time of publishing - on both gcc 4.8.4 and clang 3.5, using the c99 standard. - - There's a potential pitfall in thsvsCmdPipelineBarrier and thsvsCmdWaitEvents - where alloca is used for temporary allocations. See MEMORY ALLOCATION - for more information. - - Testing of this library is so far extremely limited with no immediate - plans to add to that - so there's bound to be some amount of bugs. - Please raise these issues on the repo issue tracker, or provide a fix - via a pull request yourself if you're so inclined. -*/ - -#ifndef THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_H -#define THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_H 1 - -#include - -/* -ThsvsAccessType defines all potential resource usages in the Vulkan API. -*/ -typedef enum ThsvsAccessType { - THSVS_ACCESS_NONE, // No access. Useful primarily for initialization - -// Read access - // Requires VK_NV_device_generated_commands to be enabled - THSVS_ACCESS_COMMAND_BUFFER_READ_NV, // Command buffer read operation as defined by NV_device_generated_commands - THSVS_ACCESS_INDIRECT_BUFFER, // Read as an indirect buffer for drawing or dispatch - THSVS_ACCESS_INDEX_BUFFER, // Read as an index buffer for drawing - THSVS_ACCESS_VERTEX_BUFFER, // Read as a vertex buffer for drawing - THSVS_ACCESS_VERTEX_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a vertex shader - THSVS_ACCESS_VERTEX_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, // Read as a sampled image/uniform texel buffer in a vertex shader - THSVS_ACCESS_VERTEX_SHADER_READ_OTHER, // Read as any other resource in a vertex shader - THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a tessellation control shader - THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, // Read as a sampled image/uniform texel buffer in a tessellation control shader - THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_OTHER, // Read as any other resource in a tessellation control shader - THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a tessellation evaluation shader - THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, // Read as a sampled image/uniform texel buffer in a tessellation evaluation shader - THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_OTHER, // Read as any other resource in a tessellation evaluation shader - THSVS_ACCESS_GEOMETRY_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a geometry shader - THSVS_ACCESS_GEOMETRY_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER,// Read as a sampled image/uniform texel buffer in a geometry shader - THSVS_ACCESS_GEOMETRY_SHADER_READ_OTHER, // Read as any other resource in a geometry shader - THSVS_ACCESS_TASK_SHADER_READ_UNIFORM_BUFFER_NV, // Read as a uniform buffer in a task shader - THSVS_ACCESS_TASK_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER_NV, // Read as a sampled image/uniform texel buffer in a task shader - THSVS_ACCESS_TASK_SHADER_READ_OTHER_NV, // Read as any other resource in a task shader - THSVS_ACCESS_MESH_SHADER_READ_UNIFORM_BUFFER_NV, // Read as a uniform buffer in a mesh shader - THSVS_ACCESS_MESH_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER_NV, // Read as a sampled image/uniform texel buffer in a mesh shader - THSVS_ACCESS_MESH_SHADER_READ_OTHER_NV, // Read as any other resource in a mesh shader - THSVS_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_EXT, // Read as a transform feedback counter buffer - THSVS_ACCESS_FRAGMENT_DENSITY_MAP_READ_EXT, // Read as a fragment density map image - THSVS_ACCESS_SHADING_RATE_READ_NV, // Read as a shading rate image - THSVS_ACCESS_FRAGMENT_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a fragment shader - THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER,// Read as a sampled image/uniform texel buffer in a fragment shader - THSVS_ACCESS_FRAGMENT_SHADER_READ_COLOR_INPUT_ATTACHMENT, // Read as an input attachment with a color format in a fragment shader - THSVS_ACCESS_FRAGMENT_SHADER_READ_DEPTH_STENCIL_INPUT_ATTACHMENT, // Read as an input attachment with a depth/stencil format in a fragment shader - THSVS_ACCESS_FRAGMENT_SHADER_READ_OTHER, // Read as any other resource in a fragment shader - THSVS_ACCESS_COLOR_ATTACHMENT_READ, // Read by standard blending/logic operations or subpass load operations - THSVS_ACCESS_COLOR_ATTACHMENT_ADVANCED_BLENDING_EXT, // Read by advanced blending, standard blending, logic operations, or subpass load operations - THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ, // Read by depth/stencil tests or subpass load operations - THSVS_ACCESS_COMPUTE_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in a compute shader - THSVS_ACCESS_COMPUTE_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, // Read as a sampled image/uniform texel buffer in a compute shader - THSVS_ACCESS_COMPUTE_SHADER_READ_OTHER, // Read as any other resource in a compute shader - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER, // Read as a uniform buffer in any shader - THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER_OR_VERTEX_BUFFER, // Read as a uniform buffer in any shader, or a vertex buffer - THSVS_ACCESS_ANY_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER, // Read as a sampled image in any shader - THSVS_ACCESS_ANY_SHADER_READ_OTHER, // Read as any other resource (excluding attachments) in any shader - THSVS_ACCESS_TRANSFER_READ, // Read as the source of a transfer operation - THSVS_ACCESS_HOST_READ, // Read on the host - - // Requires VK_KHR_swapchain to be enabled - THSVS_ACCESS_PRESENT, // Read by the presentation engine (i.e. vkQueuePresentKHR) - - // Requires VK_EXT_conditional_rendering to be enabled - THSVS_ACCESS_CONDITIONAL_RENDERING_READ_EXT, // Read by conditional rendering - - // Requires VK_NV_ray_tracing to be enabled - THSVS_ACCESS_RAY_TRACING_SHADER_ACCELERATION_STRUCTURE_READ_NV, // Read by a ray tracing shader as an acceleration structure - THSVS_ACCESS_ACCELERATION_STRUCTURE_BUILD_READ_NV, // Read as an acceleration structure during a build - - // Read accesses end - THSVS_END_OF_READ_ACCESS, - -// Write access - // Requires VK_NV_device_generated_commands to be enabled - THSVS_ACCESS_COMMAND_BUFFER_WRITE_NV, // Command buffer write operation - THSVS_ACCESS_VERTEX_SHADER_WRITE, // Written as any resource in a vertex shader - THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_WRITE, // Written as any resource in a tessellation control shader - THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_WRITE, // Written as any resource in a tessellation evaluation shader - THSVS_ACCESS_GEOMETRY_SHADER_WRITE, // Written as any resource in a geometry shader - - // Requires VK_NV_mesh_shading to be enabled - THSVS_ACCESS_TASK_SHADER_WRITE_NV, // Written as any resource in a task shader - THSVS_ACCESS_MESH_SHADER_WRITE_NV, // Written as any resource in a mesh shader - - // Requires VK_EXT_transform_feedback to be enabled - THSVS_ACCESS_TRANSFORM_FEEDBACK_WRITE_EXT, // Written as a transform feedback buffer - THSVS_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_EXT, // Written as a transform feedback counter buffer - - THSVS_ACCESS_FRAGMENT_SHADER_WRITE, // Written as any resource in a fragment shader - THSVS_ACCESS_COLOR_ATTACHMENT_WRITE, // Written as a color attachment during rendering, or via a subpass store op - THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE, // Written as a depth/stencil attachment during rendering, or via a subpass store op - - // Requires VK_KHR_maintenance2 to be enabled - THSVS_ACCESS_DEPTH_ATTACHMENT_WRITE_STENCIL_READ_ONLY, // Written as a depth aspect of a depth/stencil attachment during rendering, whilst the stencil aspect is read-only - THSVS_ACCESS_STENCIL_ATTACHMENT_WRITE_DEPTH_READ_ONLY, // Written as a stencil aspect of a depth/stencil attachment during rendering, whilst the depth aspect is read-only - - THSVS_ACCESS_COMPUTE_SHADER_WRITE, // Written as any resource in a compute shader - THSVS_ACCESS_ANY_SHADER_WRITE, // Written as any resource in any shader - THSVS_ACCESS_TRANSFER_WRITE, // Written as the destination of a transfer operation - THSVS_ACCESS_HOST_PREINITIALIZED, // Data pre-filled by host before device access starts - THSVS_ACCESS_HOST_WRITE, // Written on the host - - // Requires VK_NV_ray_tracing to be enabled - THSVS_ACCESS_ACCELERATION_STRUCTURE_BUILD_WRITE_NV, // Written as an acceleration structure during a build - - THSVS_ACCESS_COLOR_ATTACHMENT_READ_WRITE, // Read or written as a color attachment during rendering -// General access - THSVS_ACCESS_GENERAL, // Covers any access - useful for debug, generally avoid for performance reasons - -// Number of access types - THSVS_NUM_ACCESS_TYPES -} ThsvsAccessType; - -/* -ThsvsImageLayout defines a handful of layout options for images. -Rather than a list of all possible image layouts, this reduced list is -correlated with the access types to map to the correct Vulkan layouts. -THSVS_IMAGE_LAYOUT_OPTIMAL is usually preferred. -*/ -typedef enum ThsvsImageLayout { - THSVS_IMAGE_LAYOUT_OPTIMAL, // Choose the most optimal layout for each usage. Performs layout transitions as appropriate for the access. - THSVS_IMAGE_LAYOUT_GENERAL, // Layout accessible by all Vulkan access types on a device - no layout transitions except for presentation - - // Requires VK_KHR_shared_presentable_image to be enabled. Can only be used for shared presentable images (i.e. single-buffered swap chains). - THSVS_IMAGE_LAYOUT_GENERAL_AND_PRESENTATION // As GENERAL, but also allows presentation engines to access it - no layout transitions -} ThsvsImageLayout; - -/* -Global barriers define a set of accesses on multiple resources at once. -If a buffer or image doesn't require a queue ownership transfer, or an image -doesn't require a layout transition (e.g. you're using one of the GENERAL -layouts) then a global barrier should be preferred. -Simply define the previous and next access types of resources affected. -*/ -typedef struct ThsvsGlobalBarrier { - uint32_t prevAccessCount; - const ThsvsAccessType* pPrevAccesses; - uint32_t nextAccessCount; - const ThsvsAccessType* pNextAccesses; -} ThsvsGlobalBarrier; - -/* -Buffer barriers should only be used when a queue family ownership transfer -is required - prefer global barriers at all other times. - -Access types are defined in the same way as for a global memory barrier, but -they only affect the buffer range identified by buffer, offset and size, -rather than all resources. -srcQueueFamilyIndex and dstQueueFamilyIndex will be passed unmodified into a -VkBufferMemoryBarrier. - -A buffer barrier defining a queue ownership transfer needs to be executed -twice - once by a queue in the source queue family, and then once again by a -queue in the destination queue family, with a semaphore guaranteeing -execution order between them. -*/ -typedef struct ThsvsBufferBarrier { - uint32_t prevAccessCount; - const ThsvsAccessType* pPrevAccesses; - uint32_t nextAccessCount; - const ThsvsAccessType* pNextAccesses; - uint32_t srcQueueFamilyIndex; - uint32_t dstQueueFamilyIndex; - VkBuffer buffer; - VkDeviceSize offset; - VkDeviceSize size; -} ThsvsBufferBarrier; - -/* -Image barriers should only be used when a queue family ownership transfer -or an image layout transition is required - prefer global barriers at all -other times. -In general it is better to use image barriers with THSVS_IMAGE_LAYOUT_OPTIMAL -than it is to use global barriers with images using either of the -THSVS_IMAGE_LAYOUT_GENERAL* layouts. - -Access types are defined in the same way as for a global memory barrier, but -they only affect the image subresource range identified by image and -subresourceRange, rather than all resources. -srcQueueFamilyIndex, dstQueueFamilyIndex, image, and subresourceRange will -be passed unmodified into a VkImageMemoryBarrier. - -An image barrier defining a queue ownership transfer needs to be executed -twice - once by a queue in the source queue family, and then once again by a -queue in the destination queue family, with a semaphore guaranteeing -execution order between them. - -If discardContents is set to true, the contents of the image become -undefined after the barrier is executed, which can result in a performance -boost over attempting to preserve the contents. -This is particularly useful for transient images where the contents are -going to be immediately overwritten. A good example of when to use this is -when an application re-uses a presented image after vkAcquireNextImageKHR. -*/ -typedef struct ThsvsImageBarrier { - uint32_t prevAccessCount; - const ThsvsAccessType* pPrevAccesses; - uint32_t nextAccessCount; - const ThsvsAccessType* pNextAccesses; - ThsvsImageLayout prevLayout; - ThsvsImageLayout nextLayout; - VkBool32 discardContents; - uint32_t srcQueueFamilyIndex; - uint32_t dstQueueFamilyIndex; - VkImage image; - VkImageSubresourceRange subresourceRange; -} ThsvsImageBarrier; - -/* -Mapping function that translates a set of accesses into the corresponding -pipeline stages, VkAccessFlags, and image layout. -*/ -void thsvsGetAccessInfo( - uint32_t accessCount, - const ThsvsAccessType* pAccesses, - VkPipelineStageFlags* pStageMask, - VkAccessFlags* pAccessMask, - VkImageLayout* pImageLayout, - bool* pHasWriteAccess); - -#ifdef __cplusplus -/* -Mapping function that translates a global barrier into a set of source and -destination pipeline stages, and a VkMemoryBarrier, that can be used with -Vulkan's synchronization methods. -*/ -void thsvsGetVulkanMemoryBarrier( - const ThsvsGlobalBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkMemoryBarrier* pVkBarrier); - -/* -Mapping function that translates a buffer barrier into a set of source and -destination pipeline stages, and a VkBufferMemoryBarrier, that can be used -with Vulkan's synchronization methods. -*/ -void thsvsGetVulkanBufferMemoryBarrier( - const ThsvsBufferBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkBufferMemoryBarrier* pVkBarrier); - -/* -Mapping function that translates an image barrier into a set of source and -destination pipeline stages, and a VkBufferMemoryBarrier, that can be used -with Vulkan's synchronization methods. -*/ -void thsvsGetVulkanImageMemoryBarrier( - const ThsvsImageBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkImageMemoryBarrier* pVkBarrier); -#endif - -/* -Simplified wrapper around vkCmdPipelineBarrier. - -The mapping functions defined above are used to translate the passed in -barrier definitions into a set of pipeline stages and native Vulkan memory -barriers to be passed to vkCmdPipelineBarrier. - -commandBuffer is passed unmodified to vkCmdPipelineBarrier. -*/ -void thsvsCmdPipelineBarrier( - VkCommandBuffer commandBuffer, - const ThsvsGlobalBarrier* pGlobalBarrier, - uint32_t bufferBarrierCount, - const ThsvsBufferBarrier* pBufferBarriers, - uint32_t imageBarrierCount, - const ThsvsImageBarrier* pImageBarriers); - -/* -Wrapper around vkCmdSetEvent. - -Sets an event when the accesses defined by pPrevAccesses are completed. - -commandBuffer and event are passed unmodified to vkCmdSetEvent. -*/ -void thsvsCmdSetEvent( - VkCommandBuffer commandBuffer, - VkEvent event, - uint32_t prevAccessCount, - const ThsvsAccessType* pPrevAccesses); - -/* -Wrapper around vkCmdResetEvent. - -Resets an event when the accesses defined by pPrevAccesses are completed. - -commandBuffer and event are passed unmodified to vkCmdResetEvent. -*/ -void thsvsCmdResetEvent( - VkCommandBuffer commandBuffer, - VkEvent event, - uint32_t prevAccessCount, - const ThsvsAccessType* pPrevAccesses); - -/* -Simplified wrapper around vkCmdWaitEvents. - -The mapping functions defined above are used to translate the passed in -barrier definitions into a set of pipeline stages and native Vulkan memory -barriers to be passed to vkCmdPipelineBarrier. - -commandBuffer, eventCount, and pEvents are passed unmodified to -vkCmdWaitEvents. -*/ -void thsvsCmdWaitEvents( - VkCommandBuffer commandBuffer, - uint32_t eventCount, - const VkEvent* pEvents, - const ThsvsGlobalBarrier* pGlobalBarrier, - uint32_t bufferBarrierCount, - const ThsvsBufferBarrier* pBufferBarriers, - uint32_t imageBarrierCount, - const ThsvsImageBarrier* pImageBarriers); - -#endif // THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_H - -#ifdef THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION - -#include - -//// Optional Error Checking //// -/* -Checks for barriers defining multiple usages that have different layouts -*/ -// #define THSVS_ERROR_CHECK_MIXED_IMAGE_LAYOUT - -/* -Checks if an image/buffer barrier is used when a global barrier would suffice -*/ -// #define THSVS_ERROR_CHECK_COULD_USE_GLOBAL_BARRIER - -/* -Checks if a write access is listed alongside any other access - if so it -points to a potential data hazard that you need to synchronize separately. -In some cases it may simply be over-synchronization however, but it's usually -worth checking. -*/ -// #define THSVS_ERROR_CHECK_POTENTIAL_HAZARD - -/* -Checks if a variety of table lookups (like the access map) are within -a valid range. -*/ -// #define THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - -//// Temporary Memory Allocation //// -/* -Override these if you can't afford the stack space or just want to use a -custom temporary allocator. -These are currently used exclusively to allocate Vulkan memory barriers in -the API, one for each Buffer or Image barrier passed into the pipeline and -event functions. -May consider other allocation strategies in future. -*/ - -// Alloca inclusion code below copied from -// https://github.com/nothings/stb/blob/master/stb_vorbis.c - -// find definition of alloca if it's not in stdlib.h: -#if defined(_MSC_VER) || defined(__MINGW32__) - #include -#endif -#if defined(__linux__) || defined(__linux) || defined(__EMSCRIPTEN__) - #include -#endif - -#if defined(THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE) || \ - defined(THSVS_ERROR_CHECK_COULD_USE_GLOBAL_BARRIER) || \ - defined(THSVS_ERROR_CHECK_MIXED_IMAGE_LAYOUT) || \ - defined(THSVS_ERROR_CHECK_POTENTIAL_HAZARD) - #include -#endif - -#if !defined(THSVS_TEMP_ALLOC) -#define THSVS_TEMP_ALLOC(size) (alloca(size)) -#endif - -#if !defined(THSVS_TEMP_FREE) -#define THSVS_TEMP_FREE(x) ((void)(x)) -#endif - -typedef struct ThsvsVkAccessInfo { - VkPipelineStageFlags stageMask; - VkAccessFlags accessMask; - VkImageLayout imageLayout; -} ThsvsVkAccessInfo; - -const ThsvsVkAccessInfo ThsvsAccessMap[THSVS_NUM_ACCESS_TYPES] = { - // THSVS_ACCESS_NONE - { 0, - 0, - VK_IMAGE_LAYOUT_UNDEFINED}, - -// Read Access - // THSVS_ACCESS_COMMAND_BUFFER_READ_NV - { VK_PIPELINE_STAGE_COMMAND_PREPROCESS_BIT_NV, - VK_ACCESS_COMMAND_PREPROCESS_READ_BIT_NV, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_INDIRECT_BUFFER - { VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, - VK_ACCESS_INDIRECT_COMMAND_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - - // THSVS_ACCESS_INDEX_BUFFER - { VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_INDEX_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_VERTEX_BUFFER - { VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_VERTEX_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_VERTEX_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_VERTEX_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_GEOMETRY_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_GEOMETRY_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_GEOMETRY_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TASK_SHADER_READ_UNIFORM_BUFFER_NV - { VK_PIPELINE_STAGE_TASK_SHADER_BIT_NV, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_TASK_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER_NV - { VK_PIPELINE_STAGE_TASK_SHADER_BIT_NV, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_TASK_SHADER_READ_OTHER_NV - { VK_PIPELINE_STAGE_TASK_SHADER_BIT_NV, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_MESH_SHADER_READ_UNIFORM_BUFFER_NV - { VK_PIPELINE_STAGE_MESH_SHADER_BIT_NV, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_MESH_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER_NV - { VK_PIPELINE_STAGE_MESH_SHADER_BIT_NV, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_MESH_SHADER_READ_OTHER_NV - { VK_PIPELINE_STAGE_MESH_SHADER_BIT_NV, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_EXT - { VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_FRAGMENT_DENSITY_MAP_READ_EXT - { VK_PIPELINE_STAGE_FRAGMENT_DENSITY_PROCESS_BIT_EXT, - VK_ACCESS_FRAGMENT_DENSITY_MAP_READ_BIT_EXT, - VK_IMAGE_LAYOUT_FRAGMENT_DENSITY_MAP_OPTIMAL_EXT}, - // THSVS_ACCESS_SHADING_RATE_READ_NV - { VK_PIPELINE_STAGE_SHADING_RATE_IMAGE_BIT_NV, - VK_ACCESS_SHADING_RATE_IMAGE_READ_BIT_NV, - VK_IMAGE_LAYOUT_SHADING_RATE_OPTIMAL_NV}, - - // THSVS_ACCESS_FRAGMENT_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_FRAGMENT_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_FRAGMENT_SHADER_READ_COLOR_INPUT_ATTACHMENT - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_INPUT_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_FRAGMENT_SHADER_READ_DEPTH_STENCIL_INPUT_ATTACHMENT - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_INPUT_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_FRAGMENT_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_COLOR_ATTACHMENT_READ - { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL}, - // THSVS_ACCESS_COLOR_ATTACHMENT_ADVANCED_BLENDING_EXT - { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL}, - // THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ - { VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL}, - - // THSVS_ACCESS_COMPUTE_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_COMPUTE_SHADER_READ_SAMPLED_IMAGE_OR_UNIFORM_TEXEL_BUFFER - { VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_COMPUTE_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_UNIFORM_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_ANY_SHADER_READ_UNIFORM_BUFFER_OR_VERTEX_BUFFER - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_ANY_SHADER_READ_SAMPLED_IMAGE - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL}, - // THSVS_ACCESS_ANY_SHADER_READ_OTHER - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TRANSFER_READ - { VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL}, - // THSVS_ACCESS_HOST_READ - { VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_READ_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_PRESENT - { 0, - 0, - VK_IMAGE_LAYOUT_PRESENT_SRC_KHR}, - // THSVS_ACCESS_CONDITIONAL_RENDERING_READ_EXT - { VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, - VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, - VK_IMAGE_LAYOUT_UNDEFINED}, - - // THSVS_ACCESS_RAY_TRACING_SHADER_ACCELERATION_STRUCTURE_READ_NV - { VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_NV, - VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_NV, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_ACCELERATION_STRUCTURE_BUILD_READ_NV - { VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_NV, - VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_NV, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_END_OF_READ_ACCESS - { 0, - 0, - VK_IMAGE_LAYOUT_UNDEFINED}, - -// Write access - // THSVS_ACCESS_COMMAND_BUFFER_WRITE_NV - { VK_PIPELINE_STAGE_COMMAND_PREPROCESS_BIT_NV, - VK_ACCESS_COMMAND_PREPROCESS_WRITE_BIT_NV, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_VERTEX_SHADER_WRITE - { VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_TESSELLATION_CONTROL_SHADER_WRITE - { VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_TESSELLATION_EVALUATION_SHADER_WRITE - { VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_GEOMETRY_SHADER_WRITE - { VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_TASK_SHADER_WRITE_NV - { VK_PIPELINE_STAGE_TASK_SHADER_BIT_NV, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_MESH_SHADER_WRITE_NV - { VK_PIPELINE_STAGE_MESH_SHADER_BIT_NV, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_TRANSFORM_FEEDBACK_WRITE_EXT - { VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_EXT - { VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, - VK_IMAGE_LAYOUT_UNDEFINED}, - // THSVS_ACCESS_FRAGMENT_SHADER_WRITE - { VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_COLOR_ATTACHMENT_WRITE - { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL}, - // THSVS_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE - { VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL}, - // THSVS_ACCESS_DEPTH_ATTACHMENT_WRITE_STENCIL_READ_ONLY - { VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL_KHR}, - // THSVS_ACCESS_STENCIL_ATTACHMENT_WRITE_DEPTH_READ_ONLY - { VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT, - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL_KHR}, - - // THSVS_ACCESS_COMPUTE_SHADER_WRITE - { VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_ANY_SHADER_WRITE - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - - // THSVS_ACCESS_TRANSFER_WRITE - { VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL}, - // THSVS_ACCESS_HOST_PREINITIALIZED - { VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_WRITE_BIT, - VK_IMAGE_LAYOUT_PREINITIALIZED}, - // THSVS_ACCESS_HOST_WRITE - { VK_PIPELINE_STAGE_HOST_BIT, - VK_ACCESS_HOST_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL}, - // THSVS_ACCESS_ACCELERATION_STRUCTURE_BUILD_WRITE_NV - { VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_NV, - VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_NV, - VK_IMAGE_LAYOUT_UNDEFINED}, - - // THSVS_ACCESS_COLOR_ATTACHMENT_READ_WRITE - { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, - VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL}, - // THSVS_ACCESS_GENERAL - { VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL} -}; - -void thsvsGetAccessInfo( - uint32_t accessCount, - const ThsvsAccessType* pAccesses, - VkPipelineStageFlags* pStageMask, - VkAccessFlags* pAccessMask, - VkImageLayout* pImageLayout, - bool* pHasWriteAccess) -{ - *pStageMask = 0; - *pAccessMask = 0; - *pImageLayout = VK_IMAGE_LAYOUT_UNDEFINED; - *pHasWriteAccess = false; - - for (uint32_t i = 0; i < accessCount; ++i) - { - ThsvsAccessType access = pAccesses[i]; - const ThsvsVkAccessInfo* pAccessInfo = &ThsvsAccessMap[access]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(access < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(access < THSVS_END_OF_READ_ACCESS || accessCount == 1); -#endif - - *pStageMask |= pAccessInfo->stageMask; - - if (access > THSVS_END_OF_READ_ACCESS) - *pHasWriteAccess = true; - - *pAccessMask |= pAccessInfo->accessMask; - - VkImageLayout layout = pAccessInfo->imageLayout; - -#ifdef THSVS_ERROR_CHECK_MIXED_IMAGE_LAYOUT - assert(*pImageLayout == VK_IMAGE_LAYOUT_UNDEFINED || - *pImageLayout == layout); -#endif - - *pImageLayout = layout; - } -} - -void thsvsGetVulkanMemoryBarrier( - const ThsvsGlobalBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkMemoryBarrier* pVkBarrier) -{ - *pSrcStages = 0; - *pDstStages = 0; - pVkBarrier->sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; - pVkBarrier->pNext = NULL; - pVkBarrier->srcAccessMask = 0; - pVkBarrier->dstAccessMask = 0; - - for (uint32_t i = 0; i < thBarrier.prevAccessCount; ++i) - { - ThsvsAccessType prevAccess = thBarrier.pPrevAccesses[i]; - const ThsvsVkAccessInfo* pPrevAccessInfo = &ThsvsAccessMap[prevAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(prevAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(prevAccess < THSVS_END_OF_READ_ACCESS || thBarrier.prevAccessCount == 1); -#endif - - *pSrcStages |= pPrevAccessInfo->stageMask; - - // Add appropriate availability operations - for writes only. - if (prevAccess > THSVS_END_OF_READ_ACCESS) - pVkBarrier->srcAccessMask |= pPrevAccessInfo->accessMask; - } - - for (uint32_t i = 0; i < thBarrier.nextAccessCount; ++i) - { - ThsvsAccessType nextAccess = thBarrier.pNextAccesses[i]; - const ThsvsVkAccessInfo* pNextAccessInfo = &ThsvsAccessMap[nextAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the next access index is a valid range for the lookup - assert(nextAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(nextAccess < THSVS_END_OF_READ_ACCESS || thBarrier.nextAccessCount == 1); -#endif - *pDstStages |= pNextAccessInfo->stageMask; - - // Add visibility operations as necessary. - // If the src access mask is zero, this is a WAR hazard (or for some reason a "RAR"), - // so the dst access mask can be safely zeroed as these don't need visibility. - if (pVkBarrier->srcAccessMask != 0) - pVkBarrier->dstAccessMask |= pNextAccessInfo->accessMask; - } - - // Ensure that the stage masks are valid if no stages were determined - if (*pSrcStages == 0) - *pSrcStages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - if (*pDstStages == 0) - *pDstStages = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; -} - -void thsvsGetVulkanBufferMemoryBarrier( - const ThsvsBufferBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkBufferMemoryBarrier* pVkBarrier) -{ - *pSrcStages = 0; - *pDstStages = 0; - pVkBarrier->sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - pVkBarrier->pNext = NULL; - pVkBarrier->srcAccessMask = 0; - pVkBarrier->dstAccessMask = 0; - pVkBarrier->srcQueueFamilyIndex = thBarrier.srcQueueFamilyIndex; - pVkBarrier->dstQueueFamilyIndex = thBarrier.dstQueueFamilyIndex; - pVkBarrier->buffer = thBarrier.buffer; - pVkBarrier->offset = thBarrier.offset; - pVkBarrier->size = thBarrier.size; - -#ifdef THSVS_ERROR_CHECK_COULD_USE_GLOBAL_BARRIER - assert(pVkBarrier->srcQueueFamilyIndex != pVkBarrier->dstQueueFamilyIndex); -#endif - - for (uint32_t i = 0; i < thBarrier.prevAccessCount; ++i) - { - ThsvsAccessType prevAccess = thBarrier.pPrevAccesses[i]; - const ThsvsVkAccessInfo* pPrevAccessInfo = &ThsvsAccessMap[prevAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(prevAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(prevAccess < THSVS_END_OF_READ_ACCESS || thBarrier.prevAccessCount == 1); -#endif - - *pSrcStages |= pPrevAccessInfo->stageMask; - - // Add appropriate availability operations - for writes only. - if (prevAccess > THSVS_END_OF_READ_ACCESS) - pVkBarrier->srcAccessMask |= pPrevAccessInfo->accessMask; - } - - for (uint32_t i = 0; i < thBarrier.nextAccessCount; ++i) - { - ThsvsAccessType nextAccess = thBarrier.pNextAccesses[i]; - const ThsvsVkAccessInfo* pNextAccessInfo = &ThsvsAccessMap[nextAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the next access index is a valid range for the lookup - assert(nextAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(nextAccess < THSVS_END_OF_READ_ACCESS || thBarrier.nextAccessCount == 1); -#endif - - *pDstStages |= pNextAccessInfo->stageMask; - - // Add visibility operations as necessary. - // If the src access mask is zero, this is a WAR hazard (or for some reason a "RAR"), - // so the dst access mask can be safely zeroed as these don't need visibility. - if (pVkBarrier->srcAccessMask != 0) - pVkBarrier->dstAccessMask |= pNextAccessInfo->accessMask; - } - - // Ensure that the stage masks are valid if no stages were determined - if (*pSrcStages == 0) - *pSrcStages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - if (*pDstStages == 0) - *pDstStages = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; -} - -void thsvsGetVulkanImageMemoryBarrier( - const ThsvsImageBarrier& thBarrier, - VkPipelineStageFlags* pSrcStages, - VkPipelineStageFlags* pDstStages, - VkImageMemoryBarrier* pVkBarrier) -{ - *pSrcStages = 0; - *pDstStages = 0; - pVkBarrier->sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - pVkBarrier->pNext = NULL; - pVkBarrier->srcAccessMask = 0; - pVkBarrier->dstAccessMask = 0; - pVkBarrier->srcQueueFamilyIndex = thBarrier.srcQueueFamilyIndex; - pVkBarrier->dstQueueFamilyIndex = thBarrier.dstQueueFamilyIndex; - pVkBarrier->image = thBarrier.image; - pVkBarrier->subresourceRange = thBarrier.subresourceRange; - pVkBarrier->oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - pVkBarrier->newLayout = VK_IMAGE_LAYOUT_UNDEFINED; - - for (uint32_t i = 0; i < thBarrier.prevAccessCount; ++i) - { - ThsvsAccessType prevAccess = thBarrier.pPrevAccesses[i]; - const ThsvsVkAccessInfo* pPrevAccessInfo = &ThsvsAccessMap[prevAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(prevAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(prevAccess < THSVS_END_OF_READ_ACCESS || thBarrier.prevAccessCount == 1); -#endif - - *pSrcStages |= pPrevAccessInfo->stageMask; - - // Add appropriate availability operations - for writes only. - if (prevAccess > THSVS_END_OF_READ_ACCESS) - pVkBarrier->srcAccessMask |= pPrevAccessInfo->accessMask; - - if (thBarrier.discardContents == VK_TRUE) - { - pVkBarrier->oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - } - else - { - VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED; - - switch(thBarrier.prevLayout) - { - case THSVS_IMAGE_LAYOUT_GENERAL: - if (prevAccess == THSVS_ACCESS_PRESENT) - layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - else - layout = VK_IMAGE_LAYOUT_GENERAL; - break; - case THSVS_IMAGE_LAYOUT_OPTIMAL: - layout = pPrevAccessInfo->imageLayout; - break; - case THSVS_IMAGE_LAYOUT_GENERAL_AND_PRESENTATION: - layout = VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR; - break; - } - - -#ifdef THSVS_ERROR_CHECK_MIXED_IMAGE_LAYOUT - assert(pVkBarrier->oldLayout == VK_IMAGE_LAYOUT_UNDEFINED || - pVkBarrier->oldLayout == layout); -#endif - pVkBarrier->oldLayout = layout; - } - } - - for (uint32_t i = 0; i < thBarrier.nextAccessCount; ++i) - { - ThsvsAccessType nextAccess = thBarrier.pNextAccesses[i]; - const ThsvsVkAccessInfo* pNextAccessInfo = &ThsvsAccessMap[nextAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the next access index is a valid range for the lookup - assert(nextAccess < THSVS_NUM_ACCESS_TYPES); -#endif - -#ifdef THSVS_ERROR_CHECK_POTENTIAL_HAZARD - // Asserts that the access is a read, else it's a write and it should appear on its own. - assert(nextAccess < THSVS_END_OF_READ_ACCESS || thBarrier.nextAccessCount == 1); -#endif - - *pDstStages |= pNextAccessInfo->stageMask; - - // Add visibility operations as necessary. - // If the src access mask is zero, this is a WAR hazard (or for some reason a "RAR"), - // so the dst access mask can be safely zeroed as these don't need visibility. - if (pVkBarrier->srcAccessMask != 0) - pVkBarrier->dstAccessMask |= pNextAccessInfo->accessMask; - - VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED; - switch(thBarrier.nextLayout) - { - case THSVS_IMAGE_LAYOUT_GENERAL: - if (nextAccess == THSVS_ACCESS_PRESENT) - layout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - else - layout = VK_IMAGE_LAYOUT_GENERAL; - break; - case THSVS_IMAGE_LAYOUT_OPTIMAL: - layout = pNextAccessInfo->imageLayout; - break; - case THSVS_IMAGE_LAYOUT_GENERAL_AND_PRESENTATION: - layout = VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR; - break; - } - -#ifdef THSVS_ERROR_CHECK_MIXED_IMAGE_LAYOUT - assert(pVkBarrier->newLayout == VK_IMAGE_LAYOUT_UNDEFINED || - pVkBarrier->newLayout == layout); -#endif - pVkBarrier->newLayout = layout; - } - -#ifdef THSVS_ERROR_CHECK_COULD_USE_GLOBAL_BARRIER - assert(pVkBarrier->newLayout != pVkBarrier->oldLayout || - pVkBarrier->srcQueueFamilyIndex != pVkBarrier->dstQueueFamilyIndex); -#endif - - // Ensure that the stage masks are valid if no stages were determined - if (*pSrcStages == 0) - *pSrcStages = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - if (*pDstStages == 0) - *pDstStages = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; -} - -void thsvsCmdPipelineBarrier( - VkCommandBuffer commandBuffer, - const ThsvsGlobalBarrier* pGlobalBarrier, - uint32_t bufferBarrierCount, - const ThsvsBufferBarrier* pBufferBarriers, - uint32_t imageBarrierCount, - const ThsvsImageBarrier* pImageBarriers) -{ - VkMemoryBarrier memoryBarrier; - // Vulkan pipeline barrier command parameters - // commandBuffer; - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - uint32_t memoryBarrierCount = (pGlobalBarrier != NULL) ? 1 : 0; - VkMemoryBarrier* pMemoryBarriers = (pGlobalBarrier != NULL) ? &memoryBarrier : NULL; - uint32_t bufferMemoryBarrierCount = bufferBarrierCount; - VkBufferMemoryBarrier* pBufferMemoryBarriers = NULL; - uint32_t imageMemoryBarrierCount = imageBarrierCount; - VkImageMemoryBarrier* pImageMemoryBarriers = NULL; - - // Global memory barrier - if (pGlobalBarrier != NULL) - { - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - thsvsGetVulkanMemoryBarrier(*pGlobalBarrier, &tempSrcStageMask, &tempDstStageMask, pMemoryBarriers); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - - // Buffer memory barriers - if (bufferBarrierCount > 0) - { - pBufferMemoryBarriers = (VkBufferMemoryBarrier*)THSVS_TEMP_ALLOC(sizeof(VkBufferMemoryBarrier) * bufferMemoryBarrierCount); - - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - for (uint32_t i = 0; i < bufferBarrierCount; ++i) - { - thsvsGetVulkanBufferMemoryBarrier(pBufferBarriers[i], &tempSrcStageMask, &tempDstStageMask, &pBufferMemoryBarriers[i]); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - } - - // Image memory barriers - if (imageBarrierCount > 0) - { - pImageMemoryBarriers = (VkImageMemoryBarrier*)THSVS_TEMP_ALLOC(sizeof(VkImageMemoryBarrier) * imageMemoryBarrierCount); - - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - for (uint32_t i = 0; i < imageBarrierCount; ++i) - { - thsvsGetVulkanImageMemoryBarrier(pImageBarriers[i], &tempSrcStageMask, &tempDstStageMask, &pImageMemoryBarriers[i]); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - } - - vkCmdPipelineBarrier( - commandBuffer, - srcStageMask, - dstStageMask, - 0, - memoryBarrierCount, - pMemoryBarriers, - bufferMemoryBarrierCount, - pBufferMemoryBarriers, - imageMemoryBarrierCount, - pImageMemoryBarriers); - - THSVS_TEMP_FREE(pBufferMemoryBarriers); - THSVS_TEMP_FREE(pImageMemoryBarriers); -} - -void thsvsCmdSetEvent( - VkCommandBuffer commandBuffer, - VkEvent event, - uint32_t prevAccessCount, - const ThsvsAccessType* pPrevAccesses) -{ - VkPipelineStageFlags stageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - - for (uint32_t i = 0; i < prevAccessCount; ++i) - { - ThsvsAccessType prevAccess = pPrevAccesses[i]; - const ThsvsVkAccessInfo* pPrevAccessInfo = &ThsvsAccessMap[prevAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(prevAccess < THSVS_NUM_ACCESS_TYPES); -#endif - - stageMask |= pPrevAccessInfo->stageMask; - } - - vkCmdSetEvent( - commandBuffer, - event, - stageMask); -} - -void thsvsCmdResetEvent( - VkCommandBuffer commandBuffer, - VkEvent event, - uint32_t prevAccessCount, - const ThsvsAccessType* pPrevAccesses) -{ - VkPipelineStageFlags stageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - - for (uint32_t i = 0; i < prevAccessCount; ++i) - { - ThsvsAccessType prevAccess = pPrevAccesses[i]; - const ThsvsVkAccessInfo* pPrevAccessInfo = &ThsvsAccessMap[prevAccess]; - -#ifdef THSVS_ERROR_CHECK_ACCESS_TYPE_IN_RANGE - // Asserts that the previous access index is a valid range for the lookup - assert(prevAccess < THSVS_NUM_ACCESS_TYPES); -#endif - - stageMask |= pPrevAccessInfo->stageMask; - } - - vkCmdResetEvent( - commandBuffer, - event, - stageMask); -} - -void thsvsCmdWaitEvents( - VkCommandBuffer commandBuffer, - uint32_t eventCount, - const VkEvent* pEvents, - const ThsvsGlobalBarrier* pGlobalBarrier, - uint32_t bufferBarrierCount, - const ThsvsBufferBarrier* pBufferBarriers, - uint32_t imageBarrierCount, - const ThsvsImageBarrier* pImageBarriers) -{ - VkMemoryBarrier memoryBarrier; - // Vulkan pipeline barrier command parameters - // commandBuffer; - // eventCount; - // pEvents; - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - uint32_t memoryBarrierCount = (pGlobalBarrier != NULL) ? 1 : 0; - VkMemoryBarrier* pMemoryBarriers = (pGlobalBarrier != NULL) ? &memoryBarrier : NULL; - uint32_t bufferMemoryBarrierCount = bufferBarrierCount; - VkBufferMemoryBarrier* pBufferMemoryBarriers = NULL; - uint32_t imageMemoryBarrierCount = imageBarrierCount; - VkImageMemoryBarrier* pImageMemoryBarriers = NULL; - - // Global memory barrier - if (pGlobalBarrier != NULL) - { - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - thsvsGetVulkanMemoryBarrier(*pGlobalBarrier, &tempSrcStageMask, &tempDstStageMask, pMemoryBarriers); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - - // Buffer memory barriers - if (bufferBarrierCount > 0) - { - pBufferMemoryBarriers = (VkBufferMemoryBarrier*)THSVS_TEMP_ALLOC(sizeof(VkBufferMemoryBarrier) * bufferMemoryBarrierCount); - - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - for (uint32_t i = 0; i < bufferBarrierCount; ++i) - { - thsvsGetVulkanBufferMemoryBarrier(pBufferBarriers[i], &tempSrcStageMask, &tempDstStageMask, &pBufferMemoryBarriers[i]); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - } - - // Image memory barriers - if (imageBarrierCount > 0) - { - pImageMemoryBarriers = (VkImageMemoryBarrier*)THSVS_TEMP_ALLOC(sizeof(VkImageMemoryBarrier) * imageMemoryBarrierCount); - - VkPipelineStageFlags tempSrcStageMask = 0; - VkPipelineStageFlags tempDstStageMask = 0; - for (uint32_t i = 0; i < imageBarrierCount; ++i) - { - thsvsGetVulkanImageMemoryBarrier(pImageBarriers[i], &tempSrcStageMask, &tempDstStageMask, &pImageMemoryBarriers[i]); - srcStageMask |= tempSrcStageMask; - dstStageMask |= tempDstStageMask; - } - } - - vkCmdWaitEvents( - commandBuffer, - eventCount, - pEvents, - srcStageMask, - dstStageMask, - memoryBarrierCount, - pMemoryBarriers, - bufferMemoryBarrierCount, - pBufferMemoryBarriers, - imageMemoryBarrierCount, - pImageMemoryBarriers); - - THSVS_TEMP_FREE(pBufferMemoryBarriers); - THSVS_TEMP_FREE(pImageMemoryBarriers); -} - -#endif // THSVS_SIMPLER_VULKAN_SYNCHRONIZATION_IMPLEMENTATION diff --git a/contrib/xxhash/LICENSE b/contrib/xxhash/LICENSE deleted file mode 100644 index e4c5da7..0000000 --- a/contrib/xxhash/LICENSE +++ /dev/null @@ -1,26 +0,0 @@ -xxHash Library -Copyright (c) 2012-2021 Yann Collet -All rights reserved. - -BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/contrib/xxhash/xxhash.c b/contrib/xxhash/xxhash.c deleted file mode 100644 index 083b039..0000000 --- a/contrib/xxhash/xxhash.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Copyright (C) 2012-2021 Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - - -/* - * xxhash.c instantiates functions defined in xxhash.h - */ - -#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ -#define XXH_IMPLEMENTATION /* access definitions */ - -#include "xxhash.h" diff --git a/contrib/xxhash/xxhash.h b/contrib/xxhash/xxhash.h deleted file mode 100644 index a18e8c7..0000000 --- a/contrib/xxhash/xxhash.h +++ /dev/null @@ -1,6773 +0,0 @@ -/* - * xxHash - Extremely Fast Hash algorithm - * Header File - * Copyright (C) 2012-2021 Yann Collet - * - * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * You can contact the author at: - * - xxHash homepage: https://www.xxhash.com - * - xxHash source repository: https://github.com/Cyan4973/xxHash - */ - -/*! - * @mainpage xxHash - * - * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed - * limits. - * - * It is proposed in four flavors, in three families: - * 1. @ref XXH32_family - * - Classic 32-bit hash function. Simple, compact, and runs on almost all - * 32-bit and 64-bit systems. - * 2. @ref XXH64_family - * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most - * 64-bit systems (but _not_ 32-bit systems). - * 3. @ref XXH3_family - * - Modern 64-bit and 128-bit hash function family which features improved - * strength and performance across the board, especially on smaller data. - * It benefits greatly from SIMD and 64-bit without requiring it. - * - * Benchmarks - * --- - * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. - * The open source benchmark program is compiled with clang v10.0 using -O3 flag. - * - * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | - * | -------------------- | ------- | ----: | ---------------: | ------------------: | - * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | - * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | - * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | - * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | - * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | - * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | - * | RAM sequential read | | N/A | 28.0 GB/s | N/A | - * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | - * | City64 | | 64 | 22.0 GB/s | 76.6 | - * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | - * | City128 | | 128 | 21.7 GB/s | 57.7 | - * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | - * | XXH64() | | 64 | 19.4 GB/s | 71.0 | - * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | - * | Mum | | 64 | 18.0 GB/s | 67.0 | - * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | - * | XXH32() | | 32 | 9.7 GB/s | 71.9 | - * | City32 | | 32 | 9.1 GB/s | 66.0 | - * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | - * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | - * | SipHash* | | 64 | 3.0 GB/s | 43.2 | - * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | - * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | - * | FNV64 | | 64 | 1.2 GB/s | 62.7 | - * | Blake2* | | 256 | 1.1 GB/s | 5.1 | - * | SHA1* | | 160 | 0.8 GB/s | 5.6 | - * | MD5* | | 128 | 0.6 GB/s | 7.8 | - * @note - * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, - * even though it is mandatory on x64. - * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic - * by modern standards. - * - Small data velocity is a rough average of algorithm's efficiency for small - * data. For more accurate information, see the wiki. - * - More benchmarks and strength tests are found on the wiki: - * https://github.com/Cyan4973/xxHash/wiki - * - * Usage - * ------ - * All xxHash variants use a similar API. Changing the algorithm is a trivial - * substitution. - * - * @pre - * For functions which take an input and length parameter, the following - * requirements are assumed: - * - The range from [`input`, `input + length`) is valid, readable memory. - * - The only exception is if the `length` is `0`, `input` may be `NULL`. - * - For C++, the objects must have the *TriviallyCopyable* property, as the - * functions access bytes directly as if it was an array of `unsigned char`. - * - * @anchor single_shot_example - * **Single Shot** - * - * These functions are stateless functions which hash a contiguous block of memory, - * immediately returning the result. They are the easiest and usually the fastest - * option. - * - * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() - * - * @code{.c} - * #include - * #include "xxhash.h" - * - * // Example for a function which hashes a null terminated string with XXH32(). - * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) - * { - * // NULL pointers are only valid if the length is zero - * size_t length = (string == NULL) ? 0 : strlen(string); - * return XXH32(string, length, seed); - * } - * @endcode - * - * @anchor streaming_example - * **Streaming** - * - * These groups of functions allow incremental hashing of unknown size, even - * more than what would fit in a size_t. - * - * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() - * - * @code{.c} - * #include - * #include - * #include "xxhash.h" - * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). - * XXH64_hash_t hashFile(FILE* f) - * { - * // Allocate a state struct. Do not just use malloc() or new. - * XXH3_state_t* state = XXH3_createState(); - * assert(state != NULL && "Out of memory!"); - * // Reset the state to start a new hashing session. - * XXH3_64bits_reset(state); - * char buffer[4096]; - * size_t count; - * // Read the file in chunks - * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { - * // Run update() as many times as necessary to process the data - * XXH3_64bits_update(state, buffer, count); - * } - * // Retrieve the finalized hash. This will not change the state. - * XXH64_hash_t result = XXH3_64bits_digest(state); - * // Free the state. Do not use free(). - * XXH3_freeState(state); - * return result; - * } - * @endcode - * - * @file xxhash.h - * xxHash prototypes and implementation - */ - -#if defined (__cplusplus) -extern "C" { -#endif - -/* **************************** - * INLINE mode - ******************************/ -/*! - * @defgroup public Public API - * Contains details on the public xxHash functions. - * @{ - */ -#ifdef XXH_DOXYGEN -/*! - * @brief Gives access to internal state declaration, required for static allocation. - * - * Incompatible with dynamic linking, due to risks of ABI changes. - * - * Usage: - * @code{.c} - * #define XXH_STATIC_LINKING_ONLY - * #include "xxhash.h" - * @endcode - */ -# define XXH_STATIC_LINKING_ONLY -/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ - -/*! - * @brief Gives access to internal definitions. - * - * Usage: - * @code{.c} - * #define XXH_STATIC_LINKING_ONLY - * #define XXH_IMPLEMENTATION - * #include "xxhash.h" - * @endcode - */ -# define XXH_IMPLEMENTATION -/* Do not undef XXH_IMPLEMENTATION for Doxygen */ - -/*! - * @brief Exposes the implementation and marks all functions as `inline`. - * - * Use these build macros to inline xxhash into the target unit. - * Inlining improves performance on small inputs, especially when the length is - * expressed as a compile-time constant: - * - * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html - * - * It also keeps xxHash symbols private to the unit, so they are not exported. - * - * Usage: - * @code{.c} - * #define XXH_INLINE_ALL - * #include "xxhash.h" - * @endcode - * Do not compile and link xxhash.o as a separate object, as it is not useful. - */ -# define XXH_INLINE_ALL -# undef XXH_INLINE_ALL -/*! - * @brief Exposes the implementation without marking functions as inline. - */ -# define XXH_PRIVATE_API -# undef XXH_PRIVATE_API -/*! - * @brief Emulate a namespace by transparently prefixing all symbols. - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix - * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE - * (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ -# define XXH_NAMESPACE /* YOUR NAME HERE */ -# undef XXH_NAMESPACE -#endif - -#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ - && !defined(XXH_INLINE_ALL_31684351384) - /* this section should be traversed only once */ -# define XXH_INLINE_ALL_31684351384 - /* give access to the advanced API, required to compile implementations */ -# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ -# define XXH_STATIC_LINKING_ONLY - /* make all functions private */ -# undef XXH_PUBLIC_API -# if defined(__GNUC__) -# define XXH_PUBLIC_API static __inline __attribute__((unused)) -# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) -# define XXH_PUBLIC_API static inline -# elif defined(_MSC_VER) -# define XXH_PUBLIC_API static __inline -# else - /* note: this version may generate warnings for unused static functions */ -# define XXH_PUBLIC_API static -# endif - - /* - * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, - * such as part of some previously included *.h header file. - * Without further action, the new include would just be ignored, - * and functions would effectively _not_ be inlined (silent failure). - * The following macros solve this situation by prefixing all inlined names, - * avoiding naming collision with previous inclusions. - */ - /* Before that, we unconditionally #undef all symbols, - * in case they were already defined with XXH_NAMESPACE. - * They will then be redefined for XXH_INLINE_ALL - */ -# undef XXH_versionNumber - /* XXH32 */ -# undef XXH32 -# undef XXH32_createState -# undef XXH32_freeState -# undef XXH32_reset -# undef XXH32_update -# undef XXH32_digest -# undef XXH32_copyState -# undef XXH32_canonicalFromHash -# undef XXH32_hashFromCanonical - /* XXH64 */ -# undef XXH64 -# undef XXH64_createState -# undef XXH64_freeState -# undef XXH64_reset -# undef XXH64_update -# undef XXH64_digest -# undef XXH64_copyState -# undef XXH64_canonicalFromHash -# undef XXH64_hashFromCanonical - /* XXH3_64bits */ -# undef XXH3_64bits -# undef XXH3_64bits_withSecret -# undef XXH3_64bits_withSeed -# undef XXH3_64bits_withSecretandSeed -# undef XXH3_createState -# undef XXH3_freeState -# undef XXH3_copyState -# undef XXH3_64bits_reset -# undef XXH3_64bits_reset_withSeed -# undef XXH3_64bits_reset_withSecret -# undef XXH3_64bits_update -# undef XXH3_64bits_digest -# undef XXH3_generateSecret - /* XXH3_128bits */ -# undef XXH128 -# undef XXH3_128bits -# undef XXH3_128bits_withSeed -# undef XXH3_128bits_withSecret -# undef XXH3_128bits_reset -# undef XXH3_128bits_reset_withSeed -# undef XXH3_128bits_reset_withSecret -# undef XXH3_128bits_reset_withSecretandSeed -# undef XXH3_128bits_update -# undef XXH3_128bits_digest -# undef XXH128_isEqual -# undef XXH128_cmp -# undef XXH128_canonicalFromHash -# undef XXH128_hashFromCanonical - /* Finally, free the namespace itself */ -# undef XXH_NAMESPACE - - /* employ the namespace for XXH_INLINE_ALL */ -# define XXH_NAMESPACE XXH_INLINE_ - /* - * Some identifiers (enums, type names) are not symbols, - * but they must nonetheless be renamed to avoid redeclaration. - * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and has a more dispersed impact. - * Meanwhile, renaming can be achieved in a single place. - */ -# define XXH_IPREF(Id) XXH_NAMESPACE ## Id -# define XXH_OK XXH_IPREF(XXH_OK) -# define XXH_ERROR XXH_IPREF(XXH_ERROR) -# define XXH_errorcode XXH_IPREF(XXH_errorcode) -# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) -# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) -# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) -# define XXH32_state_s XXH_IPREF(XXH32_state_s) -# define XXH32_state_t XXH_IPREF(XXH32_state_t) -# define XXH64_state_s XXH_IPREF(XXH64_state_s) -# define XXH64_state_t XXH_IPREF(XXH64_state_t) -# define XXH3_state_s XXH_IPREF(XXH3_state_s) -# define XXH3_state_t XXH_IPREF(XXH3_state_t) -# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) - /* Ensure the header is parsed again, even if it was previously included */ -# undef XXHASH_H_5627135585666179 -# undef XXHASH_H_STATIC_13879238742 -#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - -/* **************************************************************** - * Stable API - *****************************************************************/ -#ifndef XXHASH_H_5627135585666179 -#define XXHASH_H_5627135585666179 1 - -/*! @brief Marks a global symbol. */ -#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) -# ifdef XXH_EXPORT -# define XXH_PUBLIC_API __declspec(dllexport) -# elif XXH_IMPORT -# define XXH_PUBLIC_API __declspec(dllimport) -# endif -# else -# define XXH_PUBLIC_API /* do nothing */ -# endif -#endif - -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) -/* XXH32 */ -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) -# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) -# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) -/* XXH64 */ -# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) -# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) -# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) -# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) -# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) -# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) -# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) -# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) -# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) -/* XXH3_64bits */ -# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) -# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) -# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) -# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) -# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) -# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) -# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) -# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) -# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) -# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) -# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) -# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) -# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) -# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) -# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) -/* XXH3_128bits */ -# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) -# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) -# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) -# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) -# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) -# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) -# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) -# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) -# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) -# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) -# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) -# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) -# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) -# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) -# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) -#endif - - -/* ************************************* -* Compiler specifics -***************************************/ - -/* specific declaration modes for Windows */ -#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) -# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) -# ifdef XXH_EXPORT -# define XXH_PUBLIC_API __declspec(dllexport) -# elif XXH_IMPORT -# define XXH_PUBLIC_API __declspec(dllimport) -# endif -# else -# define XXH_PUBLIC_API /* do nothing */ -# endif -#endif - -#if defined (__GNUC__) -# define XXH_CONSTF __attribute__((const)) -# define XXH_PUREF __attribute__((pure)) -# define XXH_MALLOCF __attribute__((malloc)) -#else -# define XXH_CONSTF /* disable */ -# define XXH_PUREF -# define XXH_MALLOCF -#endif - -/* ************************************* -* Version -***************************************/ -#define XXH_VERSION_MAJOR 0 -#define XXH_VERSION_MINOR 8 -#define XXH_VERSION_RELEASE 2 -/*! @brief Version number, encoded as two digits each */ -#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) - -/*! - * @brief Obtains the xxHash version. - * - * This is mostly useful when xxHash is compiled as a shared library, - * since the returned value comes from the library, as opposed to header file. - * - * @return @ref XXH_VERSION_NUMBER of the invoked library. - */ -XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); - - -/* **************************** -* Common basic types -******************************/ -#include /* size_t */ -/*! - * @brief Exit code for the streaming API. - */ -typedef enum { - XXH_OK = 0, /*!< OK */ - XXH_ERROR /*!< Error */ -} XXH_errorcode; - - -/*-********************************************************************** -* 32-bit hash -************************************************************************/ -#if defined(XXH_DOXYGEN) /* Don't show include */ -/*! - * @brief An unsigned 32-bit integer. - * - * Not necessarily defined to `uint32_t` but functionally equivalent. - */ -typedef uint32_t XXH32_hash_t; - -#elif !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint32_t XXH32_hash_t; - -#else -# include -# if UINT_MAX == 0xFFFFFFFFUL - typedef unsigned int XXH32_hash_t; -# elif ULONG_MAX == 0xFFFFFFFFUL - typedef unsigned long XXH32_hash_t; -# else -# error "unsupported platform: need a 32-bit type" -# endif -#endif - -/*! - * @} - * - * @defgroup XXH32_family XXH32 family - * @ingroup public - * Contains functions used in the classic 32-bit xxHash algorithm. - * - * @note - * XXH32 is useful for older platforms, with no or poor 64-bit performance. - * Note that the @ref XXH3_family provides competitive speed for both 32-bit - * and 64-bit systems, and offers true 64/128 bit hash results. - * - * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families - * @see @ref XXH32_impl for implementation details - * @{ - */ - -/*! - * @brief Calculates the 32-bit hash of @p input using xxHash32. - * - * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s - * - * See @ref single_shot_example "Single Shot Example" for an example. - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * @param seed The 32-bit seed to alter the hash's output predictably. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 32-bit hash value. - * - * @see - * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); - -#ifndef XXH_NO_STREAM -/*! - * Streaming functions generate the xxHash value from an incremental input. - * This method is slower than single-call functions, due to state management. - * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. - * - * An XXH state must first be allocated using `XXH*_createState()`. - * - * Start a new hash by initializing the state with a seed using `XXH*_reset()`. - * - * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. - * - * The function returns an error code, with 0 meaning OK, and any other value - * meaning there is an error. - * - * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. - * This function returns the nn-bits hash as an int or long long. - * - * It's still possible to continue inserting input into the hash state after a - * digest, and generate new hash values later on by invoking `XXH*_digest()`. - * - * When done, release the state using `XXH*_freeState()`. - * - * @see streaming_example at the top of @ref xxhash.h for an example. - */ - -/*! - * @typedef struct XXH32_state_s XXH32_state_t - * @brief The opaque state struct for the XXH32 streaming API. - * - * @see XXH32_state_s for details. - */ -typedef struct XXH32_state_s XXH32_state_t; - -/*! - * @brief Allocates an @ref XXH32_state_t. - * - * Must be freed with XXH32_freeState(). - * @return An allocated XXH32_state_t on success, `NULL` on failure. - */ -XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); -/*! - * @brief Frees an @ref XXH32_state_t. - * - * Must be allocated with XXH32_createState(). - * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). - * @return XXH_OK. - */ -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); -/*! - * @brief Copies one @ref XXH32_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); - -/*! - * @brief Resets an @ref XXH32_state_t to begin a new hash. - * - * This function resets and seeds a state. Call it before @ref XXH32_update(). - * - * @param statePtr The state struct to reset. - * @param seed The 32-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); - -/*! - * @brief Consumes a block of @p input to an @ref XXH32_state_t. - * - * Call this to incrementally consume blocks of data. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); - -/*! - * @brief Returns the calculated hash value from an @ref XXH32_state_t. - * - * @note - * Calling XXH32_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated xxHash32 value from that state. - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/* - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * This the simplest and fastest format for further post-processing. - * - * However, this leaves open the question of what is the order on the byte level, - * since little and big endian conventions will store the same number differently. - * - * The canonical representation settles this issue by mandating big-endian - * convention, the same convention as human-readable numbers (large digits first). - * - * When writing hash values to storage, sending them over a network, or printing - * them, it's highly recommended to use the canonical representation to ensure - * portability across a wider range of systems, present and future. - * - * The following functions allow transformation of hash values to and from - * canonical format. - */ - -/*! - * @brief Canonical (big endian) representation of @ref XXH32_hash_t. - */ -typedef struct { - unsigned char digest[4]; /*!< Hash bytes, big endian */ -} XXH32_canonical_t; - -/*! - * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. - * - * @param dst The @ref XXH32_canonical_t pointer to be stored to. - * @param hash The @ref XXH32_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - */ -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); - -/*! - * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. - * - * @param src The @ref XXH32_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - */ -XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); - - -/*! @cond Doxygen ignores this part */ -#ifdef __has_attribute -# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) -#else -# define XXH_HAS_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * C23 __STDC_VERSION__ number hasn't been specified yet. For now - * leave as `201711L` (C17 + 1). - * TODO: Update to correct value when its been specified. - */ -#define XXH_C23_VN 201711L -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* C-language Attributes are added in C23. */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) -# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) -#else -# define XXH_HAS_C_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -#if defined(__cplusplus) && defined(__has_cpp_attribute) -# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) -#else -# define XXH_HAS_CPP_ATTRIBUTE(x) 0 -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute - * introduced in CPP17 and C23. - * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough - * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough - */ -#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) -# define XXH_FALLTHROUGH [[fallthrough]] -#elif XXH_HAS_ATTRIBUTE(__fallthrough__) -# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) -#else -# define XXH_FALLTHROUGH /* fallthrough */ -#endif -/*! @endcond */ - -/*! @cond Doxygen ignores this part */ -/* - * Define XXH_NOESCAPE for annotated pointers in public API. - * https://clang.llvm.org/docs/AttributeReference.html#noescape - * As of writing this, only supported by clang. - */ -#if XXH_HAS_ATTRIBUTE(noescape) -# define XXH_NOESCAPE __attribute__((noescape)) -#else -# define XXH_NOESCAPE -#endif -/*! @endcond */ - - -/*! - * @} - * @ingroup public - * @{ - */ - -#ifndef XXH_NO_LONG_LONG -/*-********************************************************************** -* 64-bit hash -************************************************************************/ -#if defined(XXH_DOXYGEN) /* don't include */ -/*! - * @brief An unsigned 64-bit integer. - * - * Not necessarily defined to `uint64_t` but functionally equivalent. - */ -typedef uint64_t XXH64_hash_t; -#elif !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint64_t XXH64_hash_t; -#else -# include -# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL - /* LP64 ABI says uint64_t is unsigned long */ - typedef unsigned long XXH64_hash_t; -# else - /* the following type must have a width of 64-bit */ - typedef unsigned long long XXH64_hash_t; -# endif -#endif - -/*! - * @} - * - * @defgroup XXH64_family XXH64 family - * @ingroup public - * @{ - * Contains functions used in the classic 64-bit xxHash algorithm. - * - * @note - * XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. - * It provides better speed for systems with vector processing capabilities. - */ - -/*! - * @brief Calculates the 64-bit hash of @p input using xxHash64. - * - * This function usually runs faster on 64-bit systems, but slower on 32-bit - * systems (see benchmark). - * - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * @param seed The 64-bit seed to alter the hash's output predictably. - * - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return The calculated 64-bit hash. - * - * @see - * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): - * Direct equivalents for the other variants of xxHash. - * @see - * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/*! - * @brief The opaque state struct for the XXH64 streaming API. - * - * @see XXH64_state_s for details. - */ -typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ - -/*! - * @brief Allocates an @ref XXH64_state_t. - * - * Must be freed with XXH64_freeState(). - * @return An allocated XXH64_state_t on success, `NULL` on failure. - */ -XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); - -/*! - * @brief Frees an @ref XXH64_state_t. - * - * Must be allocated with XXH64_createState(). - * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). - * @return XXH_OK. - */ -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); - -/*! - * @brief Copies one @ref XXH64_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); - -/*! - * @brief Resets an @ref XXH64_state_t to begin a new hash. - * - * This function resets and seeds a state. Call it before @ref XXH64_update(). - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the hash result predictably. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); - -/*! - * @brief Consumes a block of @p input to an @ref XXH64_state_t. - * - * Call this to incrementally consume blocks of data. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated hash value from an @ref XXH64_state_t. - * - * @note - * Calling XXH64_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated xxHash64 value from that state. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ -/******* Canonical representation *******/ - -/*! - * @brief Canonical (big endian) representation of @ref XXH64_hash_t. - */ -typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; - -/*! - * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. - * - * @param dst The @ref XXH64_canonical_t pointer to be stored to. - * @param hash The @ref XXH64_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); - -/*! - * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. - * - * @param src The @ref XXH64_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); - -#ifndef XXH_NO_XXH3 - -/*! - * @} - * ************************************************************************ - * @defgroup XXH3_family XXH3 family - * @ingroup public - * @{ - * - * XXH3 is a more recent hash algorithm featuring: - * - Improved speed for both small and large inputs - * - True 64-bit and 128-bit outputs - * - SIMD acceleration - * - Improved 32-bit viability - * - * Speed analysis methodology is explained here: - * - * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html - * - * Compared to XXH64, expect XXH3 to run approximately - * ~2x faster on large inputs and >3x faster on small ones, - * exact differences vary depending on platform. - * - * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, - * but does not require it. - * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 - * at competitive speeds, even without vector support. Further details are - * explained in the implementation. - * - * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD - * implementations for many common platforms: - * - AVX512 - * - AVX2 - * - SSE2 - * - ARM NEON - * - WebAssembly SIMD128 - * - POWER8 VSX - * - s390x ZVector - * This can be controlled via the @ref XXH_VECTOR macro, but it automatically - * selects the best version according to predefined macros. For the x86 family, an - * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. - * - * XXH3 implementation is portable: - * it has a generic C90 formulation that can be compiled on any platform, - * all implementations generate exactly the same hash value on all platforms. - * Starting from v0.8.0, it's also labelled "stable", meaning that - * any future version will also generate the same hash value. - * - * XXH3 offers 2 variants, _64bits and _128bits. - * - * When only 64 bits are needed, prefer invoking the _64bits variant, as it - * reduces the amount of mixing, resulting in faster speed on small inputs. - * It's also generally simpler to manipulate a scalar return type than a struct. - * - * The API supports one-shot hashing, streaming mode, and custom secrets. - */ -/*-********************************************************************** -* XXH3 64-bit variant -************************************************************************/ - -/*! - * @brief 64-bit unseeded variant of XXH3. - * - * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however - * it may have slightly better performance due to constant propagation of the - * defaults. - * - * @see - * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms - * @see - * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants - * @see - * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief 64-bit seeded variant of XXH3 - * - * This variant generates a custom secret on the fly based on default secret - * altered using the `seed` value. - * - * While this operation is decently fast, note that it's not completely free. - * - * @note - * seed == 0 produces the same results as @ref XXH3_64bits(). - * - * @param input The data to hash - * @param length The length - * @param seed The 64-bit seed to alter the state. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); - -/*! - * The bare minimum size for a custom secret. - * - * @see - * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), - * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). - */ -#define XXH3_SECRET_SIZE_MIN 136 - -/*! - * @brief 64-bit variant of XXH3 with a custom "secret". - * - * It's possible to provide any blob of bytes as a "secret" to generate the hash. - * This makes it more difficult for an external actor to prepare an intentional collision. - * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). - * However, the quality of the secret impacts the dispersion of the hash algorithm. - * Therefore, the secret _must_ look like a bunch of random bytes. - * Avoid "trivial" or structured data such as repeated sequences or a text document. - * Whenever in doubt about the "randomness" of the blob of bytes, - * consider employing "XXH3_generateSecret()" instead (see below). - * It will generate a proper high entropy secret derived from the blob of bytes. - * Another advantage of using XXH3_generateSecret() is that - * it guarantees that all bits within the initial blob of bytes - * will impact every bit of the output. - * This is not necessarily the case when using the blob of bytes directly - * because, when hashing _small_ inputs, only a portion of the secret is employed. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); - - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/* - * Streaming requires state maintenance. - * This operation costs memory and CPU. - * As a consequence, streaming is slower than one-shot hashing. - * For better performance, prefer one-shot functions whenever applicable. - */ - -/*! - * @brief The state struct for the XXH3 streaming API. - * - * @see XXH3_state_s for details. - */ -typedef struct XXH3_state_s XXH3_state_t; -XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); - -/*! - * @brief Copies one @ref XXH3_state_t to another. - * - * @param dst_state The state to copy to. - * @param src_state The state to copy from. - * @pre - * @p dst_state and @p src_state must not be `NULL` and must not overlap. - */ -XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); - -/*! - * @brief Resets an @ref XXH3_state_t to begin a new hash. - * - * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update(). - * Digest will be equivalent to `XXH3_64bits()`. - * - * @param statePtr The state struct to reset. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); - -/*! - * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. - * - * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update(). - * Digest will be equivalent to `XXH3_64bits_withSeed()`. - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the state. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); - -/*! - * XXH3_64bits_reset_withSecret(): - * `secret` is referenced, it _must outlive_ the hash streaming session. - * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, - * and the quality of produced hash values depends on secret's entropy - * (secret's content should look like a bunch of random bytes). - * When in doubt about the randomness of a candidate `secret`, - * consider employing `XXH3_generateSecret()` instead (see below). - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); - -/*! - * @brief Consumes a block of @p input to an @ref XXH3_state_t. - * - * Call this to incrementally consume blocks of data. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. - * - * @note - * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated XXH3 64-bit hash value from that state. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/* note : canonical representation of XXH3 is the same as XXH64 - * since they both produce XXH64_hash_t values */ - - -/*-********************************************************************** -* XXH3 128-bit variant -************************************************************************/ - -/*! - * @brief The return value from 128-bit hashes. - * - * Stored in little endian order, although the fields themselves are in native - * endianness. - */ -typedef struct { - XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ - XXH64_hash_t high64; /*!< `value >> 64` */ -} XXH128_hash_t; - -/*! - * @brief Unseeded 128-bit variant of XXH3 - * - * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead - * for shorter inputs. - * - * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however - * it may have slightly better performance due to constant propagation of the - * defaults. - * - * @see - * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms - * @see - * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants - * @see - * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); -/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); -/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); - -/******* Streaming *******/ -#ifndef XXH_NO_STREAM -/* - * Streaming requires state maintenance. - * This operation costs memory and CPU. - * As a consequence, streaming is slower than one-shot hashing. - * For better performance, prefer one-shot functions whenever applicable. - * - * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). - * Use already declared XXH3_createState() and XXH3_freeState(). - * - * All reset and streaming functions have same meaning as their 64-bit counterpart. - */ - -/*! - * @brief Resets an @ref XXH3_state_t to begin a new hash. - * - * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update(). - * Digest will be equivalent to `XXH3_128bits()`. - * - * @param statePtr The state struct to reset. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); - -/*! - * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. - * - * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update(). - * Digest will be equivalent to `XXH3_128bits_withSeed()`. - * - * @param statePtr The state struct to reset. - * @param seed The 64-bit seed to alter the state. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - * - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); -/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); - -/*! - * @brief Consumes a block of @p input to an @ref XXH3_state_t. - * - * Call this to incrementally consume blocks of data. - * - * @param statePtr The state struct to update. - * @param input The block of data to be hashed, at least @p length bytes in size. - * @param length The length of @p input, in bytes. - * - * @pre - * @p statePtr must not be `NULL`. - * @pre - * The memory between @p input and @p input + @p length must be valid, - * readable, contiguous memory. However, if @p length is `0`, @p input may be - * `NULL`. In C++, this also must be *TriviallyCopyable*. - * - * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); - -/*! - * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. - * - * @note - * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, - * digest, and update again. - * - * @param statePtr The state struct to calculate the hash from. - * - * @pre - * @p statePtr must not be `NULL`. - * - * @return The calculated XXH3 128-bit hash value from that state. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); -#endif /* !XXH_NO_STREAM */ - -/* Following helper functions make it possible to compare XXH128_hast_t values. - * Since XXH128_hash_t is a structure, this capability is not offered by the language. - * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ - -/*! - * XXH128_isEqual(): - * Return: 1 if `h1` and `h2` are equal, 0 if they are not. - */ -XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); - -/*! - * @brief Compares two @ref XXH128_hash_t - * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. - * - * @return: >0 if *h128_1 > *h128_2 - * =0 if *h128_1 == *h128_2 - * <0 if *h128_1 < *h128_2 - */ -XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); - - -/******* Canonical representation *******/ -typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; - - -/*! - * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. - * - * @param dst The @ref XXH128_canonical_t pointer to be stored to. - * @param hash The @ref XXH128_hash_t to be converted. - * - * @pre - * @p dst must not be `NULL`. - */ -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); - -/*! - * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. - * - * @param src The @ref XXH128_canonical_t to convert. - * - * @pre - * @p src must not be `NULL`. - * - * @return The converted hash. - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); - - -#endif /* !XXH_NO_XXH3 */ -#endif /* XXH_NO_LONG_LONG */ - -/*! - * @} - */ -#endif /* XXHASH_H_5627135585666179 */ - - - -#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) -#define XXHASH_H_STATIC_13879238742 -/* **************************************************************************** - * This section contains declarations which are not guaranteed to remain stable. - * They may change in future versions, becoming incompatible with a different - * version of the library. - * These declarations should only be used with static linking. - * Never use them in association with dynamic linking! - ***************************************************************************** */ - -/* - * These definitions are only present to allow static allocation - * of XXH states, on stack or in a struct, for example. - * Never **ever** access their members directly. - */ - -/*! - * @internal - * @brief Structure for XXH32 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is - * an opaque type. This allows fields to safely be changed. - * - * Typedef'd to @ref XXH32_state_t. - * Do not access the members of this struct directly. - * @see XXH64_state_s, XXH3_state_s - */ -struct XXH32_state_s { - XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ - XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ - XXH32_hash_t v[4]; /*!< Accumulator lanes */ - XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ - XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ -}; /* typedef'd to XXH32_state_t */ - - -#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ - -/*! - * @internal - * @brief Structure for XXH64 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is - * an opaque type. This allows fields to safely be changed. - * - * Typedef'd to @ref XXH64_state_t. - * Do not access the members of this struct directly. - * @see XXH32_state_s, XXH3_state_s - */ -struct XXH64_state_s { - XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ - XXH64_hash_t v[4]; /*!< Accumulator lanes */ - XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ - XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ - XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ - XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ -}; /* typedef'd to XXH64_state_t */ - -#ifndef XXH_NO_XXH3 - -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ -# include -# define XXH_ALIGN(n) alignas(n) -#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ -/* In C++ alignas() is a keyword */ -# define XXH_ALIGN(n) alignas(n) -#elif defined(__GNUC__) -# define XXH_ALIGN(n) __attribute__ ((aligned(n))) -#elif defined(_MSC_VER) -# define XXH_ALIGN(n) __declspec(align(n)) -#else -# define XXH_ALIGN(n) /* disabled */ -#endif - -/* Old GCC versions only accept the attribute after the type in structures. */ -#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ - && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ - && defined(__GNUC__) -# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) -#else -# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type -#endif - -/*! - * @brief The size of the internal XXH3 buffer. - * - * This is the optimal update size for incremental hashing. - * - * @see XXH3_64b_update(), XXH3_128b_update(). - */ -#define XXH3_INTERNALBUFFER_SIZE 256 - -/*! - * @internal - * @brief Default size of the secret buffer (and @ref XXH3_kSecret). - * - * This is the size used in @ref XXH3_kSecret and the seeded functions. - * - * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. - */ -#define XXH3_SECRET_DEFAULT_SIZE 192 - -/*! - * @internal - * @brief Structure for XXH3 streaming API. - * - * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. - * Otherwise it is an opaque type. - * Never use this definition in combination with dynamic library. - * This allows fields to safely be changed in the future. - * - * @note ** This structure has a strict alignment requirement of 64 bytes!! ** - * Do not allocate this with `malloc()` or `new`, - * it will not be sufficiently aligned. - * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. - * - * Typedef'd to @ref XXH3_state_t. - * Do never access the members of this struct directly. - * - * @see XXH3_INITSTATE() for stack initialization. - * @see XXH3_createState(), XXH3_freeState(). - * @see XXH32_state_s, XXH64_state_s - */ -struct XXH3_state_s { - XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ - XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); - /*!< Used to store a custom secret generated from a seed. */ - XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); - /*!< The internal buffer. @see XXH32_state_s::mem32 */ - XXH32_hash_t bufferedSize; - /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ - XXH32_hash_t useSeed; - /*!< Reserved field. Needed for padding on 64-bit. */ - size_t nbStripesSoFar; - /*!< Number or stripes processed. */ - XXH64_hash_t totalLen; - /*!< Total length hashed. 64-bit even on 32-bit targets. */ - size_t nbStripesPerBlock; - /*!< Number of stripes per block. */ - size_t secretLimit; - /*!< Size of @ref customSecret or @ref extSecret */ - XXH64_hash_t seed; - /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ - XXH64_hash_t reserved64; - /*!< Reserved field. */ - const unsigned char* extSecret; - /*!< Reference to an external secret for the _withSecret variants, NULL - * for other variants. */ - /* note: there may be some padding at the end due to alignment on 64 bytes */ -}; /* typedef'd to XXH3_state_t */ - -#undef XXH_ALIGN_MEMBER - -/*! - * @brief Initializes a stack-allocated `XXH3_state_s`. - * - * When the @ref XXH3_state_t structure is merely emplaced on stack, - * it should be initialized with XXH3_INITSTATE() or a memset() - * in case its first reset uses XXH3_NNbits_reset_withSeed(). - * This init can be omitted if the first reset uses default or _withSecret mode. - * This operation isn't necessary when the state is created with XXH3_createState(). - * Note that this doesn't prepare the state for a streaming operation, - * it's still necessary to use XXH3_NNbits_reset*() afterwards. - */ -#define XXH3_INITSTATE(XXH3_state_ptr) \ - do { \ - XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ - tmp_xxh3_state_ptr->seed = 0; \ - tmp_xxh3_state_ptr->extSecret = NULL; \ - } while(0) - - -/*! - * simple alias to pre-selected XXH3_128bits variant - */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); - - -/* === Experimental API === */ -/* Symbols defined below must be considered tied to a specific library version. */ - -/*! - * XXH3_generateSecret(): - * - * Derive a high-entropy secret from any user-defined content, named customSeed. - * The generated secret can be used in combination with `*_withSecret()` functions. - * The `_withSecret()` variants are useful to provide a higher level of protection - * than 64-bit seed, as it becomes much more difficult for an external actor to - * guess how to impact the calculation logic. - * - * The function accepts as input a custom seed of any length and any content, - * and derives from it a high-entropy secret of length @p secretSize into an - * already allocated buffer @p secretBuffer. - * - * The generated secret can then be used with any `*_withSecret()` variant. - * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), - * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() - * are part of this list. They all accept a `secret` parameter - * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) - * _and_ feature very high entropy (consist of random-looking bytes). - * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can - * be employed to ensure proper quality. - * - * @p customSeed can be anything. It can have any size, even small ones, - * and its content can be anything, even "poor entropy" sources such as a bunch - * of zeroes. The resulting `secret` will nonetheless provide all required qualities. - * - * @pre - * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN - * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. - * - * Example code: - * @code{.c} - * #include - * #include - * #include - * #define XXH_STATIC_LINKING_ONLY // expose unstable API - * #include "xxhash.h" - * // Hashes argv[2] using the entropy from argv[1]. - * int main(int argc, char* argv[]) - * { - * char secret[XXH3_SECRET_SIZE_MIN]; - * if (argv != 3) { return 1; } - * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); - * XXH64_hash_t h = XXH3_64bits_withSecret( - * argv[2], strlen(argv[2]), - * secret, sizeof(secret) - * ); - * printf("%016llx\n", (unsigned long long) h); - * } - * @endcode - */ -XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); - -/*! - * @brief Generate the same secret as the _withSeed() variants. - * - * The generated secret can be used in combination with - *`*_withSecret()` and `_withSecretandSeed()` variants. - * - * Example C++ `std::string` hash class: - * @code{.cpp} - * #include - * #define XXH_STATIC_LINKING_ONLY // expose unstable API - * #include "xxhash.h" - * // Slow, seeds each time - * class HashSlow { - * XXH64_hash_t seed; - * public: - * HashSlow(XXH64_hash_t s) : seed{s} {} - * size_t operator()(const std::string& x) const { - * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; - * } - * }; - * // Fast, caches the seeded secret for future uses. - * class HashFast { - * unsigned char secret[XXH3_SECRET_SIZE_MIN]; - * public: - * HashFast(XXH64_hash_t s) { - * XXH3_generateSecret_fromSeed(secret, seed); - * } - * size_t operator()(const std::string& x) const { - * return size_t{ - * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) - * }; - * } - * }; - * @endcode - * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes - * @param seed The seed to seed the state. - */ -XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); - -/*! - * These variants generate hash values using either - * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) - * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). - * - * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. - * `_withSeed()` has to generate the secret on the fly for "large" keys. - * It's fast, but can be perceptible for "not so large" keys (< 1 KB). - * `_withSecret()` has to generate the masks on the fly for "small" keys, - * which requires more instructions than _withSeed() variants. - * Therefore, _withSecretandSeed variant combines the best of both worlds. - * - * When @p secret has been generated by XXH3_generateSecret_fromSeed(), - * this variant produces *exactly* the same results as `_withSeed()` variant, - * hence offering only a pure speed benefit on "large" input, - * by skipping the need to regenerate the secret for every large input. - * - * Another usage scenario is to hash the secret to a 64-bit hash value, - * for example with XXH3_64bits(), which then becomes the seed, - * and then employ both the seed and the secret in _withSecretandSeed(). - * On top of speed, an added benefit is that each bit in the secret - * has a 50% chance to swap each bit in the output, via its impact to the seed. - * - * This is not guaranteed when using the secret directly in "small data" scenarios, - * because only portions of the secret are employed for small data. - */ -XXH_PUBLIC_API XXH_PUREF XXH64_hash_t -XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed); -/*! @copydoc XXH3_64bits_withSecretandSeed() */ -XXH_PUBLIC_API XXH_PUREF XXH128_hash_t -XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); -#ifndef XXH_NO_STREAM -/*! @copydoc XXH3_64bits_withSecretandSeed() */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); -/*! @copydoc XXH3_64bits_withSecretandSeed() */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, - XXH_NOESCAPE const void* secret, size_t secretSize, - XXH64_hash_t seed64); -#endif /* !XXH_NO_STREAM */ - -#endif /* !XXH_NO_XXH3 */ -#endif /* XXH_NO_LONG_LONG */ -#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) -# define XXH_IMPLEMENTATION -#endif - -#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ - - -/* ======================================================================== */ -/* ======================================================================== */ -/* ======================================================================== */ - - -/*-********************************************************************** - * xxHash implementation - *-********************************************************************** - * xxHash's implementation used to be hosted inside xxhash.c. - * - * However, inlining requires implementation to be visible to the compiler, - * hence be included alongside the header. - * Previously, implementation was hosted inside xxhash.c, - * which was then #included when inlining was activated. - * This construction created issues with a few build and install systems, - * as it required xxhash.c to be stored in /include directory. - * - * xxHash implementation is now directly integrated within xxhash.h. - * As a consequence, xxhash.c is no longer needed in /include. - * - * xxhash.c is still available and is still useful. - * In a "normal" setup, when xxhash is not inlined, - * xxhash.h only exposes the prototypes and public symbols, - * while xxhash.c can be built into an object file xxhash.o - * which can then be linked into the final binary. - ************************************************************************/ - -#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ - || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) -# define XXH_IMPLEM_13a8737387 - -/* ************************************* -* Tuning parameters -***************************************/ - -/*! - * @defgroup tuning Tuning parameters - * @{ - * - * Various macros to control xxHash's behavior. - */ -#ifdef XXH_DOXYGEN -/*! - * @brief Define this to disable 64-bit code. - * - * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. - */ -# define XXH_NO_LONG_LONG -# undef XXH_NO_LONG_LONG /* don't actually */ -/*! - * @brief Controls how unaligned memory is accessed. - * - * By default, access to unaligned memory is controlled by `memcpy()`, which is - * safe and portable. - * - * Unfortunately, on some target/compiler combinations, the generated assembly - * is sub-optimal. - * - * The below switch allow selection of a different access method - * in the search for improved performance. - * - * @par Possible options: - * - * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` - * @par - * Use `memcpy()`. Safe and portable. Note that most modern compilers will - * eliminate the function call and treat it as an unaligned access. - * - * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` - * @par - * Depends on compiler extensions and is therefore not portable. - * This method is safe _if_ your compiler supports it, - * and *generally* as fast or faster than `memcpy`. - * - * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast - * @par - * Casts directly and dereferences. This method doesn't depend on the - * compiler, but it violates the C standard as it directly dereferences an - * unaligned pointer. It can generate buggy code on targets which do not - * support unaligned memory accesses, but in some circumstances, it's the - * only known way to get the most performance. - * - * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift - * @par - * Also portable. This can generate the best code on old compilers which don't - * inline small `memcpy()` calls, and it might also be faster on big-endian - * systems which lack a native byteswap instruction. However, some compilers - * will emit literal byteshifts even if the target supports unaligned access. - * - * - * @warning - * Methods 1 and 2 rely on implementation-defined behavior. Use these with - * care, as what works on one compiler/platform/optimization level may cause - * another to read garbage data or even crash. - * - * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. - * - * Prefer these methods in priority order (0 > 3 > 1 > 2) - */ -# define XXH_FORCE_MEMORY_ACCESS 0 - -/*! - * @def XXH_SIZE_OPT - * @brief Controls how much xxHash optimizes for size. - * - * xxHash, when compiled, tends to result in a rather large binary size. This - * is mostly due to heavy usage to forced inlining and constant folding of the - * @ref XXH3_family to increase performance. - * - * However, some developers prefer size over speed. This option can - * significantly reduce the size of the generated code. When using the `-Os` - * or `-Oz` options on GCC or Clang, this is defined to 1 by default, - * otherwise it is defined to 0. - * - * Most of these size optimizations can be controlled manually. - * - * This is a number from 0-2. - * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed - * comes first. - * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more - * conservative and disables hacks that increase code size. It implies the - * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, - * and @ref XXH3_NEON_LANES == 8 if they are not already defined. - * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. - * Performance may cry. For example, the single shot functions just use the - * streaming API. - */ -# define XXH_SIZE_OPT 0 - -/*! - * @def XXH_FORCE_ALIGN_CHECK - * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() - * and XXH64() only). - * - * This is an important performance trick for architectures without decent - * unaligned memory access performance. - * - * It checks for input alignment, and when conditions are met, uses a "fast - * path" employing direct 32-bit/64-bit reads, resulting in _dramatically - * faster_ read speed. - * - * The check costs one initial branch per hash, which is generally negligible, - * but not zero. - * - * Moreover, it's not useful to generate an additional code path if memory - * access uses the same instruction for both aligned and unaligned - * addresses (e.g. x86 and aarch64). - * - * In these cases, the alignment check can be removed by setting this macro to 0. - * Then the code will always use unaligned memory access. - * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips - * which are platforms known to offer good unaligned memory accesses performance. - * - * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. - * - * This option does not affect XXH3 (only XXH32 and XXH64). - */ -# define XXH_FORCE_ALIGN_CHECK 0 - -/*! - * @def XXH_NO_INLINE_HINTS - * @brief When non-zero, sets all functions to `static`. - * - * By default, xxHash tries to force the compiler to inline almost all internal - * functions. - * - * This can usually improve performance due to reduced jumping and improved - * constant folding, but significantly increases the size of the binary which - * might not be favorable. - * - * Additionally, sometimes the forced inlining can be detrimental to performance, - * depending on the architecture. - * - * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the - * compiler full control on whether to inline or not. - * - * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if - * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. - */ -# define XXH_NO_INLINE_HINTS 0 - -/*! - * @def XXH3_INLINE_SECRET - * @brief Determines whether to inline the XXH3 withSecret code. - * - * When the secret size is known, the compiler can improve the performance - * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). - * - * However, if the secret size is not known, it doesn't have any benefit. This - * happens when xxHash is compiled into a global symbol. Therefore, if - * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. - * - * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers - * that are *sometimes* force inline on -Og, and it is impossible to automatically - * detect this optimization level. - */ -# define XXH3_INLINE_SECRET 0 - -/*! - * @def XXH32_ENDJMP - * @brief Whether to use a jump for `XXH32_finalize`. - * - * For performance, `XXH32_finalize` uses multiple branches in the finalizer. - * This is generally preferable for performance, - * but depending on exact architecture, a jmp may be preferable. - * - * This setting is only possibly making a difference for very small inputs. - */ -# define XXH32_ENDJMP 0 - -/*! - * @internal - * @brief Redefines old internal names. - * - * For compatibility with code that uses xxHash's internals before the names - * were changed to improve namespacing. There is no other reason to use this. - */ -# define XXH_OLD_NAMES -# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ - -/*! - * @def XXH_NO_STREAM - * @brief Disables the streaming API. - * - * When xxHash is not inlined and the streaming functions are not used, disabling - * the streaming functions can improve code size significantly, especially with - * the @ref XXH3_family which tends to make constant folded copies of itself. - */ -# define XXH_NO_STREAM -# undef XXH_NO_STREAM /* don't actually */ -#endif /* XXH_DOXYGEN */ -/*! - * @} - */ - -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ - /* prefer __packed__ structures (method 1) for GCC - * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy - * which for some reason does unaligned loads. */ -# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -#ifndef XXH_SIZE_OPT - /* default to 1 for -Os or -Oz */ -# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) -# define XXH_SIZE_OPT 1 -# else -# define XXH_SIZE_OPT 0 -# endif -#endif - -#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ - /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ -# if XXH_SIZE_OPT >= 1 || \ - defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ - || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ -# define XXH_FORCE_ALIGN_CHECK 0 -# else -# define XXH_FORCE_ALIGN_CHECK 1 -# endif -#endif - -#ifndef XXH_NO_INLINE_HINTS -# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ -# define XXH_NO_INLINE_HINTS 1 -# else -# define XXH_NO_INLINE_HINTS 0 -# endif -#endif - -#ifndef XXH3_INLINE_SECRET -# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ - || !defined(XXH_INLINE_ALL) -# define XXH3_INLINE_SECRET 0 -# else -# define XXH3_INLINE_SECRET 1 -# endif -#endif - -#ifndef XXH32_ENDJMP -/* generally preferable for performance */ -# define XXH32_ENDJMP 0 -#endif - -/*! - * @defgroup impl Implementation - * @{ - */ - - -/* ************************************* -* Includes & Memory related functions -***************************************/ -#if defined(XXH_NO_STREAM) -/* nothing */ -#elif defined(XXH_NO_STDLIB) - -/* When requesting to disable any mention of stdlib, - * the library loses the ability to invoked malloc / free. - * In practice, it means that functions like `XXH*_createState()` - * will always fail, and return NULL. - * This flag is useful in situations where - * xxhash.h is integrated into some kernel, embedded or limited environment - * without access to dynamic allocation. - */ - -static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } -static void XXH_free(void* p) { (void)p; } - -#else - -/* - * Modify the local functions below should you wish to use - * different memory routines for malloc() and free() - */ -#include - -/*! - * @internal - * @brief Modify this function to use a different routine than malloc(). - */ -static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } - -/*! - * @internal - * @brief Modify this function to use a different routine than free(). - */ -static void XXH_free(void* p) { free(p); } - -#endif /* XXH_NO_STDLIB */ - -#include - -/*! - * @internal - * @brief Modify this function to use a different routine than memcpy(). - */ -static void* XXH_memcpy(void* dest, const void* src, size_t size) -{ - return memcpy(dest,src,size); -} - -#include /* ULLONG_MAX */ - - -/* ************************************* -* Compiler Specific Options -***************************************/ -#ifdef _MSC_VER /* Visual Studio warning fix */ -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -#endif - -#if XXH_NO_INLINE_HINTS /* disable inlining hints */ -# if defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __attribute__((unused)) -# else -# define XXH_FORCE_INLINE static -# endif -# define XXH_NO_INLINE static -/* enable inlining hints */ -#elif defined(__GNUC__) || defined(__clang__) -# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) -# define XXH_NO_INLINE static __attribute__((noinline)) -#elif defined(_MSC_VER) /* Visual Studio */ -# define XXH_FORCE_INLINE static __forceinline -# define XXH_NO_INLINE static __declspec(noinline) -#elif defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ -# define XXH_FORCE_INLINE static inline -# define XXH_NO_INLINE static -#else -# define XXH_FORCE_INLINE static -# define XXH_NO_INLINE static -#endif - -#if XXH3_INLINE_SECRET -# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE -#else -# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE -#endif - - -/* ************************************* -* Debug -***************************************/ -/*! - * @ingroup tuning - * @def XXH_DEBUGLEVEL - * @brief Sets the debugging level. - * - * XXH_DEBUGLEVEL is expected to be defined externally, typically via the - * compiler's command line options. The value must be a number. - */ -#ifndef XXH_DEBUGLEVEL -# ifdef DEBUGLEVEL /* backwards compat */ -# define XXH_DEBUGLEVEL DEBUGLEVEL -# else -# define XXH_DEBUGLEVEL 0 -# endif -#endif - -#if (XXH_DEBUGLEVEL>=1) -# include /* note: can still be disabled with NDEBUG */ -# define XXH_ASSERT(c) assert(c) -#else -# if defined(__INTEL_COMPILER) -# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) -# else -# define XXH_ASSERT(c) XXH_ASSUME(c) -# endif -#endif - -/* note: use after variable declarations */ -#ifndef XXH_STATIC_ASSERT -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) -# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) -# else -# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) -# endif -# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) -#endif - -/*! - * @internal - * @def XXH_COMPILER_GUARD(var) - * @brief Used to prevent unwanted optimizations for @p var. - * - * It uses an empty GCC inline assembly statement with a register constraint - * which forces @p var into a general purpose register (eg eax, ebx, ecx - * on x86) and marks it as modified. - * - * This is used in a few places to avoid unwanted autovectorization (e.g. - * XXH32_round()). All vectorization we want is explicit via intrinsics, - * and _usually_ isn't wanted elsewhere. - * - * We also use it to prevent unwanted constant folding for AArch64 in - * XXH3_initCustomSecret_scalar(). - */ -#if defined(__GNUC__) || defined(__clang__) -# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) -#else -# define XXH_COMPILER_GUARD(var) ((void)0) -#endif - -/* Specifically for NEON vectors which use the "w" constraint, on - * Clang. */ -#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) -# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) -#else -# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) -#endif - -/* ************************************* -* Basic Types -***************************************/ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) -# include - typedef uint8_t xxh_u8; -#else - typedef unsigned char xxh_u8; -#endif -typedef XXH32_hash_t xxh_u32; - -#ifdef XXH_OLD_NAMES -# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" -# define BYTE xxh_u8 -# define U8 xxh_u8 -# define U32 xxh_u32 -#endif - -/* *** Memory access *** */ - -/*! - * @internal - * @fn xxh_u32 XXH_read32(const void* ptr) - * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit native endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readLE32(const void* ptr) - * @brief Reads an unaligned 32-bit little endian integer from @p ptr. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit little endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readBE32(const void* ptr) - * @brief Reads an unaligned 32-bit big endian integer from @p ptr. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * - * @param ptr The pointer to read from. - * @return The 32-bit big endian integer from the bytes at @p ptr. - */ - -/*! - * @internal - * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) - * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. - * - * Affected by @ref XXH_FORCE_MEMORY_ACCESS. - * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is - * always @ref XXH_alignment::XXH_unaligned. - * - * @param ptr The pointer to read from. - * @param align Whether @p ptr is aligned. - * @pre - * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte - * aligned. - * @return The 32-bit little endian integer from the bytes at @p ptr. - */ - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE32 and XXH_readBE32. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* - * Force direct memory access. Only works on CPU which support unaligned memory - * access in hardware. - */ -static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __attribute__((aligned(1))) is supported by gcc and clang. Originally the - * documentation claimed that it only increased the alignment, but actually it - * can decrease it on gcc, clang, and icc: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, - * https://gcc.godbolt.org/z/xYez1j67Y. - */ -#ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; -#endif -static xxh_u32 XXH_read32(const void* ptr) -{ - typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; - return *((const xxh_unalign32*)ptr); -} - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html - */ -static xxh_u32 XXH_read32(const void* memPtr) -{ - xxh_u32 val; - XXH_memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - - -/* *** Endianness *** */ - -/*! - * @ingroup tuning - * @def XXH_CPU_LITTLE_ENDIAN - * @brief Whether the target is little endian. - * - * Defined to 1 if the target is little endian, or 0 if it is big endian. - * It can be defined externally, for example on the compiler command line. - * - * If it is not defined, - * a runtime check (which is usually constant folded) is used instead. - * - * @note - * This is not necessarily defined to an integer constant. - * - * @see XXH_isLittleEndian() for the runtime check. - */ -#ifndef XXH_CPU_LITTLE_ENDIAN -/* - * Try to detect endianness automatically, to avoid the nonstandard behavior - * in `XXH_isLittleEndian()` - */ -# if defined(_WIN32) /* Windows is always little endian */ \ - || defined(__LITTLE_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 1 -# elif defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_CPU_LITTLE_ENDIAN 0 -# else -/*! - * @internal - * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. - * - * Most compilers will constant fold this. - */ -static int XXH_isLittleEndian(void) -{ - /* - * Portable and well-defined behavior. - * Don't use static: it is detrimental to performance. - */ - const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; - return one.c[0]; -} -# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() -# endif -#endif - - - - -/* **************************************** -* Compiler-specific Functions and Macros -******************************************/ -#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#ifdef __has_builtin -# define XXH_HAS_BUILTIN(x) __has_builtin(x) -#else -# define XXH_HAS_BUILTIN(x) 0 -#endif - - - -/* - * C23 and future versions have standard "unreachable()". - * Once it has been implemented reliably we can add it as an - * additional case: - * - * ``` - * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) - * # include - * # ifdef unreachable - * # define XXH_UNREACHABLE() unreachable() - * # endif - * #endif - * ``` - * - * Note C++23 also has std::unreachable() which can be detected - * as follows: - * ``` - * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) - * # include - * # define XXH_UNREACHABLE() std::unreachable() - * #endif - * ``` - * NB: `__cpp_lib_unreachable` is defined in the `` header. - * We don't use that as including `` in `extern "C"` blocks - * doesn't work on GCC12 - */ - -#if XXH_HAS_BUILTIN(__builtin_unreachable) -# define XXH_UNREACHABLE() __builtin_unreachable() - -#elif defined(_MSC_VER) -# define XXH_UNREACHABLE() __assume(0) - -#else -# define XXH_UNREACHABLE() -#endif - -#if XXH_HAS_BUILTIN(__builtin_assume) -# define XXH_ASSUME(c) __builtin_assume(c) -#else -# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } -#endif - -/*! - * @internal - * @def XXH_rotl32(x,r) - * @brief 32-bit rotate left. - * - * @param x The 32-bit integer to be rotated. - * @param r The number of bits to rotate. - * @pre - * @p r > 0 && @p r < 32 - * @note - * @p x and @p r may be evaluated multiple times. - * @return The rotated result. - */ -#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ - && XXH_HAS_BUILTIN(__builtin_rotateleft64) -# define XXH_rotl32 __builtin_rotateleft32 -# define XXH_rotl64 __builtin_rotateleft64 -/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ -#elif defined(_MSC_VER) -# define XXH_rotl32(x,r) _rotl(x,r) -# define XXH_rotl64(x,r) _rotl64(x,r) -#else -# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) -# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) -#endif - -/*! - * @internal - * @fn xxh_u32 XXH_swap32(xxh_u32 x) - * @brief A 32-bit byteswap. - * - * @param x The 32-bit integer to byteswap. - * @return @p x, byteswapped. - */ -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap32 _byteswap_ulong -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap32 __builtin_bswap32 -#else -static xxh_u32 XXH_swap32 (xxh_u32 x) -{ - return ((x << 24) & 0xff000000 ) | - ((x << 8) & 0x00ff0000 ) | - ((x >> 8) & 0x0000ff00 ) | - ((x >> 24) & 0x000000ff ); -} -#endif - - -/* *************************** -* Memory reads -*****************************/ - -/*! - * @internal - * @brief Enum to indicate whether a pointer is aligned. - */ -typedef enum { - XXH_aligned, /*!< Aligned */ - XXH_unaligned /*!< Possibly unaligned */ -} XXH_alignment; - -/* - * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. - * - * This is ideal for older compilers which don't inline memcpy. - */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u32)bytePtr[1] << 8) - | ((xxh_u32)bytePtr[2] << 16) - | ((xxh_u32)bytePtr[3] << 24); -} - -XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[3] - | ((xxh_u32)bytePtr[2] << 8) - | ((xxh_u32)bytePtr[1] << 16) - | ((xxh_u32)bytePtr[0] << 24); -} - -#else -XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); -} - -static xxh_u32 XXH_readBE32(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u32 -XXH_readLE32_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) { - return XXH_readLE32(ptr); - } else { - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); - } -} - - -/* ************************************* -* Misc -***************************************/ -/*! @ingroup public */ -XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } - - -/* ******************************************************************* -* 32-bit hash functions -*********************************************************************/ -/*! - * @} - * @defgroup XXH32_impl XXH32 implementation - * @ingroup impl - * - * Details on the XXH32 implementation. - * @{ - */ - /* #define instead of static const, to be used as initializers */ -#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ -#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ -#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ -#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ -#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ - -#ifdef XXH_OLD_NAMES -# define PRIME32_1 XXH_PRIME32_1 -# define PRIME32_2 XXH_PRIME32_2 -# define PRIME32_3 XXH_PRIME32_3 -# define PRIME32_4 XXH_PRIME32_4 -# define PRIME32_5 XXH_PRIME32_5 -#endif - -/*! - * @internal - * @brief Normal stripe processing routine. - * - * This shuffles the bits so that any bit from @p input impacts several bits in - * @p acc. - * - * @param acc The accumulator lane. - * @param input The stripe of input to mix. - * @return The mixed accumulator lane. - */ -static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) -{ - acc += input * XXH_PRIME32_2; - acc = XXH_rotl32(acc, 13); - acc *= XXH_PRIME32_1; -#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) - /* - * UGLY HACK: - * A compiler fence is the only thing that prevents GCC and Clang from - * autovectorizing the XXH32 loop (pragmas and attributes don't work for some - * reason) without globally disabling SSE4.1. - * - * The reason we want to avoid vectorization is because despite working on - * 4 integers at a time, there are multiple factors slowing XXH32 down on - * SSE4: - * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on - * newer chips!) making it slightly slower to multiply four integers at - * once compared to four integers independently. Even when pmulld was - * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE - * just to multiply unless doing a long operation. - * - * - Four instructions are required to rotate, - * movqda tmp, v // not required with VEX encoding - * pslld tmp, 13 // tmp <<= 13 - * psrld v, 19 // x >>= 19 - * por v, tmp // x |= tmp - * compared to one for scalar: - * roll v, 13 // reliably fast across the board - * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason - * - * - Instruction level parallelism is actually more beneficial here because - * the SIMD actually serializes this operation: While v1 is rotating, v2 - * can load data, while v3 can multiply. SSE forces them to operate - * together. - * - * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing - * the loop. NEON is only faster on the A53, and with the newer cores, it is less - * than half the speed. - * - * Additionally, this is used on WASM SIMD128 because it JITs to the same - * SIMD instructions and has the same issue. - */ - XXH_COMPILER_GUARD(acc); -#endif - return acc; -} - -/*! - * @internal - * @brief Mixes all bits to finalize the hash. - * - * The final mix ensures that all input bits have a chance to impact any bit in - * the output digest, resulting in an unbiased distribution. - * - * @param hash The hash to avalanche. - * @return The avalanched hash. - */ -static xxh_u32 XXH32_avalanche(xxh_u32 hash) -{ - hash ^= hash >> 15; - hash *= XXH_PRIME32_2; - hash ^= hash >> 13; - hash *= XXH_PRIME32_3; - hash ^= hash >> 16; - return hash; -} - -#define XXH_get32bits(p) XXH_readLE32_align(p, align) - -/*! - * @internal - * @brief Processes the last 0-15 bytes of @p ptr. - * - * There may be up to 15 bytes remaining to consume from the input. - * This final stage will digest them to ensure that all input bytes are present - * in the final mix. - * - * @param hash The hash to finalize. - * @param ptr The pointer to the remaining input. - * @param len The remaining length, modulo 16. - * @param align Whether @p ptr is aligned. - * @return The finalized hash. - * @see XXH64_finalize(). - */ -static XXH_PUREF xxh_u32 -XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ -#define XXH_PROCESS1 do { \ - hash += (*ptr++) * XXH_PRIME32_5; \ - hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ -} while (0) - -#define XXH_PROCESS4 do { \ - hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ - ptr += 4; \ - hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ -} while (0) - - if (ptr==NULL) XXH_ASSERT(len == 0); - - /* Compact rerolled version; generally faster */ - if (!XXH32_ENDJMP) { - len &= 15; - while (len >= 4) { - XXH_PROCESS4; - len -= 4; - } - while (len > 0) { - XXH_PROCESS1; - --len; - } - return XXH32_avalanche(hash); - } else { - switch(len&15) /* or switch(bEnd - p) */ { - case 12: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 8: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 4: XXH_PROCESS4; - return XXH32_avalanche(hash); - - case 13: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 9: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 5: XXH_PROCESS4; - XXH_PROCESS1; - return XXH32_avalanche(hash); - - case 14: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 10: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 6: XXH_PROCESS4; - XXH_PROCESS1; - XXH_PROCESS1; - return XXH32_avalanche(hash); - - case 15: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 11: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 7: XXH_PROCESS4; - XXH_FALLTHROUGH; /* fallthrough */ - case 3: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 2: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 1: XXH_PROCESS1; - XXH_FALLTHROUGH; /* fallthrough */ - case 0: return XXH32_avalanche(hash); - } - XXH_ASSERT(0); - return hash; /* reaching this point is deemed impossible */ - } -} - -#ifdef XXH_OLD_NAMES -# define PROCESS1 XXH_PROCESS1 -# define PROCESS4 XXH_PROCESS4 -#else -# undef XXH_PROCESS1 -# undef XXH_PROCESS4 -#endif - -/*! - * @internal - * @brief The implementation for @ref XXH32(). - * - * @param input , len , seed Directly passed from @ref XXH32(). - * @param align Whether @p input is aligned. - * @return The calculated hash. - */ -XXH_FORCE_INLINE XXH_PUREF xxh_u32 -XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) -{ - xxh_u32 h32; - - if (input==NULL) XXH_ASSERT(len == 0); - - if (len>=16) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 15; - xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - xxh_u32 v2 = seed + XXH_PRIME32_2; - xxh_u32 v3 = seed + 0; - xxh_u32 v4 = seed - XXH_PRIME32_1; - - do { - v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; - v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; - v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; - v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; - } while (input < limit); - - h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) - + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); - } else { - h32 = seed + XXH_PRIME32_5; - } - - h32 += (xxh_u32)len; - - return XXH32_finalize(h32, input, len&15, align); -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) -{ -#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH32_state_t state; - XXH32_reset(&state, seed); - XXH32_update(&state, (const xxh_u8*)input, len); - return XXH32_digest(&state); -#else - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); -#endif -} - - - -/******* Hash streaming *******/ -#ifndef XXH_NO_STREAM -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) -{ - return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); -} -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) -{ - XXH_memcpy(dstState, srcState, sizeof(*dstState)); -} - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) -{ - XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - statePtr->v[1] = seed + XXH_PRIME32_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME32_1; - return XXH_OK; -} - - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH_errorcode -XXH32_update(XXH32_state_t* state, const void* input, size_t len) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len_32 += (XXH32_hash_t)len; - state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); - - if (state->memsize + len < 16) { /* fill in tmp buffer */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); - state->memsize += (XXH32_hash_t)len; - return XXH_OK; - } - - if (state->memsize) { /* some data left from previous update */ - XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); - { const xxh_u32* p32 = state->mem32; - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); - } - p += 16-state->memsize; - state->memsize = 0; - } - - if (p <= bEnd-16) { - const xxh_u8* const limit = bEnd - 16; - - do { - state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; - state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; - state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; - state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; - } while (p<=limit); - - } - - if (p < bEnd) { - XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) -{ - xxh_u32 h32; - - if (state->large_len) { - h32 = XXH_rotl32(state->v[0], 1) - + XXH_rotl32(state->v[1], 7) - + XXH_rotl32(state->v[2], 12) - + XXH_rotl32(state->v[3], 18); - } else { - h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; - } - - h32 += state->total_len_32; - - return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); -} -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/*! - * @ingroup XXH32_family - * The default return values from XXH functions are unsigned 32 and 64 bit - * integers. - * - * The canonical representation uses big endian convention, the same convention - * as human-readable numbers (large digits first). - * - * This way, hash values can be written into a file or buffer, remaining - * comparable across different systems. - * - * The following functions allow transformation of hash values to and from their - * canonical format. - */ -XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); - XXH_memcpy(dst, &hash, sizeof(*dst)); -} -/*! @ingroup XXH32_family */ -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) -{ - return XXH_readBE32(src); -} - - -#ifndef XXH_NO_LONG_LONG - -/* ******************************************************************* -* 64-bit hash functions -*********************************************************************/ -/*! - * @} - * @ingroup impl - * @{ - */ -/******* Memory access *******/ - -typedef XXH64_hash_t xxh_u64; - -#ifdef XXH_OLD_NAMES -# define U64 xxh_u64 -#endif - -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) -/* - * Manual byteshift. Best for old compilers which don't inline memcpy. - * We actually directly use XXH_readLE64 and XXH_readBE64. - */ -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static xxh_u64 XXH_read64(const void* memPtr) -{ - return *(const xxh_u64*) memPtr; -} - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* - * __attribute__((aligned(1))) is supported by gcc and clang. Originally the - * documentation claimed that it only increased the alignment, but actually it - * can decrease it on gcc, clang, and icc: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, - * https://gcc.godbolt.org/z/xYez1j67Y. - */ -#ifdef XXH_OLD_NAMES -typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; -#endif -static xxh_u64 XXH_read64(const void* ptr) -{ - typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; - return *((const xxh_unalign64*)ptr); -} - -#else - -/* - * Portable and safe solution. Generally efficient. - * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html - */ -static xxh_u64 XXH_read64(const void* memPtr) -{ - xxh_u64 val; - XXH_memcpy(&val, memPtr, sizeof(val)); - return val; -} - -#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ - -#if defined(_MSC_VER) /* Visual Studio */ -# define XXH_swap64 _byteswap_uint64 -#elif XXH_GCC_VERSION >= 403 -# define XXH_swap64 __builtin_bswap64 -#else -static xxh_u64 XXH_swap64(xxh_u64 x) -{ - return ((x << 56) & 0xff00000000000000ULL) | - ((x << 40) & 0x00ff000000000000ULL) | - ((x << 24) & 0x0000ff0000000000ULL) | - ((x << 8) & 0x000000ff00000000ULL) | - ((x >> 8) & 0x00000000ff000000ULL) | - ((x >> 24) & 0x0000000000ff0000ULL) | - ((x >> 40) & 0x000000000000ff00ULL) | - ((x >> 56) & 0x00000000000000ffULL); -} -#endif - - -/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) - -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[0] - | ((xxh_u64)bytePtr[1] << 8) - | ((xxh_u64)bytePtr[2] << 16) - | ((xxh_u64)bytePtr[3] << 24) - | ((xxh_u64)bytePtr[4] << 32) - | ((xxh_u64)bytePtr[5] << 40) - | ((xxh_u64)bytePtr[6] << 48) - | ((xxh_u64)bytePtr[7] << 56); -} - -XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) -{ - const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; - return bytePtr[7] - | ((xxh_u64)bytePtr[6] << 8) - | ((xxh_u64)bytePtr[5] << 16) - | ((xxh_u64)bytePtr[4] << 24) - | ((xxh_u64)bytePtr[3] << 32) - | ((xxh_u64)bytePtr[2] << 40) - | ((xxh_u64)bytePtr[1] << 48) - | ((xxh_u64)bytePtr[0] << 56); -} - -#else -XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); -} - -static xxh_u64 XXH_readBE64(const void* ptr) -{ - return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); -} -#endif - -XXH_FORCE_INLINE xxh_u64 -XXH_readLE64_align(const void* ptr, XXH_alignment align) -{ - if (align==XXH_unaligned) - return XXH_readLE64(ptr); - else - return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); -} - - -/******* xxh64 *******/ -/*! - * @} - * @defgroup XXH64_impl XXH64 implementation - * @ingroup impl - * - * Details on the XXH64 implementation. - * @{ - */ -/* #define rather that static const, to be used as initializers */ -#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ -#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ -#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ -#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ -#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ - -#ifdef XXH_OLD_NAMES -# define PRIME64_1 XXH_PRIME64_1 -# define PRIME64_2 XXH_PRIME64_2 -# define PRIME64_3 XXH_PRIME64_3 -# define PRIME64_4 XXH_PRIME64_4 -# define PRIME64_5 XXH_PRIME64_5 -#endif - -/*! @copydoc XXH32_round */ -static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) -{ - acc += input * XXH_PRIME64_2; - acc = XXH_rotl64(acc, 31); - acc *= XXH_PRIME64_1; - return acc; -} - -static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) -{ - val = XXH64_round(0, val); - acc ^= val; - acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; - return acc; -} - -/*! @copydoc XXH32_avalanche */ -static xxh_u64 XXH64_avalanche(xxh_u64 hash) -{ - hash ^= hash >> 33; - hash *= XXH_PRIME64_2; - hash ^= hash >> 29; - hash *= XXH_PRIME64_3; - hash ^= hash >> 32; - return hash; -} - - -#define XXH_get64bits(p) XXH_readLE64_align(p, align) - -/*! - * @internal - * @brief Processes the last 0-31 bytes of @p ptr. - * - * There may be up to 31 bytes remaining to consume from the input. - * This final stage will digest them to ensure that all input bytes are present - * in the final mix. - * - * @param hash The hash to finalize. - * @param ptr The pointer to the remaining input. - * @param len The remaining length, modulo 32. - * @param align Whether @p ptr is aligned. - * @return The finalized hash - * @see XXH32_finalize(). - */ -static XXH_PUREF xxh_u64 -XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ - if (ptr==NULL) XXH_ASSERT(len == 0); - len &= 31; - while (len >= 8) { - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); - ptr += 8; - hash ^= k1; - hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; - len -= 8; - } - if (len >= 4) { - hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; - ptr += 4; - hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; - len -= 4; - } - while (len > 0) { - hash ^= (*ptr++) * XXH_PRIME64_5; - hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; - --len; - } - return XXH64_avalanche(hash); -} - -#ifdef XXH_OLD_NAMES -# define PROCESS1_64 XXH_PROCESS1_64 -# define PROCESS4_64 XXH_PROCESS4_64 -# define PROCESS8_64 XXH_PROCESS8_64 -#else -# undef XXH_PROCESS1_64 -# undef XXH_PROCESS4_64 -# undef XXH_PROCESS8_64 -#endif - -/*! - * @internal - * @brief The implementation for @ref XXH64(). - * - * @param input , len , seed Directly passed from @ref XXH64(). - * @param align Whether @p input is aligned. - * @return The calculated hash. - */ -XXH_FORCE_INLINE XXH_PUREF xxh_u64 -XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) -{ - xxh_u64 h64; - if (input==NULL) XXH_ASSERT(len == 0); - - if (len>=32) { - const xxh_u8* const bEnd = input + len; - const xxh_u8* const limit = bEnd - 31; - xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - xxh_u64 v2 = seed + XXH_PRIME64_2; - xxh_u64 v3 = seed + 0; - xxh_u64 v4 = seed - XXH_PRIME64_1; - - do { - v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; - v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; - v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; - v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; - } while (input= 2 - /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ - XXH64_state_t state; - XXH64_reset(&state, seed); - XXH64_update(&state, (const xxh_u8*)input, len); - return XXH64_digest(&state); -#else - if (XXH_FORCE_ALIGN_CHECK) { - if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); - } } - - return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); - -#endif -} - -/******* Hash Streaming *******/ -#ifndef XXH_NO_STREAM -/*! @ingroup XXH64_family*/ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) -{ - return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); -} -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) -{ - XXH_free(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) -{ - XXH_memcpy(dstState, srcState, sizeof(*dstState)); -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) -{ - XXH_ASSERT(statePtr != NULL); - memset(statePtr, 0, sizeof(*statePtr)); - statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - statePtr->v[1] = seed + XXH_PRIME64_2; - statePtr->v[2] = seed + 0; - statePtr->v[3] = seed - XXH_PRIME64_1; - return XXH_OK; -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH_errorcode -XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - { const xxh_u8* p = (const xxh_u8*)input; - const xxh_u8* const bEnd = p + len; - - state->total_len += len; - - if (state->memsize + len < 32) { /* fill in tmp buffer */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); - state->memsize += (xxh_u32)len; - return XXH_OK; - } - - if (state->memsize) { /* tmp buffer is full */ - XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); - p += 32 - state->memsize; - state->memsize = 0; - } - - if (p+32 <= bEnd) { - const xxh_u8* const limit = bEnd - 32; - - do { - state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; - state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; - state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; - state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; - } while (p<=limit); - - } - - if (p < bEnd) { - XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); - state->memsize = (unsigned)(bEnd-p); - } - } - - return XXH_OK; -} - - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) -{ - xxh_u64 h64; - - if (state->total_len >= 32) { - h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); - h64 = XXH64_mergeRound(h64, state->v[0]); - h64 = XXH64_mergeRound(h64, state->v[1]); - h64 = XXH64_mergeRound(h64, state->v[2]); - h64 = XXH64_mergeRound(h64, state->v[3]); - } else { - h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; - } - - h64 += (xxh_u64) state->total_len; - - return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); -} -#endif /* !XXH_NO_STREAM */ - -/******* Canonical representation *******/ - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); - XXH_memcpy(dst, &hash, sizeof(*dst)); -} - -/*! @ingroup XXH64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) -{ - return XXH_readBE64(src); -} - -#ifndef XXH_NO_XXH3 - -/* ********************************************************************* -* XXH3 -* New generation hash designed for speed on small keys and vectorization -************************************************************************ */ -/*! - * @} - * @defgroup XXH3_impl XXH3 implementation - * @ingroup impl - * @{ - */ - -/* === Compiler specifics === */ - -#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ -# define XXH_RESTRICT /* disable */ -#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define XXH_RESTRICT restrict -#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ - || (defined (__clang__)) \ - || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ - || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) -/* - * There are a LOT more compilers that recognize __restrict but this - * covers the major ones. - */ -# define XXH_RESTRICT __restrict -#else -# define XXH_RESTRICT /* disable */ -#endif - -#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ - || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ - || defined(__clang__) -# define XXH_likely(x) __builtin_expect(x, 1) -# define XXH_unlikely(x) __builtin_expect(x, 0) -#else -# define XXH_likely(x) (x) -# define XXH_unlikely(x) (x) -#endif - -#ifndef XXH_HAS_INCLUDE -# ifdef __has_include -# define XXH_HAS_INCLUDE(x) __has_include(x) -# else -# define XXH_HAS_INCLUDE(x) 0 -# endif -#endif - -#if defined(__GNUC__) || defined(__clang__) -# if defined(__ARM_FEATURE_SVE) -# include -# endif -# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ - || (defined(_M_ARM) && _M_ARM >= 7) \ - || defined(_M_ARM64) || defined(_M_ARM64EC) \ - || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ -# define inline __inline__ /* circumvent a clang bug */ -# include -# undef inline -# elif defined(__AVX2__) -# include -# elif defined(__SSE2__) -# include -# endif -#endif - -#if defined(_MSC_VER) -# include -#endif - -/* - * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while - * remaining a true 64-bit/128-bit hash function. - * - * This is done by prioritizing a subset of 64-bit operations that can be - * emulated without too many steps on the average 32-bit machine. - * - * For example, these two lines seem similar, and run equally fast on 64-bit: - * - * xxh_u64 x; - * x ^= (x >> 47); // good - * x ^= (x >> 13); // bad - * - * However, to a 32-bit machine, there is a major difference. - * - * x ^= (x >> 47) looks like this: - * - * x.lo ^= (x.hi >> (47 - 32)); - * - * while x ^= (x >> 13) looks like this: - * - * // note: funnel shifts are not usually cheap. - * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); - * x.hi ^= (x.hi >> 13); - * - * The first one is significantly faster than the second, simply because the - * shift is larger than 32. This means: - * - All the bits we need are in the upper 32 bits, so we can ignore the lower - * 32 bits in the shift. - * - The shift result will always fit in the lower 32 bits, and therefore, - * we can ignore the upper 32 bits in the xor. - * - * Thanks to this optimization, XXH3 only requires these features to be efficient: - * - * - Usable unaligned access - * - A 32-bit or 64-bit ALU - * - If 32-bit, a decent ADC instruction - * - A 32 or 64-bit multiply with a 64-bit result - * - For the 128-bit variant, a decent byteswap helps short inputs. - * - * The first two are already required by XXH32, and almost all 32-bit and 64-bit - * platforms which can run XXH32 can run XXH3 efficiently. - * - * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one - * notable exception. - * - * First of all, Thumb-1 lacks support for the UMULL instruction which - * performs the important long multiply. This means numerous __aeabi_lmul - * calls. - * - * Second of all, the 8 functional registers are just not enough. - * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need - * Lo registers, and this shuffling results in thousands more MOVs than A32. - * - * A32 and T32 don't have this limitation. They can access all 14 registers, - * do a 32->64 multiply with UMULL, and the flexible operand allowing free - * shifts is helpful, too. - * - * Therefore, we do a quick sanity check. - * - * If compiling Thumb-1 for a target which supports ARM instructions, we will - * emit a warning, as it is not a "sane" platform to compile for. - * - * Usually, if this happens, it is because of an accident and you probably need - * to specify -march, as you likely meant to compile for a newer architecture. - * - * Credit: large sections of the vectorial and asm source code paths - * have been contributed by @easyaspi314 - */ -#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) -# warning "XXH3 is highly inefficient without ARM or Thumb-2." -#endif - -/* ========================================== - * Vectorization detection - * ========================================== */ - -#ifdef XXH_DOXYGEN -/*! - * @ingroup tuning - * @brief Overrides the vectorization implementation chosen for XXH3. - * - * Can be defined to 0 to disable SIMD or any of the values mentioned in - * @ref XXH_VECTOR_TYPE. - * - * If this is not defined, it uses predefined macros to determine the best - * implementation. - */ -# define XXH_VECTOR XXH_SCALAR -/*! - * @ingroup tuning - * @brief Possible values for @ref XXH_VECTOR. - * - * Note that these are actually implemented as macros. - * - * If this is not defined, it is detected automatically. - * internal macro XXH_X86DISPATCH overrides this. - */ -enum XXH_VECTOR_TYPE /* fake enum */ { - XXH_SCALAR = 0, /*!< Portable scalar version */ - XXH_SSE2 = 1, /*!< - * SSE2 for Pentium 4, Opteron, all x86_64. - * - * @note SSE2 is also guaranteed on Windows 10, macOS, and - * Android x86. - */ - XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ - XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ - XXH_NEON = 4, /*!< - * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 - * via the SIMDeverywhere polyfill provided with the - * Emscripten SDK. - */ - XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ - XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ -}; -/*! - * @ingroup tuning - * @brief Selects the minimum alignment for XXH3's accumulators. - * - * When using SIMD, this should match the alignment required for said vector - * type, so, for example, 32 for AVX2. - * - * Default: Auto detected. - */ -# define XXH_ACC_ALIGN 8 -#endif - -/* Actual definition */ -#ifndef XXH_DOXYGEN -# define XXH_SCALAR 0 -# define XXH_SSE2 1 -# define XXH_AVX2 2 -# define XXH_AVX512 3 -# define XXH_NEON 4 -# define XXH_VSX 5 -# define XXH_SVE 6 -#endif - -#ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__ARM_FEATURE_SVE) -# define XXH_VECTOR XXH_SVE -# elif ( \ - defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ - || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ - || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ - ) && ( \ - defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ - ) -# define XXH_VECTOR XXH_NEON -# elif defined(__AVX512F__) -# define XXH_VECTOR XXH_AVX512 -# elif defined(__AVX2__) -# define XXH_VECTOR XXH_AVX2 -# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) -# define XXH_VECTOR XXH_SSE2 -# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ - || (defined(__s390x__) && defined(__VEC__)) \ - && defined(__GNUC__) /* TODO: IBM XL */ -# define XXH_VECTOR XXH_VSX -# else -# define XXH_VECTOR XXH_SCALAR -# endif -#endif - -/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ -#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) -# ifdef _MSC_VER -# pragma warning(once : 4606) -# else -# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." -# endif -# undef XXH_VECTOR -# define XXH_VECTOR XXH_SCALAR -#endif - -/* - * Controls the alignment of the accumulator, - * for compatibility with aligned vector loads, which are usually faster. - */ -#ifndef XXH_ACC_ALIGN -# if defined(XXH_X86DISPATCH) -# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ -# elif XXH_VECTOR == XXH_SCALAR /* scalar */ -# define XXH_ACC_ALIGN 8 -# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ -# define XXH_ACC_ALIGN 32 -# elif XXH_VECTOR == XXH_NEON /* neon */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_VSX /* vsx */ -# define XXH_ACC_ALIGN 16 -# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ -# define XXH_ACC_ALIGN 64 -# elif XXH_VECTOR == XXH_SVE /* sve */ -# define XXH_ACC_ALIGN 64 -# endif -#endif - -#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ - || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 -# define XXH_SEC_ALIGN XXH_ACC_ALIGN -#elif XXH_VECTOR == XXH_SVE -# define XXH_SEC_ALIGN XXH_ACC_ALIGN -#else -# define XXH_SEC_ALIGN 8 -#endif - -#if defined(__GNUC__) || defined(__clang__) -# define XXH_ALIASING __attribute__((may_alias)) -#else -# define XXH_ALIASING /* nothing */ -#endif - -/* - * UGLY HACK: - * GCC usually generates the best code with -O3 for xxHash. - * - * However, when targeting AVX2, it is overzealous in its unrolling resulting - * in code roughly 3/4 the speed of Clang. - * - * There are other issues, such as GCC splitting _mm256_loadu_si256 into - * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which - * only applies to Sandy and Ivy Bridge... which don't even support AVX2. - * - * That is why when compiling the AVX2 version, it is recommended to use either - * -O2 -mavx2 -march=haswell - * or - * -O2 -mavx2 -mno-avx256-split-unaligned-load - * for decent performance, or to use Clang instead. - * - * Fortunately, we can control the first one with a pragma that forces GCC into - * -O2, but the other one we can't control without "failed to inline always - * inline function due to target mismatch" warnings. - */ -#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ - && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ -# pragma GCC push_options -# pragma GCC optimize("-O2") -#endif - -#if XXH_VECTOR == XXH_NEON - -/* - * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 - * optimizes out the entire hashLong loop because of the aliasing violation. - * - * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, - * so the only option is to mark it as aliasing. - */ -typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; - -/*! - * @internal - * @brief `vld1q_u64` but faster and alignment-safe. - * - * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only - * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). - * - * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it - * prohibits load-store optimizations. Therefore, a direct dereference is used. - * - * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe - * unaligned load. - */ -#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) -XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ -{ - return *(xxh_aliasing_uint64x2_t const *)ptr; -} -#else -XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) -{ - return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); -} -#endif - -/*! - * @internal - * @brief `vmlal_u32` on low and high halves of a vector. - * - * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with - * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` - * with `vmlal_u32`. - */ -#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - /* Inline assembly is the only way */ - __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); - return acc; -} -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - /* This intrinsic works as expected */ - return vmlal_high_u32(acc, lhs, rhs); -} -#else -/* Portable intrinsic versions */ -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); -} -/*! @copydoc XXH_vmlal_low_u32 - * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ -XXH_FORCE_INLINE uint64x2_t -XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) -{ - return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); -} -#endif - -/*! - * @ingroup tuning - * @brief Controls the NEON to scalar ratio for XXH3 - * - * This can be set to 2, 4, 6, or 8. - * - * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. - * - * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those - * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU - * bandwidth. - * - * This is even more noticeable on the more advanced cores like the Cortex-A76 which - * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. - * - * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes - * and 2 scalar lanes, which is chosen by default. - * - * This does not apply to Apple processors or 32-bit processors, which run better with - * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. - * - * This change benefits CPUs with large micro-op buffers without negatively affecting - * most other CPUs: - * - * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | - * |:----------------------|:--------------------|----------:|-----------:|------:| - * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | - * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | - * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | - * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | - * - * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. - * - * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning - * it effectively becomes worse 4. - * - * @see XXH3_accumulate_512_neon() - */ -# ifndef XXH3_NEON_LANES -# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ - && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 -# define XXH3_NEON_LANES 6 -# else -# define XXH3_NEON_LANES XXH_ACC_NB -# endif -# endif -#endif /* XXH_VECTOR == XXH_NEON */ - -/* - * VSX and Z Vector helpers. - * - * This is very messy, and any pull requests to clean this up are welcome. - * - * There are a lot of problems with supporting VSX and s390x, due to - * inconsistent intrinsics, spotty coverage, and multiple endiannesses. - */ -#if XXH_VECTOR == XXH_VSX -/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, - * and `pixel`. This is a problem for obvious reasons. - * - * These keywords are unnecessary; the spec literally says they are - * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd - * after including the header. - * - * We use pragma push_macro/pop_macro to keep the namespace clean. */ -# pragma push_macro("bool") -# pragma push_macro("vector") -# pragma push_macro("pixel") -/* silence potential macro redefined warnings */ -# undef bool -# undef vector -# undef pixel - -# if defined(__s390x__) -# include -# else -# include -# endif - -/* Restore the original macro values, if applicable. */ -# pragma pop_macro("pixel") -# pragma pop_macro("vector") -# pragma pop_macro("bool") - -typedef __vector unsigned long long xxh_u64x2; -typedef __vector unsigned char xxh_u8x16; -typedef __vector unsigned xxh_u32x4; - -/* - * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. - */ -typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; - -# ifndef XXH_VSX_BE -# if defined(__BIG_ENDIAN__) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define XXH_VSX_BE 1 -# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ -# warning "-maltivec=be is not recommended. Please use native endianness." -# define XXH_VSX_BE 1 -# else -# define XXH_VSX_BE 0 -# endif -# endif /* !defined(XXH_VSX_BE) */ - -# if XXH_VSX_BE -# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) -# define XXH_vec_revb vec_revb -# else -/*! - * A polyfill for POWER9's vec_revb(). - */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) -{ - xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, - 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; - return vec_perm(val, val, vByteSwap); -} -# endif -# endif /* XXH_VSX_BE */ - -/*! - * Performs an unaligned vector load and byte swaps it on big endian. - */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) -{ - xxh_u64x2 ret; - XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); -# if XXH_VSX_BE - ret = XXH_vec_revb(ret); -# endif - return ret; -} - -/* - * vec_mulo and vec_mule are very problematic intrinsics on PowerPC - * - * These intrinsics weren't added until GCC 8, despite existing for a while, - * and they are endian dependent. Also, their meaning swap depending on version. - * */ -# if defined(__s390x__) - /* s390x is always big endian, no issue on this platform */ -# define XXH_vec_mulo vec_mulo -# define XXH_vec_mule vec_mule -# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) -/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ - /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ -# define XXH_vec_mulo __builtin_altivec_vmulouw -# define XXH_vec_mule __builtin_altivec_vmuleuw -# else -/* gcc needs inline assembly */ -/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) -{ - xxh_u64x2 result; - __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); - return result; -} -# endif /* XXH_vec_mulo, XXH_vec_mule */ -#endif /* XXH_VECTOR == XXH_VSX */ - -#if XXH_VECTOR == XXH_SVE -#define ACCRND(acc, offset) \ -do { \ - svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ - svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ - svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ - svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ - svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ - svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ - svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ - acc = svadd_u64_x(mask, acc, mul); \ -} while (0) -#endif /* XXH_VECTOR == XXH_SVE */ - -/* prefetch - * can be disabled, by declaring XXH_NO_PREFETCH build macro */ -#if defined(XXH_NO_PREFETCH) -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -#else -# if XXH_SIZE_OPT >= 1 -# define XXH_PREFETCH(ptr) (void)(ptr) -# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ -# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) -# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) -# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) -# else -# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ -# endif -#endif /* XXH_NO_PREFETCH */ - - -/* ========================================== - * XXH3 default settings - * ========================================== */ - -#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ - -#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) -# error "default keyset is not large enough" -#endif - -/*! Pseudorandom secret taken directly from FARSH. */ -XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { - 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, - 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, - 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, - 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, - 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, - 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, - 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, - 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, - 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, - 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, - 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, - 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, -}; - -static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ -static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ - -#ifdef XXH_OLD_NAMES -# define kSecret XXH3_kSecret -#endif - -#ifdef XXH_DOXYGEN -/*! - * @brief Calculates a 32-bit to 64-bit long multiply. - * - * Implemented as a macro. - * - * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't - * need to (but it shouldn't need to anyways, it is about 7 instructions to do - * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we - * use that instead of the normal method. - * - * If you are compiling for platforms like Thumb-1 and don't have a better option, - * you may also want to write your own long multiply routine here. - * - * @param x, y Numbers to be multiplied - * @return 64-bit product of the low 32 bits of @p x and @p y. - */ -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64(xxh_u64 x, xxh_u64 y) -{ - return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); -} -#elif defined(_MSC_VER) && defined(_M_IX86) -# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) -#else -/* - * Downcast + upcast is usually better than masking on older compilers like - * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. - * - * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands - * and perform a full 64x64 multiply -- entirely redundant on 32-bit. - */ -# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) -#endif - -/*! - * @brief Calculates a 64->128-bit long multiply. - * - * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar - * version. - * - * @param lhs , rhs The 64-bit integers to be multiplied - * @return The 128-bit result represented in an @ref XXH128_hash_t. - */ -static XXH128_hash_t -XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) -{ - /* - * GCC/Clang __uint128_t method. - * - * On most 64-bit targets, GCC and Clang define a __uint128_t type. - * This is usually the best way as it usually uses a native long 64-bit - * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. - * - * Usually. - * - * Despite being a 32-bit platform, Clang (and emscripten) define this type - * despite not having the arithmetic for it. This results in a laggy - * compiler builtin call which calculates a full 128-bit multiply. - * In that case it is best to use the portable one. - * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 - */ -#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ - && defined(__SIZEOF_INT128__) \ - || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) - - __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; - XXH128_hash_t r128; - r128.low64 = (xxh_u64)(product); - r128.high64 = (xxh_u64)(product >> 64); - return r128; - - /* - * MSVC for x64's _umul128 method. - * - * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); - * - * This compiles to single operand MUL on x64. - */ -#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) - -#ifndef _MSC_VER -# pragma intrinsic(_umul128) -#endif - xxh_u64 product_high; - xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); - XXH128_hash_t r128; - r128.low64 = product_low; - r128.high64 = product_high; - return r128; - - /* - * MSVC for ARM64's __umulh method. - * - * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. - */ -#elif defined(_M_ARM64) || defined(_M_ARM64EC) - -#ifndef _MSC_VER -# pragma intrinsic(__umulh) -#endif - XXH128_hash_t r128; - r128.low64 = lhs * rhs; - r128.high64 = __umulh(lhs, rhs); - return r128; - -#else - /* - * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. - * - * This is a fast and simple grade school multiply, which is shown below - * with base 10 arithmetic instead of base 0x100000000. - * - * 9 3 // D2 lhs = 93 - * x 7 5 // D2 rhs = 75 - * ---------- - * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 - * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 - * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 - * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 - * --------- - * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 - * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 - * --------- - * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 - * - * The reasons for adding the products like this are: - * 1. It avoids manual carry tracking. Just like how - * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. - * This avoids a lot of complexity. - * - * 2. It hints for, and on Clang, compiles to, the powerful UMAAL - * instruction available in ARM's Digital Signal Processing extension - * in 32-bit ARMv6 and later, which is shown below: - * - * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) - * { - * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; - * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); - * *RdHi = (xxh_u32)(product >> 32); - * } - * - * This instruction was designed for efficient long multiplication, and - * allows this to be calculated in only 4 instructions at speeds - * comparable to some 64-bit ALUs. - * - * 3. It isn't terrible on other platforms. Usually this will be a couple - * of 32-bit ADD/ADCs. - */ - - /* First calculate all of the cross products. */ - xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); - xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); - xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); - xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); - - /* Now add the products together. These will never overflow. */ - xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; - xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; - xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); - - XXH128_hash_t r128; - r128.low64 = lower; - r128.high64 = upper; - return r128; -#endif -} - -/*! - * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. - * - * The reason for the separate function is to prevent passing too many structs - * around by value. This will hopefully inline the multiply, but we don't force it. - * - * @param lhs , rhs The 64-bit integers to multiply - * @return The low 64 bits of the product XOR'd by the high 64 bits. - * @see XXH_mult64to128() - */ -static xxh_u64 -XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) -{ - XXH128_hash_t product = XXH_mult64to128(lhs, rhs); - return product.low64 ^ product.high64; -} - -/*! Seems to produce slightly better code on GCC for some reason. */ -XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) -{ - XXH_ASSERT(0 <= shift && shift < 64); - return v64 ^ (v64 >> shift); -} - -/* - * This is a fast avalanche stage, - * suitable when input bits are already partially mixed - */ -static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) -{ - h64 = XXH_xorshift64(h64, 37); - h64 *= PRIME_MX1; - h64 = XXH_xorshift64(h64, 32); - return h64; -} - -/* - * This is a stronger avalanche, - * inspired by Pelle Evensen's rrmxmx - * preferable when input has not been previously mixed - */ -static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) -{ - /* this mix is inspired by Pelle Evensen's rrmxmx */ - h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); - h64 *= PRIME_MX2; - h64 ^= (h64 >> 35) + len ; - h64 *= PRIME_MX2; - return XXH_xorshift64(h64, 28); -} - - -/* ========================================== - * Short keys - * ========================================== - * One of the shortcomings of XXH32 and XXH64 was that their performance was - * sub-optimal on short lengths. It used an iterative algorithm which strongly - * favored lengths that were a multiple of 4 or 8. - * - * Instead of iterating over individual inputs, we use a set of single shot - * functions which piece together a range of lengths and operate in constant time. - * - * Additionally, the number of multiplies has been significantly reduced. This - * reduces latency, especially when emulating 64-bit multiplies on 32-bit. - * - * Depending on the platform, this may or may not be faster than XXH32, but it - * is almost guaranteed to be faster than XXH64. - */ - -/* - * At very short lengths, there isn't enough input to fully hide secrets, or use - * the entire secret. - * - * There is also only a limited amount of mixing we can do before significantly - * impacting performance. - * - * Therefore, we use different sections of the secret and always mix two secret - * samples with an XOR. This should have no effect on performance on the - * seedless or withSeed variants because everything _should_ be constant folded - * by modern compilers. - * - * The XOR mixing hides individual parts of the secret and increases entropy. - * - * This adds an extra layer of strength for custom secrets. - */ -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combined = { input[0], 0x01, input[0], input[0] } - * len = 2: combined = { input[1], 0x02, input[0], input[1] } - * len = 3: combined = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) - | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); - xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; - return XXH64_avalanche(keyed); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input1 = XXH_readLE32(input); - xxh_u32 const input2 = XXH_readLE32(input + len - 4); - xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; - xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); - xxh_u64 const keyed = input64 ^ bitflip; - return XXH3_rrmxmx(keyed, len); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); - { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; - xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; - xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; - xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; - xxh_u64 const acc = len - + XXH_swap64(input_lo) + input_hi - + XXH3_mul128_fold64(input_lo, input_hi); - return XXH3_avalanche(acc); - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); - if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); - if (len) return XXH3_len_1to3_64b(input, len, secret, seed); - return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); - } -} - -/* - * DISCLAIMER: There are known *seed-dependent* multicollisions here due to - * multiplication by zero, affecting hashes of lengths 17 to 240. - * - * However, they are very unlikely. - * - * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all - * unseeded non-cryptographic hashes, it does not attempt to defend itself - * against specially crafted inputs, only random inputs. - * - * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes - * cancelling out the secret is taken an arbitrary number of times (addressed - * in XXH3_accumulate_512), this collision is very unlikely with random inputs - * and/or proper seeding: - * - * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a - * function that is only called up to 16 times per hash with up to 240 bytes of - * input. - * - * This is not too bad for a non-cryptographic hash function, especially with - * only 64 bit outputs. - * - * The 128-bit variant (which trades some speed for strength) is NOT affected - * by this, although it is always a good idea to use a proper seed if you care - * about strength. - */ -XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) -{ -#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ - /* - * UGLY HACK: - * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in - * slower code. - * - * By forcing seed64 into a register, we disrupt the cost model and - * cause it to scalarize. See `XXH32_round()` - * - * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, - * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on - * GCC 9.2, despite both emitting scalar code. - * - * GCC generates much better scalar code than Clang for the rest of XXH3, - * which is why finding a more optimal codepath is an interest. - */ - XXH_COMPILER_GUARD(seed64); -#endif - { xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 const input_hi = XXH_readLE64(input+8); - return XXH3_mul128_fold64( - input_lo ^ (XXH_readLE64(secret) + seed64), - input_hi ^ (XXH_readLE64(secret+8) - seed64) - ); - } -} - -/* For mid range keys, XXH3 uses a Mum-hash variant. */ -XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { xxh_u64 acc = len * XXH_PRIME64_1; -#if XXH_SIZE_OPT >= 1 - /* Smaller and cleaner, but slightly slower. */ - unsigned int i = (unsigned int)(len - 1) / 32; - do { - acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); - acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); - } while (i-- != 0); -#else - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc += XXH3_mix16B(input+48, secret+96, seed); - acc += XXH3_mix16B(input+len-64, secret+112, seed); - } - acc += XXH3_mix16B(input+32, secret+64, seed); - acc += XXH3_mix16B(input+len-48, secret+80, seed); - } - acc += XXH3_mix16B(input+16, secret+32, seed); - acc += XXH3_mix16B(input+len-32, secret+48, seed); - } - acc += XXH3_mix16B(input+0, secret+0, seed); - acc += XXH3_mix16B(input+len-16, secret+16, seed); -#endif - return XXH3_avalanche(acc); - } -} - -#define XXH3_MIDSIZE_MAX 240 - -XXH_NO_INLINE XXH_PUREF XXH64_hash_t -XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - #define XXH3_MIDSIZE_STARTOFFSET 3 - #define XXH3_MIDSIZE_LASTOFFSET 17 - - { xxh_u64 acc = len * XXH_PRIME64_1; - xxh_u64 acc_end; - unsigned int const nbRounds = (unsigned int)len / 16; - unsigned int i; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - for (i=0; i<8; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); - } - /* last bytes */ - acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - XXH_ASSERT(nbRounds >= 8); - acc = XXH3_avalanche(acc); -#if defined(__clang__) /* Clang */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. - * In everywhere else, it uses scalar code. - * - * For 64->128-bit multiplies, even if the NEON was 100% optimal, it - * would still be slower than UMAAL (see XXH_mult64to128). - * - * Unfortunately, Clang doesn't handle the long multiplies properly and - * converts them to the nonexistent "vmulq_u64" intrinsic, which is then - * scalarized into an ugly mess of VMOV.32 instructions. - * - * This mess is difficult to avoid without turning autovectorization - * off completely, but they are usually relatively minor and/or not - * worth it to fix. - * - * This loop is the easiest to fix, as unlike XXH32, this pragma - * _actually works_ because it is a loop vectorization instead of an - * SLP vectorization. - */ - #pragma clang loop vectorize(disable) -#endif - for (i=8 ; i < nbRounds; i++) { - /* - * Prevents clang for unrolling the acc loop and interleaving with this one. - */ - XXH_COMPILER_GUARD(acc); - acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); - } - return XXH3_avalanche(acc + acc_end); - } -} - - -/* ======= Long Keys ======= */ - -#define XXH_STRIPE_LEN 64 -#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ -#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) - -#ifdef XXH_OLD_NAMES -# define STRIPE_LEN XXH_STRIPE_LEN -# define ACC_NB XXH_ACC_NB -#endif - -#ifndef XXH_PREFETCH_DIST -# ifdef __clang__ -# define XXH_PREFETCH_DIST 320 -# else -# if (XXH_VECTOR == XXH_AVX512) -# define XXH_PREFETCH_DIST 512 -# else -# define XXH_PREFETCH_DIST 384 -# endif -# endif /* __clang__ */ -#endif /* XXH_PREFETCH_DIST */ - -/* - * These macros are to generate an XXH3_accumulate() function. - * The two arguments select the name suffix and target attribute. - * - * The name of this symbol is XXH3_accumulate_() and it calls - * XXH3_accumulate_512_(). - * - * It may be useful to hand implement this function if the compiler fails to - * optimize the inline function. - */ -#define XXH3_ACCUMULATE_TEMPLATE(name) \ -void \ -XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ - const xxh_u8* XXH_RESTRICT input, \ - const xxh_u8* XXH_RESTRICT secret, \ - size_t nbStripes) \ -{ \ - size_t n; \ - for (n = 0; n < nbStripes; n++ ) { \ - const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ - XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ - XXH3_accumulate_512_##name( \ - acc, \ - in, \ - secret + n*XXH_SECRET_CONSUME_RATE); \ - } \ -} - - -XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) -{ - if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); - XXH_memcpy(dst, &v64, sizeof(v64)); -} - -/* Several intrinsic functions below are supposed to accept __int64 as argument, - * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . - * However, several environments do not define __int64 type, - * requiring a workaround. - */ -#if !defined (__VMS) \ - && (defined (__cplusplus) \ - || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) - typedef int64_t xxh_i64; -#else - /* the following type must have a width of 64-bit */ - typedef long long xxh_i64; -#endif - - -/* - * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. - * - * It is a hardened version of UMAC, based off of FARSH's implementation. - * - * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD - * implementations, and it is ridiculously fast. - * - * We harden it by mixing the original input to the accumulators as well as the product. - * - * This means that in the (relatively likely) case of a multiply by zero, the - * original input is preserved. - * - * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve - * cross-pollination, as otherwise the upper and lower halves would be - * essentially independent. - * - * This doesn't matter on 64-bit hashes since they all get merged together in - * the end, so we skip the extra step. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ - -#if (XXH_VECTOR == XXH_AVX512) \ - || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) - -#ifndef XXH_TARGET_AVX512 -# define XXH_TARGET_AVX512 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - __m512i* const xacc = (__m512i *) acc; - XXH_ASSERT((((size_t)acc) & 63) == 0); - XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - - { - /* data_vec = input[0]; */ - __m512i const data_vec = _mm512_loadu_si512 (input); - /* key_vec = secret[0]; */ - __m512i const key_vec = _mm512_loadu_si512 (secret); - /* data_key = data_vec ^ key_vec; */ - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); - /* xacc[0] += swap(data_vec); */ - __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); - __m512i const sum = _mm512_add_epi64(*xacc, data_swap); - /* xacc[0] += product; */ - *xacc = _mm512_add_epi64(product, sum); - } -} -XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) - -/* - * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. - * - * Multiplication isn't perfect, as explained by Google in HighwayHash: - * - * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to - * // varying degrees. In descending order of goodness, bytes - * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. - * // As expected, the upper and lower bytes are much worse. - * - * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 - * - * Since our algorithm uses a pseudorandom secret to add some variance into the - * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. - * - * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid - * extraction. - * - * Both XXH3_64bits and XXH3_128bits use this subroutine. - */ - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 63) == 0); - XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - { __m512i* const xacc = (__m512i*) acc; - const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); - - /* xacc[0] ^= (xacc[0] >> 47) */ - __m512i const acc_vec = *xacc; - __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); - /* xacc[0] ^= secret; */ - __m512i const key_vec = _mm512_loadu_si512 (secret); - __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); - - /* xacc[0] *= XXH_PRIME32_1; */ - __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); - __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); - __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); - *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); - } -} - -XXH_FORCE_INLINE XXH_TARGET_AVX512 void -XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); - XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); - XXH_ASSERT(((size_t)customSecret & 63) == 0); - (void)(&XXH_writeLE64); - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); - __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); - __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); - - const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); - __m512i* const dest = ( __m512i*) customSecret; - int i; - XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 63) == 0); - for (i=0; i < nbRounds; ++i) { - dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_AVX2) \ - || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) - -#ifndef XXH_TARGET_AVX2 -# define XXH_TARGET_AVX2 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void -XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 31) == 0); - { __m256i* const xacc = (__m256i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xinput = (const __m256i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { - /* data_vec = xinput[i]; */ - __m256i const data_vec = _mm256_loadu_si256 (xinput+i); - /* key_vec = xsecret[i]; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); - /* xacc[i] += swap(data_vec); */ - __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); - __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm256_add_epi64(product, sum); - } } -} -XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void -XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 31) == 0); - { __m256i* const xacc = (__m256i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ - const __m256i* const xsecret = (const __m256i *) secret; - const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m256i const acc_vec = xacc[i]; - __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); - __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); - /* xacc[i] ^= xsecret; */ - __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); - __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); - - /* xacc[i] *= XXH_PRIME32_1; */ - __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); - __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); - __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); - } - } -} - -XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); - XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); - (void)(&XXH_writeLE64); - XXH_PREFETCH(customSecret); - { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); - - const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); - __m256i* dest = ( __m256i*) customSecret; - -# if defined(__GNUC__) || defined(__clang__) - /* - * On GCC & Clang, marking 'dest' as modified will cause the compiler: - * - do not extract the secret from sse registers in the internal loop - * - use less common registers, and avoid pushing these reg into stack - */ - XXH_COMPILER_GUARD(dest); -# endif - XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dest & 31) == 0); - - /* GCC -O2 need unroll loop manually */ - dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); - dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); - dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); - dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); - dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); - dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); - } -} - -#endif - -/* x86dispatch always generates SSE2 */ -#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) - -#ifndef XXH_TARGET_SSE2 -# define XXH_TARGET_SSE2 /* disable attribute target */ -#endif - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void -XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - /* SSE2 is just a half-scale version of the AVX2 version. */ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { __m128i* const xacc = (__m128i *) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xinput = (const __m128i *) input; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { - /* data_vec = xinput[i]; */ - __m128i const data_vec = _mm_loadu_si128 (xinput+i); - /* key_vec = xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - /* data_key = data_vec ^ key_vec; */ - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - /* data_key_lo = data_key >> 32; */ - __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ - __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); - /* xacc[i] += swap(data_vec); */ - __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); - __m128i const sum = _mm_add_epi64(xacc[i], data_swap); - /* xacc[i] += product; */ - xacc[i] = _mm_add_epi64(product, sum); - } } -} -XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void -XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - { __m128i* const xacc = (__m128i*) acc; - /* Unaligned. This is mainly for pointer arithmetic, and because - * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ - const __m128i* const xsecret = (const __m128i *) secret; - const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); - - size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { - /* xacc[i] ^= (xacc[i] >> 47) */ - __m128i const acc_vec = xacc[i]; - __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); - __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); - /* xacc[i] ^= xsecret[i]; */ - __m128i const key_vec = _mm_loadu_si128 (xsecret+i); - __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); - - /* xacc[i] *= XXH_PRIME32_1; */ - __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); - __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); - __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); - xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); - } - } -} - -XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - (void)(&XXH_writeLE64); - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); - -# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ - XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; - __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); -# else - __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); -# endif - int i; - - const void* const src16 = XXH3_kSecret; - __m128i* dst16 = (__m128i*) customSecret; -# if defined(__GNUC__) || defined(__clang__) - /* - * On GCC & Clang, marking 'dest' as modified will cause the compiler: - * - do not extract the secret from sse registers in the internal loop - * - use less common registers, and avoid pushing these reg into stack - */ - XXH_COMPILER_GUARD(dst16); -# endif - XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ - XXH_ASSERT(((size_t)dst16 & 15) == 0); - - for (i=0; i < nbRounds; ++i) { - dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_NEON) - -/* forward declarations for the scalar routines */ -XXH_FORCE_INLINE void -XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, - void const* XXH_RESTRICT secret, size_t lane); - -XXH_FORCE_INLINE void -XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT secret, size_t lane); - -/*! - * @internal - * @brief The bulk processing loop for NEON and WASM SIMD128. - * - * The NEON code path is actually partially scalar when running on AArch64. This - * is to optimize the pipelining and can have up to 15% speedup depending on the - * CPU, and it also mitigates some GCC codegen issues. - * - * @see XXH3_NEON_LANES for configuring this and details about this optimization. - * - * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit - * integers instead of the other platforms which mask full 64-bit vectors, - * so the setup is more complicated than just shifting right. - * - * Additionally, there is an optimization for 4 lanes at once noted below. - * - * Since, as stated, the most optimal amount of lanes for Cortexes is 6, - * there needs to be *three* versions of the accumulate operation used - * for the remaining 2 lanes. - * - * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap - * nearly perfectly. - */ - -XXH_FORCE_INLINE void -XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); - { /* GCC for darwin arm64 does not like aliasing here */ - xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; - /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ - uint8_t const* xinput = (const uint8_t *) input; - uint8_t const* xsecret = (const uint8_t *) secret; - - size_t i; -#ifdef __wasm_simd128__ - /* - * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret - * is constant propagated, which results in it converting it to this - * inside the loop: - * - * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) - * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) - * ... - * - * This requires a full 32-bit address immediate (and therefore a 6 byte - * instruction) as well as an add for each offset. - * - * Putting an asm guard prevents it from folding (at the cost of losing - * the alignment hint), and uses the free offset in `v128.load` instead - * of adding secret_offset each time which overall reduces code size by - * about a kilobyte and improves performance. - */ - XXH_COMPILER_GUARD(xsecret); -#endif - /* Scalar lanes use the normal scalarRound routine */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarRound(acc, input, secret, i); - } - i = 0; - /* 4 NEON lanes at a time. */ - for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { - /* data_vec = xinput[i]; */ - uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); - uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); - /* key_vec = xsecret[i]; */ - uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); - uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); - /* data_swap = swap(data_vec) */ - uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); - uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); - uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); - - /* - * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a - * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to - * get one vector with the low 32 bits of each lane, and one vector - * with the high 32 bits of each lane. - * - * The intrinsic returns a double vector because the original ARMv7-a - * instruction modified both arguments in place. AArch64 and SIMD128 emit - * two instructions from this intrinsic. - * - * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] - * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] - */ - uint32x4x2_t unzipped = vuzpq_u32( - vreinterpretq_u32_u64(data_key_1), - vreinterpretq_u32_u64(data_key_2) - ); - /* data_key_lo = data_key & 0xFFFFFFFF */ - uint32x4_t data_key_lo = unzipped.val[0]; - /* data_key_hi = data_key >> 32 */ - uint32x4_t data_key_hi = unzipped.val[1]; - /* - * Then, we can split the vectors horizontally and multiply which, as for most - * widening intrinsics, have a variant that works on both high half vectors - * for free on AArch64. A similar instruction is available on SIMD128. - * - * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi - */ - uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); - uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); - /* - * Clang reorders - * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s - * c += a; // add acc.2d, acc.2d, swap.2d - * to - * c += a; // add acc.2d, acc.2d, swap.2d - * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s - * - * While it would make sense in theory since the addition is faster, - * for reasons likely related to umlal being limited to certain NEON - * pipelines, this is worse. A compiler guard fixes this. - */ - XXH_COMPILER_GUARD_CLANG_NEON(sum_1); - XXH_COMPILER_GUARD_CLANG_NEON(sum_2); - /* xacc[i] = acc_vec + sum; */ - xacc[i] = vaddq_u64(xacc[i], sum_1); - xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); - } - /* Operate on the remaining NEON lanes 2 at a time. */ - for (; i < XXH3_NEON_LANES / 2; i++) { - /* data_vec = xinput[i]; */ - uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); - /* key_vec = xsecret[i]; */ - uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); - /* acc_vec_2 = swap(data_vec) */ - uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); - /* data_key = data_vec ^ key_vec; */ - uint64x2_t data_key = veorq_u64(data_vec, key_vec); - /* For two lanes, just use VMOVN and VSHRN. */ - /* data_key_lo = data_key & 0xFFFFFFFF; */ - uint32x2_t data_key_lo = vmovn_u64(data_key); - /* data_key_hi = data_key >> 32; */ - uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); - /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ - uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); - /* Same Clang workaround as before */ - XXH_COMPILER_GUARD_CLANG_NEON(sum); - /* xacc[i] = acc_vec + sum; */ - xacc[i] = vaddq_u64 (xacc[i], sum); - } - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) - -XXH_FORCE_INLINE void -XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; - uint8_t const* xsecret = (uint8_t const*) secret; - - size_t i; - /* WASM uses operator overloads and doesn't need these. */ -#ifndef __wasm_simd128__ - /* { prime32_1, prime32_1 } */ - uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); - /* { 0, prime32_1, 0, prime32_1 } */ - uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); -#endif - - /* AArch64 uses both scalar and neon at the same time */ - for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { - XXH3_scalarScrambleRound(acc, secret, i); - } - for (i=0; i < XXH3_NEON_LANES / 2; i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); - uint64x2_t data_vec = veorq_u64(acc_vec, shifted); - - /* xacc[i] ^= xsecret[i]; */ - uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); - uint64x2_t data_key = veorq_u64(data_vec, key_vec); - /* xacc[i] *= XXH_PRIME32_1 */ -#ifdef __wasm_simd128__ - /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ - xacc[i] = data_key * XXH_PRIME32_1; -#else - /* - * Expanded version with portable NEON intrinsics - * - * lo(x) * lo(y) + (hi(x) * lo(y) << 32) - * - * prod_hi = hi(data_key) * lo(prime) << 32 - * - * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector - * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits - * and avoid the shift. - */ - uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); - /* Extract low bits for vmlal_u32 */ - uint32x2_t data_key_lo = vmovn_u64(data_key); - /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ - xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); -#endif - } - } -} -#endif - -#if (XXH_VECTOR == XXH_VSX) - -XXH_FORCE_INLINE void -XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - /* presumed aligned */ - xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; - xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ - xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ - xxh_u64x2 const v32 = { 32, 32 }; - size_t i; - for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* data_vec = xinput[i]; */ - xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); - /* key_vec = xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - /* shuffled = (data_key << 32) | (data_key >> 32); */ - xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); - /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ - xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); - /* acc_vec = xacc[i]; */ - xxh_u64x2 acc_vec = xacc[i]; - acc_vec += product; - - /* swap high and low halves */ -#ifdef __s390x__ - acc_vec += vec_permi(data_vec, data_vec, 2); -#else - acc_vec += vec_xxpermdi(data_vec, data_vec, 2); -#endif - xacc[i] = acc_vec; - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) - -XXH_FORCE_INLINE void -XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - XXH_ASSERT((((size_t)acc) & 15) == 0); - - { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; - const xxh_u8* const xsecret = (const xxh_u8*) secret; - /* constants */ - xxh_u64x2 const v32 = { 32, 32 }; - xxh_u64x2 const v47 = { 47, 47 }; - xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; - size_t i; - for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { - /* xacc[i] ^= (xacc[i] >> 47); */ - xxh_u64x2 const acc_vec = xacc[i]; - xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); - - /* xacc[i] ^= xsecret[i]; */ - xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); - xxh_u64x2 const data_key = data_vec ^ key_vec; - - /* xacc[i] *= XXH_PRIME32_1 */ - /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ - xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); - /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ - xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); - xacc[i] = prod_odd + (prod_even << v32); - } } -} - -#endif - -#if (XXH_VECTOR == XXH_SVE) - -XXH_FORCE_INLINE void -XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - uint64_t *xacc = (uint64_t *)acc; - const uint64_t *xinput = (const uint64_t *)(const void *)input; - const uint64_t *xsecret = (const uint64_t *)(const void *)secret; - svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); - uint64_t element_count = svcntd(); - if (element_count >= 8) { - svbool_t mask = svptrue_pat_b64(SV_VL8); - svuint64_t vacc = svld1_u64(mask, xacc); - ACCRND(vacc, 0); - svst1_u64(mask, xacc, vacc); - } else if (element_count == 2) { /* sve128 */ - svbool_t mask = svptrue_pat_b64(SV_VL2); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 2); - svuint64_t acc2 = svld1_u64(mask, xacc + 4); - svuint64_t acc3 = svld1_u64(mask, xacc + 6); - ACCRND(acc0, 0); - ACCRND(acc1, 2); - ACCRND(acc2, 4); - ACCRND(acc3, 6); - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 2, acc1); - svst1_u64(mask, xacc + 4, acc2); - svst1_u64(mask, xacc + 6, acc3); - } else { - svbool_t mask = svptrue_pat_b64(SV_VL4); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 4); - ACCRND(acc0, 0); - ACCRND(acc1, 4); - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 4, acc1); - } -} - -XXH_FORCE_INLINE void -XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes) -{ - if (nbStripes != 0) { - uint64_t *xacc = (uint64_t *)acc; - const uint64_t *xinput = (const uint64_t *)(const void *)input; - const uint64_t *xsecret = (const uint64_t *)(const void *)secret; - svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); - uint64_t element_count = svcntd(); - if (element_count >= 8) { - svbool_t mask = svptrue_pat_b64(SV_VL8); - svuint64_t vacc = svld1_u64(mask, xacc + 0); - do { - /* svprfd(svbool_t, void *, enum svfprop); */ - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(vacc, 0); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, vacc); - } else if (element_count == 2) { /* sve128 */ - svbool_t mask = svptrue_pat_b64(SV_VL2); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 2); - svuint64_t acc2 = svld1_u64(mask, xacc + 4); - svuint64_t acc3 = svld1_u64(mask, xacc + 6); - do { - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(acc0, 0); - ACCRND(acc1, 2); - ACCRND(acc2, 4); - ACCRND(acc3, 6); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 2, acc1); - svst1_u64(mask, xacc + 4, acc2); - svst1_u64(mask, xacc + 6, acc3); - } else { - svbool_t mask = svptrue_pat_b64(SV_VL4); - svuint64_t acc0 = svld1_u64(mask, xacc + 0); - svuint64_t acc1 = svld1_u64(mask, xacc + 4); - do { - svprfd(mask, xinput + 128, SV_PLDL1STRM); - ACCRND(acc0, 0); - ACCRND(acc1, 4); - xinput += 8; - xsecret += 1; - nbStripes--; - } while (nbStripes != 0); - - svst1_u64(mask, xacc + 0, acc0); - svst1_u64(mask, xacc + 4, acc1); - } - } -} - -#endif - -/* scalar variants - universal */ - -#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) -/* - * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they - * emit an excess mask and a full 64-bit multiply-add (MADD X-form). - * - * While this might not seem like much, as AArch64 is a 64-bit architecture, only - * big Cortex designs have a full 64-bit multiplier. - * - * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit - * multiplies expand to 2-3 multiplies in microcode. This has a major penalty - * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. - * - * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does - * not have this penalty and does the mask automatically. - */ -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) -{ - xxh_u64 ret; - /* note: %x = 64-bit register, %w = 32-bit register */ - __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); - return ret; -} -#else -XXH_FORCE_INLINE xxh_u64 -XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) -{ - return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; -} -#endif - -/*! - * @internal - * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). - * - * This is extracted to its own function because the NEON path uses a combination - * of NEON and scalar. - */ -XXH_FORCE_INLINE void -XXH3_scalarRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT input, - void const* XXH_RESTRICT secret, - size_t lane) -{ - xxh_u64* xacc = (xxh_u64*) acc; - xxh_u8 const* xinput = (xxh_u8 const*) input; - xxh_u8 const* xsecret = (xxh_u8 const*) secret; - XXH_ASSERT(lane < XXH_ACC_NB); - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); - { - xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); - xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); - xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ - xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); - } -} - -/*! - * @internal - * @brief Processes a 64 byte block of data using the scalar path. - */ -XXH_FORCE_INLINE void -XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, - const void* XXH_RESTRICT input, - const void* XXH_RESTRICT secret) -{ - size_t i; - /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ -#if defined(__GNUC__) && !defined(__clang__) \ - && (defined(__arm__) || defined(__thumb2__)) \ - && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ - && XXH_SIZE_OPT <= 0 -# pragma GCC unroll 8 -#endif - for (i=0; i < XXH_ACC_NB; i++) { - XXH3_scalarRound(acc, input, secret, i); - } -} -XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) - -/*! - * @internal - * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). - * - * This is extracted to its own function because the NEON path uses a combination - * of NEON and scalar. - */ -XXH_FORCE_INLINE void -XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, - void const* XXH_RESTRICT secret, - size_t lane) -{ - xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - XXH_ASSERT(lane < XXH_ACC_NB); - { - xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); - xxh_u64 acc64 = xacc[lane]; - acc64 = XXH_xorshift64(acc64, 47); - acc64 ^= key64; - acc64 *= XXH_PRIME32_1; - xacc[lane] = acc64; - } -} - -/*! - * @internal - * @brief Scrambles the accumulators after a large chunk has been read - */ -XXH_FORCE_INLINE void -XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) -{ - size_t i; - for (i=0; i < XXH_ACC_NB; i++) { - XXH3_scalarScrambleRound(acc, secret, i); - } -} - -XXH_FORCE_INLINE void -XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) -{ - /* - * We need a separate pointer for the hack below, - * which requires a non-const pointer. - * Any decent compiler will optimize this out otherwise. - */ - const xxh_u8* kSecretPtr = XXH3_kSecret; - XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); - -#if defined(__GNUC__) && defined(__aarch64__) - /* - * UGLY HACK: - * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are - * placed sequentially, in order, at the top of the unrolled loop. - * - * While MOVK is great for generating constants (2 cycles for a 64-bit - * constant compared to 4 cycles for LDR), it fights for bandwidth with - * the arithmetic instructions. - * - * I L S - * MOVK - * MOVK - * MOVK - * MOVK - * ADD - * SUB STR - * STR - * By forcing loads from memory (as the asm line causes the compiler to assume - * that XXH3_kSecretPtr has been changed), the pipelines are used more - * efficiently: - * I L S - * LDR - * ADD LDR - * SUB STR - * STR - * - * See XXH3_NEON_LANES for details on the pipsline. - * - * XXH3_64bits_withSeed, len == 256, Snapdragon 835 - * without hack: 2654.4 MB/s - * with hack: 3202.9 MB/s - */ - XXH_COMPILER_GUARD(kSecretPtr); -#endif - { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; - int i; - for (i=0; i < nbRounds; i++) { - /* - * The asm hack causes the compiler to assume that kSecretPtr aliases with - * customSecret, and on aarch64, this prevented LDP from merging two - * loads together for free. Putting the loads together before the stores - * properly generates LDP. - */ - xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; - xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; - XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); - XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); - } } -} - - -typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); -typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); -typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); - - -#if (XXH_VECTOR == XXH_AVX512) - -#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 -#define XXH3_accumulate XXH3_accumulate_avx512 -#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 -#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 - -#elif (XXH_VECTOR == XXH_AVX2) - -#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 -#define XXH3_accumulate XXH3_accumulate_avx2 -#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 -#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 - -#elif (XXH_VECTOR == XXH_SSE2) - -#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 -#define XXH3_accumulate XXH3_accumulate_sse2 -#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 -#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 - -#elif (XXH_VECTOR == XXH_NEON) - -#define XXH3_accumulate_512 XXH3_accumulate_512_neon -#define XXH3_accumulate XXH3_accumulate_neon -#define XXH3_scrambleAcc XXH3_scrambleAcc_neon -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#elif (XXH_VECTOR == XXH_VSX) - -#define XXH3_accumulate_512 XXH3_accumulate_512_vsx -#define XXH3_accumulate XXH3_accumulate_vsx -#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#elif (XXH_VECTOR == XXH_SVE) -#define XXH3_accumulate_512 XXH3_accumulate_512_sve -#define XXH3_accumulate XXH3_accumulate_sve -#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#else /* scalar */ - -#define XXH3_accumulate_512 XXH3_accumulate_512_scalar -#define XXH3_accumulate XXH3_accumulate_scalar -#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar -#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar - -#endif - -#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ -# undef XXH3_initCustomSecret -# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar -#endif - -XXH_FORCE_INLINE void -XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; - size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; - size_t const nb_blocks = (len - 1) / block_len; - - size_t n; - - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - - for (n = 0; n < nb_blocks; n++) { - f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); - f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); - } - - /* last partial block */ - XXH_ASSERT(len > XXH_STRIPE_LEN); - { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; - XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); - - /* last stripe */ - { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; -#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ - XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); - } } -} - -XXH_FORCE_INLINE xxh_u64 -XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) -{ - return XXH3_mul128_fold64( - acc[0] ^ XXH_readLE64(secret), - acc[1] ^ XXH_readLE64(secret+8) ); -} - -static XXH64_hash_t -XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) -{ - xxh_u64 result64 = start; - size_t i = 0; - - for (i = 0; i < 4; i++) { - result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); -#if defined(__clang__) /* Clang */ \ - && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ - && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ - && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ - /* - * UGLY HACK: - * Prevent autovectorization on Clang ARMv7-a. Exact same problem as - * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. - * XXH3_64bits, len == 256, Snapdragon 835: - * without hack: 2063.7 MB/s - * with hack: 2560.7 MB/s - */ - XXH_COMPILER_GUARD(result64); -#endif - } - - return XXH3_avalanche(result64); -} - -#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ - XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, - const void* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - /* do not align on 8, so that the secret is different from the accumulator */ -#define XXH_SECRET_MERGEACCS_START 11 - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); -} - -/* - * It's important for performance to transmit secret's size (when it's static) - * so that the compiler can properly optimize the vectorized loop. - * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. - * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE - * breaks -Og, this is XXH_NO_INLINE. - */ -XXH3_WITH_SECRET_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; - return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * It's preferable for performance that XXH3_hashLong is not inlined, - * as it results in a smaller function for small data, easier to the instruction cache. - * Note that inside this no_inline function, we do inline the internal loop, - * and provide a statically defined secret size to allow optimization of vector loop. - */ -XXH_NO_INLINE XXH_PUREF XXH64_hash_t -XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * XXH3_hashLong_64b_withSeed(): - * Generate a custom key based on alteration of default XXH3_kSecret with the seed, - * and then use this key for long mode hashing. - * - * This operation is decently fast but nonetheless costs a little bit of time. - * Try to avoid it whenever possible (typically when seed==0). - * - * It's important for performance that XXH3_hashLong is not inlined. Not sure - * why (uop cache maybe?), but the difference is large and easily measurable. - */ -XXH_FORCE_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, - XXH64_hash_t seed, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble, - XXH3_f_initCustomSecret f_initSec) -{ -#if XXH_SIZE_OPT <= 0 - if (seed == 0) - return XXH3_hashLong_64b_internal(input, len, - XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc, f_scramble); -#endif - { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - f_initSec(secret, seed); - return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), - f_acc, f_scramble); - } -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. - */ -XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) -{ - (void)secret; (void)secretLen; - return XXH3_hashLong_64b_withSeed_internal(input, len, seed, - XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); -} - - -typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, - XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); - -XXH_FORCE_INLINE XXH64_hash_t -XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, - XXH3_hashLong64_f f_hashLong) -{ - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secretLen` condition is not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - * Also, note that function signature doesn't offer room to return an error. - */ - if (len <= 16) - return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); - if (len <= 128) - return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); -} - - -/* === Public entry point === */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) -{ - return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) -{ - return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); -} - -XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - if (length <= XXH3_MIDSIZE_MAX) - return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); - return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); -} - - -/* === XXH3 streaming === */ -#ifndef XXH_NO_STREAM -/* - * Malloc's a pointer that is always aligned to align. - * - * This must be freed with `XXH_alignedFree()`. - * - * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte - * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 - * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. - * - * This underalignment previously caused a rather obvious crash which went - * completely unnoticed due to XXH3_createState() not actually being tested. - * Credit to RedSpah for noticing this bug. - * - * The alignment is done manually: Functions like posix_memalign or _mm_malloc - * are avoided: To maintain portability, we would have to write a fallback - * like this anyways, and besides, testing for the existence of library - * functions without relying on external build tools is impossible. - * - * The method is simple: Overallocate, manually align, and store the offset - * to the original behind the returned pointer. - * - * Align must be a power of 2 and 8 <= align <= 128. - */ -static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) -{ - XXH_ASSERT(align <= 128 && align >= 8); /* range check */ - XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ - XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ - { /* Overallocate to make room for manual realignment and an offset byte */ - xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); - if (base != NULL) { - /* - * Get the offset needed to align this pointer. - * - * Even if the returned pointer is aligned, there will always be - * at least one byte to store the offset to the original pointer. - */ - size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ - /* Add the offset for the now-aligned pointer */ - xxh_u8* ptr = base + offset; - - XXH_ASSERT((size_t)ptr % align == 0); - - /* Store the offset immediately before the returned pointer. */ - ptr[-1] = (xxh_u8)offset; - return ptr; - } - return NULL; - } -} -/* - * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass - * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. - */ -static void XXH_alignedFree(void* p) -{ - if (p != NULL) { - xxh_u8* ptr = (xxh_u8*)p; - /* Get the offset byte we added in XXH_malloc. */ - xxh_u8 offset = ptr[-1]; - /* Free the original malloc'd pointer */ - xxh_u8* base = ptr - offset; - XXH_free(base); - } -} -/*! @ingroup XXH3_family */ -/*! - * @brief Allocate an @ref XXH3_state_t. - * - * Must be freed with XXH3_freeState(). - * @return An allocated XXH3_state_t on success, `NULL` on failure. - */ -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) -{ - XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); - if (state==NULL) return NULL; - XXH3_INITSTATE(state); - return state; -} - -/*! @ingroup XXH3_family */ -/*! - * @brief Frees an @ref XXH3_state_t. - * - * Must be allocated with XXH3_createState(). - * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). - * @return XXH_OK. - */ -XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) -{ - XXH_alignedFree(statePtr); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API void -XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) -{ - XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); -} - -static void -XXH3_reset_internal(XXH3_state_t* statePtr, - XXH64_hash_t seed, - const void* secret, size_t secretSize) -{ - size_t const initStart = offsetof(XXH3_state_t, bufferedSize); - size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; - XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); - XXH_ASSERT(statePtr != NULL); - /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ - memset((char*)statePtr + initStart, 0, initLength); - statePtr->acc[0] = XXH_PRIME32_3; - statePtr->acc[1] = XXH_PRIME64_1; - statePtr->acc[2] = XXH_PRIME64_2; - statePtr->acc[3] = XXH_PRIME64_3; - statePtr->acc[4] = XXH_PRIME64_4; - statePtr->acc[5] = XXH_PRIME32_2; - statePtr->acc[6] = XXH_PRIME64_5; - statePtr->acc[7] = XXH_PRIME32_1; - statePtr->seed = seed; - statePtr->useSeed = (seed != 0); - statePtr->extSecret = (const unsigned char*)secret; - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); - statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; - statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, secret, secretSize); - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - if (statePtr == NULL) return XXH_ERROR; - if (seed==0) return XXH3_64bits_reset(statePtr); - if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) - XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) -{ - if (statePtr == NULL) return XXH_ERROR; - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - XXH3_reset_internal(statePtr, seed64, secret, secretSize); - statePtr->useSeed = 1; /* always, even if seed64==0 */ - return XXH_OK; -} - -/*! - * @internal - * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). - * - * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. - * - * @param acc Pointer to the 8 accumulator lanes - * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* - * @param nbStripesPerBlock Number of stripes in a block - * @param input Input pointer - * @param nbStripes Number of stripes to process - * @param secret Secret pointer - * @param secretLimit Offset of the last block in @p secret - * @param f_acc Pointer to an XXH3_accumulate implementation - * @param f_scramble Pointer to an XXH3_scrambleAcc implementation - * @return Pointer past the end of @p input after processing - */ -XXH_FORCE_INLINE const xxh_u8 * -XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, - size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, - const xxh_u8* XXH_RESTRICT input, size_t nbStripes, - const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; - /* Process full blocks */ - if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { - /* Process the initial partial block... */ - size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; - - do { - /* Accumulate and scramble */ - f_acc(acc, input, initialSecret, nbStripesThisIter); - f_scramble(acc, secret + secretLimit); - input += nbStripesThisIter * XXH_STRIPE_LEN; - nbStripes -= nbStripesThisIter; - /* Then continue the loop with the full block size */ - nbStripesThisIter = nbStripesPerBlock; - initialSecret = secret; - } while (nbStripes >= nbStripesPerBlock); - *nbStripesSoFarPtr = 0; - } - /* Process a partial block */ - if (nbStripes > 0) { - f_acc(acc, input, initialSecret, nbStripes); - input += nbStripes * XXH_STRIPE_LEN; - *nbStripesSoFarPtr += nbStripes; - } - /* Return end pointer */ - return input; -} - -#ifndef XXH3_STREAM_USE_STACK -# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ -# define XXH3_STREAM_USE_STACK 1 -# endif -#endif -/* - * Both XXH3_64bits_update and XXH3_128bits_update use this routine. - */ -XXH_FORCE_INLINE XXH_errorcode -XXH3_update(XXH3_state_t* XXH_RESTRICT const state, - const xxh_u8* XXH_RESTRICT input, size_t len, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - if (input==NULL) { - XXH_ASSERT(len == 0); - return XXH_OK; - } - - XXH_ASSERT(state != NULL); - { const xxh_u8* const bEnd = input + len; - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; -#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 - /* For some reason, gcc and MSVC seem to suffer greatly - * when operating accumulators directly into state. - * Operating into stack space seems to enable proper optimization. - * clang, on the other hand, doesn't seem to need this trick */ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; - XXH_memcpy(acc, state->acc, sizeof(acc)); -#else - xxh_u64* XXH_RESTRICT const acc = state->acc; -#endif - state->totalLen += len; - XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); - - /* small input : just fill in tmp buffer */ - if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { - XXH_memcpy(state->buffer + state->bufferedSize, input, len); - state->bufferedSize += (XXH32_hash_t)len; - return XXH_OK; - } - - /* total input is now > XXH3_INTERNALBUFFER_SIZE */ - #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) - XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ - - /* - * Internal buffer is partially filled (always, except at beginning) - * Complete it, then consume it. - */ - if (state->bufferedSize) { - size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; - XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); - input += loadSize; - XXH3_consumeStripes(acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, XXH3_INTERNALBUFFER_STRIPES, - secret, state->secretLimit, - f_acc, f_scramble); - state->bufferedSize = 0; - } - XXH_ASSERT(input < bEnd); - if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { - size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; - input = XXH3_consumeStripes(acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - input, nbStripes, - secret, state->secretLimit, - f_acc, f_scramble); - XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); - - } - /* Some remaining input (always) : buffer it */ - XXH_ASSERT(input < bEnd); - XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); - XXH_ASSERT(state->bufferedSize == 0); - XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); - state->bufferedSize = (XXH32_hash_t)(bEnd-input); -#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 - /* save stack accumulators into state */ - XXH_memcpy(state->acc, acc, sizeof(acc)); -#endif - } - - return XXH_OK; -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate, XXH3_scrambleAcc); -} - - -XXH_FORCE_INLINE void -XXH3_digest_long (XXH64_hash_t* acc, - const XXH3_state_t* state, - const unsigned char* secret) -{ - xxh_u8 lastStripe[XXH_STRIPE_LEN]; - const xxh_u8* lastStripePtr; - - /* - * Digest on a local copy. This way, the state remains unaltered, and it can - * continue ingesting more input afterwards. - */ - XXH_memcpy(acc, state->acc, sizeof(state->acc)); - if (state->bufferedSize >= XXH_STRIPE_LEN) { - /* Consume remaining stripes then point to remaining data in buffer */ - size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; - size_t nbStripesSoFar = state->nbStripesSoFar; - XXH3_consumeStripes(acc, - &nbStripesSoFar, state->nbStripesPerBlock, - state->buffer, nbStripes, - secret, state->secretLimit, - XXH3_accumulate, XXH3_scrambleAcc); - lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; - } else { /* bufferedSize < XXH_STRIPE_LEN */ - /* Copy to temp buffer */ - size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; - XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ - XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); - XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); - lastStripePtr = lastStripe; - } - /* Last stripe */ - XXH3_accumulate_512(acc, - lastStripePtr, - secret + state->secretLimit - XXH_SECRET_LASTACC_START); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) -{ - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; - XXH3_digest_long(acc, state, secret); - return XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); - } - /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ - if (state->useSeed) - return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), - secret, state->secretLimit + XXH_STRIPE_LEN); -} -#endif /* !XXH_NO_STREAM */ - - -/* ========================================== - * XXH3 128 bits (a.k.a XXH128) - * ========================================== - * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, - * even without counting the significantly larger output size. - * - * For example, extra steps are taken to avoid the seed-dependent collisions - * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). - * - * This strength naturally comes at the cost of some speed, especially on short - * lengths. Note that longer hashes are about as fast as the 64-bit version - * due to it using only a slight modification of the 64-bit loop. - * - * XXH128 is also more oriented towards 64-bit machines. It is still extremely - * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). - */ - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - /* A doubled version of 1to3_64b with different constants. */ - XXH_ASSERT(input != NULL); - XXH_ASSERT(1 <= len && len <= 3); - XXH_ASSERT(secret != NULL); - /* - * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } - * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } - * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } - */ - { xxh_u8 const c1 = input[0]; - xxh_u8 const c2 = input[len >> 1]; - xxh_u8 const c3 = input[len - 1]; - xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) - | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); - xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); - xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; - xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; - xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; - xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; - XXH128_hash_t h128; - h128.low64 = XXH64_avalanche(keyed_lo); - h128.high64 = XXH64_avalanche(keyed_hi); - return h128; - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(4 <= len && len <= 8); - seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; - { xxh_u32 const input_lo = XXH_readLE32(input); - xxh_u32 const input_hi = XXH_readLE32(input + len - 4); - xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); - xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; - xxh_u64 const keyed = input_64 ^ bitflip; - - /* Shift len to the left to ensure it is even, this avoids even multiplies. */ - XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); - - m128.high64 += (m128.low64 << 1); - m128.low64 ^= (m128.high64 >> 3); - - m128.low64 = XXH_xorshift64(m128.low64, 35); - m128.low64 *= PRIME_MX2; - m128.low64 = XXH_xorshift64(m128.low64, 28); - m128.high64 = XXH3_avalanche(m128.high64); - return m128; - } -} - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(input != NULL); - XXH_ASSERT(secret != NULL); - XXH_ASSERT(9 <= len && len <= 16); - { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; - xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; - xxh_u64 const input_lo = XXH_readLE64(input); - xxh_u64 input_hi = XXH_readLE64(input + len - 8); - XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); - /* - * Put len in the middle of m128 to ensure that the length gets mixed to - * both the low and high bits in the 128x64 multiply below. - */ - m128.low64 += (xxh_u64)(len - 1) << 54; - input_hi ^= bitfliph; - /* - * Add the high 32 bits of input_hi to the high 32 bits of m128, then - * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to - * the high 64 bits of m128. - * - * The best approach to this operation is different on 32-bit and 64-bit. - */ - if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ - /* - * 32-bit optimized version, which is more readable. - * - * On 32-bit, it removes an ADC and delays a dependency between the two - * halves of m128.high64, but it generates an extra mask on 64-bit. - */ - m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); - } else { - /* - * 64-bit optimized (albeit more confusing) version. - * - * Uses some properties of addition and multiplication to remove the mask: - * - * Let: - * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) - * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) - * c = XXH_PRIME32_2 - * - * a + (b * c) - * Inverse Property: x + y - x == y - * a + (b * (1 + c - 1)) - * Distributive Property: x * (y + z) == (x * y) + (x * z) - * a + (b * 1) + (b * (c - 1)) - * Identity Property: x * 1 == x - * a + b + (b * (c - 1)) - * - * Substitute a, b, and c: - * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) - * - * Since input_hi.hi + input_hi.lo == input_hi, we get this: - * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) - */ - m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); - } - /* m128 ^= XXH_swap64(m128 >> 64); */ - m128.low64 ^= XXH_swap64(m128.high64); - - { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ - XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); - h128.high64 += m128.high64 * XXH_PRIME64_2; - - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = XXH3_avalanche(h128.high64); - return h128; - } } -} - -/* - * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN - */ -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) -{ - XXH_ASSERT(len <= 16); - { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); - if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); - if (len) return XXH3_len_1to3_128b(input, len, secret, seed); - { XXH128_hash_t h128; - xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); - xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); - h128.low64 = XXH64_avalanche(seed ^ bitflipl); - h128.high64 = XXH64_avalanche( seed ^ bitfliph); - return h128; - } } -} - -/* - * A bit slower than XXH3_mix16B, but handles multiply by zero better. - */ -XXH_FORCE_INLINE XXH128_hash_t -XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, - const xxh_u8* secret, XXH64_hash_t seed) -{ - acc.low64 += XXH3_mix16B (input_1, secret+0, seed); - acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); - acc.high64 += XXH3_mix16B (input_2, secret+16, seed); - acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); - return acc; -} - - -XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(16 < len && len <= 128); - - { XXH128_hash_t acc; - acc.low64 = len * XXH_PRIME64_1; - acc.high64 = 0; - -#if XXH_SIZE_OPT >= 1 - { - /* Smaller, but slightly slower. */ - unsigned int i = (unsigned int)(len - 1) / 32; - do { - acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); - } while (i-- != 0); - } -#else - if (len > 32) { - if (len > 64) { - if (len > 96) { - acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); - } - acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); - } - acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); - } - acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); -#endif - { XXH128_hash_t h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + ((len - seed) * XXH_PRIME64_2); - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); - return h128; - } - } -} - -XXH_NO_INLINE XXH_PUREF XXH128_hash_t -XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH64_hash_t seed) -{ - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; - XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); - - { XXH128_hash_t acc; - unsigned i; - acc.low64 = len * XXH_PRIME64_1; - acc.high64 = 0; - /* - * We set as `i` as offset + 32. We do this so that unchanged - * `len` can be used as upper bound. This reaches a sweet spot - * where both x86 and aarch64 get simple agen and good codegen - * for the loop. - */ - for (i = 32; i < 160; i += 32) { - acc = XXH128_mix32B(acc, - input + i - 32, - input + i - 16, - secret + i - 32, - seed); - } - acc.low64 = XXH3_avalanche(acc.low64); - acc.high64 = XXH3_avalanche(acc.high64); - /* - * NB: `i <= len` will duplicate the last 32-bytes if - * len % 32 was zero. This is an unfortunate necessity to keep - * the hash result stable. - */ - for (i=160; i <= len; i += 32) { - acc = XXH128_mix32B(acc, - input + i - 32, - input + i - 16, - secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, - seed); - } - /* last bytes */ - acc = XXH128_mix32B(acc, - input + len - 16, - input + len - 32, - secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, - (XXH64_hash_t)0 - seed); - - { XXH128_hash_t h128; - h128.low64 = acc.low64 + acc.high64; - h128.high64 = (acc.low64 * XXH_PRIME64_1) - + (acc.high64 * XXH_PRIME64_4) - + ((len - seed) * XXH_PRIME64_2); - h128.low64 = XXH3_avalanche(h128.low64); - h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); - return h128; - } - } -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, - const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble) -{ - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); - - /* converge into final hash */ - XXH_STATIC_ASSERT(sizeof(acc) == 64); - XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)len * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + secretSize - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)len * XXH_PRIME64_2)); - return h128; - } -} - -/* - * It's important for performance that XXH3_hashLong() is not inlined. - */ -XXH_NO_INLINE XXH_PUREF XXH128_hash_t -XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_accumulate, XXH3_scrambleAcc); -} - -/* - * It's important for performance to pass @p secretLen (when it's static) - * to the compiler, so that it can properly optimize the vectorized loop. - * - * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE - * breaks -Og, this is XXH_NO_INLINE. - */ -XXH3_WITH_SECRET_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)seed64; - return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, - XXH3_accumulate, XXH3_scrambleAcc); -} - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, - XXH64_hash_t seed64, - XXH3_f_accumulate f_acc, - XXH3_f_scrambleAcc f_scramble, - XXH3_f_initCustomSecret f_initSec) -{ - if (seed64 == 0) - return XXH3_hashLong_128b_internal(input, len, - XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc, f_scramble); - { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; - f_initSec(secret, seed64); - return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), - f_acc, f_scramble); - } -} - -/* - * It's important for performance that XXH3_hashLong is not inlined. - */ -XXH_NO_INLINE XXH128_hash_t -XXH3_hashLong_128b_withSeed(const void* input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) -{ - (void)secret; (void)secretLen; - return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, - XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); -} - -typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, - XXH64_hash_t, const void* XXH_RESTRICT, size_t); - -XXH_FORCE_INLINE XXH128_hash_t -XXH3_128bits_internal(const void* input, size_t len, - XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, - XXH3_hashLong128_f f_hl128) -{ - XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); - /* - * If an action is to be taken if `secret` conditions are not respected, - * it should be done here. - * For now, it's a contract pre-condition. - * Adding a check and a branch here would cost performance at every hash. - */ - if (len <= 16) - return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); - if (len <= 128) - return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); - return f_hl128(input, len, seed64, secret, secretLen); -} - - -/* === Public XXH128 API === */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_128bits_internal(input, len, 0, - XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_hashLong_128b_default); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_128bits_internal(input, len, 0, - (const xxh_u8*)secret, secretSize, - XXH3_hashLong_128b_withSecret); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) -{ - return XXH3_128bits_internal(input, len, seed, - XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_hashLong_128b_withSeed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - if (len <= XXH3_MIDSIZE_MAX) - return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); - return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) -{ - return XXH3_128bits_withSeed(input, len, seed); -} - - -/* === XXH3 128-bit streaming === */ -#ifndef XXH_NO_STREAM -/* - * All initialization and update functions are identical to 64-bit streaming variant. - * The only difference is the finalization routine. - */ - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) -{ - return XXH3_64bits_reset(statePtr); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) -{ - return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) -{ - return XXH3_64bits_reset_withSeed(statePtr, seed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) -{ - return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) -{ - return XXH3_64bits_update(state, input, len); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) -{ - const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; - if (state->totalLen > XXH3_MIDSIZE_MAX) { - XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; - XXH3_digest_long(acc, state, secret); - XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); - { XXH128_hash_t h128; - h128.low64 = XXH3_mergeAccs(acc, - secret + XXH_SECRET_MERGEACCS_START, - (xxh_u64)state->totalLen * XXH_PRIME64_1); - h128.high64 = XXH3_mergeAccs(acc, - secret + state->secretLimit + XXH_STRIPE_LEN - - sizeof(acc) - XXH_SECRET_MERGEACCS_START, - ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); - return h128; - } - } - /* len <= XXH3_MIDSIZE_MAX : short code */ - if (state->seed) - return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); - return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), - secret, state->secretLimit + XXH_STRIPE_LEN); -} -#endif /* !XXH_NO_STREAM */ -/* 128-bit utility functions */ - -#include /* memcmp, memcpy */ - -/* return : 1 is equal, 0 if different */ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) -{ - /* note : XXH128_hash_t is compact, it has no padding byte */ - return !(memcmp(&h1, &h2, sizeof(h1))); -} - -/* This prototype is compatible with stdlib's qsort(). - * @return : >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 */ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) -{ - XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; - XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; - int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); - /* note : bets that, in most cases, hash values are different */ - if (hcmp) return hcmp; - return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); -} - - -/*====== Canonical representation ======*/ -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API void -XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) -{ - XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); - if (XXH_CPU_LITTLE_ENDIAN) { - hash.high64 = XXH_swap64(hash.high64); - hash.low64 = XXH_swap64(hash.low64); - } - XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); - XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH128_hash_t -XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) -{ - XXH128_hash_t h; - h.high64 = XXH_readBE64(src); - h.low64 = XXH_readBE64(src->digest + 8); - return h; -} - - - -/* ========================================== - * Secret generators - * ========================================== - */ -#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) - -XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) -{ - XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); - XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); -} - -/*! @ingroup XXH3_family */ -XXH_PUBLIC_API XXH_errorcode -XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) -{ -#if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(secretBuffer != NULL); - XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); -#else - /* production mode, assert() are disabled */ - if (secretBuffer == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; -#endif - - if (customSeedSize == 0) { - customSeed = XXH3_kSecret; - customSeedSize = XXH_SECRET_DEFAULT_SIZE; - } -#if (XXH_DEBUGLEVEL >= 1) - XXH_ASSERT(customSeed != NULL); -#else - if (customSeed == NULL) return XXH_ERROR; -#endif - - /* Fill secretBuffer with a copy of customSeed - repeat as needed */ - { size_t pos = 0; - while (pos < secretSize) { - size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); - memcpy((char*)secretBuffer + pos, customSeed, toCopy); - pos += toCopy; - } } - - { size_t const nbSeg16 = secretSize / 16; - size_t n; - XXH128_canonical_t scrambler; - XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); - for (n=0; n +#include RT_DLLEXPORT size_t rtGetCompressionBound(size_t uncompressed_size) { return (size_t)LZ4_compressBound((int)uncompressed_size); diff --git a/src/runtime/config.c b/src/runtime/config.c index a2cbd4f..900f711 100644 --- a/src/runtime/config.c +++ b/src/runtime/config.c @@ -1,41 +1,304 @@ #include "config.h" +#include "fsutils.h" #include "runtime.h" #include "threading.h" +#include "aio.h" +#include "buffer_manager.h" +#include "file_tab.h" +#include "mem_arena.h" +#include "string_storage.h" + +#include #include +#include #include -#define RT_MAX_CVARS 1024 +#define RT_MAX_CVARS 1024 +#define RT_MAX_EVENT_HANDLERS 1024 -static rt_cvar *_vars[RT_MAX_CVARS]; -static unsigned int _next = 0; -static rt_mutex *_mutex = NULL; +typedef struct rt_cvar_change_event_list { + rt_cvar_change_event_handler_fn *Handler; + void *userdata; + struct rt_cvar_change_event_list *next; +} rt_cvar_change_event_list; -void rtRegisterCVAR(rt_cvar *cvar) { - /* TODO(Kevin): Check if we have a loaded value for that cvar. - * If yes, override the provided default value. */ +typedef struct { + rt_cvar *cvar; + rt_cvar_change_event_list *change_event_handlers; +} rt_cvar_container; + +static rt_cvar_container _vars[RT_MAX_CVARS]; +static unsigned int _next_cvar = 0; +static rt_cvar_change_event_list _change_event_handlers[RT_MAX_EVENT_HANDLERS]; +static unsigned int _next_event_handler = 0; +static rt_mutex *_mutex = NULL; + +RT_DLLEXPORT void rtRegisterCVAR(rt_cvar *cvar) { if (!_mutex) _mutex = rtCreateMutex(); rtLockMutex(_mutex); - if (_next < RT_MAX_CVARS) { - _vars[_next++] = cvar; + if (_next_cvar < RT_MAX_CVARS) { + rtLog("CVAR", "Registered cvar %s", cvar->name); + _vars[_next_cvar].cvar = cvar; + _vars[_next_cvar].change_event_handlers = NULL; + ++_next_cvar; } else { - rtReportError("cvar", "Ran out of space for CVars"); + rtReportError("CVAR", "Ran out of space for CVars"); } rtUnlockMutex(_mutex); } -rt_cvar *rtGetCVAR(const char *name) { +RT_DLLEXPORT rt_cvar *rtGetCVAR(const char *name) { if (!_mutex) _mutex = rtCreateMutex(); rt_cvar *var = NULL; rtLockMutex(_mutex); - for (unsigned int i = 0; i < _next; ++i) { - if (strcmp(name, _vars[i]->name) == 0) { - var = _vars[i]; + for (unsigned int i = 0; i < _next_cvar; ++i) { + if (strcmp(name, _vars[i].cvar->name) == 0) { + var = _vars[i].cvar; break; } } rtUnlockMutex(_mutex); return var; +} + +RT_DLLEXPORT void rtRegisterCVARChangeEventHandler(rt_cvar *cvar, + rt_cvar_change_event_handler_fn *event_handler, + void *userdata) { + RT_VERIFY(_mutex); + rtLockMutex(_mutex); + if (_next_event_handler == RT_MAX_EVENT_HANDLERS) { + rtReportError("CVAR", "Ran out of space for CVar change event handlers"); + rtUnlockMutex(_mutex); + return; + } + + for (unsigned int i = 0; i < _next_cvar; ++i) { + if (_vars[i].cvar == cvar) { + rt_cvar_change_event_list *entry = &_change_event_handlers[_next_event_handler++]; + entry->Handler = event_handler; + entry->next = _vars[i].change_event_handlers; + entry->userdata = userdata; + _vars[i].change_event_handlers = entry; + } + } + rtUnlockMutex(_mutex); +} + +RT_DLLEXPORT void rtNotifyCVARChange(const rt_cvar *cvar) { +#ifndef RT_DONT_LOG_CVAR_CHANGES + switch (cvar->type) { + case RT_CVAR_TYPE_INT: + rtLog("CVAR", "Changed %s to %d.", cvar->name, cvar->i); + break; + case RT_CVAR_TYPE_FLOAT: + rtLog("CVAR", "Changed %s to %f.", cvar->name, cvar->f); + break; + case RT_CVAR_TYPE_STRING: + rtLog("CVAR", "Changed %s to '%s'.", cvar->name, cvar->s); + break; + case RT_CVAR_TYPE_SIZE: + rtLog("CVAR", "Changed %s to %zu.", cvar->name, cvar->sz); + break; + default: + rtLog("CVAR", "Changed %s, but the cvar has an invalid type.", cvar->name); + break; + } +#endif + + RT_VERIFY(_mutex); + rtLockMutex(_mutex); + for (unsigned int i = 0; i < _next_cvar; ++i) { + if (_vars[i].cvar == cvar) { + rt_cvar_change_event_list *entry = _vars[i].change_event_handlers; + rtUnlockMutex(_mutex); + + while (entry) { + /* We don't want to hold the mutex while the event handler runs, + * because that would prevent any calls to CVar functions. */ + entry->Handler(_vars[i].cvar, entry->userdata); + + rtLockMutex(_mutex); + entry = entry->next; + rtUnlockMutex(_mutex); + } + return; + } + } + /* Not found... */ + rtUnlockMutex(_mutex); +} + +static int Handler(void *user, const char *section, const char *name, const char *value) { + /* We currently ignore sections. They could be used to categorize variables by system? */ + RT_UNUSED(section); + const char *file_path = user; + + rt_cvar *cvar = rtGetCVAR(name); + if (!cvar) { + /* Not a critical error. */ + rtLog("CVAR", "Unknown CVar %s in config file %s.", name, file_path); + return 1; + } + + int num_read = 0; + switch (cvar->type) { + case RT_CVAR_TYPE_INT: + num_read = sscanf(value, "%d", &cvar->i); + break; + case RT_CVAR_TYPE_FLOAT: + num_read = sscanf(value, "%f", &cvar->f); + break; + case RT_CVAR_TYPE_STRING: { + num_read = 1; + char *copy = rtStoreString(value); + if (!copy) { + rtReportError("CVAR", + "Failed to store string value of cvar %s in config file %s.", + name, + file_path); + return 0; + } + cvar->s = copy; + break; + } + case RT_CVAR_TYPE_SIZE: + num_read = sscanf(value, "%zu", &cvar->sz); + break; + default: + rtReportError("CVAR", "CVar %s has an invalid type.", cvar->name); + return 0; + } + + if (num_read == 1) { + rtNotifyCVARChange(cvar); + } else { + rtLog("CVAR", "Failed to read value of CVar %s in config file %s.", cvar->name, file_path); + } + + return 1; +} + +/* Config files are ini files */ +static rt_result ProcessConfigFile(const void *buffer, size_t fsz, const char *file_path) { + rt_result result = RT_SUCCESS; + if (ini_parse_string(buffer, Handler, (void *)file_path) < 0) { + rtLog("CVAR", "Failed to parse config file '%s'.", file_path); + return RT_UNKNOWN_ERROR; + } + return result; +} + +RT_DLLEXPORT rt_result rtProcessConfigFiles(unsigned int count, const rt_file_id *fids) { + if (count > RT_LOAD_BATCH_MAX_SIZE) { + rtReportError("CVAR", + "Got %u config files, but the maximum number of config files is %u.", + count, + RT_LOAD_BATCH_MAX_SIZE); + return RT_INVALID_VALUE; + } + + typedef struct { + void *buffer; + size_t fsz; + const char *path; + } config_file_buf; + + rt_temp_arena temp = rtGetTemporaryArena(NULL, 0); + if (!temp.arena) + return RT_OUT_OF_MEMORY; + config_file_buf *configs = RT_ARENA_PUSH_ARRAY_ZERO(temp.arena, config_file_buf, count); + if (!configs) { + rtReturnTemporaryArena(temp); + return RT_OUT_OF_MEMORY; + } + rt_aio_handle *aios = RT_ARENA_PUSH_ARRAY_ZERO(temp.arena, rt_aio_handle, count); + if (!configs) { + rtReturnTemporaryArena(temp); + return RT_OUT_OF_MEMORY; + } + rt_result res = RT_SUCCESS; + + rt_load_batch load_batch; + load_batch.num_loads = count; + for (unsigned int i = 0; i < count; ++i) { + const char *path = rtGetFilePath(fids[i]); + if (!path) { + rtReportError("CVAR", "Invalid config file path!"); + res = RT_INVALID_VALUE; + goto out; + } + size_t fsz = rtGetFileSize(path); + load_batch.loads[i].dest = rtAllocBuffer(fsz); + if (!load_batch.loads[i].dest) { + rtReportError("CVAR", "Failed to allocate memory for config file."); + res = RT_OUT_OF_MEMORY; + goto out; + } + load_batch.loads[i].num_bytes = fsz; + load_batch.loads[i].offset = 0; + load_batch.loads[i].file = fids[i]; + + configs[i].buffer = load_batch.loads[i].dest; + configs[i].fsz = fsz; + configs[i].path = path; + } + + if ((res = rtSubmitLoadBatch(&load_batch, aios)) != RT_SUCCESS) { + goto out; + } + + for (unsigned int i = 0; i < count; ++i) { + rt_aio_state state = rtWaitForAIOCompletion(aios[i]); + if (state == RT_AIO_STATE_FINISHED) { + res = ProcessConfigFile(configs[i].buffer, configs[i].fsz, configs[i].path); + if (res != RT_SUCCESS) + goto out; + } else { + RT_ASSERT(state == RT_AIO_STATE_FAILED, "Unexpected aio state."); + res = RT_UNKNOWN_ERROR; + goto out; + } + } + +out: + for (unsigned int i = 0; i < count; ++i) { + if (configs[i].buffer) + rtReleaseBuffer(configs[i].buffer, configs[i].fsz); + if (aios[i] != RT_AIO_INVALID_HANDLE) + rtReleaseAIO(aios[i]); + } + rtReturnTemporaryArena(temp); + return res; +} + +/* This gets called before _any_ engine systems are available (especially AIO, ftab and + * Buffermanager) */ +void ProcessEarlyEngineConfigs(void) { + const char *engine_cfgs[] = {"cfg/engine_early.cfg"}; + void *buf = NULL; + size_t buf_sz = 0; + + for (unsigned int i = 0; i < RT_ARRAY_COUNT(engine_cfgs); ++i) { + if (!rtDoesFileExist(engine_cfgs[i])) { + rtLog("CVAR", "Could not find early engine config file %s", engine_cfgs[i]); + continue; + } + size_t fsz = rtGetFileSize(engine_cfgs[i]); + if ((fsz + 1) > buf_sz) { + buf = realloc(buf, fsz + 1); + buf_sz = fsz + 1; + } + if (!rtSyncReadWholeFile(engine_cfgs[i], buf, buf_sz)) { + rtLog("CVAR", "Failed to read early engine config file %s", engine_cfgs[i]); + continue; + } + ((char *)buf)[fsz] = '\0'; + if (ProcessConfigFile(buf, fsz, engine_cfgs[i]) != RT_SUCCESS) { + rtLog("CVAR", "Failed to process early engine config file %s", engine_cfgs[i]); + } + } + free(buf); } \ No newline at end of file diff --git a/src/runtime/config.h b/src/runtime/config.h index bc0f4c3..42fc03f 100644 --- a/src/runtime/config.h +++ b/src/runtime/config.h @@ -1,6 +1,7 @@ #ifndef RT_CONFIG_H #define RT_CONFIG_H +#include "file_tab.h" #include "runtime.h" #ifdef __cplusplus @@ -26,6 +27,8 @@ typedef struct { rt_cvar_type type; } rt_cvar; +typedef void(rt_cvar_change_event_handler_fn)(rt_cvar *cvar, void *userdata); + /* n variable name, d description string, v default value*/ #define RT_CVAR_I(n, d, v) \ rt_cvar n = {.name = #n, .description = d, .i = (v), .type = RT_CVAR_TYPE_INT} @@ -40,6 +43,20 @@ RT_DLLEXPORT void rtRegisterCVAR(rt_cvar *cvar); RT_DLLEXPORT rt_cvar *rtGetCVAR(const char *name); +/* Change event handlers are called when rtNotifyCVARChange is called. + * The system gives no guarantees about the order in which these are called. + * + * NOTE: A internal mutex is held while the event handlers run, which means that */ +RT_DLLEXPORT void rtRegisterCVARChangeEventHandler(rt_cvar *cvar, + rt_cvar_change_event_handler_fn *event_handler, + void *userdata); + +RT_DLLEXPORT void rtNotifyCVARChange(const rt_cvar *cvar); + +/* Load and parse configuration files. + * They are processed in-order, meaning later files can overwrite earlier files. */ +RT_DLLEXPORT rt_result rtProcessConfigFiles(unsigned int count, const rt_file_id *fids); + #ifdef __cplusplus } #endif diff --git a/src/runtime/fsutils.c b/src/runtime/fsutils.c index 8aeab88..454fcfe 100644 --- a/src/runtime/fsutils.c +++ b/src/runtime/fsutils.c @@ -98,7 +98,14 @@ RT_DLLEXPORT size_t rtGetFileSize(const char *path) { return (size_t)attribs.nFileSizeHigh << 32 | (size_t)attribs.nFileSizeLow; } -uint64_t rtGetCurrentTimestamp(void) { +RT_DLLEXPORT bool rtDoesFileExist(const char *path) { + WCHAR wpath[MAX_PATH]; + MultiByteToWideChar(CP_UTF8, MB_PRECOMPOSED, path, -1, wpath, MAX_PATH); + DWORD dwAttrib = GetFileAttributesW(wpath); + return (dwAttrib != INVALID_FILE_ATTRIBUTES && !(dwAttrib & FILE_ATTRIBUTE_DIRECTORY)); +} + +RT_DLLEXPORT uint64_t rtGetCurrentTimestamp(void) { FILETIME ft; GetSystemTimeAsFileTime(&ft); uint64_t ts = ft.dwLowDateTime; @@ -106,7 +113,7 @@ uint64_t rtGetCurrentTimestamp(void) { return ts; } -uint64_t rtGetFileModificationTimestamp(const char *path) { +RT_DLLEXPORT uint64_t rtGetFileModificationTimestamp(const char *path) { WCHAR wpath[MAX_PATH]; MultiByteToWideChar(CP_UTF8, MB_PRECOMPOSED, path, -1, wpath, MAX_PATH); WIN32_FILE_ATTRIBUTE_DATA attribs; @@ -117,12 +124,34 @@ uint64_t rtGetFileModificationTimestamp(const char *path) { return ts; } +RT_DLLEXPORT bool rtSyncReadWholeFile(const char *path, void *dest, size_t dest_size) { + size_t fsz = rtGetFileSize(path); + if (fsz > dest_size) + return false; + + WCHAR wpath[MAX_PATH]; + MultiByteToWideChar(CP_UTF8, MB_PRECOMPOSED, path, -1, wpath, MAX_PATH); + HANDLE hFile = CreateFileW(wpath, + GENERIC_READ, + FILE_SHARE_READ, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_NORMAL, + NULL); + if (!hFile) + return false; + BOOL bResult = ReadFile(hFile, dest, (DWORD)fsz, NULL, NULL); + CloseHandle(hFile); + return (bool)bResult; +} + #elif defined(__linux__) #include #include #include #include +#include struct rt_scandir_handle_s { DIR *handle; @@ -208,17 +237,40 @@ RT_DLLEXPORT size_t rtGetFileSize(const char *path) { return (size_t)st.st_size; } -uint64_t rtGetCurrentTimestamp(void) { +RT_DLLEXPORT bool rtDoesFileExist(const char *path) { + struct stat st; + if (stat(path, &st) != 0) + return 0; + return 1; +} + +RT_DLLEXPORT uint64_t rtGetCurrentTimestamp(void) { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); return (uint64_t)ts.tv_sec; } -uint64_t rtGetFileModificationTimestamp(const char *path) { +RT_DLLEXPORT uint64_t rtGetFileModificationTimestamp(const char *path) { struct stat st; if (stat(path, &st) != 0) return 0; return (uint64_t)st.st_mtim.tv_sec; } +RT_DLLEXPORT bool rtSyncReadWholeFile(const char *path, void *dest, size_t dest_size) { + FILE *f = fopen(path, "rb"); + if (!f) + return false; + fseek(f, SEEK_END, 0); + size_t fsz = (size_t)ftell(f); + fseek(f, SEEK_SET, 0); + if (fsz > dest_size) { + fclose(f); + return false; + } + size_t n = fread(dest, 1 fsz, f); + fclose(f); + return n == fsz; +} + #endif diff --git a/src/runtime/fsutils.h b/src/runtime/fsutils.h index 1dfd9ba..0fb17b5 100644 --- a/src/runtime/fsutils.h +++ b/src/runtime/fsutils.h @@ -35,9 +35,15 @@ RT_DLLEXPORT size_t rtGetFileSize(const char *path); RT_DLLEXPORT uint64_t rtGetFileModificationTimestamp(const char *path); +RT_DLLEXPORT bool rtDoesFileExist(const char *path); + /* Does not really fit here, but it is mostly useful for comparison with file timestamps. */ RT_DLLEXPORT uint64_t rtGetCurrentTimestamp(void); +/* CAUTION: This does a synchronous file read!!! ONLY use this IF YOU KNOW WHAT YOU ARE DOING. + * 99.9% of file reads should probably use the AIO system instead. */ +RT_DLLEXPORT bool rtSyncReadWholeFile(const char *path, void *dest, size_t dest_size); + #ifdef __cplusplus } #endif diff --git a/src/runtime/hashing.c b/src/runtime/hashing.c index afed5dd..f7a0d0c 100644 --- a/src/runtime/hashing.c +++ b/src/runtime/hashing.c @@ -1,17 +1,18 @@ #include "hashing.h" -#include +#include #include -/* XXH32 of "recreational.tech" */ +/* XXH32/64 of "recreational.tech" with seed 0 */ #define HASH32_SEED 0xd9035e35 +#define HASH64_SEED 0xa14e16c3ae8b282dULL static_assert(sizeof(rt_hash64) == sizeof(XXH64_hash_t), "Size mismatch between rt_hash64 and XXH64_hash_t!"); static_assert(sizeof(rt_hash32) == sizeof(XXH32_hash_t), "Size mismatch between rt_hash32 and XXH32_hash_t!"); RT_DLLEXPORT rt_hash64 rtHashBytes(const void *begin, size_t count) { - return XXH3_64bits(begin, count); + return XXH64(begin, count, HASH64_SEED); } diff --git a/src/runtime/init.c b/src/runtime/init.c index 2132bdd..e7725f9 100644 --- a/src/runtime/init.c +++ b/src/runtime/init.c @@ -4,19 +4,23 @@ #include "file_tab.h" #include "buffer_manager.h" +extern rt_cvar rt_MaxConcurrentAsyncIO; +extern rt_cvar rt_AssertEnabled; extern rt_cvar rt_BufferMemoryBudget; extern rt_cvar rt_FileTabCapacity; -extern rt_cvar rt_MaxConcurrentAsyncIO; +extern rt_cvar rt_TemporaryArenaSize; extern rt_cvar rt_ResourceDirectory; extern rt_cvar rt_ResourceCacheSize; extern rt_cvar rt_MaxCachedResources; extern rt_cvar rt_ResourceNamespaceSize; extern rt_cvar rt_DisableResourceNamespaceLoad; -void RegisterRuntimeCVars(void) { +static void RegisterRuntimeCVars(void) { + rtRegisterCVAR(&rt_MaxConcurrentAsyncIO); + rtRegisterCVAR(&rt_AssertEnabled); rtRegisterCVAR(&rt_BufferMemoryBudget); rtRegisterCVAR(&rt_FileTabCapacity); - rtRegisterCVAR(&rt_MaxConcurrentAsyncIO); + rtRegisterCVAR(&rt_TemporaryArenaSize); rtRegisterCVAR(&rt_ResourceDirectory); rtRegisterCVAR(&rt_ResourceCacheSize); rtRegisterCVAR(&rt_MaxCachedResources); @@ -25,6 +29,8 @@ void RegisterRuntimeCVars(void) { } extern void SetMainThreadId(void); +extern void ProcessEarlyEngineConfigs(void); +extern void InitStringStorage(void); extern rt_result InitBufferManager(void); extern void ShutdownBufferManager(void); @@ -39,6 +45,9 @@ extern rt_result InitTiming(void); RT_DLLEXPORT rt_result rtInitRuntime(void) { SetMainThreadId(); RegisterRuntimeCVars(); + InitStringStorage(); + + ProcessEarlyEngineConfigs(); rt_result res; if ((res = InitTiming()) != RT_SUCCESS) { diff --git a/src/runtime/meson.build b/src/runtime/meson.build index 9c15ecf..7920da5 100644 --- a/src/runtime/meson.build +++ b/src/runtime/meson.build @@ -1,5 +1,12 @@ -# Runtime -runtime_deps = [thread_dep, m_dep, windowing_dep] +# Dependencies installed via wrap +inih_proj = subproject('inih', default_options: ['default_library=static', 'b_sanitize=none', 'utf-8_bom=true', 'multi-line_entries=true', 'report_line_numbers=true']) +inih_dep = inih_proj.get_variable('inih_dep') +lz4_proj = subproject('lz4', default_options: ['default_library=static', 'b_sanitize=none']) +lz4_dep = lz4_proj.get_variable('liblz4_dep') +xxhash_proj = subproject('xxhash', default_options: ['default_library=static', 'b_sanitize=none']) +xxhash_dep = xxhash_proj.get_variable('xxhash_dep') + +runtime_deps = [thread_dep, m_dep, windowing_dep, inih_dep, lz4_dep, xxhash_dep] runtime_incdirs = contrib_incdir runtime_lib = library('rt', # Project Sources @@ -18,6 +25,7 @@ runtime_lib = library('rt', 'mem_arena.h', 'resources.h', 'runtime.h', + 'string_storage.h', 'threading.h', 'threading_helpers.hpp', 'timing.h', @@ -40,6 +48,7 @@ runtime_lib = library('rt', 'mem_arena.c', 'resource_manager.c', 'sprint.c', + 'string_storage.c', 'text.c', 'threading_cond.c', 'threading_mutex.c', @@ -48,9 +57,6 @@ runtime_lib = library('rt', 'threading_thread.c', 'timing.c', - # Contrib Sources - contrib_dir / 'xxhash/xxhash.c', - contrib_dir / 'lz4/lz4.c', dependencies : runtime_deps, include_directories : [engine_incdir, runtime_incdirs], c_pch : 'pch/rt_pch.h', diff --git a/src/runtime/resource_manager.c b/src/runtime/resource_manager.c index a839f8b..5b4cab7 100644 --- a/src/runtime/resource_manager.c +++ b/src/runtime/resource_manager.c @@ -17,13 +17,13 @@ #include RT_CVAR_S(rt_ResourceDirectory, "The directory used for storing resources. Default: res", "res"); -RT_CVAR_I(rt_ResourceCacheSize, +RT_CVAR_SZ(rt_ResourceCacheSize, "The maximum amount of memory used for caching resources. Default: 512MB", RT_MB(512)); -RT_CVAR_I(rt_MaxCachedResources, +RT_CVAR_SZ(rt_MaxCachedResources, "The maximum number of simultaneously cached resources. Default: 1024", 1024); -RT_CVAR_I(rt_ResourceNamespaceSize, +RT_CVAR_SZ(rt_ResourceNamespaceSize, "The maximum number of resources that can exist. Default: 1.048.576", 1048576); RT_CVAR_I(rt_DisableResourceNamespaceLoad, @@ -312,7 +312,7 @@ static void ShutdownResourceCache(void) { /* NOTE(Kevin): Only call this while holding a write-lock on the cache. * The function locks the reclaim heap lock itself. */ static bool FreeCacheSpace(size_t space) { - size_t free_space = (size_t)rt_ResourceCacheSize.i - _cache.current_size; + size_t free_space = rt_ResourceCacheSize.sz - _cache.current_size; rtLockMutex(_cache.heap_lock); while (free_space < space && !rtMinheapIsEmpty(&_cache.reclaim_heap)) { rt_cached_resource_ref ref; @@ -336,7 +336,7 @@ static bool FreeCacheSpace(size_t space) { res->size = 0; /* Remove from lookup table */ - size_t ht_size = (size_t)rt_MaxCachedResources.i * 2; + size_t ht_size = rt_MaxCachedResources.sz * 2; for (size_t off = 0; off < ht_size; ++off) { size_t slot = (ref.id + off) % ht_size; if (_cache.resource_ids[slot] == ref.id) { @@ -352,7 +352,7 @@ static bool FreeCacheSpace(size_t space) { } static unsigned int FindCachedResource(rt_resource_id id) { - size_t ht_size = (size_t)rt_MaxCachedResources.i * 2; + size_t ht_size = rt_MaxCachedResources.sz * 2; for (size_t off = 0; off < ht_size; ++off) { size_t slot = (id + off) % ht_size; if (_cache.resource_ids[slot] == id) @@ -378,7 +378,7 @@ static rt_resource *CacheResource(rt_resource_id id, const rt_resource *res) { } else { /* Insert into cache */ size_t total_size = sizeof(rt_resource) + GetResourceDataSize(res); - if (_cache.current_size + total_size >= (size_t)rt_ResourceCacheSize.i) { + if (_cache.current_size + total_size >= rt_ResourceCacheSize.sz) { if (!FreeCacheSpace(total_size)) { rtLog("RESMGR", "Unable to reclaim %zu kB from the resource cache.", @@ -421,7 +421,7 @@ static rt_resource *CacheResource(rt_resource_id id, const rt_resource *res) { /* Insert into lookup table */ bool inserted = false; - size_t ht_size = (size_t)rt_MaxCachedResources.i * 2; + size_t ht_size = rt_MaxCachedResources.sz * 2; for (size_t off = 0; off < ht_size; ++off) { size_t slot = (id + off) % ht_size; if (_cache.resource_ids[slot] == RT_INVALID_RESOURCE_ID || @@ -452,7 +452,7 @@ static void InsertPrefetchResourceIntoCache(rt_resource_id id, return; } - if (_cache.current_size + load_buffer_size >= (size_t)rt_ResourceCacheSize.i) { + if (_cache.current_size + load_buffer_size >= rt_ResourceCacheSize.sz) { if (!FreeCacheSpace(load_buffer_size)) { rtLog("RESMGR", "Unable to reclaim %zu kB from the resource cache.", @@ -484,7 +484,7 @@ static void InsertPrefetchResourceIntoCache(rt_resource_id id, /* Insert into lookup table */ bool inserted = false; - size_t ht_size = (size_t)rt_MaxCachedResources.i * 2; + size_t ht_size = rt_MaxCachedResources.sz * 2; for (size_t off = 0; off < ht_size; ++off) { size_t slot = (id + off) % ht_size; if (_cache.resource_ids[slot] == RT_INVALID_RESOURCE_ID || @@ -508,7 +508,7 @@ static void InsertPrefetchResourceIntoCache(rt_resource_id id, static rt_resource_namespace _namespace; static rt_result InitResourceNamespace(void) { - size_t size = (size_t)rt_ResourceNamespaceSize.i; + size_t size = rt_ResourceNamespaceSize.sz; if (size == 0) { rtReportError("RESMGR", "rt_ResourceNamespaceSize must be greater than 0."); return RT_INVALID_VALUE; @@ -567,7 +567,7 @@ static void LoadNamespace(void) { rt_hash64 entries_hash = rtHashBytes(entries, sizeof(rt_namespace_file_entry) * header->num_entries); if (entries_hash == header->checksum) { - size_t ns_size = (size_t)rt_ResourceNamespaceSize.i; + size_t ns_size = rt_ResourceNamespaceSize.sz; for (uint32_t i = 0; i < header->num_entries; ++i) { bool inserted = false; for (size_t j = 0; j < ns_size; ++j) { @@ -610,7 +610,7 @@ static void LoadNamespace(void) { static rt_resource_ref GetResourceRef(rt_resource_id id) { rt_resource_ref ref = {.file = RT_INVALID_FILE_ID}; rtLockRead(&_namespace.lock); - size_t ns_size = (size_t)rt_ResourceNamespaceSize.i; + size_t ns_size = rt_ResourceNamespaceSize.sz; for (size_t off = 0; off < ns_size; ++off) { size_t at = (id + off) % ns_size; if (_namespace.ids[at] == id) { @@ -772,7 +772,7 @@ RT_DLLEXPORT rt_result rtGetResource(rt_resource_id id, void *dest) { RT_DLLEXPORT size_t rtGetResourceSize(rt_resource_id id) { size_t size = 0; rtLockRead(&_namespace.lock); - size_t ns_size = (size_t)rt_ResourceNamespaceSize.i; + size_t ns_size = rt_ResourceNamespaceSize.sz; for (size_t off = 0; off < ns_size; ++off) { size_t at = (id + off) % ns_size; if (_namespace.ids[at] == id) { @@ -859,7 +859,7 @@ RT_DLLEXPORT rt_result rtCreateResources(uint32_t count, const rt_resource *resources, rt_resource_id *ids) { rt_result result = RT_SUCCESS; - size_t ns_size = (size_t)rt_ResourceNamespaceSize.i; + size_t ns_size = rt_ResourceNamespaceSize.sz; rt_write_batch writes = {.num_writes = 0}; rt_aio_handle write_handles[RT_WRITE_BATCH_MAX_SIZE]; @@ -1090,7 +1090,7 @@ RT_DLLEXPORT void rtSaveResourceNamespace(void) { rt_temp_arena temp = rtGetTemporaryArena(NULL, 0); rtLockRead(&_namespace.lock); uint32_t entry_count = 0; - for (size_t i = 0; i < (size_t)rt_ResourceNamespaceSize.i; ++i) { + for (size_t i = 0; i < rt_ResourceNamespaceSize.sz; ++i) { if (_namespace.ids[i] != RT_INVALID_RESOURCE_ID) ++entry_count; } @@ -1108,7 +1108,7 @@ RT_DLLEXPORT void rtSaveResourceNamespace(void) { rt_namespace_file_header *header = buffer; rt_namespace_file_entry *entries = (rt_namespace_file_entry *)(header + 1); size_t at = 0; - for (size_t i = 0; i < (size_t)rt_ResourceNamespaceSize.i; ++i) { + for (size_t i = 0; i < rt_ResourceNamespaceSize.sz; ++i) { if (_namespace.ids[i] != RT_INVALID_RESOURCE_ID) { entries[at].id = _namespace.ids[i]; entries[at].ref = _namespace.refs[i]; diff --git a/src/runtime/string_storage.c b/src/runtime/string_storage.c new file mode 100644 index 0000000..40e2485 --- /dev/null +++ b/src/runtime/string_storage.c @@ -0,0 +1,120 @@ +#include "string_storage.h" + +#include +#include + +#define STRING_STORAGE_BUDGET RT_MB(1) +/* Must be greater than sizeof(rt_free_range) */ +#define MIN_SPLIT_LENGTH 64 + +#define HEADER_MAGIC 0x1234567812345678uLL + +typedef struct rt_free_range { + /* Begin is address of this struct */ + size_t length; + struct rt_free_range *next; +} rt_free_range; + +typedef struct { + size_t alloc_len; + uint64_t magic; +} rt_string_header; + +static char _mem[STRING_STORAGE_BUDGET]; +static rt_free_range *_first_free; + +_Static_assert(sizeof(rt_free_range) < MIN_SPLIT_LENGTH, "Minimal split length must be greater than the size of a free list node."); + +void InitStringStorage(void) { + _first_free = (rt_free_range *)& _mem[0]; + _first_free->length = STRING_STORAGE_BUDGET; + _first_free->next = NULL; +} + +RT_DLLEXPORT char *rtStoreString(const char *sz) { + size_t len = strlen(sz) + 1; + size_t alloc_len = len + sizeof(rt_string_header); + + /* Look for a free range that is large enough */ + rt_free_range *free_range = _first_free; + rt_free_range *prev = NULL; + while (free_range && free_range->length < alloc_len) { + prev = free_range; + free_range = free_range->next; + } + if (!free_range) + return NULL; + + /* Should we split? */ + if ((free_range->length - alloc_len) >= MIN_SPLIT_LENGTH) { + rt_free_range *split = (rt_free_range *)((char *)free_range + alloc_len); + split->length = free_range->length - alloc_len; + + /* Enter into the linked list and remove the allocated range */ + split->next = free_range->next; + if (prev) + prev->next = split; + else + _first_free = split; + } else { + /* Just remove us */ + alloc_len = free_range->length; + if (prev) + prev->next = free_range->next; + else + _first_free = free_range->next; + } + + char *ptr = (char *)free_range; + memcpy(ptr + sizeof(rt_string_header), sz, len); + rt_string_header *header = (rt_string_header*)ptr; + header->alloc_len = alloc_len; + header->magic = HEADER_MAGIC; + return ptr + sizeof(rt_string_header); +} + +RT_DLLEXPORT void rtFreeString(char *storeStringRet) { + if (!storeStringRet) + return; + + size_t len = strlen(storeStringRet) + 1; + if (((uintptr_t)storeStringRet < (uintptr_t)&_mem[sizeof(rt_string_header)]) || + (((uintptr_t)storeStringRet + len) > (uintptr_t)&_mem[STRING_STORAGE_BUDGET - 1])) { + rtReportError("STRSTORAGE", + "Tried to free a string that is not stored in the string storage."); + return; + } + rt_string_header *header = (rt_string_header *)(storeStringRet - sizeof(rt_string_header)); + if (header->magic != HEADER_MAGIC) { + rtReportError("STRSTORAGE", "Invalid header in string storage!"); + return; + } + + size_t alloc_len = header->alloc_len; + rt_free_range *free_range = (rt_free_range *)header; + free_range->length = alloc_len; + + /* Search the place for inserting it */ + rt_free_range *prev = _first_free; + if ((uintptr_t)prev < (uintptr_t)free_range) { + while (prev->next) { + if ((uintptr_t)prev->next > (uintptr_t)free_range) + break; + } + free_range->next = prev->next; + prev->next = free_range; + } else { + /* Swap */ + rt_free_range *t = _first_free; + _first_free = free_range; + free_range = t; + prev = _first_free; + } + + /* Check if we can merge */ + while (((uintptr_t)prev + prev->length) == (uintptr_t)free_range) { + prev->length += alloc_len; + prev->next = free_range->next; + free_range = prev->next; + } +} \ No newline at end of file diff --git a/src/runtime/string_storage.h b/src/runtime/string_storage.h new file mode 100644 index 0000000..8109411 --- /dev/null +++ b/src/runtime/string_storage.h @@ -0,0 +1,23 @@ +#ifndef RT_STRING_STORAGE_H +#define RT_STRING_STORAGE_H + +/* Memory management for strings. + * This should only be used for relatively short strings (<256 bytes) + * For large amounts of data, use the block allocator instead. + */ + +#include "runtime.h" + +#ifdef __cplusplus +extern "C" { +#endif + +RT_DLLEXPORT char *rtStoreString(const char *sz); + +RT_DLLEXPORT void rtFreeString(char *storeStringRet); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/subprojects/inih.wrap b/subprojects/inih.wrap new file mode 100644 index 0000000..a4079d2 --- /dev/null +++ b/subprojects/inih.wrap @@ -0,0 +1,11 @@ +[wrap-file] +directory = inih-r57 +source_url = https://github.com/benhoyt/inih/archive/r57.tar.gz +source_filename = inih-r57.tar.gz +source_hash = f03f98ca35c3adb56b2358573c8d3eda319ccd5287243d691e724b7eafa970b3 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/inih_r57-1/inih-r57.tar.gz +wrapdb_version = r57-1 + +[provide] +inih = inih_dep +inireader = INIReader_dep diff --git a/subprojects/lz4.wrap b/subprojects/lz4.wrap new file mode 100644 index 0000000..7708a4a --- /dev/null +++ b/subprojects/lz4.wrap @@ -0,0 +1,12 @@ +[wrap-file] +directory = lz4-1.9.4 +source_url = https://github.com/lz4/lz4/archive/v1.9.4.tar.gz +source_filename = lz4-1.9.4.tgz +source_hash = 0b0e3aa07c8c063ddf40b082bdf7e37a1562bda40a0ff5272957f3e987e0e54b +patch_filename = lz4_1.9.4-2_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/lz4_1.9.4-2/get_patch +patch_hash = 4f33456cce986167d23faf5d28a128e773746c10789950475d2155a7914630fb +wrapdb_version = 1.9.4-2 + +[provide] +liblz4 = liblz4_dep diff --git a/subprojects/xxhash.wrap b/subprojects/xxhash.wrap new file mode 100644 index 0000000..e4d6125 --- /dev/null +++ b/subprojects/xxhash.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = xxHash-0.8.2 +source_url = https://github.com/Cyan4973/xxHash/archive/v0.8.2.tar.gz +source_filename = xxHash-0.8.2.tar.gz +source_hash = baee0c6afd4f03165de7a4e67988d16f0f2b257b51d0e3cb91909302a26a79c4 +patch_filename = xxhash_0.8.2-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/xxhash_0.8.2-1/get_patch +patch_hash = e721ef7a4c4ee0ade8b8440f6f7cb9f935b68e825249d74cb1c2503c53e68d25 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/xxhash_0.8.2-1/xxHash-0.8.2.tar.gz +wrapdb_version = 0.8.2-1 + +[provide] +libxxhash = xxhash_dep