16 #if !defined(MEOW_HASH_INTRINSICS_H)
23 #if _M_AMD64 || _M_IX86
26 #include <arm64_neon.h>
31 #if __x86_64__ || __i386__
32 #include <x86intrin.h>
43 #if !defined(MEOW_HASH_INTEL) || !defined(MEOW_HASH_ARMV8)
44 #if __x86_64__ || _M_AMD64
45 #define MEOW_HASH_INTEL 1
47 #define MEOW_PAGESIZE 4096
48 #elif __i386__ || _M_IX86
49 #define MEOW_HASH_INTEL 1
51 #define MEOW_PAGESIZE 4096
52 #elif __aarch64__ || _M_ARM64
53 #define MEOW_HASH_ARMV8 1
55 #define MEOW_PAGESIZE 4096
57 #error Cannot determine architecture to use!
65 #define meow_u8 char unsigned
66 #define meow_u16 short unsigned
67 #define meow_u32 int unsigned
68 #define meow_u64 long long unsigned
71 #define meow_umm long long unsigned
73 #define meow_umm int unsigned
82 #define meow_u128 __m128i
83 #define meow_aes_128 __m128i
84 #define meow_u256 __m256i
85 #define meow_aes_256 __m256i
86 #define meow_u512 __m512i
87 #define meow_aes_512 __m512i
89 #define MeowU32From(A, I) (_mm_extract_epi32((A), (I)))
90 #define MeowU64From(A, I) (_mm_extract_epi64((A), (I)))
91 #define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF)
93 #define Meow128_AESDEC(Prior, Xor) _mm_aesdec_si128((Prior), (Xor))
94 #define Meow128_AESDEC_Mem(Prior, Xor) _mm_aesdec_si128((Prior), _mm_loadu_si128((meow_u128 *)(Xor)))
95 #define Meow128_AESDEC_Finalize(A) (A)
96 #define Meow128_Set64x2(Low64, High64) _mm_set_epi64x((High64), (Low64))
97 #define Meow128_Set64x2_State(Low64, High64) Meow128_Set64x2(Low64, High64)
98 #define Meow128_GetAESConstant(Ptr) (*(meow_u128 *)(Ptr))
100 #define Meow128_And_Mem(A,B) _mm_and_si128((A),_mm_loadu_si128((meow_u128 *)(B)))
101 #define Meow128_Shuffle_Mem(Mem,Control) _mm_shuffle_epi8(_mm_loadu_si128((meow_u128 *)(Mem)),_mm_loadu_si128((meow_u128 *)(Control)))
104 #define Meow128_Zero() _mm_setzero_si128()
106 #define Meow256_AESDEC(Prior, XOr) _mm256_aesdec_epi128((Prior), (XOr))
107 #define Meow256_AESDEC_Mem(Prior, XOr) _mm256_aesdec_epi128((Prior), *(meow_u256 *)(XOr))
108 #define Meow256_Zero() _mm256_setzero_si256()
109 #define Meow256_PartialLoad(A, B) _mm256_mask_loadu_epi8(_mm256_setzero_si256(), _cvtu32_mask32((1UL<<(B)) - 1), (A))
110 #define Meow128_FromLow(A) _mm256_extracti128_si256((A), 0)
111 #define Meow128_FromHigh(A) _mm256_extracti128_si256((A), 1)
113 #define Meow512_AESDEC(Prior, XOr) _mm512_aesdec_epi128((Prior), (XOr))
114 #define Meow512_AESDEC_Mem(Prior, XOr) _mm512_aesdec_epi128((Prior), *(meow_u512 *)(XOr))
115 #define Meow512_Zero() _mm512_setzero_si512()
116 #define Meow512_PartialLoad(A, B) _mm512_mask_loadu_epi8(_mm512_setzero_si512(), _cvtu64_mask64((1ULL<<(B)) - 1), (A))
117 #define Meow256_FromLow(A) _mm512_extracti64x4_epi64((A), 0)
118 #define Meow256_FromHigh(A) _mm512_extracti64x4_epi64((A), 1)
124 #elif MEOW_HASH_ARMV8
126 #define meow_u128 uint8x16_t
143 #define MeowU32From(A, I) (vgetq_lane_u32(vreinterpretq_u32_u8((A)), (I)))
144 #define MeowU64From(A, I) (vgetq_lane_u64(vreinterpretq_u64_u8((A)), (I)))
147 MeowHashesAreEqualImpl(meow_u128 A, meow_u128 B)
149 uint8x16_t Powers = {
150 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
153 uint8x16_t Input = vceqq_u8(A, B);
154 uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(Input, Powers))));
157 vst1q_lane_u8((meow_u8*)&Output + 0, vreinterpretq_u8_u64(Mask), 0);
158 vst1q_lane_u8((meow_u8*)&Output + 1, vreinterpretq_u8_u64(Mask), 8);
159 return Output == 0xFFFF;
162 #define MeowHashesAreEqual(A, B) MeowHashesAreEqualImpl((A), (B))
165 Meow128_AESDEC(meow_aes_128 Prior, meow_u128 Xor)
168 R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
174 Meow128_AESDEC_Mem(meow_aes_128 Prior,
void *Xor)
177 R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
178 R.B = vld1q_u8((meow_u8*)Xor);
183 Meow128_AESDEC_Finalize(meow_aes_128 Value)
185 meow_u128 R = veorq_u8(Value.A, Value.B);
192 meow_u128 R = vdupq_n_u8(0);
197 Meow128_GetAESConstant(
const meow_u8 *Ptr)
206 Meow128_Set64x2(meow_u64 Low64, meow_u64 High64)
208 meow_u128 R = vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(Low64), vcreate_u64(High64)));
213 Meow128_Set64x2_State(meow_u64 Low64, meow_u64 High64)
216 R.A = Meow128_Set64x2(Low64, High64);
217 R.B = Meow128_Zero();
221 #define Meow128_And_Mem(A,B) vandq_u8((A), vld1q_u8((meow_u8 *)B))
222 #define Meow128_Shuffle_Mem(Mem,Control) vqtbl1q_u8(vld1q_u8((meow_u8 *)(Mem)),vld1q_u8((meow_u8 *)(Control)))
226 #define MEOW_HASH_VERSION 4
227 #define MEOW_HASH_VERSION_NAME "0.4/himalayan"
240 #define Meow128_CopyToHash(A, B) ((B).u128 = (A))
244 #undef MeowHashesAreEqual
245 #define MeowU32From(A, I) ((A).u32[I])
246 #define MeowU64From(A, I) ((A).u64[I])
247 #define MeowHashesAreEqual(A, B) (((A).u32[0] == (B).u32[0]) && ((A).u32[1] == (B).u32[1]) && ((A).u32[2] == (B).u32[2]) && ((A).u32[3] == (B).u32[3]))
251 typedef meow_u128 meow_hash;
252 #define Meow128_CopyToHash(A, B) ((B) = (A))
256 typedef struct meow_hash_state meow_hash_state;
257 typedef meow_hash meow_hash_implementation(meow_u64 Seed, meow_u64 Len,
void *Source);
258 typedef void meow_absorb_implementation(
struct meow_hash_state *State, meow_u64 Len,
void *Source);
260 #define MEOW_HASH_INTRINSICS_H