Kigs Framework  Doc version 0.8
Open source multi purpose Rapid Application Development framework
meow_intrinsics.h
1 /* ========================================================================
2 
3  meow_intrinsics.h
4  (C) Copyright 2018 by Molly Rocket, Inc. (https://mollyrocket.com)
5 
6  See https://mollyrocket.com/meowhash for details.
7 
8  This is the default way to define all of the types and operations that
9  meow_hash.h needs. However, if you've got your _own_ equivalent type
10  definitions and intrinsics, you can _omit_ this header file and just
11  #define/typedef all the Meow ops to map to your own ops, keeping things
12  nice and uniform in your codebase.
13 
14  ======================================================================== */
15 
16 #if !defined(MEOW_HASH_INTRINSICS_H)
17 
18 //
19 // NOTE(casey): Try to guess the source file for compiler intrinsics
20 //
21 #if _MSC_VER
22 
23 #if _M_AMD64 || _M_IX86
24 #include <intrin.h>
25 #elif _M_ARM64
26 #include <arm64_neon.h>
27 #endif
28 
29 #else
30 
31 #if __x86_64__ || __i386__
32 #include <x86intrin.h>
33 #elif __aarch64__
34 #include <arm_neon.h>
35 #endif
36 
37 #endif
38 
39 //
40 // NOTE(casey): Set #define's to their defaults
41 //
42 
43 #if !defined(MEOW_HASH_INTEL) || !defined(MEOW_HASH_ARMV8)
44 #if __x86_64__ || _M_AMD64
45 #define MEOW_HASH_INTEL 1
46 #define MEOW_64BIT 1
47 #define MEOW_PAGESIZE 4096
48 #elif __i386__ || _M_IX86
49 #define MEOW_HASH_INTEL 1
50 #define MEOW_64BIT 0
51 #define MEOW_PAGESIZE 4096
52 #elif __aarch64__ || _M_ARM64
53 #define MEOW_HASH_ARMV8 1
54 #define MEOW_64BIT 1
55 #define MEOW_PAGESIZE 4096
56 #else
57 #error Cannot determine architecture to use!
58 #endif
59 #endif
60 
61 //
62 // NOTE(casey): Define basic types
63 //
64 
65 #define meow_u8 char unsigned
66 #define meow_u16 short unsigned
67 #define meow_u32 int unsigned
68 #define meow_u64 long long unsigned
69 
70 #if MEOW_64BIT
71 #define meow_umm long long unsigned
72 #else
73 #define meow_umm int unsigned
74 #endif
75 
76 //
77 // NOTE(casey): Operations for x64 processors
78 //
79 
80 #if MEOW_HASH_INTEL
81 
82 #define meow_u128 __m128i
83 #define meow_aes_128 __m128i
84 #define meow_u256 __m256i
85 #define meow_aes_256 __m256i
86 #define meow_u512 __m512i
87 #define meow_aes_512 __m512i
88 
89 #define MeowU32From(A, I) (_mm_extract_epi32((A), (I)))
90 #define MeowU64From(A, I) (_mm_extract_epi64((A), (I)))
91 #define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF)
92 
93 #define Meow128_AESDEC(Prior, Xor) _mm_aesdec_si128((Prior), (Xor))
94 #define Meow128_AESDEC_Mem(Prior, Xor) _mm_aesdec_si128((Prior), _mm_loadu_si128((meow_u128 *)(Xor)))
95 #define Meow128_AESDEC_Finalize(A) (A)
96 #define Meow128_Set64x2(Low64, High64) _mm_set_epi64x((High64), (Low64))
97 #define Meow128_Set64x2_State(Low64, High64) Meow128_Set64x2(Low64, High64)
98 #define Meow128_GetAESConstant(Ptr) (*(meow_u128 *)(Ptr))
99 
100 #define Meow128_And_Mem(A,B) _mm_and_si128((A),_mm_loadu_si128((meow_u128 *)(B)))
101 #define Meow128_Shuffle_Mem(Mem,Control) _mm_shuffle_epi8(_mm_loadu_si128((meow_u128 *)(Mem)),_mm_loadu_si128((meow_u128 *)(Control)))
102 
103 // TODO(casey): Not sure if this should actually be Meow128_Zero(A) ((A) = _mm_setzero_si128()), maybe
104 #define Meow128_Zero() _mm_setzero_si128()
105 
106 #define Meow256_AESDEC(Prior, XOr) _mm256_aesdec_epi128((Prior), (XOr))
107 #define Meow256_AESDEC_Mem(Prior, XOr) _mm256_aesdec_epi128((Prior), *(meow_u256 *)(XOr))
108 #define Meow256_Zero() _mm256_setzero_si256()
109 #define Meow256_PartialLoad(A, B) _mm256_mask_loadu_epi8(_mm256_setzero_si256(), _cvtu32_mask32((1UL<<(B)) - 1), (A))
110 #define Meow128_FromLow(A) _mm256_extracti128_si256((A), 0)
111 #define Meow128_FromHigh(A) _mm256_extracti128_si256((A), 1)
112 
113 #define Meow512_AESDEC(Prior, XOr) _mm512_aesdec_epi128((Prior), (XOr))
114 #define Meow512_AESDEC_Mem(Prior, XOr) _mm512_aesdec_epi128((Prior), *(meow_u512 *)(XOr))
115 #define Meow512_Zero() _mm512_setzero_si512()
116 #define Meow512_PartialLoad(A, B) _mm512_mask_loadu_epi8(_mm512_setzero_si512(), _cvtu64_mask64((1ULL<<(B)) - 1), (A))
117 #define Meow256_FromLow(A) _mm512_extracti64x4_epi64((A), 0)
118 #define Meow256_FromHigh(A) _mm512_extracti64x4_epi64((A), 1)
119 
120 //
121 // NOTE(casey): Operations for ARM processors
122 //
123 
124 #elif MEOW_HASH_ARMV8
125 
126 #define meow_u128 uint8x16_t
127 
128 // NOTE(mmozeiko): AES opcodes on ARMv8 work a bit differently than on Intel
129 // On Intel the "x = AESDEC(x, m)" does following:
130 // x = InvMixColumns(SubBytes(ShiftRows(x))) ^ m
131 // But on ARMv8 the "x = AESDEC(x, m)" does following:
132 // x = SubBytes(ShiftRows(x ^ m))
133 // Thus on ARMv8 it requires extra InvMixColumns call and delay on Xor operation.
134 // On iteration N it needs to use m[N-1] as input, and remeber m[N] for next iteration.
135 // This structure will store memory operand in member B which will be used in
136 // next AESDEC opcode. Remember to do one more XOR(A,B) when finishing AES
137 // operations in a loop.
138 typedef struct {
139  meow_u128 A;
140  meow_u128 B;
141 } meow_aes_128;
142 
143 #define MeowU32From(A, I) (vgetq_lane_u32(vreinterpretq_u32_u8((A)), (I)))
144 #define MeowU64From(A, I) (vgetq_lane_u64(vreinterpretq_u64_u8((A)), (I)))
145 
146 static int
147 MeowHashesAreEqualImpl(meow_u128 A, meow_u128 B)
148 {
149  uint8x16_t Powers = {
150  1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
151  };
152 
153  uint8x16_t Input = vceqq_u8(A, B);
154  uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(Input, Powers))));
155 
156  meow_u16 Output;
157  vst1q_lane_u8((meow_u8*)&Output + 0, vreinterpretq_u8_u64(Mask), 0);
158  vst1q_lane_u8((meow_u8*)&Output + 1, vreinterpretq_u8_u64(Mask), 8);
159  return Output == 0xFFFF;
160 }
161 
162 #define MeowHashesAreEqual(A, B) MeowHashesAreEqualImpl((A), (B))
163 
164 static meow_aes_128
165 Meow128_AESDEC(meow_aes_128 Prior, meow_u128 Xor)
166 {
167  meow_aes_128 R;
168  R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
169  R.B = Xor;
170  return(R);
171 }
172 
173 static meow_aes_128
174 Meow128_AESDEC_Mem(meow_aes_128 Prior, void *Xor)
175 {
176  meow_aes_128 R;
177  R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
178  R.B = vld1q_u8((meow_u8*)Xor);
179  return(R);
180 }
181 
182 static meow_u128
183 Meow128_AESDEC_Finalize(meow_aes_128 Value)
184 {
185  meow_u128 R = veorq_u8(Value.A, Value.B);
186  return(R);
187 }
188 
189 static meow_u128
190 Meow128_Zero()
191 {
192  meow_u128 R = vdupq_n_u8(0);
193  return(R);
194 }
195 
196 static meow_aes_128
197 Meow128_GetAESConstant(const meow_u8 *Ptr)
198 {
199  meow_aes_128 R;
200  R.A = vld1q_u8(Ptr);
201  R.B = vdupq_n_u8(0);
202  return(R);
203 }
204 
205 static meow_u128
206 Meow128_Set64x2(meow_u64 Low64, meow_u64 High64)
207 {
208  meow_u128 R = vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(Low64), vcreate_u64(High64)));
209  return(R);
210 }
211 
212 static meow_aes_128
213 Meow128_Set64x2_State(meow_u64 Low64, meow_u64 High64)
214 {
215  meow_aes_128 R;
216  R.A = Meow128_Set64x2(Low64, High64);
217  R.B = Meow128_Zero();
218  return(R);
219 }
220 
221 #define Meow128_And_Mem(A,B) vandq_u8((A), vld1q_u8((meow_u8 *)B))
222 #define Meow128_Shuffle_Mem(Mem,Control) vqtbl1q_u8(vld1q_u8((meow_u8 *)(Mem)),vld1q_u8((meow_u8 *)(Control)))
223 
224 #endif
225 
226 #define MEOW_HASH_VERSION 4
227 #define MEOW_HASH_VERSION_NAME "0.4/himalayan"
228 
229 #if MEOW_INCLUDE_C
230 
231 // NOTE(casey): Unfortunately, if you want an ANSI-C version, we have to slow everyone
232 // else down because you can't return 128-bit values by register anymore (in case the
233 // CPU doesn't support that)
234 union meow_hash
235 {
236  meow_u128 u128;
237  meow_u64 u64[2];
238  meow_u32 u32[4];
239 };
240 #define Meow128_CopyToHash(A, B) ((B).u128 = (A))
241 
242 #undef MeowU64From
243 #undef MeowU32From
244 #undef MeowHashesAreEqual
245 #define MeowU32From(A, I) ((A).u32[I])
246 #define MeowU64From(A, I) ((A).u64[I])
247 #define MeowHashesAreEqual(A, B) (((A).u32[0] == (B).u32[0]) && ((A).u32[1] == (B).u32[1]) && ((A).u32[2] == (B).u32[2]) && ((A).u32[3] == (B).u32[3]))
248 
249 #else
250 
251 typedef meow_u128 meow_hash;
252 #define Meow128_CopyToHash(A, B) ((B) = (A))
253 
254 #endif
255 
256 typedef struct meow_hash_state meow_hash_state;
257 typedef meow_hash meow_hash_implementation(meow_u64 Seed, meow_u64 Len, void *Source);
258 typedef void meow_absorb_implementation(struct meow_hash_state *State, meow_u64 Len, void *Source);
259 
260 #define MEOW_HASH_INTRINSICS_H
261 #endif