15#ifndef PRIVACY_PROOFS_ZK_LIB_GF2K_SYSDEP_H_
16#define PRIVACY_PROOFS_ZK_LIB_GF2K_SYSDEP_H_
26#if defined(__x86_64__) || defined(__i386__)
31using gf2_128_elt_t = __m128i;
33static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
34 return std::array<uint64_t, 2>{
static_cast<uint64_t
>(x[0]),
35 static_cast<uint64_t
>(x[1])};
38static inline gf2_128_elt_t gf2_128_of_uint64x2(
39 const std::array<uint64_t, 2> &x) {
42 return gf2_128_elt_t{
static_cast<long long>(x[0]),
43 static_cast<long long>(x[1])};
46static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
47 return _mm_xor_si128(x, y);
51static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
52 const gf2_128_elt_t poly = {0x87};
53 t0 = _mm_xor_si128(t0, _mm_slli_si128(t1, 64 / 8 ));
54 t0 = _mm_xor_si128(t0, _mm_clmulepi64_si128(t1, poly, 0x01));
57static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
58 gf2_128_elt_t t1a = _mm_clmulepi64_si128(x, y, 0x01);
59 gf2_128_elt_t t1b = _mm_clmulepi64_si128(x, y, 0x10);
60 gf2_128_elt_t t1 = gf2_128_add(t1a, t1b);
61 gf2_128_elt_t t2 = _mm_clmulepi64_si128(x, y, 0x11);
62 t1 = gf2_128_reduce(t1, t2);
63 gf2_128_elt_t t0 = _mm_clmulepi64_si128(x, y, 0x00);
64 t0 = gf2_128_reduce(t0, t1);
68#elif defined(__aarch64__)
79using gf2_128_elt_t = poly64x2_t;
81static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
82 return std::array<uint64_t, 2>{
static_cast<uint64_t
>(x[0]),
83 static_cast<uint64_t
>(x[1])};
86static inline gf2_128_elt_t gf2_128_of_uint64x2(
87 const std::array<uint64_t, 2> &x) {
88 return gf2_128_elt_t{
static_cast<poly64_t
>(x[0]),
89 static_cast<poly64_t
>(x[1])};
92static inline gf2_128_elt_t vmull_low(gf2_128_elt_t t0, gf2_128_elt_t t1) {
93 poly64_t tt0 = vgetq_lane_p64(t0, 0);
94 poly64_t tt1 = vgetq_lane_p64(t1, 0);
95 return vreinterpretq_p64_p128(vmull_p64(tt0, tt1));
97static inline gf2_128_elt_t vmull_high(gf2_128_elt_t t0, gf2_128_elt_t t1) {
98 return vreinterpretq_p64_p128(vmull_high_p64(t0, t1));
102static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
103 const gf2_128_elt_t poly = {0x0, 0x87};
104 const gf2_128_elt_t zero = {0x0, 0x0};
105 t0 = vaddq_p64(t0, vextq_p64(zero, t1, 1));
106 t0 = vaddq_p64(t0, vmull_high(t1, poly));
109static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
110 return vaddq_p64(x, y);
112static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
113 gf2_128_elt_t swx = vextq_p64(x, x, 1);
114 gf2_128_elt_t t1a = vmull_high(swx, y);
115 gf2_128_elt_t t1b = vmull_low(swx, y);
116 gf2_128_elt_t t1 = vaddq_p64(t1a, t1b);
117 gf2_128_elt_t t2 = vmull_high(x, y);
118 t1 = gf2_128_reduce(t1, t2);
119 gf2_128_elt_t t0 = vmull_low(x, y);
120 t0 = gf2_128_reduce(t0, t1);
125#elif defined(__arm__) || defined(__aarch64__)
132using gf2_128_elt_t = poly64x2_t;
134static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
135 return std::array<uint64_t, 2>{
static_cast<uint64_t
>(x[0]),
136 static_cast<uint64_t
>(x[1])};
139static inline gf2_128_elt_t gf2_128_of_uint64x2(
140 const std::array<uint64_t, 2> &x) {
141 return gf2_128_elt_t{
static_cast<poly64_t
>(x[0]),
142 static_cast<poly64_t
>(x[1])};
145static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
146 return vaddq_p64(x, y);
163static inline poly8x16_t pmul64x8(poly8x8_t x, poly8_t y) {
164 const poly8x16_t zero{};
165 poly8x16_t prod = vmull_p8(x, vdup_n_p8(y));
166 poly8x16x2_t uzp = vuzpq_p8(prod, zero);
167 return vaddq_p8(uzp.val[0], vextq_p8(uzp.val[1], uzp.val[1], 15));
172static inline poly8x16x2_t pmac64x8(poly8x16_t cin, poly8x8_t x, poly8_t y) {
173 const poly8x16_t zero{};
174 poly8x16_t prod = vmull_p8(x, vdup_n_p8(y));
175 poly8x16x2_t uzp = vuzpq_p8(prod, zero);
176 uzp.val[0] = vaddq_p8(uzp.val[0], cin);
180static inline poly8x16_t pmul64x64(poly8x8_t x, poly8x8_t y) {
183 poly8x16x2_t prod = pmac64x8(r, x, y[0]);
186 prod = pmac64x8(prod.val[1], x, y[1]);
187 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 15));
189 prod = pmac64x8(prod.val[1], x, y[2]);
190 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 14));
192 prod = pmac64x8(prod.val[1], x, y[3]);
193 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 13));
195 prod = pmac64x8(prod.val[1], x, y[4]);
196 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 12));
198 prod = pmac64x8(prod.val[1], x, y[5]);
199 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 11));
201 prod = pmac64x8(prod.val[1], x, y[6]);
202 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 10));
204 prod = pmac64x8(prod.val[1], x, y[7]);
205 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 9));
206 r = vaddq_p8(r, vextq_p8(prod.val[1], prod.val[1], 8));
211static inline gf2_128_elt_t vmull_low(gf2_128_elt_t t0, gf2_128_elt_t t1) {
214 return static_cast<poly64x2_t
>(pmul64x64(vget_low_p8(t0), vget_low_p8(t1)));
216static inline gf2_128_elt_t vmull_high(gf2_128_elt_t t0, gf2_128_elt_t t1) {
217 return static_cast<poly64x2_t
>(pmul64x64(vget_high_p8(t0), vget_high_p8(t1)));
221static inline gf2_128_elt_t vextq_p64_1_emul(gf2_128_elt_t t0,
223 return static_cast<poly64x2_t
>(
224 vextq_p8(
static_cast<poly8x16_t
>(t0),
static_cast<poly8x16_t
>(t1), 8));
228static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
229 const poly8_t poly =
static_cast<poly8_t
>(0x87);
230 const gf2_128_elt_t zero = {0x0, 0x0};
231 t0 = vaddq_p64(t0, vextq_p64_1_emul(zero, t1));
232 t0 = vaddq_p64(t0, pmul64x8(vget_high_p8(t1), poly));
236static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
237 gf2_128_elt_t swx = vextq_p64_1_emul(x, x);
238 gf2_128_elt_t t1a = vmull_high(swx, y);
239 gf2_128_elt_t t1b = vmull_low(swx, y);
240 gf2_128_elt_t t1 = vaddq_p64(t1a, t1b);
241 gf2_128_elt_t t2 = vmull_high(x, y);
242 t1 = gf2_128_reduce(t1, t2);
243 gf2_128_elt_t t0 = vmull_low(x, y);
244 t0 = gf2_128_reduce(t0, t1);
250#error "unimplemented gf2k/sysdep.h"