Longfellow ZK 0290cb32
Loading...
Searching...
No Matches
sysdep.h
1// Copyright 2025 Google LLC.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifndef PRIVACY_PROOFS_ZK_LIB_GF2K_SYSDEP_H_
16#define PRIVACY_PROOFS_ZK_LIB_GF2K_SYSDEP_H_
17
18#include <stddef.h>
19#include <stdint.h>
20
21#include <array>
22
23// Hardcoded GF(2^128) SIMD arithmetic where
24// GF(2^128) = GF(2)[x] / (x^128 + x^7 + x^2 + x + 1)
25
26#if defined(__x86_64__) || defined(__i386__)
27#include <immintrin.h> // IWYU pragma: keep
28
29namespace proofs {
30
31using gf2_128_elt_t = __m128i;
32
33static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
34 return std::array<uint64_t, 2>{static_cast<uint64_t>(x[0]),
35 static_cast<uint64_t>(x[1])};
36}
37
38static inline gf2_128_elt_t gf2_128_of_uint64x2(
39 const std::array<uint64_t, 2> &x) {
40 // Cast to long long (as opposed to int64_t) is necessary because __m128i is
41 // defined in terms of long long.
42 return gf2_128_elt_t{static_cast<long long>(x[0]),
43 static_cast<long long>(x[1])};
44}
45
46static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
47 return _mm_xor_si128(x, y);
48}
49
50// return t0 + x^64 * t1
51static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
52 const gf2_128_elt_t poly = {0x87};
53 t0 = _mm_xor_si128(t0, _mm_slli_si128(t1, 64 /*bits*/ / 8 /*bits/byte*/));
54 t0 = _mm_xor_si128(t0, _mm_clmulepi64_si128(t1, poly, 0x01));
55 return t0;
56}
57static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
58 gf2_128_elt_t t1a = _mm_clmulepi64_si128(x, y, 0x01);
59 gf2_128_elt_t t1b = _mm_clmulepi64_si128(x, y, 0x10);
60 gf2_128_elt_t t1 = gf2_128_add(t1a, t1b);
61 gf2_128_elt_t t2 = _mm_clmulepi64_si128(x, y, 0x11);
62 t1 = gf2_128_reduce(t1, t2);
63 gf2_128_elt_t t0 = _mm_clmulepi64_si128(x, y, 0x00);
64 t0 = gf2_128_reduce(t0, t1);
65 return t0;
66}
67} // namespace proofs
68#elif defined(__aarch64__)
69//
70// Implementation for arm/neon with AES instructions.
71// We assume that __aarch64__ implies AES, which isn't necessarily
72// the case. If this is a problem, change the defined(__aarch64__)
73// above and the code will fall back to the non-AES implementation
74// below.
75//
76#include <arm_neon.h> // IWYU pragma: keep
77
78namespace proofs {
79using gf2_128_elt_t = poly64x2_t;
80
81static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
82 return std::array<uint64_t, 2>{static_cast<uint64_t>(x[0]),
83 static_cast<uint64_t>(x[1])};
84}
85
86static inline gf2_128_elt_t gf2_128_of_uint64x2(
87 const std::array<uint64_t, 2> &x) {
88 return gf2_128_elt_t{static_cast<poly64_t>(x[0]),
89 static_cast<poly64_t>(x[1])};
90}
91
92static inline gf2_128_elt_t vmull_low(gf2_128_elt_t t0, gf2_128_elt_t t1) {
93 poly64_t tt0 = vgetq_lane_p64(t0, 0);
94 poly64_t tt1 = vgetq_lane_p64(t1, 0);
95 return vreinterpretq_p64_p128(vmull_p64(tt0, tt1));
96}
97static inline gf2_128_elt_t vmull_high(gf2_128_elt_t t0, gf2_128_elt_t t1) {
98 return vreinterpretq_p64_p128(vmull_high_p64(t0, t1));
99}
100
101// return t0 + x^64 * t1
102static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
103 const gf2_128_elt_t poly = {0x0, 0x87};
104 const gf2_128_elt_t zero = {0x0, 0x0};
105 t0 = vaddq_p64(t0, vextq_p64(zero, t1, 1));
106 t0 = vaddq_p64(t0, vmull_high(t1, poly));
107 return t0;
108}
109static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
110 return vaddq_p64(x, y);
111}
112static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
113 gf2_128_elt_t swx = vextq_p64(x, x, 1);
114 gf2_128_elt_t t1a = vmull_high(swx, y);
115 gf2_128_elt_t t1b = vmull_low(swx, y);
116 gf2_128_elt_t t1 = vaddq_p64(t1a, t1b);
117 gf2_128_elt_t t2 = vmull_high(x, y);
118 t1 = gf2_128_reduce(t1, t2);
119 gf2_128_elt_t t0 = vmull_low(x, y);
120 t0 = gf2_128_reduce(t0, t1);
121 return t0;
122}
123} // namespace proofs
124
125#elif defined(__arm__) || defined(__aarch64__)
126//
127// Implementation for arm/neon without AES instructions
128//
129#include <arm_neon.h> // IWYU pragma: keep
130
131namespace proofs {
132using gf2_128_elt_t = poly64x2_t;
133
134static inline std::array<uint64_t, 2> uint64x2_of_gf2_128(gf2_128_elt_t x) {
135 return std::array<uint64_t, 2>{static_cast<uint64_t>(x[0]),
136 static_cast<uint64_t>(x[1])};
137}
138
139static inline gf2_128_elt_t gf2_128_of_uint64x2(
140 const std::array<uint64_t, 2> &x) {
141 return gf2_128_elt_t{static_cast<poly64_t>(x[0]),
142 static_cast<poly64_t>(x[1])};
143}
144
145static inline gf2_128_elt_t gf2_128_add(gf2_128_elt_t x, gf2_128_elt_t y) {
146 return vaddq_p64(x, y);
147}
148
149// Emulate vmull_p64() with vmull_p8().
150//
151// This emulation is pretty naive and it performs a lot of permutations.
152//
153// A possibly better alternative appears in Danilo Câmara, Conrado
154// Gouvêa, Julio López, Ricardo Dahab, "Fast Software Polynomial
155// Multiplication on ARM Processors Using the NEON Engine", 1st
156// Cross-Domain Conference and Workshop on Availability, Reliability,
157// and Security in Information Systems (CD-ARES), Sep 2013,
158// Regensburg, Germany. pp.137-154. ⟨hal-01506572⟩
159//
160// However, the code from that paper makes heavy use of type
161// punning of 128-bit registers as two 64-bit registers, which
162// I don't know how to express in C.
163static inline poly8x16_t pmul64x8(poly8x8_t x, poly8_t y) {
164 const poly8x16_t zero{};
165 poly8x16_t prod = vmull_p8(x, vdup_n_p8(y));
166 poly8x16x2_t uzp = vuzpq_p8(prod, zero);
167 return vaddq_p8(uzp.val[0], vextq_p8(uzp.val[1], uzp.val[1], 15));
168}
169
170// multiply/add. Return (cout, s) = cin + x * y where the final sum
171// would be (cout << 8) + s.
172static inline poly8x16x2_t pmac64x8(poly8x16_t cin, poly8x8_t x, poly8_t y) {
173 const poly8x16_t zero{};
174 poly8x16_t prod = vmull_p8(x, vdup_n_p8(y));
175 poly8x16x2_t uzp = vuzpq_p8(prod, zero);
176 uzp.val[0] = vaddq_p8(uzp.val[0], cin);
177 return uzp;
178}
179
180static inline poly8x16_t pmul64x64(poly8x8_t x, poly8x8_t y) {
181 poly8x16_t r{};
182
183 poly8x16x2_t prod = pmac64x8(r, x, y[0]);
184 r = prod.val[0];
185
186 prod = pmac64x8(prod.val[1], x, y[1]);
187 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 15));
188
189 prod = pmac64x8(prod.val[1], x, y[2]);
190 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 14));
191
192 prod = pmac64x8(prod.val[1], x, y[3]);
193 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 13));
194
195 prod = pmac64x8(prod.val[1], x, y[4]);
196 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 12));
197
198 prod = pmac64x8(prod.val[1], x, y[5]);
199 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 11));
200
201 prod = pmac64x8(prod.val[1], x, y[6]);
202 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 10));
203
204 prod = pmac64x8(prod.val[1], x, y[7]);
205 r = vaddq_p8(r, vextq_p8(prod.val[0], prod.val[0], 9));
206 r = vaddq_p8(r, vextq_p8(prod.val[1], prod.val[1], 8));
207
208 return r;
209}
210
211static inline gf2_128_elt_t vmull_low(gf2_128_elt_t t0, gf2_128_elt_t t1) {
212 // vreinterpretq_p64_p8() seems not to be defined, use
213 // static_cast<poly64x2_t>
214 return static_cast<poly64x2_t>(pmul64x64(vget_low_p8(t0), vget_low_p8(t1)));
215}
216static inline gf2_128_elt_t vmull_high(gf2_128_elt_t t0, gf2_128_elt_t t1) {
217 return static_cast<poly64x2_t>(pmul64x64(vget_high_p8(t0), vget_high_p8(t1)));
218}
219
220// vextq_p64() seems not to be defined.
221static inline gf2_128_elt_t vextq_p64_1_emul(gf2_128_elt_t t0,
222 gf2_128_elt_t t1) {
223 return static_cast<poly64x2_t>(
224 vextq_p8(static_cast<poly8x16_t>(t0), static_cast<poly8x16_t>(t1), 8));
225}
226
227// return t0 + x^64 * t1
228static inline gf2_128_elt_t gf2_128_reduce(gf2_128_elt_t t0, gf2_128_elt_t t1) {
229 const poly8_t poly = static_cast<poly8_t>(0x87);
230 const gf2_128_elt_t zero = {0x0, 0x0};
231 t0 = vaddq_p64(t0, vextq_p64_1_emul(zero, t1));
232 t0 = vaddq_p64(t0, pmul64x8(vget_high_p8(t1), poly));
233 return t0;
234}
235
236static inline gf2_128_elt_t gf2_128_mul(gf2_128_elt_t x, gf2_128_elt_t y) {
237 gf2_128_elt_t swx = vextq_p64_1_emul(x, x);
238 gf2_128_elt_t t1a = vmull_high(swx, y);
239 gf2_128_elt_t t1b = vmull_low(swx, y);
240 gf2_128_elt_t t1 = vaddq_p64(t1a, t1b);
241 gf2_128_elt_t t2 = vmull_high(x, y);
242 t1 = gf2_128_reduce(t1, t2);
243 gf2_128_elt_t t0 = vmull_low(x, y);
244 t0 = gf2_128_reduce(t0, t1);
245 return t0;
246}
247
248} // namespace proofs
249#else
250#error "unimplemented gf2k/sysdep.h"
251#endif
252
253#endif // PRIVACY_PROOFS_ZK_LIB_GF2K_SYSDEP_H_