4-way KeccaK-f[1600] using AVX2 to speed up Dilithium

Also speeds up Dilithium by adding more AVX2 optimised routines. Non AVX2: BenchmarkPermutationFunction-8 3594723 326 ns/op AVX2: BenchmarkF1600x4-8 2926183 399 ns/op Dilithium Mode 3 using 4-way f[1600]: BenchmarkSkUnpack-8 37950 30844 ns/op BenchmarkPkUnpack-8 39169 30442 ns/op BenchmarkVerify-8 76672 15513 ns/op BenchmarkSign-8 10000 125782 ns/op BenchmarkGenerateKey-8 17419 68635 ns/op BenchmarkPublicFromPrivate-8 134430 9250 ns/op Without: BenchmarkSkUnpack-8 18980 61840 ns/op BenchmarkPkUnpack-8 19743 60087 ns/op BenchmarkVerify-8 39998 30421 ns/op BenchmarkSign-8 5420 228631 ns/op BenchmarkGenerateKey-8 12037 99672 ns/op BenchmarkPublicFromPrivate-8 107148 11512 ns/op See #113
cloudflare · May 21, 2020 · f2d9abd · f2d9abd
1 parent 7b3fb57
commit f2d9abd
Show file tree

Hide file tree

Showing 62 changed files with 4,635 additions and 320 deletions.
diff --git a/internal/shake/keccakf.go b/internal/shake/keccakf.go
@@ -6,34 +6,6 @@
 
 package shake
 
-// rc stores the round constants for use in the ι step.
-var rc = [24]uint64{
-	0x0000000000000001,
-	0x0000000000008082,
-	0x800000000000808A,
-	0x8000000080008000,
-	0x000000000000808B,
-	0x0000000080000001,
-	0x8000000080008081,
-	0x8000000000008009,
-	0x000000000000008A,
-	0x0000000000000088,
-	0x0000000080008009,
-	0x000000008000000A,
-	0x000000008000808B,
-	0x800000000000008B,
-	0x8000000000008089,
-	0x8000000000008003,
-	0x8000000000008002,
-	0x8000000000000080,
-	0x000000000000800A,
-	0x800000008000000A,
-	0x8000000080008081,
-	0x8000000000008080,
-	0x0000000080000001,
-	0x8000000080008008,
-}
-
 // keccakF1600 applies the Keccak permutation to a 1600b-wide
 // state represented as a slice of 25 uint64s.
 func keccakF1600(a *[25]uint64) {
@@ -66,7 +38,7 @@ func keccakF1600(a *[25]uint64) {
 		bc3 = t<<21 | t>>(64-21)
 		t = a[24] ^ d4
 		bc4 = t<<14 | t>>(64-14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i]
 		a[6] = bc1 ^ (bc3 &^ bc2)
 		a[12] = bc2 ^ (bc4 &^ bc3)
 		a[18] = bc3 ^ (bc0 &^ bc4)
@@ -157,7 +129,7 @@ func keccakF1600(a *[25]uint64) {
 		bc3 = t<<21 | t>>(64-21)
 		t = a[14] ^ d4
 		bc4 = t<<14 | t>>(64-14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+1]
 		a[16] = bc1 ^ (bc3 &^ bc2)
 		a[7] = bc2 ^ (bc4 &^ bc3)
 		a[23] = bc3 ^ (bc0 &^ bc4)
@@ -248,7 +220,7 @@ func keccakF1600(a *[25]uint64) {
 		bc3 = t<<21 | t>>(64-21)
 		t = a[19] ^ d4
 		bc4 = t<<14 | t>>(64-14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+2]
 		a[11] = bc1 ^ (bc3 &^ bc2)
 		a[22] = bc2 ^ (bc4 &^ bc3)
 		a[8] = bc3 ^ (bc0 &^ bc4)
@@ -339,7 +311,7 @@ func keccakF1600(a *[25]uint64) {
 		bc3 = t<<21 | t>>(64-21)
 		t = a[4] ^ d4
 		bc4 = t<<14 | t>>(64-14)
-		a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
+		a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+3]
 		a[1] = bc1 ^ (bc3 &^ bc2)
 		a[2] = bc2 ^ (bc4 &^ bc3)
 		a[3] = bc3 ^ (bc0 &^ bc4)

diff --git a/internal/shake/shake.go b/internal/shake/shake.go
@@ -20,6 +20,34 @@ const (
 	rate128     = 168
 )
 
+// RC stores the round constants for use in the ι step.
+var RC = [24]uint64{
+	0x0000000000000001,
+	0x0000000000008082,
+	0x800000000000808A,
+	0x8000000080008000,
+	0x000000000000808B,
+	0x0000000080000001,
+	0x8000000080008081,
+	0x8000000000008009,
+	0x000000000000008A,
+	0x0000000000000088,
+	0x0000000080008009,
+	0x000000008000000A,
+	0x000000008000808B,
+	0x800000000000008B,
+	0x8000000000008089,
+	0x8000000000008003,
+	0x8000000000008002,
+	0x8000000000000080,
+	0x000000000000800A,
+	0x800000008000000A,
+	0x8000000080008081,
+	0x8000000000008080,
+	0x0000000080000001,
+	0x8000000080008008,
+}
+
 // NewShake256 creates a new SHAKE256 variable-output-length Shake.
 // Its generic security strength is 256 bits against all attacks if
 // at least 64 bytes of its output are used.

diff --git a/sign/dilithium/internal/aes.go → sign/dilithium/internal/common/aes.go b/sign/dilithium/internal/aes.go → sign/dilithium/internal/common/aes.go
@@ -1,4 +1,4 @@
-package internal
+package common
 
 import (
 	"crypto/aes"

diff --git a/sign/dilithium/internal/field.go → sign/dilithium/internal/common/field.go b/sign/dilithium/internal/field.go → sign/dilithium/internal/common/field.go
@@ -1,11 +1,11 @@
-package internal
+package common
 
 // Returns a y with y < 2q and y = x mod q.
 // Note that in general *not*: reduceLe2Q(reduceLe2Q(x)) == x.
 func reduceLe2Q(x uint32) uint32 {
-	// Note 2²³ = 2¹³ - 1 mod q. So, writing  x = x1 2²³ + x2 with x2 < 2²³
-	// and x1 < 2⁹, we have x = y (mod q) where
-	// y = x2 + x1 2¹³ - x1 ≤ 2²³ + 2¹³ < 2q.
+	// Note 2²³ = 2¹³ - 1 mod q. So, writing  x = x₁ 2²³ + x₂ with x₂ < 2²³
+	// and x₁ < 2⁹, we have x = y (mod q) where
+	// y = x₂ + x₁ 2¹³ - x₁ ≤ 2²³ + 2¹³ < 2q.
 	x1 := x >> 23
 	x2 := x & 0x7FFFFF // 2²³-1
 	return x2 + (x1 << 13) - x1
@@ -51,9 +51,9 @@ func power2round(a uint32) (a0plusQ, a1 uint32) {
 	return
 }
 
-// Splits 0 ≤ a < Q into a0 and a1 with a = a1*α + a0 with -α/2 < a0 ≤ α/2,
+// Splits 0 ≤ a < Q into a0 and a₁ with a = a₁*α + a₀ with -α/2 < a₀ ≤ α/2,
 // except for when we would have a1 = (Q-1)/α = 16 in which case a1=0 is taken
-// and -α/2 ≤ a0 < 0.  Returns a0 + Q.  Note 0 ≤ a1 ≤ 15.
+// and -α/2 ≤ a₀ < 0.  Returns a₀ + Q.  Note 0 ≤ a₁ ≤ 15.
 // Note α = 2*γ₂ = γ₁ with the chosen parameters of Dilithium.
 func decompose(a uint32) (a0plusQ, a1 uint32) {
 	// Finds 0 ≤ t < 1.5α with t = a mod α.  (Recall α=2¹⁹ - 2⁹.)
@@ -69,20 +69,20 @@ func decompose(a uint32) (a0plusQ, a1 uint32) {
 
 	a1 = a - uint32(t)
 
-	// We want to divide α out of a1 (to get the proper value of a1).
+	// We want to divide α out of a₁ (to get the proper value of a1).
 	// As our values are relatively small and α=2¹⁹-2⁹, we can simply
 	// divide by 2¹⁹ and add one.  There is one corner case we have to deal
-	// with: if a1=0 then 0/α=0≠1=0/2¹⁹+1, so we need to get rid of the +1.
-	u := ((a1 - 1) >> 31) & 1 // u=1 if a1=0
+	// with: if a₁=0 then 0/α=0≠1=0/2¹⁹+1, so we need to get rid of the +1.
+	u := ((a1 - 1) >> 31) & 1 // u=1 if a₁=0
 	a1 = (a1 >> 19) + 1
-	a1 -= u // correct for the case a1=0
+	a1 -= u // correct for the case a₁=0
 
 	a0plusQ = Q + uint32(t)
 
-	// Now deal with the corner case of the definition, if a1=(Q-1)/α,
-	// then we use a1=0.  Note (Q-1)/α=2⁴.
+	// Now deal with the corner case of the definition, if a₁=(Q-1)/α,
+	// then we use a₁=0.  Note (Q-1)/α=2⁴.
 	a0plusQ -= a1 >> 4 // to compensate, we only have to move the -1.
-	a1 &= 15           // set a0=0 if a1=16
+	a1 &= 15           // set a₀=0 if a₁=16
 	return
 }
 

diff --git a/sign/dilithium/internal/field_test.go → sign/dilithium/internal/common/field_test.go b/sign/dilithium/internal/field_test.go → sign/dilithium/internal/common/field_test.go
@@ -1,4 +1,4 @@
-package internal
+package common
 
 import (
 	"flag"

diff --git a/sign/dilithium/internal/ntt.go → sign/dilithium/internal/common/ntt.go b/sign/dilithium/internal/ntt.go → sign/dilithium/internal/common/ntt.go
@@ -1,12 +1,12 @@
-package internal
+package common
 
 // Zetas lists precomputed powers of the root of unity in Montgomery
 // representation used for the NTT:
 //
-//     Zetas[i] = zeta^brv(i) R mod q,
+//     Zetas[i] = zetaᵇʳᵛ⁽ⁱ⁾ R mod q,
 //
 // where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
-// and R=2^32 mod q.
+// and R=2³² mod q.
 //
 // The following Python code generates the Zetas (and InvZetas) lists:
 //
@@ -59,10 +59,10 @@ var Zetas = [N]uint32{
 // InvZetas lists precomputed powers of the inverse root of unity in Montgomery
 // representation used for the inverse NTT:
 //
-//     InvZetas[i] = zeta^{brv(255-i)-256} R mod q,
+//     InvZetas[i] = zetaᵇʳᵛ⁽²⁵⁵⁻ⁱ⁾⁻²⁵⁶ R mod q,
 //
 // where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
-// and R=2^32 mod q.
+// and R=2³² mod q.
 var InvZetas = [N]uint32{
 	6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757,
 	554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899,
@@ -109,14 +109,14 @@ var InvZetas = [N]uint32{
 // by 2*Q.  The resulting coefficients are again in Montgomery representation,
 // but are only bounded bt 18*Q.
 func (p *Poly) nttGeneric() {
-	// Writing z := zeta for our root of unity zeta := 1753, note z^256=-1
+	// Writing z := zeta for our root of unity zeta := 1753, note z²⁵⁶=-1
 	// (otherwise the order of z wouldn't be 512) and so
 	//
-	//  x^256 + 1 = x^256 - z^256
-	//            = (x^128 - z^128)(x^128 + z^128)
-	//            = (x^64 - z^64)(x^64 + z^64)(x^64 + z^192)(x^64 - z^192)
+	//  x²⁵⁶ + 1 = x²⁵⁶ - z²⁵⁶
+	//           = (x¹²⁸ - z¹²⁸)(x¹²⁸ + z¹²⁸)
+	//           = (x⁶⁴ - z⁶⁴)(x⁶⁴ + z⁶⁴)(x⁶⁴ + z¹⁹²)(x⁶⁴ - z¹⁹²)
 	//          ...
-	//            = (x-z)(x+z)(x - z^129)(x + z^129) ... (x - z^255)(x + z^255)
+	//           = (x-z)(x+z)(x - z¹²⁹)(x + z¹²⁹) ... (x - z²⁵⁵)(x + z²⁵⁵)
 	//
 	// Note that the powers of z that appear (from the second line) are
 	//  in binary
@@ -129,23 +129,23 @@ func (p *Poly) nttGeneric() {
 	// i.e. brv(2), brv(3), brv(4), ... and these powers of z are given by
 	// the Zetas array.
 	//
-	// The polynomials x ± z^i are irreducable and coprime, hence by the
+	// The polynomials x ± zⁱ are irreducable and coprime, hence by the
 	// Chinese Remainder Theorem we know
 	//
-	//  R[x]/(x^256+1) ---> R[x] / (x-z) x ... x R[x] / (x+z^255)
+	//  R[x]/(x²⁵⁶+1) → R[x] / (x-z) x ... x R[x] / (x+z²⁵⁵)
 	//                      ~= ∏_i R
 	//
 	// given by
 	//
-	//  a |---> = ( a mod x-z, ..., a mod x+z^255 )
-	//          ~ ( a(z), a(-z), a(z^129), a(-z^129), ..., a(z^255), a(-z^255) )
+	//  a ↦ ( a mod x-z, ..., a mod x+z²⁵⁵ )
+	//    ~ ( a(z), a(-z), a(z¹²⁹), a(-z¹²⁹), ..., a(z²⁵⁵), a(-z²⁵⁵) )
 	//
 	// is an isomorphism, which is the forward NTT.  It can be computed
 	// efficiently by computing
 	//
-	//  a |---> ( a mod x^128 - z^128, a mod x^128 + z^128 )
-	//    |---> ( a mod x^64 - z^64, a mod x^64 + z^64,
-	//            a mod x^64 - z^192, a mod x^64 + z^192 )
+	//  a ↦ ( a mod x¹²⁸ - z¹²⁸, a mod x¹²⁸ + z¹²⁸ )
+	//    ↦ ( a mod x⁶⁴ - z⁶⁴,  a mod x⁶⁴ + z⁶⁴,
+	//        a mod x⁶⁴ - z¹⁹², a mod x⁶⁴ + z¹⁹² )
 	//       et cetera
 	//
 	// If N was 8 then this can be pictured in the following diagram:
@@ -154,7 +154,7 @@ func (p *Poly) nttGeneric() {
 	//
 	// Each cross is a Cooley--Tukey butterfly: it's the map
 	//
-	//      (a, b) |--> (a + ζ, a - ζ)
+	//      (a, b) ↦ (a + ζ, a - ζ)
 	//
 	// for the appropriate ζ for that column and row group.
 
@@ -193,11 +193,11 @@ func (p *Poly) invNttGeneric() {
 
 	// We basically do the opposite of NTT, but postpone dividing by 2 in the
 	// inverse of the Cooley--Tukey butterfly and accumulate that to a big
-	// division by 2^8 at the end.  See comments in the NTT() function.
+	// division by 2⁸ at the end.  See comments in the NTT() function.
 
 	for l := uint(1); l < N; l <<= 1 {
 		// On the n-th iteration of the l-loop, the coefficients start off
-		// bounded by 2^(n-1)*2*Q, so by 256*Q on the last.
+		// bounded by 2ⁿ⁻¹*2*Q, so by 256*Q on the last.
 		for offset := uint(0); offset < N-l; offset += 2 * l {
 			zeta := uint64(InvZetas[k])
 			k++
@@ -211,7 +211,7 @@ func (p *Poly) invNttGeneric() {
 	}
 
 	for j := uint(0); j < N; j++ {
-		// ROver256 = 41978 = (256)^-1 R^2
+		// ROver256 = 41978 = (256)⁻¹ R²
 		p[j] = montReduceLe2Q(ROver256 * uint64(p[j]))
 	}
 }