Skip to content

Commit

Permalink
4-way KeccaK-f[1600] using AVX2 to speed up Dilithium
Browse files Browse the repository at this point in the history
Also speeds up Dilithium by adding more AVX2 optimised routines.

Non AVX2:

    BenchmarkPermutationFunction-8   	 3594723	       326 ns/op

AVX2:

    BenchmarkF1600x4-8   	 2926183	       399 ns/op

Dilithium Mode 3 using 4-way f[1600]:

    BenchmarkSkUnpack-8                      	   37950	     30844 ns/op
    BenchmarkPkUnpack-8                      	   39169	     30442 ns/op
    BenchmarkVerify-8                        	   76672	     15513 ns/op
    BenchmarkSign-8                          	   10000	    125782 ns/op
    BenchmarkGenerateKey-8                   	   17419	     68635 ns/op
    BenchmarkPublicFromPrivate-8             	  134430	      9250 ns/op

Without:

    BenchmarkSkUnpack-8            	   18980	     61840 ns/op
    BenchmarkPkUnpack-8            	   19743	     60087 ns/op
    BenchmarkVerify-8              	   39998	     30421 ns/op
    BenchmarkSign-8                	    5420	    228631 ns/op
    BenchmarkGenerateKey-8         	   12037	     99672 ns/op
    BenchmarkPublicFromPrivate-8   	  107148	     11512 ns/op

See #113
  • Loading branch information
bwesterb authored May 21, 2020
1 parent 7b3fb57 commit f2d9abd
Show file tree
Hide file tree
Showing 62 changed files with 4,635 additions and 320 deletions.
36 changes: 4 additions & 32 deletions internal/shake/keccakf.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,6 @@

package shake

// rc stores the round constants for use in the ι step.
var rc = [24]uint64{
0x0000000000000001,
0x0000000000008082,
0x800000000000808A,
0x8000000080008000,
0x000000000000808B,
0x0000000080000001,
0x8000000080008081,
0x8000000000008009,
0x000000000000008A,
0x0000000000000088,
0x0000000080008009,
0x000000008000000A,
0x000000008000808B,
0x800000000000008B,
0x8000000000008089,
0x8000000000008003,
0x8000000000008002,
0x8000000000000080,
0x000000000000800A,
0x800000008000000A,
0x8000000080008081,
0x8000000000008080,
0x0000000080000001,
0x8000000080008008,
}

// keccakF1600 applies the Keccak permutation to a 1600b-wide
// state represented as a slice of 25 uint64s.
func keccakF1600(a *[25]uint64) {
Expand Down Expand Up @@ -66,7 +38,7 @@ func keccakF1600(a *[25]uint64) {
bc3 = t<<21 | t>>(64-21)
t = a[24] ^ d4
bc4 = t<<14 | t>>(64-14)
a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i]
a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i]
a[6] = bc1 ^ (bc3 &^ bc2)
a[12] = bc2 ^ (bc4 &^ bc3)
a[18] = bc3 ^ (bc0 &^ bc4)
Expand Down Expand Up @@ -157,7 +129,7 @@ func keccakF1600(a *[25]uint64) {
bc3 = t<<21 | t>>(64-21)
t = a[14] ^ d4
bc4 = t<<14 | t>>(64-14)
a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1]
a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+1]
a[16] = bc1 ^ (bc3 &^ bc2)
a[7] = bc2 ^ (bc4 &^ bc3)
a[23] = bc3 ^ (bc0 &^ bc4)
Expand Down Expand Up @@ -248,7 +220,7 @@ func keccakF1600(a *[25]uint64) {
bc3 = t<<21 | t>>(64-21)
t = a[19] ^ d4
bc4 = t<<14 | t>>(64-14)
a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2]
a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+2]
a[11] = bc1 ^ (bc3 &^ bc2)
a[22] = bc2 ^ (bc4 &^ bc3)
a[8] = bc3 ^ (bc0 &^ bc4)
Expand Down Expand Up @@ -339,7 +311,7 @@ func keccakF1600(a *[25]uint64) {
bc3 = t<<21 | t>>(64-21)
t = a[4] ^ d4
bc4 = t<<14 | t>>(64-14)
a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3]
a[0] = bc0 ^ (bc2 &^ bc1) ^ RC[i+3]
a[1] = bc1 ^ (bc3 &^ bc2)
a[2] = bc2 ^ (bc4 &^ bc3)
a[3] = bc3 ^ (bc0 &^ bc4)
Expand Down
28 changes: 28 additions & 0 deletions internal/shake/shake.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,34 @@ const (
rate128 = 168
)

// RC stores the round constants for use in the ι step.
var RC = [24]uint64{
0x0000000000000001,
0x0000000000008082,
0x800000000000808A,
0x8000000080008000,
0x000000000000808B,
0x0000000080000001,
0x8000000080008081,
0x8000000000008009,
0x000000000000008A,
0x0000000000000088,
0x0000000080008009,
0x000000008000000A,
0x000000008000808B,
0x800000000000008B,
0x8000000000008089,
0x8000000000008003,
0x8000000000008002,
0x8000000000000080,
0x000000000000800A,
0x800000008000000A,
0x8000000080008081,
0x8000000000008080,
0x0000000080000001,
0x8000000080008008,
}

// NewShake256 creates a new SHAKE256 variable-output-length Shake.
// Its generic security strength is 256 bits against all attacks if
// at least 64 bytes of its output are used.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package internal
package common

import (
"crypto/aes"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package internal
package common

// Returns a y with y < 2q and y = x mod q.
// Note that in general *not*: reduceLe2Q(reduceLe2Q(x)) == x.
func reduceLe2Q(x uint32) uint32 {
// Note 2²³ = 2¹³ - 1 mod q. So, writing x = x1 2²³ + x2 with x2 < 2²³
// and x1 < 2⁹, we have x = y (mod q) where
// y = x2 + x1 2¹³ - x1 ≤ 2²³ + 2¹³ < 2q.
// Note 2²³ = 2¹³ - 1 mod q. So, writing x = x₁ 2²³ + x₂ with x₂ < 2²³
// and x₁ < 2⁹, we have x = y (mod q) where
// y = x₂ + x₁ 2¹³ - x₁ ≤ 2²³ + 2¹³ < 2q.
x1 := x >> 23
x2 := x & 0x7FFFFF // 2²³-1
return x2 + (x1 << 13) - x1
Expand Down Expand Up @@ -51,9 +51,9 @@ func power2round(a uint32) (a0plusQ, a1 uint32) {
return
}

// Splits 0 ≤ a < Q into a0 and a1 with a = a1*α + a0 with -α/2 < a0 ≤ α/2,
// Splits 0 ≤ a < Q into a0 and a₁ with a = a₁*α + a₀ with -α/2 < a₀ ≤ α/2,
// except for when we would have a1 = (Q-1)/α = 16 in which case a1=0 is taken
// and -α/2 ≤ a0 < 0. Returns a0 + Q. Note 0 ≤ a1 ≤ 15.
// and -α/2 ≤ a₀ < 0. Returns a₀ + Q. Note 0 ≤ a₁ ≤ 15.
// Note α = 2*γ₂ = γ₁ with the chosen parameters of Dilithium.
func decompose(a uint32) (a0plusQ, a1 uint32) {
// Finds 0 ≤ t < 1.5α with t = a mod α. (Recall α=2¹⁹ - 2⁹.)
Expand All @@ -69,20 +69,20 @@ func decompose(a uint32) (a0plusQ, a1 uint32) {

a1 = a - uint32(t)

// We want to divide α out of a1 (to get the proper value of a1).
// We want to divide α out of a₁ (to get the proper value of a1).
// As our values are relatively small and α=2¹⁹-2⁹, we can simply
// divide by 2¹⁹ and add one. There is one corner case we have to deal
// with: if a1=0 then 0/α=0≠1=0/2¹⁹+1, so we need to get rid of the +1.
u := ((a1 - 1) >> 31) & 1 // u=1 if a1=0
// with: if a₁=0 then 0/α=0≠1=0/2¹⁹+1, so we need to get rid of the +1.
u := ((a1 - 1) >> 31) & 1 // u=1 if a₁=0
a1 = (a1 >> 19) + 1
a1 -= u // correct for the case a1=0
a1 -= u // correct for the case a₁=0

a0plusQ = Q + uint32(t)

// Now deal with the corner case of the definition, if a1=(Q-1)/α,
// then we use a1=0. Note (Q-1)/α=2⁴.
// Now deal with the corner case of the definition, if a₁=(Q-1)/α,
// then we use a₁=0. Note (Q-1)/α=2⁴.
a0plusQ -= a1 >> 4 // to compensate, we only have to move the -1.
a1 &= 15 // set a0=0 if a1=16
a1 &= 15 // set a₀=0 if a₁=16
return
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package internal
package common

import (
"flag"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package internal
package common

// Zetas lists precomputed powers of the root of unity in Montgomery
// representation used for the NTT:
//
// Zetas[i] = zeta^brv(i) R mod q,
// Zetas[i] = zetaᵇʳᵛ⁽ⁱ⁾ R mod q,
//
// where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
// and R=2^32 mod q.
// and R=2³² mod q.
//
// The following Python code generates the Zetas (and InvZetas) lists:
//
Expand Down Expand Up @@ -59,10 +59,10 @@ var Zetas = [N]uint32{
// InvZetas lists precomputed powers of the inverse root of unity in Montgomery
// representation used for the inverse NTT:
//
// InvZetas[i] = zeta^{brv(255-i)-256} R mod q,
// InvZetas[i] = zetaᵇʳᵛ⁽²⁵⁵⁻ⁱ⁾⁻²⁵⁶ R mod q,
//
// where zeta = 1753, brv(i) is the bitreversal of a 8-bit number
// and R=2^32 mod q.
// and R=2³² mod q.
var InvZetas = [N]uint32{
6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757,
554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899,
Expand Down Expand Up @@ -109,14 +109,14 @@ var InvZetas = [N]uint32{
// by 2*Q. The resulting coefficients are again in Montgomery representation,
// but are only bounded bt 18*Q.
func (p *Poly) nttGeneric() {
// Writing z := zeta for our root of unity zeta := 1753, note z^256=-1
// Writing z := zeta for our root of unity zeta := 1753, note z²⁵⁶=-1
// (otherwise the order of z wouldn't be 512) and so
//
// x^256 + 1 = x^256 - z^256
// = (x^128 - z^128)(x^128 + z^128)
// = (x^64 - z^64)(x^64 + z^64)(x^64 + z^192)(x^64 - z^192)
// x²⁵⁶ + 1 = x²⁵⁶ - z²⁵⁶
// = (x¹²⁸ - z¹²⁸)(x¹²⁸ + z¹²⁸)
// = (x⁶⁴ - z⁶⁴)(x⁶⁴ + z⁶⁴)(x⁶⁴ + z¹⁹²)(x⁶⁴ - z¹⁹²)
// ...
// = (x-z)(x+z)(x - z^129)(x + z^129) ... (x - z^255)(x + z^255)
// = (x-z)(x+z)(x - z¹²⁹)(x + z¹²⁹) ... (x - z²⁵⁵)(x + z²⁵⁵)
//
// Note that the powers of z that appear (from the second line) are
// in binary
Expand All @@ -129,23 +129,23 @@ func (p *Poly) nttGeneric() {
// i.e. brv(2), brv(3), brv(4), ... and these powers of z are given by
// the Zetas array.
//
// The polynomials x ± z^i are irreducable and coprime, hence by the
// The polynomials x ± zⁱ are irreducable and coprime, hence by the
// Chinese Remainder Theorem we know
//
// R[x]/(x^256+1) ---> R[x] / (x-z) x ... x R[x] / (x+z^255)
// R[x]/(x²⁵⁶+1) R[x] / (x-z) x ... x R[x] / (x+z²⁵⁵)
// ~= ∏_i R
//
// given by
//
// a |---> = ( a mod x-z, ..., a mod x+z^255 )
// ~ ( a(z), a(-z), a(z^129), a(-z^129), ..., a(z^255), a(-z^255) )
// a ( a mod x-z, ..., a mod x+z²⁵⁵ )
// ~ ( a(z), a(-z), a(z¹²⁹), a(-z¹²⁹), ..., a(z²⁵⁵), a(-z²⁵⁵) )
//
// is an isomorphism, which is the forward NTT. It can be computed
// efficiently by computing
//
// a |---> ( a mod x^128 - z^128, a mod x^128 + z^128 )
// |---> ( a mod x^64 - z^64, a mod x^64 + z^64,
// a mod x^64 - z^192, a mod x^64 + z^192 )
// a ( a mod x¹²⁸ - z¹²⁸, a mod x¹²⁸ + z¹²⁸ )
// ( a mod x⁶⁴ - z⁶⁴, a mod x⁶⁴ + z⁶⁴,
// a mod x⁶⁴ - z¹⁹², a mod x⁶⁴ + z¹⁹² )
// et cetera
//
// If N was 8 then this can be pictured in the following diagram:
Expand All @@ -154,7 +154,7 @@ func (p *Poly) nttGeneric() {
//
// Each cross is a Cooley--Tukey butterfly: it's the map
//
// (a, b) |--> (a + ζ, a - ζ)
// (a, b) (a + ζ, a - ζ)
//
// for the appropriate ζ for that column and row group.

Expand Down Expand Up @@ -193,11 +193,11 @@ func (p *Poly) invNttGeneric() {

// We basically do the opposite of NTT, but postpone dividing by 2 in the
// inverse of the Cooley--Tukey butterfly and accumulate that to a big
// division by 2^8 at the end. See comments in the NTT() function.
// division by 2 at the end. See comments in the NTT() function.

for l := uint(1); l < N; l <<= 1 {
// On the n-th iteration of the l-loop, the coefficients start off
// bounded by 2^(n-1)*2*Q, so by 256*Q on the last.
// bounded by 2ⁿ⁻¹*2*Q, so by 256*Q on the last.
for offset := uint(0); offset < N-l; offset += 2 * l {
zeta := uint64(InvZetas[k])
k++
Expand All @@ -211,7 +211,7 @@ func (p *Poly) invNttGeneric() {
}

for j := uint(0); j < N; j++ {
// ROver256 = 41978 = (256)^-1 R^2
// ROver256 = 41978 = (256)⁻¹ R²
p[j] = montReduceLe2Q(ROver256 * uint64(p[j]))
}
}
Loading

0 comments on commit f2d9abd

Please sign in to comment.