Skip to content

Commit 5ff4065

Browse files
author
Andreas Auernhammer
committed
add AES-GCM-SIV amd64 assembler code for AES-CTR and POLYVAL
This commit adds AMD64 assembler implementations for AES-CTR (AES-GCM-SIV) and POLYVAL. The assembler implementations are still quite generic and use possible optimiaztions like combining decryption and authentication in `Open(...)`. Such more sophisticated optimizations will be introduced over time. The AMD64 assembler code significantly improves performance on machines with AES-NI and PCLMULQDQ instruction: ``` name old time/op new time/op delta AES128GCMSeal64-4 5.24µs ± 0% 0.47µs ± 1% -91.09% (p=0.029 n=4+4) AES128GCMSeal1K-4 57.1µs ± 0% 1.3µs ± 0% -97.71% (p=0.029 n=4+4) AES128GCMSeal8K-4 445µs ± 0% 7µs ± 0% -98.34% (p=0.029 n=4+4) AES128GCMOpen64-4 5.27µs ± 0% 0.48µs ± 0% -90.82% (p=0.029 n=4+4) AES128GCMOpen1K-4 57.2µs ± 0% 1.3µs ± 1% -97.70% (p=0.029 n=4+4) AES128GCMOpen8K-4 444µs ± 0% 7µs ± 0% -98.34% (p=0.029 n=4+4) AES256GCMSeal64-4 5.49µs ± 1% 0.57µs ± 0% -89.66% (p=0.029 n=4+4) AES256GCMSeal1K-4 57.9µs ± 0% 1.5µs ± 0% -97.45% (p=0.029 n=4+4) AES256GCMSeal8K-4 449µs ± 0% 8µs ± 0% -98.18% (p=0.029 n=4+4) AES256GCMOpen64-4 5.49µs ± 0% 0.59µs ± 0% -89.32% (p=0.029 n=4+4) AES256GCMOpen1K-4 57.6µs ± 0% 1.5µs ± 0% -97.40% (p=0.029 n=4+4) AES256GCMOpen8K-4 446µs ± 0% 8µs ± 0% -98.16% (p=0.029 n=4+4) name old speed new speed delta AES128GCMSeal64-4 12.2MB/s ± 0% 137.1MB/s ± 1% +1021.43% (p=0.029 n=4+4) AES128GCMSeal1K-4 17.9MB/s ± 0% 784.8MB/s ± 0% +4273.17% (p=0.029 n=4+4) AES128GCMSeal8K-4 18.4MB/s ± 0% 1106.5MB/s ± 0% +5911.82% (p=0.029 n=4+4) AES128GCMOpen64-4 12.1MB/s ± 0% 132.2MB/s ± 0% +989.41% (p=0.029 n=4+4) AES128GCMOpen1K-4 17.9MB/s ± 0% 776.9MB/s ± 1% +4241.63% (p=0.029 n=4+4) AES128GCMOpen8K-4 18.4MB/s ± 0% 1107.2MB/s ± 0% +5907.46% (p=0.029 n=4+4) AES256GCMSeal64-4 11.7MB/s ± 1% 112.7MB/s ± 0% +866.88% (p=0.029 n=4+4) AES256GCMSeal1K-4 17.7MB/s ± 0% 692.6MB/s ± 0% +3813.22% (p=0.029 n=4+4) AES256GCMSeal8K-4 18.3MB/s ± 0% 1002.0MB/s ± 0% +5386.18% (p=0.029 n=4+4) AES256GCMOpen64-4 11.7MB/s ± 0% 109.1MB/s ± 0% +835.68% (p=0.029 n=4+4) AES256GCMOpen1K-4 17.8MB/s ± 0% 682.5MB/s ± 0% +3739.66% (p=0.029 n=4+4) AES256GCMOpen8K-4 18.4MB/s ± 0% 1000.1MB/s ± 0% +5347.14% (p=0.029 n=4+4) ```
1 parent 339dd21 commit 5ff4065

6 files changed

+414
-15
lines changed

aes_amd64.s

+5-5
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,17 @@ TEXT ·encryptBlock(SB), 4, $0-80
4343
MOVQ keyLen+72(FP), DX
4444

4545
MOVUPS (0 * 16)(SI), X0
46-
CMPQ DX, $24
47-
JE aes_192
48-
JB aes_128
46+
CMPQ DX, $24
47+
JE aes_192
48+
JB aes_128
4949

5050
aes_256:
5151
AES_256(X0, X1, AX)
52-
JMP return
52+
JMP return
5353

5454
aes_192:
5555
AES_192(X0, X1, AX)
56-
JMP return
56+
JMP return
5757

5858
aes_128:
5959
AES_128(X0, X1, AX)

aes_gcm_amd64.go

+67
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,78 @@
77
package siv
88

99
import (
10+
"crypto/aes"
11+
"crypto/cipher"
12+
"crypto/subtle"
13+
1014
"golang.org/x/sys/cpu"
1115
)
1216

17+
func polyval(tag *[16]byte, additionalData, plaintext, key []byte)
18+
19+
func aesGcmXORKeyStream(dst, src, iv, keys []byte, keyLen uint64)
20+
1321
func newGCM(key []byte) aead {
1422
if cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ {
23+
block, _ := aes.NewCipher(key)
24+
return &aesGcmSivAsm{block: block, keyLen: len(key)}
1525
}
1626
return newGCMGeneric(key)
1727
}
28+
29+
var _ aead = (*aesGcmSivAsm)(nil)
30+
31+
type aesGcmSivAsm struct {
32+
block cipher.Block
33+
keyLen int
34+
}
35+
36+
func (c *aesGcmSivAsm) seal(ciphertext, nonce, plaintext, additionalData []byte) {
37+
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
38+
39+
var tag [16]byte
40+
polyval(&tag, additionalData, plaintext, authKey)
41+
for i := range nonce {
42+
tag[i] ^= nonce[i]
43+
}
44+
tag[15] &= 0x7f
45+
46+
var encKeys [240]byte
47+
keySchedule(encKeys[:], encKey)
48+
encryptBlock(tag[:], tag[:], encKeys[:], uint64(len(encKey)))
49+
ctrBlock := tag
50+
ctrBlock[15] |= 0x80
51+
52+
aesGcmXORKeyStream(ciphertext, plaintext, ctrBlock[:], encKeys[:], uint64(len(encKey)))
53+
copy(ciphertext[len(plaintext):], tag[:])
54+
}
55+
56+
func (c *aesGcmSivAsm) open(plaintext, nonce, ciphertext, additionalData []byte) error {
57+
tag := ciphertext[len(ciphertext)-16:]
58+
ciphertext = ciphertext[:len(ciphertext)-16]
59+
60+
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
61+
var ctrBlock [16]byte
62+
copy(ctrBlock[:], tag)
63+
ctrBlock[15] |= 0x80
64+
65+
var encKeys [240]byte
66+
keySchedule(encKeys[:], encKey)
67+
aesGcmXORKeyStream(plaintext, ciphertext, ctrBlock[:], encKeys[:], uint64(len(encKey)))
68+
69+
var sum [16]byte
70+
polyval(&sum, additionalData, plaintext, authKey)
71+
for i := range nonce {
72+
sum[i] ^= nonce[i]
73+
}
74+
sum[15] &= 0x7f
75+
76+
encryptBlock(sum[:], sum[:], encKeys[:], uint64(len(encKey)))
77+
if subtle.ConstantTimeCompare(sum[:], tag[:]) != 1 {
78+
for i := range plaintext {
79+
plaintext[i] = 0
80+
}
81+
return errOpen
82+
}
83+
return nil
84+
}

aes_gcm_amd64.s

+229
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
2+
// Use of this source code is governed by a license that can be
3+
// found in the LICENSE file.
4+
5+
// +build amd64,!gccgo,!appengine
6+
7+
#include "textflag.h"
8+
#include "aes_macros_amd64.s"
9+
10+
DATA ·one<>+0x00(SB)/8, $1
11+
DATA ·one<>+0x08(SB)/8, $0
12+
GLOBL ·one<>(SB), (NOPTR+RODATA), $16
13+
14+
DATA ·polyvalMask<>+0x00(SB)/8, $0x0000000000000001
15+
DATA ·polyvalMask<>+0x08(SB)/8, $0xc200000000000000
16+
GLOBL ·polyvalMask<>(SB), (NOPTR+RODATA), $16
17+
18+
// func aesGcmXORKeyStream(dst, src, iv, keys []byte, keyLen uint64)
19+
TEXT ·aesGcmXORKeyStream(SB), 4, $0-104
20+
MOVQ dst+0(FP), DI
21+
MOVQ src+24(FP), SI
22+
MOVQ src_len+32(FP), DX
23+
MOVQ iv+48(FP), BX
24+
MOVQ keys+72(FP), AX
25+
MOVQ keyLen+96(FP), CX
26+
27+
TESTQ DX, DX
28+
JZ return
29+
30+
MOVUPS (0 * 16)(BX), X10
31+
MOVUPS ·one<>(SB), X9
32+
33+
CMPQ DX, $64
34+
JB loop_1
35+
CMPQ DX, $128
36+
JB loop_4
37+
38+
loop_8:
39+
MOVAPS X10, X0
40+
PADDD X9, X10
41+
MOVAPS X10, X1
42+
PADDD X9, X10
43+
MOVAPS X10, X2
44+
PADDD X9, X10
45+
MOVAPS X10, X3
46+
PADDD X9, X10
47+
MOVAPS X10, X4
48+
PADDD X9, X10
49+
MOVAPS X10, X5
50+
PADDD X9, X10
51+
MOVAPS X10, X6
52+
PADDD X9, X10
53+
MOVAPS X10, X7
54+
PADDD X9, X10
55+
56+
CMPQ CX, $16
57+
JE aes_128_8
58+
59+
aes_256_8:
60+
AES_256_8(X0, X1, X2, X3, X4, X5, X6, X7, X8, AX)
61+
JMP xor_8
62+
63+
aes_128_8:
64+
AES_128_8(X0, X1, X2, X3, X4, X5, X6, X7, X8, AX)
65+
66+
xor_8:
67+
PXOR (0 * 16)(SI), X0
68+
PXOR (1 * 16)(SI), X1
69+
PXOR (2 * 16)(SI), X2
70+
PXOR (3 * 16)(SI), X3
71+
PXOR (4 * 16)(SI), X4
72+
PXOR (5 * 16)(SI), X5
73+
PXOR (6 * 16)(SI), X6
74+
PXOR (7 * 16)(SI), X7
75+
MOVUPS X0, (0 * 16)(DI)
76+
MOVUPS X1, (1 * 16)(DI)
77+
MOVUPS X2, (2 * 16)(DI)
78+
MOVUPS X3, (3 * 16)(DI)
79+
MOVUPS X4, (4 * 16)(DI)
80+
MOVUPS X5, (5 * 16)(DI)
81+
MOVUPS X6, (6 * 16)(DI)
82+
MOVUPS X7, (7 * 16)(DI)
83+
ADDQ $128, SI
84+
ADDQ $128, DI
85+
SUBQ $128, DX
86+
CMPQ DX, $128
87+
JAE loop_8
88+
TESTQ DX, DX
89+
JZ return
90+
CMPQ DX, $64
91+
JB loop_1
92+
93+
loop_4:
94+
MOVAPS X10, X0
95+
PADDD X9, X10
96+
MOVAPS X10, X1
97+
PADDD X9, X10
98+
MOVAPS X10, X2
99+
PADDD X9, X10
100+
MOVAPS X10, X3
101+
PADDD X9, X10
102+
103+
CMPQ CX, $16
104+
JE aes_128_4
105+
106+
aes_256_4:
107+
AES_256_4(X0, X1, X2, X3, X4, AX)
108+
JMP xor_4
109+
110+
aes_128_4:
111+
AES_128_4(X0, X1, X2, X3, X4, AX)
112+
113+
xor_4:
114+
PXOR (0 * 16)(SI), X0
115+
PXOR (1 * 16)(SI), X1
116+
PXOR (2 * 16)(SI), X2
117+
PXOR (3 * 16)(SI), X3
118+
MOVUPS X0, (0 * 16)(DI)
119+
MOVUPS X1, (1 * 16)(DI)
120+
MOVUPS X2, (2 * 16)(DI)
121+
MOVUPS X3, (3 * 16)(DI)
122+
ADDQ $64, SI
123+
ADDQ $64, DI
124+
SUBQ $64, DX
125+
CMPQ DX, $64
126+
JAE loop_4
127+
TESTQ DX, DX
128+
JZ return
129+
130+
loop_1:
131+
MOVAPS X10, X0
132+
PADDD X9, X10
133+
CMPQ CX, $16
134+
JE aes_128_1
135+
136+
aes_256_1:
137+
AES_256(X0, X1, AX)
138+
JMP xor_1
139+
140+
aes_128_1:
141+
AES_128(X0, X1, AX)
142+
143+
xor_1:
144+
CMPQ DX, $16
145+
JB finalize
146+
PXOR 0(SI), X0
147+
MOVUPS X0, 0(DI)
148+
149+
ADDQ $16, SI
150+
ADDQ $16, DI
151+
SUBQ $16, DX
152+
JMP loop_1
153+
154+
finalize:
155+
TESTQ DX, DX
156+
JZ return
157+
158+
finalize_loop:
159+
MOVQ X0, R10
160+
PSRLDQ $1, X0
161+
MOVB 0(SI), R11
162+
XORQ R11, R10
163+
MOVB R10, 0(DI)
164+
INCQ SI
165+
INCQ DI
166+
DECQ DX
167+
JNZ finalize_loop
168+
169+
return:
170+
RET
171+
172+
// func polyval(tag *[16]byte, additionalData, plaintext, key []byte)
173+
TEXT ·polyval(SB), $0-64
174+
MOVQ tag+0(FP), DI
175+
MOVQ additionalData+8(FP), SI
176+
MOVQ additionalData_len+16(FP), DX
177+
MOVQ plaintext+32(FP), BX
178+
MOVQ plaintext_len+40(FP), CX
179+
MOVQ key+56(FP), AX
180+
181+
MOVQ DX, R14
182+
MOVQ CX, R15
183+
SHLQ $3, R14
184+
SHLQ $3, R15
185+
MOVOU 0(DI), X0
186+
MOVOU 0(AX), X1
187+
MOVOU ·polyvalMask<>(SB), X2
188+
189+
MOVQ $2, AX
190+
191+
loop:
192+
CMPQ DX, $16
193+
JB finalize
194+
MOVUPS 0(SI), X7
195+
PXOR X7, X0
196+
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)
197+
ADDQ $16, SI
198+
SUBQ $16, DX
199+
JMP loop
200+
201+
finalize:
202+
TESTQ DX, DX
203+
JZ process_next
204+
MOVQ DI, R11
205+
PXOR X3, X3
206+
MOVOU X3, 0(R11)
207+
208+
finalize_loop:
209+
MOVB 0(SI), R10
210+
MOVB R10, 0(R11)
211+
INCQ SI
212+
INCQ R11
213+
DECQ DX
214+
JNZ finalize_loop
215+
PXOR 0(DI), X0
216+
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)
217+
218+
process_next:
219+
MOVQ BX, SI
220+
MOVQ CX, DX
221+
DECQ AX
222+
JNZ loop
223+
224+
MOVQ R14, 0(DI)
225+
MOVQ R15, 8(DI)
226+
PXOR 0(DI), X0
227+
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)
228+
MOVOU X0, 0(DI)
229+
RET

aes_gcm_generic.go

+10-10
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ type aesGcmSivGeneric struct {
2424
}
2525

2626
func (c *aesGcmSivGeneric) seal(ciphertext, nonce, plaintext, additionalData []byte) {
27-
encKey, authKey := c.deriveKeys(nonce)
27+
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
2828

2929
var tag [16]byte
3030
polyvalGeneric(&tag, additionalData, plaintext, authKey)
@@ -46,7 +46,7 @@ func (c *aesGcmSivGeneric) open(plaintext, nonce, ciphertext, additionalData []b
4646
tag := ciphertext[len(ciphertext)-16:]
4747
ciphertext = ciphertext[:len(ciphertext)-16]
4848

49-
encKey, authKey := c.deriveKeys(nonce)
49+
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
5050
var ctrBlock [16]byte
5151
copy(ctrBlock[:], tag)
5252
ctrBlock[15] |= 0x80
@@ -70,39 +70,39 @@ func (c *aesGcmSivGeneric) open(plaintext, nonce, ciphertext, additionalData []b
7070
return nil
7171
}
7272

73-
func (c *aesGcmSivGeneric) deriveKeys(nonce []byte) (encKey, authKey []byte) {
73+
func deriveKeys(nonce []byte, block cipher.Block, keyLen int) (encKey, authKey []byte) {
7474
var counter [16]byte
7575
encKey = make([]byte, 32)
7676
authKey = make([]byte, 16)
7777
copy(counter[4:], nonce[:])
7878

7979
var tmp [16]byte
8080
binary.LittleEndian.PutUint32(counter[:4], 0)
81-
c.block.Encrypt(tmp[:], counter[:])
81+
block.Encrypt(tmp[:], counter[:])
8282
copy(authKey[0:], tmp[:8])
8383

8484
binary.LittleEndian.PutUint32(counter[:4], 1)
85-
c.block.Encrypt(tmp[:], counter[:])
85+
block.Encrypt(tmp[:], counter[:])
8686
copy(authKey[8:], tmp[:8])
8787

8888
binary.LittleEndian.PutUint32(counter[:4], 2)
89-
c.block.Encrypt(tmp[:], counter[:])
89+
block.Encrypt(tmp[:], counter[:])
9090
copy(encKey[0:], tmp[:8])
9191

9292
binary.LittleEndian.PutUint32(counter[:4], 3)
93-
c.block.Encrypt(tmp[:], counter[:])
93+
block.Encrypt(tmp[:], counter[:])
9494
copy(encKey[8:], tmp[:8])
9595

96-
if c.keyLen == 16 {
96+
if keyLen == 16 {
9797
return encKey[:16], authKey
9898
}
9999

100100
binary.LittleEndian.PutUint32(counter[:4], 4)
101-
c.block.Encrypt(tmp[:], counter[:])
101+
block.Encrypt(tmp[:], counter[:])
102102
copy(encKey[16:], tmp[:8])
103103

104104
binary.LittleEndian.PutUint32(counter[:4], 5)
105-
c.block.Encrypt(tmp[:], counter[:])
105+
block.Encrypt(tmp[:], counter[:])
106106
copy(encKey[24:], tmp[:8])
107107

108108
return encKey, authKey

0 commit comments

Comments
 (0)