1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
|
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
*
* Based on arch/loongarch/vdso/vgetrandom-chacha.S.
*/
#include <asm/asm.h>
#include <linux/linkage.h>
.text
.macro ROTRI rd rs imm
slliw t0, \rs, 32 - \imm
srliw \rd, \rs, \imm
or \rd, \rd, t0
.endm
.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
\op \d0, \d0, \s0
\op \d1, \d1, \s1
\op \d2, \d2, \s2
\op \d3, \d3, \s3
.endm
/*
* a0: output bytes
* a1: 32-byte key input
* a2: 8-byte counter input/output
* a3: number of 64-byte blocks to write to output
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
#define output a0
#define key a1
#define counter a2
#define nblocks a3
#define i a4
#define state0 s0
#define state1 s1
#define state2 s2
#define state3 s3
#define state4 s4
#define state5 s5
#define state6 s6
#define state7 s7
#define state8 s8
#define state9 s9
#define state10 s10
#define state11 s11
#define state12 a5
#define state13 a6
#define state14 a7
#define state15 t1
#define cnt t2
#define copy0 t3
#define copy1 t4
#define copy2 t5
#define copy3 t6
/* Packs to be used with OP_4REG */
#define line0 state0, state1, state2, state3
#define line1 state4, state5, state6, state7
#define line2 state8, state9, state10, state11
#define line3 state12, state13, state14, state15
#define line1_perm state5, state6, state7, state4
#define line2_perm state10, state11, state8, state9
#define line3_perm state15, state12, state13, state14
#define copy copy0, copy1, copy2, copy3
#define _16 16, 16, 16, 16
#define _20 20, 20, 20, 20
#define _24 24, 24, 24, 24
#define _25 25, 25, 25, 25
/*
* The ABI requires s0-s9 saved.
* This does not violate the stack-less requirement: no sensitive data
* is spilled onto the stack.
*/
addi sp, sp, -12*SZREG
REG_S s0, (sp)
REG_S s1, SZREG(sp)
REG_S s2, 2*SZREG(sp)
REG_S s3, 3*SZREG(sp)
REG_S s4, 4*SZREG(sp)
REG_S s5, 5*SZREG(sp)
REG_S s6, 6*SZREG(sp)
REG_S s7, 7*SZREG(sp)
REG_S s8, 8*SZREG(sp)
REG_S s9, 9*SZREG(sp)
REG_S s10, 10*SZREG(sp)
REG_S s11, 11*SZREG(sp)
ld cnt, (counter)
li copy0, 0x61707865
li copy1, 0x3320646e
li copy2, 0x79622d32
li copy3, 0x6b206574
.Lblock:
/* state[0,1,2,3] = "expand 32-byte k" */
mv state0, copy0
mv state1, copy1
mv state2, copy2
mv state3, copy3
/* state[4,5,..,11] = key */
lw state4, (key)
lw state5, 4(key)
lw state6, 8(key)
lw state7, 12(key)
lw state8, 16(key)
lw state9, 20(key)
lw state10, 24(key)
lw state11, 28(key)
/* state[12,13] = counter */
mv state12, cnt
srli state13, cnt, 32
/* state[14,15] = 0 */
mv state14, zero
mv state15, zero
li i, 10
.Lpermute:
/* odd round */
OP_4REG addw line0, line1
OP_4REG xor line3, line0
OP_4REG ROTRI line3, _16
OP_4REG addw line2, line3
OP_4REG xor line1, line2
OP_4REG ROTRI line1, _20
OP_4REG addw line0, line1
OP_4REG xor line3, line0
OP_4REG ROTRI line3, _24
OP_4REG addw line2, line3
OP_4REG xor line1, line2
OP_4REG ROTRI line1, _25
/* even round */
OP_4REG addw line0, line1_perm
OP_4REG xor line3_perm, line0
OP_4REG ROTRI line3_perm, _16
OP_4REG addw line2_perm, line3_perm
OP_4REG xor line1_perm, line2_perm
OP_4REG ROTRI line1_perm, _20
OP_4REG addw line0, line1_perm
OP_4REG xor line3_perm, line0
OP_4REG ROTRI line3_perm, _24
OP_4REG addw line2_perm, line3_perm
OP_4REG xor line1_perm, line2_perm
OP_4REG ROTRI line1_perm, _25
addi i, i, -1
bnez i, .Lpermute
/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
OP_4REG addw line0, copy
sw state0, (output)
sw state1, 4(output)
sw state2, 8(output)
sw state3, 12(output)
/* from now on state[0,1,2,3] are scratch registers */
/* state[0,1,2,3] = lo(key) */
lw state0, (key)
lw state1, 4(key)
lw state2, 8(key)
lw state3, 12(key)
/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
OP_4REG addw line1, line0
sw state4, 16(output)
sw state5, 20(output)
sw state6, 24(output)
sw state7, 28(output)
/* state[0,1,2,3] = hi(key) */
lw state0, 16(key)
lw state1, 20(key)
lw state2, 24(key)
lw state3, 28(key)
/* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */
OP_4REG addw line2, line0
sw state8, 32(output)
sw state9, 36(output)
sw state10, 40(output)
sw state11, 44(output)
/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
addw state12, state12, cnt
srli state0, cnt, 32
addw state13, state13, state0
sw state12, 48(output)
sw state13, 52(output)
sw state14, 56(output)
sw state15, 60(output)
/* ++counter */
addi cnt, cnt, 1
/* output += 64 */
addi output, output, 64
/* --nblocks */
addi nblocks, nblocks, -1
bnez nblocks, .Lblock
/* counter = [cnt_lo, cnt_hi] */
sd cnt, (counter)
/* Zero out the potentially sensitive regs, in case nothing uses these
* again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
* state[0,...,11] are s0-s11 those we'll restore in the epilogue, we
* only need to zero state[12,...,15].
*/
mv state12, zero
mv state13, zero
mv state14, zero
mv state15, zero
REG_L s0, (sp)
REG_L s1, SZREG(sp)
REG_L s2, 2*SZREG(sp)
REG_L s3, 3*SZREG(sp)
REG_L s4, 4*SZREG(sp)
REG_L s5, 5*SZREG(sp)
REG_L s6, 6*SZREG(sp)
REG_L s7, 7*SZREG(sp)
REG_L s8, 8*SZREG(sp)
REG_L s9, 9*SZREG(sp)
REG_L s10, 10*SZREG(sp)
REG_L s11, 11*SZREG(sp)
addi sp, sp, 12*SZREG
ret
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
|