blob: 414db480250e1bbc1a45e619b6acb40c9f8a1f4b [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * SSE2 implementation of MORUS-640
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define SHUFFLE_MASK(i0, i1, i2, i3) \
16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
17
18#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
19#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
20#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
21
22#define STATE0 %xmm0
23#define STATE1 %xmm1
24#define STATE2 %xmm2
25#define STATE3 %xmm3
26#define STATE4 %xmm4
27#define KEY %xmm5
28#define MSG %xmm5
29#define T0 %xmm6
30#define T1 %xmm7
31
32.section .rodata.cst16.morus640_const, "aM", @progbits, 32
33.align 16
34.Lmorus640_const_0:
35 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
36 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
37.Lmorus640_const_1:
38 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
39 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
40
41.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
42.align 16
43.Lmorus640_counter:
44 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
45 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
46
47.text
48
49.macro morus640_round s0, s1, s2, s3, s4, b, w
50 movdqa \s1, T0
51 pand \s2, T0
52 pxor T0, \s0
53 pxor \s3, \s0
54 movdqa \s0, T0
55 pslld $\b, T0
56 psrld $(32 - \b), \s0
57 pxor T0, \s0
58 pshufd $\w, \s3, \s3
59.endm
60
61/*
62 * __morus640_update: internal ABI
63 * input:
64 * STATE[0-4] - input state
65 * MSG - message block
66 * output:
67 * STATE[0-4] - output state
68 * changed:
69 * T0
70 */
71__morus640_update:
72 morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
73 pxor MSG, STATE1
74 morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
75 pxor MSG, STATE2
76 morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
77 pxor MSG, STATE3
78 morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
79 pxor MSG, STATE4
80 morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
81 ret
82ENDPROC(__morus640_update)
83
84
85/*
86 * __morus640_update_zero: internal ABI
87 * input:
88 * STATE[0-4] - input state
89 * output:
90 * STATE[0-4] - output state
91 * changed:
92 * T0
93 */
94__morus640_update_zero:
95 morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
96 morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
97 morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
98 morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
99 morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
100 ret
101ENDPROC(__morus640_update_zero)
102
103/*
104 * __load_partial: internal ABI
105 * input:
106 * %rsi - src
107 * %rcx - bytes
108 * output:
109 * MSG - message block
110 * changed:
111 * T0
112 * %r8
113 * %r9
114 */
115__load_partial:
116 xor %r9d, %r9d
117 pxor MSG, MSG
118
119 mov %rcx, %r8
120 and $0x1, %r8
121 jz .Lld_partial_1
122
123 mov %rcx, %r8
124 and $0x1E, %r8
125 add %rsi, %r8
126 mov (%r8), %r9b
127
128.Lld_partial_1:
129 mov %rcx, %r8
130 and $0x2, %r8
131 jz .Lld_partial_2
132
133 mov %rcx, %r8
134 and $0x1C, %r8
135 add %rsi, %r8
136 shl $16, %r9
137 mov (%r8), %r9w
138
139.Lld_partial_2:
140 mov %rcx, %r8
141 and $0x4, %r8
142 jz .Lld_partial_4
143
144 mov %rcx, %r8
145 and $0x18, %r8
146 add %rsi, %r8
147 shl $32, %r9
148 mov (%r8), %r8d
149 xor %r8, %r9
150
151.Lld_partial_4:
152 movq %r9, MSG
153
154 mov %rcx, %r8
155 and $0x8, %r8
156 jz .Lld_partial_8
157
158 mov %rcx, %r8
159 and $0x10, %r8
160 add %rsi, %r8
161 pslldq $8, MSG
162 movq (%r8), T0
163 pxor T0, MSG
164
165.Lld_partial_8:
166 ret
167ENDPROC(__load_partial)
168
169/*
170 * __store_partial: internal ABI
171 * input:
172 * %rdx - dst
173 * %rcx - bytes
174 * output:
175 * T0 - message block
176 * changed:
177 * %r8
178 * %r9
179 * %r10
180 */
181__store_partial:
182 mov %rcx, %r8
183 mov %rdx, %r9
184
185 movq T0, %r10
186
187 cmp $8, %r8
188 jl .Lst_partial_8
189
190 mov %r10, (%r9)
191 psrldq $8, T0
192 movq T0, %r10
193
194 sub $8, %r8
195 add $8, %r9
196
197.Lst_partial_8:
198 cmp $4, %r8
199 jl .Lst_partial_4
200
201 mov %r10d, (%r9)
202 shr $32, %r10
203
204 sub $4, %r8
205 add $4, %r9
206
207.Lst_partial_4:
208 cmp $2, %r8
209 jl .Lst_partial_2
210
211 mov %r10w, (%r9)
212 shr $16, %r10
213
214 sub $2, %r8
215 add $2, %r9
216
217.Lst_partial_2:
218 cmp $1, %r8
219 jl .Lst_partial_1
220
221 mov %r10b, (%r9)
222
223.Lst_partial_1:
224 ret
225ENDPROC(__store_partial)
226
227/*
228 * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
229 */
230ENTRY(crypto_morus640_sse2_init)
231 FRAME_BEGIN
232
233 /* load IV: */
234 movdqu (%rdx), STATE0
235 /* load key: */
236 movdqu (%rsi), KEY
237 movdqa KEY, STATE1
238 /* load all ones: */
239 pcmpeqd STATE2, STATE2
240 /* load the constants: */
241 movdqa .Lmorus640_const_0, STATE3
242 movdqa .Lmorus640_const_1, STATE4
243
244 /* update 16 times with zero: */
245 call __morus640_update_zero
246 call __morus640_update_zero
247 call __morus640_update_zero
248 call __morus640_update_zero
249 call __morus640_update_zero
250 call __morus640_update_zero
251 call __morus640_update_zero
252 call __morus640_update_zero
253 call __morus640_update_zero
254 call __morus640_update_zero
255 call __morus640_update_zero
256 call __morus640_update_zero
257 call __morus640_update_zero
258 call __morus640_update_zero
259 call __morus640_update_zero
260 call __morus640_update_zero
261
262 /* xor-in the key again after updates: */
263 pxor KEY, STATE1
264
265 /* store the state: */
266 movdqu STATE0, (0 * 16)(%rdi)
267 movdqu STATE1, (1 * 16)(%rdi)
268 movdqu STATE2, (2 * 16)(%rdi)
269 movdqu STATE3, (3 * 16)(%rdi)
270 movdqu STATE4, (4 * 16)(%rdi)
271
272 FRAME_END
273 ret
274ENDPROC(crypto_morus640_sse2_init)
275
276/*
277 * void crypto_morus640_sse2_ad(void *state, const void *data,
278 * unsigned int length);
279 */
280ENTRY(crypto_morus640_sse2_ad)
281 FRAME_BEGIN
282
283 cmp $16, %rdx
284 jb .Lad_out
285
286 /* load the state: */
287 movdqu (0 * 16)(%rdi), STATE0
288 movdqu (1 * 16)(%rdi), STATE1
289 movdqu (2 * 16)(%rdi), STATE2
290 movdqu (3 * 16)(%rdi), STATE3
291 movdqu (4 * 16)(%rdi), STATE4
292
293 mov %rsi, %r8
294 and $0xF, %r8
295 jnz .Lad_u_loop
296
297.align 4
298.Lad_a_loop:
299 movdqa (%rsi), MSG
300 call __morus640_update
301 sub $16, %rdx
302 add $16, %rsi
303 cmp $16, %rdx
304 jge .Lad_a_loop
305
306 jmp .Lad_cont
307.align 4
308.Lad_u_loop:
309 movdqu (%rsi), MSG
310 call __morus640_update
311 sub $16, %rdx
312 add $16, %rsi
313 cmp $16, %rdx
314 jge .Lad_u_loop
315
316.Lad_cont:
317 /* store the state: */
318 movdqu STATE0, (0 * 16)(%rdi)
319 movdqu STATE1, (1 * 16)(%rdi)
320 movdqu STATE2, (2 * 16)(%rdi)
321 movdqu STATE3, (3 * 16)(%rdi)
322 movdqu STATE4, (4 * 16)(%rdi)
323
324.Lad_out:
325 FRAME_END
326 ret
327ENDPROC(crypto_morus640_sse2_ad)
328
329/*
330 * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
331 * unsigned int length);
332 */
333ENTRY(crypto_morus640_sse2_enc)
334 FRAME_BEGIN
335
336 cmp $16, %rcx
337 jb .Lenc_out
338
339 /* load the state: */
340 movdqu (0 * 16)(%rdi), STATE0
341 movdqu (1 * 16)(%rdi), STATE1
342 movdqu (2 * 16)(%rdi), STATE2
343 movdqu (3 * 16)(%rdi), STATE3
344 movdqu (4 * 16)(%rdi), STATE4
345
346 mov %rsi, %r8
347 or %rdx, %r8
348 and $0xF, %r8
349 jnz .Lenc_u_loop
350
351.align 4
352.Lenc_a_loop:
353 movdqa (%rsi), MSG
354 movdqa MSG, T0
355 pxor STATE0, T0
356 pshufd $MASK3, STATE1, T1
357 pxor T1, T0
358 movdqa STATE2, T1
359 pand STATE3, T1
360 pxor T1, T0
361 movdqa T0, (%rdx)
362
363 call __morus640_update
364 sub $16, %rcx
365 add $16, %rsi
366 add $16, %rdx
367 cmp $16, %rcx
368 jge .Lenc_a_loop
369
370 jmp .Lenc_cont
371.align 4
372.Lenc_u_loop:
373 movdqu (%rsi), MSG
374 movdqa MSG, T0
375 pxor STATE0, T0
376 pshufd $MASK3, STATE1, T1
377 pxor T1, T0
378 movdqa STATE2, T1
379 pand STATE3, T1
380 pxor T1, T0
381 movdqu T0, (%rdx)
382
383 call __morus640_update
384 sub $16, %rcx
385 add $16, %rsi
386 add $16, %rdx
387 cmp $16, %rcx
388 jge .Lenc_u_loop
389
390.Lenc_cont:
391 /* store the state: */
392 movdqu STATE0, (0 * 16)(%rdi)
393 movdqu STATE1, (1 * 16)(%rdi)
394 movdqu STATE2, (2 * 16)(%rdi)
395 movdqu STATE3, (3 * 16)(%rdi)
396 movdqu STATE4, (4 * 16)(%rdi)
397
398.Lenc_out:
399 FRAME_END
400 ret
401ENDPROC(crypto_morus640_sse2_enc)
402
403/*
404 * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
405 * unsigned int length);
406 */
407ENTRY(crypto_morus640_sse2_enc_tail)
408 FRAME_BEGIN
409
410 /* load the state: */
411 movdqu (0 * 16)(%rdi), STATE0
412 movdqu (1 * 16)(%rdi), STATE1
413 movdqu (2 * 16)(%rdi), STATE2
414 movdqu (3 * 16)(%rdi), STATE3
415 movdqu (4 * 16)(%rdi), STATE4
416
417 /* encrypt message: */
418 call __load_partial
419
420 movdqa MSG, T0
421 pxor STATE0, T0
422 pshufd $MASK3, STATE1, T1
423 pxor T1, T0
424 movdqa STATE2, T1
425 pand STATE3, T1
426 pxor T1, T0
427
428 call __store_partial
429
430 call __morus640_update
431
432 /* store the state: */
433 movdqu STATE0, (0 * 16)(%rdi)
434 movdqu STATE1, (1 * 16)(%rdi)
435 movdqu STATE2, (2 * 16)(%rdi)
436 movdqu STATE3, (3 * 16)(%rdi)
437 movdqu STATE4, (4 * 16)(%rdi)
438
439 FRAME_END
440 ret
441ENDPROC(crypto_morus640_sse2_enc_tail)
442
443/*
444 * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
445 * unsigned int length);
446 */
447ENTRY(crypto_morus640_sse2_dec)
448 FRAME_BEGIN
449
450 cmp $16, %rcx
451 jb .Ldec_out
452
453 /* load the state: */
454 movdqu (0 * 16)(%rdi), STATE0
455 movdqu (1 * 16)(%rdi), STATE1
456 movdqu (2 * 16)(%rdi), STATE2
457 movdqu (3 * 16)(%rdi), STATE3
458 movdqu (4 * 16)(%rdi), STATE4
459
460 mov %rsi, %r8
461 or %rdx, %r8
462 and $0xF, %r8
463 jnz .Ldec_u_loop
464
465.align 4
466.Ldec_a_loop:
467 movdqa (%rsi), MSG
468 pxor STATE0, MSG
469 pshufd $MASK3, STATE1, T0
470 pxor T0, MSG
471 movdqa STATE2, T0
472 pand STATE3, T0
473 pxor T0, MSG
474 movdqa MSG, (%rdx)
475
476 call __morus640_update
477 sub $16, %rcx
478 add $16, %rsi
479 add $16, %rdx
480 cmp $16, %rcx
481 jge .Ldec_a_loop
482
483 jmp .Ldec_cont
484.align 4
485.Ldec_u_loop:
486 movdqu (%rsi), MSG
487 pxor STATE0, MSG
488 pshufd $MASK3, STATE1, T0
489 pxor T0, MSG
490 movdqa STATE2, T0
491 pand STATE3, T0
492 pxor T0, MSG
493 movdqu MSG, (%rdx)
494
495 call __morus640_update
496 sub $16, %rcx
497 add $16, %rsi
498 add $16, %rdx
499 cmp $16, %rcx
500 jge .Ldec_u_loop
501
502.Ldec_cont:
503 /* store the state: */
504 movdqu STATE0, (0 * 16)(%rdi)
505 movdqu STATE1, (1 * 16)(%rdi)
506 movdqu STATE2, (2 * 16)(%rdi)
507 movdqu STATE3, (3 * 16)(%rdi)
508 movdqu STATE4, (4 * 16)(%rdi)
509
510.Ldec_out:
511 FRAME_END
512 ret
513ENDPROC(crypto_morus640_sse2_dec)
514
515/*
516 * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
517 * unsigned int length);
518 */
519ENTRY(crypto_morus640_sse2_dec_tail)
520 FRAME_BEGIN
521
522 /* load the state: */
523 movdqu (0 * 16)(%rdi), STATE0
524 movdqu (1 * 16)(%rdi), STATE1
525 movdqu (2 * 16)(%rdi), STATE2
526 movdqu (3 * 16)(%rdi), STATE3
527 movdqu (4 * 16)(%rdi), STATE4
528
529 /* decrypt message: */
530 call __load_partial
531
532 pxor STATE0, MSG
533 pshufd $MASK3, STATE1, T0
534 pxor T0, MSG
535 movdqa STATE2, T0
536 pand STATE3, T0
537 pxor T0, MSG
538 movdqa MSG, T0
539
540 call __store_partial
541
542 /* mask with byte count: */
543 movq %rcx, T0
544 punpcklbw T0, T0
545 punpcklbw T0, T0
546 punpcklbw T0, T0
547 punpcklbw T0, T0
548 movdqa .Lmorus640_counter, T1
549 pcmpgtb T1, T0
550 pand T0, MSG
551
552 call __morus640_update
553
554 /* store the state: */
555 movdqu STATE0, (0 * 16)(%rdi)
556 movdqu STATE1, (1 * 16)(%rdi)
557 movdqu STATE2, (2 * 16)(%rdi)
558 movdqu STATE3, (3 * 16)(%rdi)
559 movdqu STATE4, (4 * 16)(%rdi)
560
561 FRAME_END
562 ret
563ENDPROC(crypto_morus640_sse2_dec_tail)
564
565/*
566 * void crypto_morus640_sse2_final(void *state, void *tag_xor,
567 * u64 assoclen, u64 cryptlen);
568 */
569ENTRY(crypto_morus640_sse2_final)
570 FRAME_BEGIN
571
572 /* load the state: */
573 movdqu (0 * 16)(%rdi), STATE0
574 movdqu (1 * 16)(%rdi), STATE1
575 movdqu (2 * 16)(%rdi), STATE2
576 movdqu (3 * 16)(%rdi), STATE3
577 movdqu (4 * 16)(%rdi), STATE4
578
579 /* xor state[0] into state[4]: */
580 pxor STATE0, STATE4
581
582 /* prepare length block: */
583 movq %rdx, MSG
584 movq %rcx, T0
585 pslldq $8, T0
586 pxor T0, MSG
587 psllq $3, MSG /* multiply by 8 (to get bit count) */
588
589 /* update state: */
590 call __morus640_update
591 call __morus640_update
592 call __morus640_update
593 call __morus640_update
594 call __morus640_update
595 call __morus640_update
596 call __morus640_update
597 call __morus640_update
598 call __morus640_update
599 call __morus640_update
600
601 /* xor tag: */
602 movdqu (%rsi), MSG
603
604 pxor STATE0, MSG
605 pshufd $MASK3, STATE1, T0
606 pxor T0, MSG
607 movdqa STATE2, T0
608 pand STATE3, T0
609 pxor T0, MSG
610
611 movdqu MSG, (%rsi)
612
613 FRAME_END
614 ret
615ENDPROC(crypto_morus640_sse2_final)