blob: 844d8e774492e65929168bfff4d0655fa50dda74 [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * Author: Anton Blanchard <anton@au.ibm.com>
3 * Copyright 2015 IBM Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/ppc-opcode.h>
13
14#define off8 r6
15#define off16 r7
16#define off24 r8
17
18#define rA r9
19#define rB r10
20#define rC r11
21#define rD r27
22#define rE r28
23#define rF r29
24#define rG r30
25#define rH r31
26
27#ifdef __LITTLE_ENDIAN__
28#define LH lhbrx
29#define LW lwbrx
30#define LD ldbrx
31#define LVS lvsr
32#define VPERM(_VRT,_VRA,_VRB,_VRC) \
33 vperm _VRT,_VRB,_VRA,_VRC
34#else
35#define LH lhzx
36#define LW lwzx
37#define LD ldx
38#define LVS lvsl
39#define VPERM(_VRT,_VRA,_VRB,_VRC) \
40 vperm _VRT,_VRA,_VRB,_VRC
41#endif
42
43#define VMX_THRESH 4096
44#define ENTER_VMX_OPS \
45 mflr r0; \
46 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
47 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
48 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
49 std r0,16(r1); \
50 stdu r1,-STACKFRAMESIZE(r1); \
51 bl enter_vmx_ops; \
52 cmpwi cr1,r3,0; \
53 ld r0,STACKFRAMESIZE+16(r1); \
54 ld r3,STK_REG(R31)(r1); \
55 ld r4,STK_REG(R30)(r1); \
56 ld r5,STK_REG(R29)(r1); \
57 addi r1,r1,STACKFRAMESIZE; \
58 mtlr r0
59
60#define EXIT_VMX_OPS \
61 mflr r0; \
62 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
63 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
64 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
65 std r0,16(r1); \
66 stdu r1,-STACKFRAMESIZE(r1); \
67 bl exit_vmx_ops; \
68 ld r0,STACKFRAMESIZE+16(r1); \
69 ld r3,STK_REG(R31)(r1); \
70 ld r4,STK_REG(R30)(r1); \
71 ld r5,STK_REG(R29)(r1); \
72 addi r1,r1,STACKFRAMESIZE; \
73 mtlr r0
74
75/*
76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
77 * 16 bytes boundary and permute the result with the 1st 16 bytes.
78
79 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
80 * ^ ^ ^
81 * 0xbbbb10 0xbbbb20 0xbbb30
82 * ^
83 * _vaddr
84 *
85 *
86 * _vmask is the mask generated by LVS
87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
88 * for example: 0xyyyyyyyyyyyyy012 for big endian
89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
90 * for example: 0x3456789abcdefzzz for big endian
91 * The permute result is saved in _v_res.
92 * for example: 0x0123456789abcdef for big endian.
93 */
94#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
95 lvx _v2nd_qw,_vaddr,off16; \
96 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
97
98/*
99 * There are 2 categories for memcmp:
100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
101 * are named like .Lsameoffset_xxxx
102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
103 * are named like .Ldiffoffset_xxxx
104 */
105_GLOBAL_TOC(memcmp)
106 cmpdi cr1,r5,0
107
108 /* Use the short loop if the src/dst addresses are not
109 * with the same offset of 8 bytes align boundary.
110 */
111 xor r6,r3,r4
112 andi. r6,r6,7
113
114 /* Fall back to short loop if compare at aligned addrs
115 * with less than 8 bytes.
116 */
117 cmpdi cr6,r5,7
118
119 beq cr1,.Lzero
120 bgt cr6,.Lno_short
121
122.Lshort:
123 mtctr r5
1241: lbz rA,0(r3)
125 lbz rB,0(r4)
126 subf. rC,rB,rA
127 bne .Lnon_zero
128 bdz .Lzero
129
130 lbz rA,1(r3)
131 lbz rB,1(r4)
132 subf. rC,rB,rA
133 bne .Lnon_zero
134 bdz .Lzero
135
136 lbz rA,2(r3)
137 lbz rB,2(r4)
138 subf. rC,rB,rA
139 bne .Lnon_zero
140 bdz .Lzero
141
142 lbz rA,3(r3)
143 lbz rB,3(r4)
144 subf. rC,rB,rA
145 bne .Lnon_zero
146
147 addi r3,r3,4
148 addi r4,r4,4
149
150 bdnz 1b
151
152.Lzero:
153 li r3,0
154 blr
155
156.Lno_short:
157 dcbt 0,r3
158 dcbt 0,r4
159 bne .Ldiffoffset_8bytes_make_align_start
160
161
162.Lsameoffset_8bytes_make_align_start:
163 /* attempt to compare bytes not aligned with 8 bytes so that
164 * rest comparison can run based on 8 bytes alignment.
165 */
166 andi. r6,r3,7
167
168 /* Try to compare the first double word which is not 8 bytes aligned:
169 * load the first double word at (src & ~7UL) and shift left appropriate
170 * bits before comparision.
171 */
172 rlwinm r6,r3,3,26,28
173 beq .Lsameoffset_8bytes_aligned
174 clrrdi r3,r3,3
175 clrrdi r4,r4,3
176 LD rA,0,r3
177 LD rB,0,r4
178 sld rA,rA,r6
179 sld rB,rB,r6
180 cmpld cr0,rA,rB
181 srwi r6,r6,3
182 bne cr0,.LcmpAB_lightweight
183 subfic r6,r6,8
184 subf. r5,r6,r5
185 addi r3,r3,8
186 addi r4,r4,8
187 beq .Lzero
188
189.Lsameoffset_8bytes_aligned:
190 /* now we are aligned with 8 bytes.
191 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
192 */
193 cmpdi cr6,r5,31
194 bgt cr6,.Llong
195
196.Lcmp_lt32bytes:
197 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
198 cmpdi cr5,r5,7
199 srdi r0,r5,3
200 ble cr5,.Lcmp_rest_lt8bytes
201
202 /* handle 8 ~ 31 bytes */
203 clrldi r5,r5,61
204 mtctr r0
2052:
206 LD rA,0,r3
207 LD rB,0,r4
208 cmpld cr0,rA,rB
209 addi r3,r3,8
210 addi r4,r4,8
211 bne cr0,.LcmpAB_lightweight
212 bdnz 2b
213
214 cmpwi r5,0
215 beq .Lzero
216
217.Lcmp_rest_lt8bytes:
218 /* Here we have only less than 8 bytes to compare with. at least s1
219 * Address is aligned with 8 bytes.
220 * The next double words are load and shift right with appropriate
221 * bits.
222 */
223 subfic r6,r5,8
224 slwi r6,r6,3
225 LD rA,0,r3
226 LD rB,0,r4
227 srd rA,rA,r6
228 srd rB,rB,r6
229 cmpld cr0,rA,rB
230 bne cr0,.LcmpAB_lightweight
231 b .Lzero
232
233.Lnon_zero:
234 mr r3,rC
235 blr
236
237.Llong:
238#ifdef CONFIG_ALTIVEC
239BEGIN_FTR_SECTION
240 /* Try to use vmx loop if length is equal or greater than 4K */
241 cmpldi cr6,r5,VMX_THRESH
242 bge cr6,.Lsameoffset_vmx_cmp
243END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
244
245.Llong_novmx_cmp:
246#endif
247 /* At least s1 addr is aligned with 8 bytes */
248 li off8,8
249 li off16,16
250 li off24,24
251
252 std r31,-8(r1)
253 std r30,-16(r1)
254 std r29,-24(r1)
255 std r28,-32(r1)
256 std r27,-40(r1)
257
258 srdi r0,r5,5
259 mtctr r0
260 andi. r5,r5,31
261
262 LD rA,0,r3
263 LD rB,0,r4
264
265 LD rC,off8,r3
266 LD rD,off8,r4
267
268 LD rE,off16,r3
269 LD rF,off16,r4
270
271 LD rG,off24,r3
272 LD rH,off24,r4
273 cmpld cr0,rA,rB
274
275 addi r3,r3,32
276 addi r4,r4,32
277
278 bdz .Lfirst32
279
280 LD rA,0,r3
281 LD rB,0,r4
282 cmpld cr1,rC,rD
283
284 LD rC,off8,r3
285 LD rD,off8,r4
286 cmpld cr6,rE,rF
287
288 LD rE,off16,r3
289 LD rF,off16,r4
290 cmpld cr7,rG,rH
291 bne cr0,.LcmpAB
292
293 LD rG,off24,r3
294 LD rH,off24,r4
295 cmpld cr0,rA,rB
296 bne cr1,.LcmpCD
297
298 addi r3,r3,32
299 addi r4,r4,32
300
301 bdz .Lsecond32
302
303 .balign 16
304
3051: LD rA,0,r3
306 LD rB,0,r4
307 cmpld cr1,rC,rD
308 bne cr6,.LcmpEF
309
310 LD rC,off8,r3
311 LD rD,off8,r4
312 cmpld cr6,rE,rF
313 bne cr7,.LcmpGH
314
315 LD rE,off16,r3
316 LD rF,off16,r4
317 cmpld cr7,rG,rH
318 bne cr0,.LcmpAB
319
320 LD rG,off24,r3
321 LD rH,off24,r4
322 cmpld cr0,rA,rB
323 bne cr1,.LcmpCD
324
325 addi r3,r3,32
326 addi r4,r4,32
327
328 bdnz 1b
329
330.Lsecond32:
331 cmpld cr1,rC,rD
332 bne cr6,.LcmpEF
333
334 cmpld cr6,rE,rF
335 bne cr7,.LcmpGH
336
337 cmpld cr7,rG,rH
338 bne cr0,.LcmpAB
339
340 bne cr1,.LcmpCD
341 bne cr6,.LcmpEF
342 bne cr7,.LcmpGH
343
344.Ltail:
345 ld r31,-8(r1)
346 ld r30,-16(r1)
347 ld r29,-24(r1)
348 ld r28,-32(r1)
349 ld r27,-40(r1)
350
351 cmpdi r5,0
352 beq .Lzero
353 b .Lshort
354
355.Lfirst32:
356 cmpld cr1,rC,rD
357 cmpld cr6,rE,rF
358 cmpld cr7,rG,rH
359
360 bne cr0,.LcmpAB
361 bne cr1,.LcmpCD
362 bne cr6,.LcmpEF
363 bne cr7,.LcmpGH
364
365 b .Ltail
366
367.LcmpAB:
368 li r3,1
369 bgt cr0,.Lout
370 li r3,-1
371 b .Lout
372
373.LcmpCD:
374 li r3,1
375 bgt cr1,.Lout
376 li r3,-1
377 b .Lout
378
379.LcmpEF:
380 li r3,1
381 bgt cr6,.Lout
382 li r3,-1
383 b .Lout
384
385.LcmpGH:
386 li r3,1
387 bgt cr7,.Lout
388 li r3,-1
389
390.Lout:
391 ld r31,-8(r1)
392 ld r30,-16(r1)
393 ld r29,-24(r1)
394 ld r28,-32(r1)
395 ld r27,-40(r1)
396 blr
397
398.LcmpAB_lightweight: /* skip NV GPRS restore */
399 li r3,1
400 bgtlr
401 li r3,-1
402 blr
403
404#ifdef CONFIG_ALTIVEC
405.Lsameoffset_vmx_cmp:
406 /* Enter with src/dst addrs has the same offset with 8 bytes
407 * align boundary.
408 *
409 * There is an optimization based on following fact: memcmp()
410 * prones to fail early at the first 32 bytes.
411 * Before applying VMX instructions which will lead to 32x128bits
412 * VMX regs load/restore penalty, we compare the first 32 bytes
413 * so that we can catch the ~80% fail cases.
414 */
415
416 li r0,4
417 mtctr r0
418.Lsameoffset_prechk_32B_loop:
419 LD rA,0,r3
420 LD rB,0,r4
421 cmpld cr0,rA,rB
422 addi r3,r3,8
423 addi r4,r4,8
424 bne cr0,.LcmpAB_lightweight
425 addi r5,r5,-8
426 bdnz .Lsameoffset_prechk_32B_loop
427
428 ENTER_VMX_OPS
429 beq cr1,.Llong_novmx_cmp
430
4313:
432 /* need to check whether r4 has the same offset with r3
433 * for 16 bytes boundary.
434 */
435 xor r0,r3,r4
436 andi. r0,r0,0xf
437 bne .Ldiffoffset_vmx_cmp_start
438
439 /* len is no less than 4KB. Need to align with 16 bytes further.
440 */
441 andi. rA,r3,8
442 LD rA,0,r3
443 beq 4f
444 LD rB,0,r4
445 cmpld cr0,rA,rB
446 addi r3,r3,8
447 addi r4,r4,8
448 addi r5,r5,-8
449
450 beq cr0,4f
451 /* save and restore cr0 */
452 mfocrf r5,128
453 EXIT_VMX_OPS
454 mtocrf 128,r5
455 b .LcmpAB_lightweight
456
4574:
458 /* compare 32 bytes for each loop */
459 srdi r0,r5,5
460 mtctr r0
461 clrldi r5,r5,59
462 li off16,16
463
464.balign 16
4655:
466 lvx v0,0,r3
467 lvx v1,0,r4
468 VCMPEQUD_RC(v0,v0,v1)
469 bnl cr6,7f
470 lvx v0,off16,r3
471 lvx v1,off16,r4
472 VCMPEQUD_RC(v0,v0,v1)
473 bnl cr6,6f
474 addi r3,r3,32
475 addi r4,r4,32
476 bdnz 5b
477
478 EXIT_VMX_OPS
479 cmpdi r5,0
480 beq .Lzero
481 b .Lcmp_lt32bytes
482
4836:
484 addi r3,r3,16
485 addi r4,r4,16
486
4877:
488 /* diff the last 16 bytes */
489 EXIT_VMX_OPS
490 LD rA,0,r3
491 LD rB,0,r4
492 cmpld cr0,rA,rB
493 li off8,8
494 bne cr0,.LcmpAB_lightweight
495
496 LD rA,off8,r3
497 LD rB,off8,r4
498 cmpld cr0,rA,rB
499 bne cr0,.LcmpAB_lightweight
500 b .Lzero
501#endif
502
503.Ldiffoffset_8bytes_make_align_start:
504 /* now try to align s1 with 8 bytes */
505 rlwinm r6,r3,3,26,28
506 beq .Ldiffoffset_align_s1_8bytes
507
508 clrrdi r3,r3,3
509 LD rA,0,r3
510 LD rB,0,r4 /* unaligned load */
511 sld rA,rA,r6
512 srd rA,rA,r6
513 srd rB,rB,r6
514 cmpld cr0,rA,rB
515 srwi r6,r6,3
516 bne cr0,.LcmpAB_lightweight
517
518 subfic r6,r6,8
519 subf. r5,r6,r5
520 addi r3,r3,8
521 add r4,r4,r6
522
523 beq .Lzero
524
525.Ldiffoffset_align_s1_8bytes:
526 /* now s1 is aligned with 8 bytes. */
527#ifdef CONFIG_ALTIVEC
528BEGIN_FTR_SECTION
529 /* only do vmx ops when the size equal or greater than 4K bytes */
530 cmpdi cr5,r5,VMX_THRESH
531 bge cr5,.Ldiffoffset_vmx_cmp
532END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
533
534.Ldiffoffset_novmx_cmp:
535#endif
536
537
538 cmpdi cr5,r5,31
539 ble cr5,.Lcmp_lt32bytes
540
541#ifdef CONFIG_ALTIVEC
542 b .Llong_novmx_cmp
543#else
544 b .Llong
545#endif
546
547#ifdef CONFIG_ALTIVEC
548.Ldiffoffset_vmx_cmp:
549 /* perform a 32 bytes pre-checking before
550 * enable VMX operations.
551 */
552 li r0,4
553 mtctr r0
554.Ldiffoffset_prechk_32B_loop:
555 LD rA,0,r3
556 LD rB,0,r4
557 cmpld cr0,rA,rB
558 addi r3,r3,8
559 addi r4,r4,8
560 bne cr0,.LcmpAB_lightweight
561 addi r5,r5,-8
562 bdnz .Ldiffoffset_prechk_32B_loop
563
564 ENTER_VMX_OPS
565 beq cr1,.Ldiffoffset_novmx_cmp
566
567.Ldiffoffset_vmx_cmp_start:
568 /* Firstly try to align r3 with 16 bytes */
569 andi. r6,r3,0xf
570 li off16,16
571 beq .Ldiffoffset_vmx_s1_16bytes_align
572
573 LVS v3,0,r3
574 LVS v4,0,r4
575
576 lvx v5,0,r3
577 lvx v6,0,r4
578 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
579 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
580
581 VCMPEQUB_RC(v7,v9,v10)
582 bnl cr6,.Ldiffoffset_vmx_diff_found
583
584 subfic r6,r6,16
585 subf r5,r6,r5
586 add r3,r3,r6
587 add r4,r4,r6
588
589.Ldiffoffset_vmx_s1_16bytes_align:
590 /* now s1 is aligned with 16 bytes */
591 lvx v6,0,r4
592 LVS v4,0,r4
593 srdi r6,r5,5 /* loop for 32 bytes each */
594 clrldi r5,r5,59
595 mtctr r6
596
597.balign 16
598.Ldiffoffset_vmx_32bytesloop:
599 /* the first qw of r4 was saved in v6 */
600 lvx v9,0,r3
601 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
602 VCMPEQUB_RC(v7,v9,v10)
603 vor v6,v8,v8
604 bnl cr6,.Ldiffoffset_vmx_diff_found
605
606 addi r3,r3,16
607 addi r4,r4,16
608
609 lvx v9,0,r3
610 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
611 VCMPEQUB_RC(v7,v9,v10)
612 vor v6,v8,v8
613 bnl cr6,.Ldiffoffset_vmx_diff_found
614
615 addi r3,r3,16
616 addi r4,r4,16
617
618 bdnz .Ldiffoffset_vmx_32bytesloop
619
620 EXIT_VMX_OPS
621
622 cmpdi r5,0
623 beq .Lzero
624 b .Lcmp_lt32bytes
625
626.Ldiffoffset_vmx_diff_found:
627 EXIT_VMX_OPS
628 /* anyway, the diff will appear in next 16 bytes */
629 li r5,16
630 b .Lcmp_lt32bytes
631
632#endif
633EXPORT_SYMBOL(memcmp)