blob: 96c514bee66b6a58d294942a77ca9b6115e2e9c6 [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9#include <asm/processor.h>
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/asm-compat.h>
13#include <asm/feature-fixups.h>
14
15#ifndef SELFTEST_CASE
16/* 0 == most CPUs, 1 == POWER6, 2 == Cell */
17#define SELFTEST_CASE 0
18#endif
19
20#ifdef __BIG_ENDIAN__
21#define sLd sld /* Shift towards low-numbered address. */
22#define sHd srd /* Shift towards high-numbered address. */
23#else
24#define sLd srd /* Shift towards low-numbered address. */
25#define sHd sld /* Shift towards high-numbered address. */
26#endif
27
28/*
29 * These macros are used to generate exception table entries.
30 * The exception handlers below use the original arguments
31 * (stored on the stack) and the point where we're up to in
32 * the destination buffer, i.e. the address of the first
33 * unmodified byte. Generally r3 points into the destination
34 * buffer, but the first unmodified byte is at a variable
35 * offset from r3. In the code below, the symbol r3_offset
36 * is set to indicate the current offset at each point in
37 * the code. This offset is then used as a negative offset
38 * from the exception handler code, and those instructions
39 * before the exception handlers are addi instructions that
40 * adjust r3 to point to the correct place.
41 */
42 .macro lex /* exception handler for load */
43100: EX_TABLE(100b, .Lld_exc - r3_offset)
44 .endm
45
46 .macro stex /* exception handler for store */
47100: EX_TABLE(100b, .Lst_exc - r3_offset)
48 .endm
49
50 .align 7
51_GLOBAL_TOC(__copy_tofrom_user)
52#ifdef CONFIG_PPC_BOOK3S_64
53BEGIN_FTR_SECTION
54 nop
55FTR_SECTION_ELSE
56 b __copy_tofrom_user_power7
57ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
58#endif
59_GLOBAL(__copy_tofrom_user_base)
60 /* first check for a 4kB copy on a 4kB boundary */
61 cmpldi cr1,r5,16
62 cmpdi cr6,r5,4096
63 or r0,r3,r4
64 neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
65 andi. r0,r0,4095
66 std r3,-24(r1)
67 crand cr0*4+2,cr0*4+2,cr6*4+2
68 std r4,-16(r1)
69 std r5,-8(r1)
70 dcbt 0,r4
71 beq .Lcopy_page_4K
72 andi. r6,r6,7
73 PPC_MTOCRF(0x01,r5)
74 blt cr1,.Lshort_copy
75/* Below we want to nop out the bne if we're on a CPU that has the
76 * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
77 * cleared.
78 * At the time of writing the only CPU that has this combination of bits
79 * set is Power6.
80 */
81test_feature = (SELFTEST_CASE == 1)
82BEGIN_FTR_SECTION
83 nop
84FTR_SECTION_ELSE
85 bne .Ldst_unaligned
86ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
87 CPU_FTR_UNALIGNED_LD_STD)
88.Ldst_aligned:
89 addi r3,r3,-16
90r3_offset = 16
91test_feature = (SELFTEST_CASE == 0)
92BEGIN_FTR_SECTION
93 andi. r0,r4,7
94 bne .Lsrc_unaligned
95END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
96 blt cr1,.Ldo_tail /* if < 16 bytes to copy */
97 srdi r0,r5,5
98 cmpdi cr1,r0,0
99lex; ld r7,0(r4)
100lex; ld r6,8(r4)
101 addi r4,r4,16
102 mtctr r0
103 andi. r0,r5,0x10
104 beq 22f
105 addi r3,r3,16
106r3_offset = 0
107 addi r4,r4,-16
108 mr r9,r7
109 mr r8,r6
110 beq cr1,72f
11121:
112lex; ld r7,16(r4)
113lex; ld r6,24(r4)
114 addi r4,r4,32
115stex; std r9,0(r3)
116r3_offset = 8
117stex; std r8,8(r3)
118r3_offset = 16
11922:
120lex; ld r9,0(r4)
121lex; ld r8,8(r4)
122stex; std r7,16(r3)
123r3_offset = 24
124stex; std r6,24(r3)
125 addi r3,r3,32
126r3_offset = 0
127 bdnz 21b
12872:
129stex; std r9,0(r3)
130r3_offset = 8
131stex; std r8,8(r3)
132r3_offset = 16
133 andi. r5,r5,0xf
134 beq+ 3f
135 addi r4,r4,16
136.Ldo_tail:
137 addi r3,r3,16
138r3_offset = 0
139 bf cr7*4+0,246f
140lex; ld r9,0(r4)
141 addi r4,r4,8
142stex; std r9,0(r3)
143 addi r3,r3,8
144246: bf cr7*4+1,1f
145lex; lwz r9,0(r4)
146 addi r4,r4,4
147stex; stw r9,0(r3)
148 addi r3,r3,4
1491: bf cr7*4+2,2f
150lex; lhz r9,0(r4)
151 addi r4,r4,2
152stex; sth r9,0(r3)
153 addi r3,r3,2
1542: bf cr7*4+3,3f
155lex; lbz r9,0(r4)
156stex; stb r9,0(r3)
1573: li r3,0
158 blr
159
160.Lsrc_unaligned:
161r3_offset = 16
162 srdi r6,r5,3
163 addi r5,r5,-16
164 subf r4,r0,r4
165 srdi r7,r5,4
166 sldi r10,r0,3
167 cmpldi cr6,r6,3
168 andi. r5,r5,7
169 mtctr r7
170 subfic r11,r10,64
171 add r5,r5,r0
172 bt cr7*4+0,28f
173
174lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */
175lex; ld r0,8(r4)
176 sLd r6,r9,r10
177lex; ldu r9,16(r4)
178 sHd r7,r0,r11
179 sLd r8,r0,r10
180 or r7,r7,r6
181 blt cr6,79f
182lex; ld r0,8(r4)
183 b 2f
184
18528:
186lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */
187lex; ldu r9,8(r4)
188 sLd r8,r0,r10
189 addi r3,r3,-8
190r3_offset = 24
191 blt cr6,5f
192lex; ld r0,8(r4)
193 sHd r12,r9,r11
194 sLd r6,r9,r10
195lex; ldu r9,16(r4)
196 or r12,r8,r12
197 sHd r7,r0,r11
198 sLd r8,r0,r10
199 addi r3,r3,16
200r3_offset = 8
201 beq cr6,78f
202
2031: or r7,r7,r6
204lex; ld r0,8(r4)
205stex; std r12,8(r3)
206r3_offset = 16
2072: sHd r12,r9,r11
208 sLd r6,r9,r10
209lex; ldu r9,16(r4)
210 or r12,r8,r12
211stex; stdu r7,16(r3)
212r3_offset = 8
213 sHd r7,r0,r11
214 sLd r8,r0,r10
215 bdnz 1b
216
21778:
218stex; std r12,8(r3)
219r3_offset = 16
220 or r7,r7,r6
22179:
222stex; std r7,16(r3)
223r3_offset = 24
2245: sHd r12,r9,r11
225 or r12,r8,r12
226stex; std r12,24(r3)
227r3_offset = 32
228 bne 6f
229 li r3,0
230 blr
2316: cmpwi cr1,r5,8
232 addi r3,r3,32
233r3_offset = 0
234 sLd r9,r9,r10
235 ble cr1,7f
236lex; ld r0,8(r4)
237 sHd r7,r0,r11
238 or r9,r7,r9
2397:
240 bf cr7*4+1,1f
241#ifdef __BIG_ENDIAN__
242 rotldi r9,r9,32
243#endif
244stex; stw r9,0(r3)
245#ifdef __LITTLE_ENDIAN__
246 rotrdi r9,r9,32
247#endif
248 addi r3,r3,4
2491: bf cr7*4+2,2f
250#ifdef __BIG_ENDIAN__
251 rotldi r9,r9,16
252#endif
253stex; sth r9,0(r3)
254#ifdef __LITTLE_ENDIAN__
255 rotrdi r9,r9,16
256#endif
257 addi r3,r3,2
2582: bf cr7*4+3,3f
259#ifdef __BIG_ENDIAN__
260 rotldi r9,r9,8
261#endif
262stex; stb r9,0(r3)
263#ifdef __LITTLE_ENDIAN__
264 rotrdi r9,r9,8
265#endif
2663: li r3,0
267 blr
268
269.Ldst_unaligned:
270r3_offset = 0
271 PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */
272 subf r5,r6,r5
273 li r7,0
274 cmpldi cr1,r5,16
275 bf cr7*4+3,1f
276100: EX_TABLE(100b, .Lld_exc_r7)
277 lbz r0,0(r4)
278100: EX_TABLE(100b, .Lst_exc_r7)
279 stb r0,0(r3)
280 addi r7,r7,1
2811: bf cr7*4+2,2f
282100: EX_TABLE(100b, .Lld_exc_r7)
283 lhzx r0,r7,r4
284100: EX_TABLE(100b, .Lst_exc_r7)
285 sthx r0,r7,r3
286 addi r7,r7,2
2872: bf cr7*4+1,3f
288100: EX_TABLE(100b, .Lld_exc_r7)
289 lwzx r0,r7,r4
290100: EX_TABLE(100b, .Lst_exc_r7)
291 stwx r0,r7,r3
2923: PPC_MTOCRF(0x01,r5)
293 add r4,r6,r4
294 add r3,r6,r3
295 b .Ldst_aligned
296
297.Lshort_copy:
298r3_offset = 0
299 bf cr7*4+0,1f
300lex; lwz r0,0(r4)
301lex; lwz r9,4(r4)
302 addi r4,r4,8
303stex; stw r0,0(r3)
304stex; stw r9,4(r3)
305 addi r3,r3,8
3061: bf cr7*4+1,2f
307lex; lwz r0,0(r4)
308 addi r4,r4,4
309stex; stw r0,0(r3)
310 addi r3,r3,4
3112: bf cr7*4+2,3f
312lex; lhz r0,0(r4)
313 addi r4,r4,2
314stex; sth r0,0(r3)
315 addi r3,r3,2
3163: bf cr7*4+3,4f
317lex; lbz r0,0(r4)
318stex; stb r0,0(r3)
3194: li r3,0
320 blr
321
322/*
323 * exception handlers follow
324 * we have to return the number of bytes not copied
325 * for an exception on a load, we set the rest of the destination to 0
326 * Note that the number of bytes of instructions for adjusting r3 needs
327 * to equal the amount of the adjustment, due to the trick of using
328 * .Lld_exc - r3_offset as the handler address.
329 */
330
331.Lld_exc_r7:
332 add r3,r3,r7
333 b .Lld_exc
334
335 /* adjust by 24 */
336 addi r3,r3,8
337 nop
338 /* adjust by 16 */
339 addi r3,r3,8
340 nop
341 /* adjust by 8 */
342 addi r3,r3,8
343 nop
344
345/*
346 * Here we have had a fault on a load and r3 points to the first
347 * unmodified byte of the destination. We use the original arguments
348 * and r3 to work out how much wasn't copied. Since we load some
349 * distance ahead of the stores, we continue copying byte-by-byte until
350 * we hit the load fault again in order to copy as much as possible.
351 */
352.Lld_exc:
353 ld r6,-24(r1)
354 ld r4,-16(r1)
355 ld r5,-8(r1)
356 subf r6,r6,r3
357 add r4,r4,r6
358 subf r5,r6,r5 /* #bytes left to go */
359
360/*
361 * first see if we can copy any more bytes before hitting another exception
362 */
363 mtctr r5
364r3_offset = 0
365100: EX_TABLE(100b, .Ldone)
36643: lbz r0,0(r4)
367 addi r4,r4,1
368stex; stb r0,0(r3)
369 addi r3,r3,1
370 bdnz 43b
371 li r3,0 /* huh? all copied successfully this time? */
372 blr
373
374/*
375 * here we have trapped again, amount remaining is in ctr.
376 */
377.Ldone:
378 mfctr r3
379 blr
380
381/*
382 * exception handlers for stores: we need to work out how many bytes
383 * weren't copied, and we may need to copy some more.
384 * Note that the number of bytes of instructions for adjusting r3 needs
385 * to equal the amount of the adjustment, due to the trick of using
386 * .Lst_exc - r3_offset as the handler address.
387 */
388.Lst_exc_r7:
389 add r3,r3,r7
390 b .Lst_exc
391
392 /* adjust by 24 */
393 addi r3,r3,8
394 nop
395 /* adjust by 16 */
396 addi r3,r3,8
397 nop
398 /* adjust by 8 */
399 addi r3,r3,4
400 /* adjust by 4 */
401 addi r3,r3,4
402.Lst_exc:
403 ld r6,-24(r1) /* original destination pointer */
404 ld r4,-16(r1) /* original source pointer */
405 ld r5,-8(r1) /* original number of bytes */
406 add r7,r6,r5
407 /*
408 * If the destination pointer isn't 8-byte aligned,
409 * we may have got the exception as a result of a
410 * store that overlapped a page boundary, so we may be
411 * able to copy a few more bytes.
412 */
41317: andi. r0,r3,7
414 beq 19f
415 subf r8,r6,r3 /* #bytes copied */
416100: EX_TABLE(100b,19f)
417 lbzx r0,r8,r4
418100: EX_TABLE(100b,19f)
419 stb r0,0(r3)
420 addi r3,r3,1
421 cmpld r3,r7
422 blt 17b
42319: subf r3,r3,r7 /* #bytes not copied in r3 */
424 blr
425
426/*
427 * Routine to copy a whole page of data, optimized for POWER4.
428 * On POWER4 it is more than 50% faster than the simple loop
429 * above (following the .Ldst_aligned label).
430 */
431 .macro exc
432100: EX_TABLE(100b, .Labort)
433 .endm
434.Lcopy_page_4K:
435 std r31,-32(1)
436 std r30,-40(1)
437 std r29,-48(1)
438 std r28,-56(1)
439 std r27,-64(1)
440 std r26,-72(1)
441 std r25,-80(1)
442 std r24,-88(1)
443 std r23,-96(1)
444 std r22,-104(1)
445 std r21,-112(1)
446 std r20,-120(1)
447 li r5,4096/32 - 1
448 addi r3,r3,-8
449 li r0,5
4500: addi r5,r5,-24
451 mtctr r0
452exc; ld r22,640(4)
453exc; ld r21,512(4)
454exc; ld r20,384(4)
455exc; ld r11,256(4)
456exc; ld r9,128(4)
457exc; ld r7,0(4)
458exc; ld r25,648(4)
459exc; ld r24,520(4)
460exc; ld r23,392(4)
461exc; ld r10,264(4)
462exc; ld r8,136(4)
463exc; ldu r6,8(4)
464 cmpwi r5,24
4651:
466exc; std r22,648(3)
467exc; std r21,520(3)
468exc; std r20,392(3)
469exc; std r11,264(3)
470exc; std r9,136(3)
471exc; std r7,8(3)
472exc; ld r28,648(4)
473exc; ld r27,520(4)
474exc; ld r26,392(4)
475exc; ld r31,264(4)
476exc; ld r30,136(4)
477exc; ld r29,8(4)
478exc; std r25,656(3)
479exc; std r24,528(3)
480exc; std r23,400(3)
481exc; std r10,272(3)
482exc; std r8,144(3)
483exc; std r6,16(3)
484exc; ld r22,656(4)
485exc; ld r21,528(4)
486exc; ld r20,400(4)
487exc; ld r11,272(4)
488exc; ld r9,144(4)
489exc; ld r7,16(4)
490exc; std r28,664(3)
491exc; std r27,536(3)
492exc; std r26,408(3)
493exc; std r31,280(3)
494exc; std r30,152(3)
495exc; stdu r29,24(3)
496exc; ld r25,664(4)
497exc; ld r24,536(4)
498exc; ld r23,408(4)
499exc; ld r10,280(4)
500exc; ld r8,152(4)
501exc; ldu r6,24(4)
502 bdnz 1b
503exc; std r22,648(3)
504exc; std r21,520(3)
505exc; std r20,392(3)
506exc; std r11,264(3)
507exc; std r9,136(3)
508exc; std r7,8(3)
509 addi r4,r4,640
510 addi r3,r3,648
511 bge 0b
512 mtctr r5
513exc; ld r7,0(4)
514exc; ld r8,8(4)
515exc; ldu r9,16(4)
5163:
517exc; ld r10,8(4)
518exc; std r7,8(3)
519exc; ld r7,16(4)
520exc; std r8,16(3)
521exc; ld r8,24(4)
522exc; std r9,24(3)
523exc; ldu r9,32(4)
524exc; stdu r10,32(3)
525 bdnz 3b
5264:
527exc; ld r10,8(4)
528exc; std r7,8(3)
529exc; std r8,16(3)
530exc; std r9,24(3)
531exc; std r10,32(3)
5329: ld r20,-120(1)
533 ld r21,-112(1)
534 ld r22,-104(1)
535 ld r23,-96(1)
536 ld r24,-88(1)
537 ld r25,-80(1)
538 ld r26,-72(1)
539 ld r27,-64(1)
540 ld r28,-56(1)
541 ld r29,-48(1)
542 ld r30,-40(1)
543 ld r31,-32(1)
544 li r3,0
545 blr
546
547/*
548 * on an exception, reset to the beginning and jump back into the
549 * standard __copy_tofrom_user
550 */
551.Labort:
552 ld r20,-120(1)
553 ld r21,-112(1)
554 ld r22,-104(1)
555 ld r23,-96(1)
556 ld r24,-88(1)
557 ld r25,-80(1)
558 ld r26,-72(1)
559 ld r27,-64(1)
560 ld r28,-56(1)
561 ld r29,-48(1)
562 ld r30,-40(1)
563 ld r31,-32(1)
564 ld r3,-24(r1)
565 ld r4,-16(r1)
566 li r5,4096
567 b .Ldst_aligned
568EXPORT_SYMBOL(__copy_tofrom_user)