blob: ba66846fe973f674fc67a1faec9d83a4e5740beb [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/cache.h>
13#include <asm/errno.h>
14#include <asm/ppc_asm.h>
15#include <asm/export.h>
16#include <asm/code-patching-asm.h>
17
18#define COPY_16_BYTES \
19 lwz r7,4(r4); \
20 lwz r8,8(r4); \
21 lwz r9,12(r4); \
22 lwzu r10,16(r4); \
23 stw r7,4(r6); \
24 stw r8,8(r6); \
25 stw r9,12(r6); \
26 stwu r10,16(r6)
27
28#define COPY_16_BYTES_WITHEX(n) \
298 ## n ## 0: \
30 lwz r7,4(r4); \
318 ## n ## 1: \
32 lwz r8,8(r4); \
338 ## n ## 2: \
34 lwz r9,12(r4); \
358 ## n ## 3: \
36 lwzu r10,16(r4); \
378 ## n ## 4: \
38 stw r7,4(r6); \
398 ## n ## 5: \
40 stw r8,8(r6); \
418 ## n ## 6: \
42 stw r9,12(r6); \
438 ## n ## 7: \
44 stwu r10,16(r6)
45
46#define COPY_16_BYTES_EXCODE(n) \
479 ## n ## 0: \
48 addi r5,r5,-(16 * n); \
49 b 104f; \
509 ## n ## 1: \
51 addi r5,r5,-(16 * n); \
52 b 105f; \
53 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \
54 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \
55 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \
56 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \
57 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \
58 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \
59 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \
60 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
61
62 .text
63 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
64 .stabs "copy_32.S",N_SO,0,0,0f
650:
66
67CACHELINE_BYTES = L1_CACHE_BYTES
68LG_CACHELINE_BYTES = L1_CACHE_SHIFT
69CACHELINE_MASK = (L1_CACHE_BYTES-1)
70
71_GLOBAL(memset16)
72 rlwinm. r0 ,r5, 31, 1, 31
73 addi r6, r3, -4
74 beq- 2f
75 rlwimi r4 ,r4 ,16 ,0 ,15
76 mtctr r0
771: stwu r4, 4(r6)
78 bdnz 1b
792: andi. r0, r5, 1
80 beqlr
81 sth r4, 4(r6)
82 blr
83EXPORT_SYMBOL(memset16)
84
85/*
86 * Use dcbz on the complete cache lines in the destination
87 * to set them to zero. This requires that the destination
88 * area is cacheable. -- paulus
89 *
90 * During early init, cache might not be active yet, so dcbz cannot be used.
91 * We therefore skip the optimised bloc that uses dcbz. This jump is
92 * replaced by a nop once cache is active. This is done in machine_init()
93 */
94_GLOBAL(memset)
95 cmplwi 0,r5,4
96 blt 7f
97
98 rlwimi r4,r4,8,16,23
99 rlwimi r4,r4,16,0,15
100
101 stw r4,0(r3)
102 beqlr
103 andi. r0,r3,3
104 add r5,r0,r5
105 subf r6,r0,r3
106 cmplwi 0,r4,0
107 /*
108 * Skip optimised bloc until cache is enabled. Will be replaced
109 * by 'bne' during boot to use normal procedure if r4 is not zero
110 */
1115: b 2f
112 patch_site 5b, patch__memset_nocache
113
114 clrlwi r7,r6,32-LG_CACHELINE_BYTES
115 add r8,r7,r5
116 srwi r9,r8,LG_CACHELINE_BYTES
117 addic. r9,r9,-1 /* total number of complete cachelines */
118 ble 2f
119 xori r0,r7,CACHELINE_MASK & ~3
120 srwi. r0,r0,2
121 beq 3f
122 mtctr r0
1234: stwu r4,4(r6)
124 bdnz 4b
1253: mtctr r9
126 li r7,4
12710: dcbz r7,r6
128 addi r6,r6,CACHELINE_BYTES
129 bdnz 10b
130 clrlwi r5,r8,32-LG_CACHELINE_BYTES
131 addi r5,r5,4
132
1332: srwi r0,r5,2
134 mtctr r0
135 bdz 6f
1361: stwu r4,4(r6)
137 bdnz 1b
1386: andi. r5,r5,3
139 beqlr
140 mtctr r5
141 addi r6,r6,3
1428: stbu r4,1(r6)
143 bdnz 8b
144 blr
145
1467: cmpwi 0,r5,0
147 beqlr
148 mtctr r5
149 addi r6,r3,-1
1509: stbu r4,1(r6)
151 bdnz 9b
152 blr
153EXPORT_SYMBOL(memset)
154
155/*
156 * This version uses dcbz on the complete cache lines in the
157 * destination area to reduce memory traffic. This requires that
158 * the destination area is cacheable.
159 * We only use this version if the source and dest don't overlap.
160 * -- paulus.
161 *
162 * During early init, cache might not be active yet, so dcbz cannot be used.
163 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
164 * replaced by a nop once cache is active. This is done in machine_init()
165 */
166_GLOBAL(memmove)
167 cmplw 0,r3,r4
168 bgt backwards_memcpy
169 /* fall through */
170
171_GLOBAL(memcpy)
1721: b generic_memcpy
173 patch_site 1b, patch__memcpy_nocache
174
175 add r7,r3,r5 /* test if the src & dst overlap */
176 add r8,r4,r5
177 cmplw 0,r4,r7
178 cmplw 1,r3,r8
179 crand 0,0,4 /* cr0.lt &= cr1.lt */
180 blt generic_memcpy /* if regions overlap */
181
182 addi r4,r4,-4
183 addi r6,r3,-4
184 neg r0,r3
185 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
186 beq 58f
187
188 cmplw 0,r5,r0 /* is this more than total to do? */
189 blt 63f /* if not much to do */
190 andi. r8,r0,3 /* get it word-aligned first */
191 subf r5,r0,r5
192 mtctr r8
193 beq+ 61f
19470: lbz r9,4(r4) /* do some bytes */
195 addi r4,r4,1
196 addi r6,r6,1
197 stb r9,3(r6)
198 bdnz 70b
19961: srwi. r0,r0,2
200 mtctr r0
201 beq 58f
20272: lwzu r9,4(r4) /* do some words */
203 stwu r9,4(r6)
204 bdnz 72b
205
20658: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
207 clrlwi r5,r5,32-LG_CACHELINE_BYTES
208 li r11,4
209 mtctr r0
210 beq 63f
21153:
212 dcbz r11,r6
213 COPY_16_BYTES
214#if L1_CACHE_BYTES >= 32
215 COPY_16_BYTES
216#if L1_CACHE_BYTES >= 64
217 COPY_16_BYTES
218 COPY_16_BYTES
219#if L1_CACHE_BYTES >= 128
220 COPY_16_BYTES
221 COPY_16_BYTES
222 COPY_16_BYTES
223 COPY_16_BYTES
224#endif
225#endif
226#endif
227 bdnz 53b
228
22963: srwi. r0,r5,2
230 mtctr r0
231 beq 64f
23230: lwzu r0,4(r4)
233 stwu r0,4(r6)
234 bdnz 30b
235
23664: andi. r0,r5,3
237 mtctr r0
238 beq+ 65f
239 addi r4,r4,3
240 addi r6,r6,3
24140: lbzu r0,1(r4)
242 stbu r0,1(r6)
243 bdnz 40b
24465: blr
245EXPORT_SYMBOL(memcpy)
246EXPORT_SYMBOL(memmove)
247
248generic_memcpy:
249 srwi. r7,r5,3
250 addi r6,r3,-4
251 addi r4,r4,-4
252 beq 2f /* if less than 8 bytes to do */
253 andi. r0,r6,3 /* get dest word aligned */
254 mtctr r7
255 bne 5f
2561: lwz r7,4(r4)
257 lwzu r8,8(r4)
258 stw r7,4(r6)
259 stwu r8,8(r6)
260 bdnz 1b
261 andi. r5,r5,7
2622: cmplwi 0,r5,4
263 blt 3f
264 lwzu r0,4(r4)
265 addi r5,r5,-4
266 stwu r0,4(r6)
2673: cmpwi 0,r5,0
268 beqlr
269 mtctr r5
270 addi r4,r4,3
271 addi r6,r6,3
2724: lbzu r0,1(r4)
273 stbu r0,1(r6)
274 bdnz 4b
275 blr
2765: subfic r0,r0,4
277 mtctr r0
2786: lbz r7,4(r4)
279 addi r4,r4,1
280 stb r7,4(r6)
281 addi r6,r6,1
282 bdnz 6b
283 subf r5,r0,r5
284 rlwinm. r7,r5,32-3,3,31
285 beq 2b
286 mtctr r7
287 b 1b
288
289_GLOBAL(backwards_memcpy)
290 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
291 add r6,r3,r5
292 add r4,r4,r5
293 beq 2f
294 andi. r0,r6,3
295 mtctr r7
296 bne 5f
2971: lwz r7,-4(r4)
298 lwzu r8,-8(r4)
299 stw r7,-4(r6)
300 stwu r8,-8(r6)
301 bdnz 1b
302 andi. r5,r5,7
3032: cmplwi 0,r5,4
304 blt 3f
305 lwzu r0,-4(r4)
306 subi r5,r5,4
307 stwu r0,-4(r6)
3083: cmpwi 0,r5,0
309 beqlr
310 mtctr r5
3114: lbzu r0,-1(r4)
312 stbu r0,-1(r6)
313 bdnz 4b
314 blr
3155: mtctr r0
3166: lbzu r7,-1(r4)
317 stbu r7,-1(r6)
318 bdnz 6b
319 subf r5,r0,r5
320 rlwinm. r7,r5,32-3,3,31
321 beq 2b
322 mtctr r7
323 b 1b
324
325_GLOBAL(__copy_tofrom_user)
326 addi r4,r4,-4
327 addi r6,r3,-4
328 neg r0,r3
329 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
330 beq 58f
331
332 cmplw 0,r5,r0 /* is this more than total to do? */
333 blt 63f /* if not much to do */
334 andi. r8,r0,3 /* get it word-aligned first */
335 mtctr r8
336 beq+ 61f
33770: lbz r9,4(r4) /* do some bytes */
33871: stb r9,4(r6)
339 addi r4,r4,1
340 addi r6,r6,1
341 bdnz 70b
34261: subf r5,r0,r5
343 srwi. r0,r0,2
344 mtctr r0
345 beq 58f
34672: lwzu r9,4(r4) /* do some words */
34773: stwu r9,4(r6)
348 bdnz 72b
349
350 EX_TABLE(70b,100f)
351 EX_TABLE(71b,101f)
352 EX_TABLE(72b,102f)
353 EX_TABLE(73b,103f)
354
35558: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
356 clrlwi r5,r5,32-LG_CACHELINE_BYTES
357 li r11,4
358 beq 63f
359
360 /* Here we decide how far ahead to prefetch the source */
361 li r3,4
362 cmpwi r0,1
363 li r7,0
364 ble 114f
365 li r7,1
366#if MAX_COPY_PREFETCH > 1
367 /* Heuristically, for large transfers we prefetch
368 MAX_COPY_PREFETCH cachelines ahead. For small transfers
369 we prefetch 1 cacheline ahead. */
370 cmpwi r0,MAX_COPY_PREFETCH
371 ble 112f
372 li r7,MAX_COPY_PREFETCH
373112: mtctr r7
374111: dcbt r3,r4
375 addi r3,r3,CACHELINE_BYTES
376 bdnz 111b
377#else
378 dcbt r3,r4
379 addi r3,r3,CACHELINE_BYTES
380#endif /* MAX_COPY_PREFETCH > 1 */
381
382114: subf r8,r7,r0
383 mr r0,r7
384 mtctr r8
385
38653: dcbt r3,r4
38754: dcbz r11,r6
388 EX_TABLE(54b,105f)
389/* the main body of the cacheline loop */
390 COPY_16_BYTES_WITHEX(0)
391#if L1_CACHE_BYTES >= 32
392 COPY_16_BYTES_WITHEX(1)
393#if L1_CACHE_BYTES >= 64
394 COPY_16_BYTES_WITHEX(2)
395 COPY_16_BYTES_WITHEX(3)
396#if L1_CACHE_BYTES >= 128
397 COPY_16_BYTES_WITHEX(4)
398 COPY_16_BYTES_WITHEX(5)
399 COPY_16_BYTES_WITHEX(6)
400 COPY_16_BYTES_WITHEX(7)
401#endif
402#endif
403#endif
404 bdnz 53b
405 cmpwi r0,0
406 li r3,4
407 li r7,0
408 bne 114b
409
41063: srwi. r0,r5,2
411 mtctr r0
412 beq 64f
41330: lwzu r0,4(r4)
41431: stwu r0,4(r6)
415 bdnz 30b
416
41764: andi. r0,r5,3
418 mtctr r0
419 beq+ 65f
42040: lbz r0,4(r4)
42141: stb r0,4(r6)
422 addi r4,r4,1
423 addi r6,r6,1
424 bdnz 40b
42565: li r3,0
426 blr
427
428/* read fault, initial single-byte copy */
429100: li r9,0
430 b 90f
431/* write fault, initial single-byte copy */
432101: li r9,1
43390: subf r5,r8,r5
434 li r3,0
435 b 99f
436/* read fault, initial word copy */
437102: li r9,0
438 b 91f
439/* write fault, initial word copy */
440103: li r9,1
44191: li r3,2
442 b 99f
443
444/*
445 * this stuff handles faults in the cacheline loop and branches to either
446 * 104f (if in read part) or 105f (if in write part), after updating r5
447 */
448 COPY_16_BYTES_EXCODE(0)
449#if L1_CACHE_BYTES >= 32
450 COPY_16_BYTES_EXCODE(1)
451#if L1_CACHE_BYTES >= 64
452 COPY_16_BYTES_EXCODE(2)
453 COPY_16_BYTES_EXCODE(3)
454#if L1_CACHE_BYTES >= 128
455 COPY_16_BYTES_EXCODE(4)
456 COPY_16_BYTES_EXCODE(5)
457 COPY_16_BYTES_EXCODE(6)
458 COPY_16_BYTES_EXCODE(7)
459#endif
460#endif
461#endif
462
463/* read fault in cacheline loop */
464104: li r9,0
465 b 92f
466/* fault on dcbz (effectively a write fault) */
467/* or write fault in cacheline loop */
468105: li r9,1
46992: li r3,LG_CACHELINE_BYTES
470 mfctr r8
471 add r0,r0,r8
472 b 106f
473/* read fault in final word loop */
474108: li r9,0
475 b 93f
476/* write fault in final word loop */
477109: li r9,1
47893: andi. r5,r5,3
479 li r3,2
480 b 99f
481/* read fault in final byte loop */
482110: li r9,0
483 b 94f
484/* write fault in final byte loop */
485111: li r9,1
48694: li r5,0
487 li r3,0
488/*
489 * At this stage the number of bytes not copied is
490 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
491 */
49299: mfctr r0
493106: slw r3,r0,r3
494 add. r3,r3,r5
495 beq 120f /* shouldn't happen */
496 cmpwi 0,r9,0
497 bne 120f
498/* for a read fault, first try to continue the copy one byte at a time */
499 mtctr r3
500130: lbz r0,4(r4)
501131: stb r0,4(r6)
502 addi r4,r4,1
503 addi r6,r6,1
504 bdnz 130b
505/* then clear out the destination: r3 bytes starting at 4(r6) */
506132: mfctr r3
507120: blr
508
509 EX_TABLE(30b,108b)
510 EX_TABLE(31b,109b)
511 EX_TABLE(40b,110b)
512 EX_TABLE(41b,111b)
513 EX_TABLE(130b,132b)
514 EX_TABLE(131b,120b)
515
516EXPORT_SYMBOL(__copy_tofrom_user)