Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * M7memcpy: Optimized SPARC M7 memcpy |
| 3 | * |
| 4 | * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. |
| 5 | */ |
| 6 | |
| 7 | .file "M7memcpy.S" |
| 8 | |
| 9 | /* |
| 10 | * memcpy(s1, s2, len) |
| 11 | * |
| 12 | * Copy s2 to s1, always copy n bytes. |
| 13 | * Note: this C code does not work for overlapped copies. |
| 14 | * |
| 15 | * Fast assembler language version of the following C-program for memcpy |
| 16 | * which represents the `standard' for the C-library. |
| 17 | * |
| 18 | * void * |
| 19 | * memcpy(void *s, const void *s0, size_t n) |
| 20 | * { |
| 21 | * if (n != 0) { |
| 22 | * char *s1 = s; |
| 23 | * const char *s2 = s0; |
| 24 | * do { |
| 25 | * *s1++ = *s2++; |
| 26 | * } while (--n != 0); |
| 27 | * } |
| 28 | * return (s); |
| 29 | * } |
| 30 | * |
| 31 | * |
| 32 | * SPARC T7/M7 Flow : |
| 33 | * |
| 34 | * if (count < SMALL_MAX) { |
| 35 | * if count < SHORTCOPY (SHORTCOPY=3) |
| 36 | * copy bytes; exit with dst addr |
| 37 | * if src & dst aligned on word boundary but not long word boundary, |
| 38 | * copy with ldw/stw; branch to finish_up |
| 39 | * if src & dst aligned on long word boundary |
| 40 | * copy with ldx/stx; branch to finish_up |
| 41 | * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) |
| 42 | * copy bytes; exit with dst addr |
| 43 | * move enough bytes to get src to word boundary |
| 44 | * if dst now on word boundary |
| 45 | * move_words: |
| 46 | * copy words; branch to finish_up |
| 47 | * if dst now on half word boundary |
| 48 | * load words, shift half words, store words; branch to finish_up |
| 49 | * if dst on byte 1 |
| 50 | * load words, shift 3 bytes, store words; branch to finish_up |
| 51 | * if dst on byte 3 |
| 52 | * load words, shift 1 byte, store words; branch to finish_up |
| 53 | * finish_up: |
| 54 | * copy bytes; exit with dst addr |
| 55 | * } else { More than SMALL_MAX bytes |
| 56 | * move bytes until dst is on long word boundary |
| 57 | * if( src is on long word boundary ) { |
| 58 | * if (count < MED_MAX) { |
| 59 | * finish_long: src/dst aligned on 8 bytes |
| 60 | * copy with ldx/stx in 8-way unrolled loop; |
| 61 | * copy final 0-63 bytes; exit with dst addr |
| 62 | * } else { src/dst aligned; count > MED_MAX |
| 63 | * align dst on 64 byte boundary; for main data movement: |
| 64 | * prefetch src data to L2 cache; let HW prefetch move data to L1 cache |
| 65 | * Use BIS (block initializing store) to avoid copying store cache |
| 66 | * lines from memory. But pre-store first element of each cache line |
| 67 | * ST_CHUNK lines in advance of the rest of that cache line. That |
| 68 | * gives time for replacement cache lines to be written back without |
| 69 | * excess STQ and Miss Buffer filling. Repeat until near the end, |
| 70 | * then finish up storing before going to finish_long. |
| 71 | * } |
| 72 | * } else { src/dst not aligned on 8 bytes |
| 73 | * if src is word aligned and count < MED_WMAX |
| 74 | * move words in 8-way unrolled loop |
| 75 | * move final 0-31 bytes; exit with dst addr |
| 76 | * if count < MED_UMAX |
| 77 | * use alignaddr/faligndata combined with ldd/std in 8-way |
| 78 | * unrolled loop to move data. |
| 79 | * go to unalign_done |
| 80 | * else |
| 81 | * setup alignaddr for faligndata instructions |
| 82 | * align dst on 64 byte boundary; prefetch src data to L1 cache |
| 83 | * loadx8, falign, block-store, prefetch loop |
| 84 | * (only use block-init-store when src/dst on 8 byte boundaries.) |
| 85 | * unalign_done: |
| 86 | * move remaining bytes for unaligned cases. exit with dst addr. |
| 87 | * } |
| 88 | * |
| 89 | */ |
| 90 | |
| 91 | #include <asm/visasm.h> |
| 92 | #include <asm/asi.h> |
| 93 | |
| 94 | #if !defined(EX_LD) && !defined(EX_ST) |
| 95 | #define NON_USER_COPY |
| 96 | #endif |
| 97 | |
| 98 | #ifndef EX_LD |
| 99 | #define EX_LD(x,y) x |
| 100 | #endif |
| 101 | #ifndef EX_LD_FP |
| 102 | #define EX_LD_FP(x,y) x |
| 103 | #endif |
| 104 | |
| 105 | #ifndef EX_ST |
| 106 | #define EX_ST(x,y) x |
| 107 | #endif |
| 108 | #ifndef EX_ST_FP |
| 109 | #define EX_ST_FP(x,y) x |
| 110 | #endif |
| 111 | |
| 112 | #ifndef EX_RETVAL |
| 113 | #define EX_RETVAL(x) x |
| 114 | #endif |
| 115 | |
| 116 | #ifndef LOAD |
| 117 | #define LOAD(type,addr,dest) type [addr], dest |
| 118 | #endif |
| 119 | |
| 120 | #ifndef STORE |
| 121 | #define STORE(type,src,addr) type src, [addr] |
| 122 | #endif |
| 123 | |
| 124 | /* |
| 125 | * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache |
| 126 | * line as "least recently used" which means if many threads are |
| 127 | * active, it has a high probability of being pushed out of the cache |
| 128 | * between the first initializing store and the final stores. |
| 129 | * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which |
| 130 | * marks the cache line as "most recently used" for all |
| 131 | * but the last cache line |
| 132 | */ |
| 133 | #ifndef STORE_ASI |
| 134 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
| 135 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P |
| 136 | #else |
| 137 | #define STORE_ASI 0x80 /* ASI_P */ |
| 138 | #endif |
| 139 | #endif |
| 140 | |
| 141 | #ifndef STORE_MRU_ASI |
| 142 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
| 143 | #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P |
| 144 | #else |
| 145 | #define STORE_MRU_ASI 0x80 /* ASI_P */ |
| 146 | #endif |
| 147 | #endif |
| 148 | |
| 149 | #ifndef STORE_INIT |
| 150 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI |
| 151 | #endif |
| 152 | |
| 153 | #ifndef STORE_INIT_MRU |
| 154 | #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI |
| 155 | #endif |
| 156 | |
| 157 | #ifndef FUNC_NAME |
| 158 | #define FUNC_NAME M7memcpy |
| 159 | #endif |
| 160 | |
| 161 | #ifndef PREAMBLE |
| 162 | #define PREAMBLE |
| 163 | #endif |
| 164 | |
| 165 | #define BLOCK_SIZE 64 |
| 166 | #define SHORTCOPY 3 |
| 167 | #define SHORTCHECK 14 |
| 168 | #define SHORT_LONG 64 /* max copy for short longword-aligned case */ |
| 169 | /* must be at least 64 */ |
| 170 | #define SMALL_MAX 128 |
| 171 | #define MED_UMAX 1024 /* max copy for medium un-aligned case */ |
| 172 | #define MED_WMAX 1024 /* max copy for medium word-aligned case */ |
| 173 | #define MED_MAX 1024 /* max copy for medium longword-aligned case */ |
| 174 | #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ |
| 175 | #define ALIGN_PRE 24 /* distance for aligned prefetch loop */ |
| 176 | |
| 177 | .register %g2,#scratch |
| 178 | |
| 179 | .section ".text" |
| 180 | .global FUNC_NAME |
| 181 | .type FUNC_NAME, #function |
| 182 | .align 16 |
| 183 | FUNC_NAME: |
| 184 | srlx %o2, 31, %g2 |
| 185 | cmp %g2, 0 |
| 186 | tne %xcc, 5 |
| 187 | PREAMBLE |
| 188 | mov %o0, %g1 ! save %o0 |
| 189 | brz,pn %o2, .Lsmallx |
| 190 | cmp %o2, 3 |
| 191 | ble,pn %icc, .Ltiny_cp |
| 192 | cmp %o2, 19 |
| 193 | ble,pn %icc, .Lsmall_cp |
| 194 | or %o0, %o1, %g2 |
| 195 | cmp %o2, SMALL_MAX |
| 196 | bl,pn %icc, .Lmedium_cp |
| 197 | nop |
| 198 | |
| 199 | .Lmedium: |
| 200 | neg %o0, %o5 |
| 201 | andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned |
| 202 | brz,pt %o5, .Ldst_aligned_on_8 |
| 203 | |
| 204 | ! %o5 has the bytes to be written in partial store. |
| 205 | sub %o2, %o5, %o2 |
| 206 | sub %o1, %o0, %o1 ! %o1 gets the difference |
| 207 | 7: ! dst aligning loop |
| 208 | add %o1, %o0, %o4 |
| 209 | EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte |
| 210 | subcc %o5, 1, %o5 |
| 211 | EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) |
| 212 | bgu,pt %xcc, 7b |
| 213 | add %o0, 1, %o0 ! advance dst |
| 214 | add %o1, %o0, %o1 ! restore %o1 |
| 215 | .Ldst_aligned_on_8: |
| 216 | andcc %o1, 7, %o5 |
| 217 | brnz,pt %o5, .Lsrc_dst_unaligned_on_8 |
| 218 | nop |
| 219 | |
| 220 | .Lsrc_dst_aligned_on_8: |
| 221 | ! check if we are copying MED_MAX or more bytes |
| 222 | set MED_MAX, %o3 |
| 223 | cmp %o2, %o3 ! limit to store buffer size |
| 224 | bgu,pn %xcc, .Llarge_align8_copy |
| 225 | nop |
| 226 | |
| 227 | /* |
| 228 | * Special case for handling when src and dest are both long word aligned |
| 229 | * and total data to move is less than MED_MAX bytes |
| 230 | */ |
| 231 | .Lmedlong: |
| 232 | subcc %o2, 63, %o2 ! adjust length to allow cc test |
| 233 | ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes |
| 234 | nop |
| 235 | .Lmedl64: |
| 236 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load |
| 237 | subcc %o2, 64, %o2 ! decrement length count |
| 238 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store |
| 239 | EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 |
| 240 | EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) |
| 241 | EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) |
| 242 | EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) |
| 243 | EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) |
| 244 | EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) |
| 245 | EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store |
| 246 | EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) |
| 247 | EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 |
| 248 | add %o1, 64, %o1 ! increase src ptr by 64 |
| 249 | EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) |
| 250 | EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) |
| 251 | add %o0, 64, %o0 ! increase dst ptr by 64 |
| 252 | EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) |
| 253 | EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) |
| 254 | bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left |
| 255 | EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) |
| 256 | .Lmedl63: |
| 257 | addcc %o2, 32, %o2 ! adjust remaining count |
| 258 | ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left |
| 259 | nop |
| 260 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load |
| 261 | sub %o2, 32, %o2 ! decrement length count |
| 262 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store |
| 263 | EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 |
| 264 | add %o1, 32, %o1 ! increase src ptr by 32 |
| 265 | EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) |
| 266 | EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) |
| 267 | add %o0, 32, %o0 ! increase dst ptr by 32 |
| 268 | EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) |
| 269 | EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) |
| 270 | EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) |
| 271 | .Lmedl31: |
| 272 | addcc %o2, 16, %o2 ! adjust remaining count |
| 273 | ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left |
| 274 | nop ! |
| 275 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) |
| 276 | add %o1, 16, %o1 ! increase src ptr by 16 |
| 277 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) |
| 278 | sub %o2, 16, %o2 ! decrease count by 16 |
| 279 | EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) |
| 280 | add %o0, 16, %o0 ! increase dst ptr by 16 |
| 281 | EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) |
| 282 | .Lmedl15: |
| 283 | addcc %o2, 15, %o2 ! restore count |
| 284 | bz,pt %xcc, .Lsmallx ! exit if finished |
| 285 | cmp %o2, 8 |
| 286 | blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left |
| 287 | tst %o2 |
| 288 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes |
| 289 | add %o1, 8, %o1 ! increase src ptr by 8 |
| 290 | add %o0, 8, %o0 ! increase dst ptr by 8 |
| 291 | subcc %o2, 8, %o2 ! decrease count by 8 |
| 292 | bnz,pn %xcc, .Lmedw7 |
| 293 | EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 |
| 294 | retl |
| 295 | mov EX_RETVAL(%g1), %o0 ! restore %o0 |
| 296 | |
| 297 | .align 16 |
| 298 | .Lsrc_dst_unaligned_on_8: |
| 299 | ! DST is 8-byte aligned, src is not |
| 300 | 2: |
| 301 | andcc %o1, 0x3, %o5 ! test word alignment |
| 302 | bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned |
| 303 | nop |
| 304 | |
| 305 | /* |
| 306 | * Handle all cases where src and dest are aligned on word |
| 307 | * boundaries. Use unrolled loops for better performance. |
| 308 | * This option wins over standard large data move when |
| 309 | * source and destination is in cache for.Lmedium |
| 310 | * to short data moves. |
| 311 | */ |
| 312 | set MED_WMAX, %o3 |
| 313 | cmp %o2, %o3 ! limit to store buffer size |
| 314 | bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop |
| 315 | nop |
| 316 | |
| 317 | subcc %o2, 31, %o2 ! adjust length to allow cc test |
| 318 | ! for end of loop |
| 319 | ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 |
| 320 | .Lmedw32: |
| 321 | EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 |
| 322 | sllx %o4, 32, %o5 |
| 323 | EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) |
| 324 | or %o4, %o5, %o5 |
| 325 | EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) |
| 326 | subcc %o2, 32, %o2 ! decrement length count |
| 327 | EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) |
| 328 | sllx %o4, 32, %o5 |
| 329 | EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) |
| 330 | or %o4, %o5, %o5 |
| 331 | EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) |
| 332 | add %o1, 32, %o1 ! increase src ptr by 32 |
| 333 | EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) |
| 334 | sllx %o4, 32, %o5 |
| 335 | EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) |
| 336 | or %o4, %o5, %o5 |
| 337 | EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) |
| 338 | add %o0, 32, %o0 ! increase dst ptr by 32 |
| 339 | EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) |
| 340 | sllx %o4, 32, %o5 |
| 341 | EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) |
| 342 | or %o4, %o5, %o5 |
| 343 | bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left |
| 344 | EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) |
| 345 | .Lmedw31: |
| 346 | addcc %o2, 31, %o2 ! restore count |
| 347 | |
| 348 | bz,pt %xcc, .Lsmallx ! exit if finished |
| 349 | nop |
| 350 | cmp %o2, 16 |
| 351 | blt,pt %xcc, .Lmedw15 |
| 352 | nop |
| 353 | EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes |
| 354 | sllx %o4, 32, %o5 |
| 355 | subcc %o2, 16, %o2 ! decrement length count |
| 356 | EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) |
| 357 | or %o4, %o5, %o5 |
| 358 | EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) |
| 359 | add %o1, 16, %o1 ! increase src ptr by 16 |
| 360 | EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) |
| 361 | add %o0, 16, %o0 ! increase dst ptr by 16 |
| 362 | sllx %o4, 32, %o5 |
| 363 | EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) |
| 364 | or %o4, %o5, %o5 |
| 365 | EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) |
| 366 | .Lmedw15: |
| 367 | bz,pt %xcc, .Lsmallx ! exit if finished |
| 368 | cmp %o2, 8 |
| 369 | blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left |
| 370 | tst %o2 |
| 371 | EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes |
| 372 | subcc %o2, 8, %o2 ! decrease count by 8 |
| 373 | EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes |
| 374 | add %o1, 8, %o1 ! increase src ptr by 8 |
| 375 | EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes |
| 376 | add %o0, 8, %o0 ! increase dst ptr by 8 |
| 377 | EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes |
| 378 | bz,pt %xcc, .Lsmallx ! exit if finished |
| 379 | .Lmedw7: ! count is ge 1, less than 8 |
| 380 | cmp %o2, 4 ! check for 4 bytes left |
| 381 | blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left |
| 382 | nop ! |
| 383 | EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes |
| 384 | add %o1, 4, %o1 ! increase src ptr by 4 |
| 385 | add %o0, 4, %o0 ! increase dst ptr by 4 |
| 386 | subcc %o2, 4, %o2 ! decrease count by 4 |
| 387 | bnz .Lsmallleft3 |
| 388 | EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes |
| 389 | retl |
| 390 | mov EX_RETVAL(%g1), %o0 |
| 391 | |
| 392 | .align 16 |
| 393 | .Llarge_align8_copy: ! Src and dst share 8 byte alignment |
| 394 | ! align dst to 64 byte boundary |
| 395 | andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned |
| 396 | brz,pn %o3, .Laligned_to_64 |
| 397 | andcc %o0, 8, %o3 ! odd long words to move? |
| 398 | brz,pt %o3, .Laligned_to_16 |
| 399 | nop |
| 400 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) |
| 401 | sub %o2, 8, %o2 |
| 402 | add %o1, 8, %o1 ! increment src ptr |
| 403 | add %o0, 8, %o0 ! increment dst ptr |
| 404 | EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) |
| 405 | .Laligned_to_16: |
| 406 | andcc %o0, 16, %o3 ! pair of long words to move? |
| 407 | brz,pt %o3, .Laligned_to_32 |
| 408 | nop |
| 409 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) |
| 410 | sub %o2, 16, %o2 |
| 411 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) |
| 412 | add %o1, 16, %o1 ! increment src ptr |
| 413 | EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) |
| 414 | add %o0, 16, %o0 ! increment dst ptr |
| 415 | EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) |
| 416 | .Laligned_to_32: |
| 417 | andcc %o0, 32, %o3 ! four long words to move? |
| 418 | brz,pt %o3, .Laligned_to_64 |
| 419 | nop |
| 420 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) |
| 421 | sub %o2, 32, %o2 |
| 422 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) |
| 423 | EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) |
| 424 | EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) |
| 425 | EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) |
| 426 | EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) |
| 427 | add %o1, 32, %o1 ! increment src ptr |
| 428 | EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) |
| 429 | add %o0, 32, %o0 ! increment dst ptr |
| 430 | EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) |
| 431 | .Laligned_to_64: |
| 432 | ! |
| 433 | ! Using block init store (BIS) instructions to avoid fetching cache |
| 434 | ! lines from memory. Use ST_CHUNK stores to first element of each cache |
| 435 | ! line (similar to prefetching) to avoid overfilling STQ or miss buffers. |
| 436 | ! Gives existing cache lines time to be moved out of L1/L2/L3 cache. |
| 437 | ! Initial stores using MRU version of BIS to keep cache line in |
| 438 | ! cache until we are ready to store final element of cache line. |
| 439 | ! Then store last element using the LRU version of BIS. |
| 440 | ! |
| 441 | andn %o2, 0x3f, %o5 ! %o5 is multiple of block size |
| 442 | and %o2, 0x3f, %o2 ! residue bytes in %o2 |
| 443 | ! |
| 444 | ! We use STORE_MRU_ASI for the first seven stores to each cache line |
| 445 | ! followed by STORE_ASI (mark as LRU) for the last store. That |
| 446 | ! mixed approach reduces the probability that the cache line is removed |
| 447 | ! before we finish setting it, while minimizing the effects on |
| 448 | ! other cached values during a large memcpy |
| 449 | ! |
| 450 | ! ST_CHUNK batches up initial BIS operations for several cache lines |
| 451 | ! to allow multiple requests to not be blocked by overflowing the |
| 452 | ! the store miss buffer. Then the matching stores for all those |
| 453 | ! BIS operations are executed. |
| 454 | ! |
| 455 | |
| 456 | sub %o0, 8, %o0 ! adjust %o0 for ASI alignment |
| 457 | .Lalign_loop: |
| 458 | cmp %o5, ST_CHUNK*64 |
| 459 | blu,pt %xcc, .Lalign_loop_fin |
| 460 | mov ST_CHUNK,%o3 |
| 461 | .Lalign_loop_start: |
| 462 | prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 |
| 463 | subcc %o3, 1, %o3 |
| 464 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) |
| 465 | add %o1, 64, %o1 |
| 466 | add %o0, 8, %o0 |
| 467 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 468 | bgu %xcc,.Lalign_loop_start |
| 469 | add %o0, 56, %o0 |
| 470 | |
| 471 | mov ST_CHUNK,%o3 |
| 472 | sllx %o3, 6, %o4 ! ST_CHUNK*64 |
| 473 | sub %o1, %o4, %o1 ! reset %o1 |
| 474 | sub %o0, %o4, %o0 ! reset %o0 |
| 475 | |
| 476 | .Lalign_loop_rest: |
| 477 | EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) |
| 478 | add %o0, 16, %o0 |
| 479 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 480 | EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) |
| 481 | add %o0, 8, %o0 |
| 482 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 483 | subcc %o3, 1, %o3 |
| 484 | EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) |
| 485 | add %o0, 8, %o0 |
| 486 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 487 | EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) |
| 488 | add %o0, 8, %o0 |
| 489 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 490 | EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) |
| 491 | add %o0, 8, %o0 |
| 492 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 493 | EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) |
| 494 | add %o1, 64, %o1 |
| 495 | add %o0, 8, %o0 |
| 496 | EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) |
| 497 | add %o0, 8, %o0 |
| 498 | EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) |
| 499 | sub %o5, 64, %o5 |
| 500 | bgu %xcc,.Lalign_loop_rest |
| 501 | ! mark cache line as LRU |
| 502 | EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) |
| 503 | |
| 504 | cmp %o5, ST_CHUNK*64 |
| 505 | bgu,pt %xcc, .Lalign_loop_start |
| 506 | mov ST_CHUNK,%o3 |
| 507 | |
| 508 | cmp %o5, 0 |
| 509 | beq .Lalign_done |
| 510 | nop |
| 511 | .Lalign_loop_fin: |
| 512 | EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) |
| 513 | EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) |
| 514 | EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) |
| 515 | EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) |
| 516 | EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) |
| 517 | EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) |
| 518 | subcc %o5, 64, %o5 |
| 519 | EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) |
| 520 | EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) |
| 521 | EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) |
| 522 | EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) |
| 523 | EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) |
| 524 | EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) |
| 525 | EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) |
| 526 | add %o1, 64, %o1 |
| 527 | EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) |
| 528 | add %o0, 64, %o0 |
| 529 | EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) |
| 530 | bgu %xcc,.Lalign_loop_fin |
| 531 | EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) |
| 532 | |
| 533 | .Lalign_done: |
| 534 | add %o0, 8, %o0 ! restore %o0 from ASI alignment |
| 535 | membar #StoreStore |
| 536 | sub %o2, 63, %o2 ! adjust length to allow cc test |
| 537 | ba .Lmedl63 ! in .Lmedl63 |
| 538 | nop |
| 539 | |
| 540 | .align 16 |
| 541 | ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX |
| 542 | .Lunalignsetup: |
| 543 | .Lunalignrejoin: |
| 544 | mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it |
| 545 | #ifdef NON_USER_COPY |
| 546 | VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) |
| 547 | #else |
| 548 | VISEntryHalf |
| 549 | #endif |
| 550 | mov %o3, %g1 ! restore %g1 |
| 551 | |
| 552 | set MED_UMAX, %o3 |
| 553 | cmp %o2, %o3 ! check for.Lmedium unaligned limit |
| 554 | bge,pt %xcc,.Lunalign_large |
| 555 | prefetch [%o1 + (4 * BLOCK_SIZE)], 20 |
| 556 | andn %o2, 0x3f, %o5 ! %o5 is multiple of block size |
| 557 | and %o2, 0x3f, %o2 ! residue bytes in %o2 |
| 558 | cmp %o2, 8 ! Insure we do not load beyond |
| 559 | bgt .Lunalign_adjust ! end of source buffer |
| 560 | andn %o1, 0x7, %o4 ! %o4 has long word aligned src address |
| 561 | add %o2, 64, %o2 ! adjust to leave loop |
| 562 | sub %o5, 64, %o5 ! early if necessary |
| 563 | .Lunalign_adjust: |
| 564 | alignaddr %o1, %g0, %g0 ! generate %gsr |
| 565 | add %o1, %o5, %o1 ! advance %o1 to after blocks |
| 566 | EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) |
| 567 | .Lunalign_loop: |
| 568 | EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) |
| 569 | faligndata %f0, %f2, %f16 |
| 570 | EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) |
| 571 | subcc %o5, BLOCK_SIZE, %o5 |
| 572 | EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) |
| 573 | faligndata %f2, %f4, %f18 |
| 574 | EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) |
| 575 | EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) |
| 576 | faligndata %f4, %f6, %f20 |
| 577 | EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) |
| 578 | EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) |
| 579 | faligndata %f6, %f8, %f22 |
| 580 | EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) |
| 581 | EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) |
| 582 | faligndata %f8, %f10, %f24 |
| 583 | EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) |
| 584 | EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) |
| 585 | faligndata %f10, %f12, %f26 |
| 586 | EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) |
| 587 | add %o4, BLOCK_SIZE, %o4 |
| 588 | EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) |
| 589 | faligndata %f12, %f14, %f28 |
| 590 | EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) |
| 591 | EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) |
| 592 | faligndata %f14, %f0, %f30 |
| 593 | EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) |
| 594 | add %o0, BLOCK_SIZE, %o0 |
| 595 | bgu,pt %xcc, .Lunalign_loop |
| 596 | prefetch [%o4 + (5 * BLOCK_SIZE)], 20 |
| 597 | ba .Lunalign_done |
| 598 | nop |
| 599 | |
| 600 | .Lunalign_large: |
| 601 | andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? |
| 602 | bz %xcc, .Lunalignsrc |
| 603 | sub %o3, 64, %o3 ! %o3 will be multiple of 8 |
| 604 | neg %o3 ! bytes until dest is 64 byte aligned |
| 605 | sub %o2, %o3, %o2 ! update cnt with bytes to be moved |
| 606 | ! Move bytes according to source alignment |
| 607 | andcc %o1, 0x1, %o5 |
| 608 | bnz %xcc, .Lunalignbyte ! check for byte alignment |
| 609 | nop |
| 610 | andcc %o1, 2, %o5 ! check for half word alignment |
| 611 | bnz %xcc, .Lunalignhalf |
| 612 | nop |
| 613 | ! Src is word aligned |
| 614 | .Lunalignword: |
| 615 | EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes |
| 616 | add %o1, 8, %o1 ! increase src ptr by 8 |
| 617 | EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 |
| 618 | subcc %o3, 8, %o3 ! decrease count by 8 |
| 619 | EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 |
| 620 | add %o0, 8, %o0 ! increase dst ptr by 8 |
| 621 | bnz %xcc, .Lunalignword |
| 622 | EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) |
| 623 | ba .Lunalignsrc |
| 624 | nop |
| 625 | |
| 626 | ! Src is half-word aligned |
| 627 | .Lunalignhalf: |
| 628 | EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes |
| 629 | sllx %o4, 32, %o5 ! shift left |
| 630 | EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) |
| 631 | or %o4, %o5, %o5 |
| 632 | sllx %o5, 16, %o5 |
| 633 | EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) |
| 634 | or %o4, %o5, %o5 |
| 635 | EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) |
| 636 | add %o1, 8, %o1 |
| 637 | subcc %o3, 8, %o3 |
| 638 | bnz %xcc, .Lunalignhalf |
| 639 | add %o0, 8, %o0 |
| 640 | ba .Lunalignsrc |
| 641 | nop |
| 642 | |
| 643 | ! Src is Byte aligned |
| 644 | .Lunalignbyte: |
| 645 | sub %o0, %o1, %o0 ! share pointer advance |
| 646 | .Lunalignbyte_loop: |
| 647 | EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) |
| 648 | sllx %o4, 56, %o5 |
| 649 | EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) |
| 650 | sllx %o4, 40, %o4 |
| 651 | or %o4, %o5, %o5 |
| 652 | EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) |
| 653 | sllx %o4, 24, %o4 |
| 654 | or %o4, %o5, %o5 |
| 655 | EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) |
| 656 | sllx %o4, 8, %o4 |
| 657 | or %o4, %o5, %o5 |
| 658 | EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) |
| 659 | or %o4, %o5, %o5 |
| 660 | add %o0, %o1, %o0 |
| 661 | EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) |
| 662 | sub %o0, %o1, %o0 |
| 663 | subcc %o3, 8, %o3 |
| 664 | bnz %xcc, .Lunalignbyte_loop |
| 665 | add %o1, 8, %o1 |
| 666 | add %o0,%o1, %o0 ! restore pointer |
| 667 | |
| 668 | ! Destination is now block (64 byte aligned) |
| 669 | .Lunalignsrc: |
| 670 | andn %o2, 0x3f, %o5 ! %o5 is multiple of block size |
| 671 | and %o2, 0x3f, %o2 ! residue bytes in %o2 |
| 672 | add %o2, 64, %o2 ! Insure we do not load beyond |
| 673 | sub %o5, 64, %o5 ! end of source buffer |
| 674 | |
| 675 | andn %o1, 0x7, %o4 ! %o4 has long word aligned src address |
| 676 | alignaddr %o1, %g0, %g0 ! generate %gsr |
| 677 | add %o1, %o5, %o1 ! advance %o1 to after blocks |
| 678 | |
| 679 | EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) |
| 680 | add %o4, 8, %o4 |
| 681 | .Lunalign_sloop: |
| 682 | EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) |
| 683 | faligndata %f14, %f16, %f0 |
| 684 | EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) |
| 685 | faligndata %f16, %f18, %f2 |
| 686 | EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) |
| 687 | faligndata %f18, %f20, %f4 |
| 688 | EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) |
| 689 | subcc %o5, 64, %o5 |
| 690 | EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) |
| 691 | faligndata %f20, %f22, %f6 |
| 692 | EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) |
| 693 | EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) |
| 694 | faligndata %f22, %f24, %f8 |
| 695 | EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) |
| 696 | EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) |
| 697 | faligndata %f24, %f26, %f10 |
| 698 | EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) |
| 699 | EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) |
| 700 | faligndata %f26, %f28, %f12 |
| 701 | EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) |
| 702 | add %o4, 64, %o4 |
| 703 | EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) |
| 704 | faligndata %f28, %f30, %f14 |
| 705 | EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) |
| 706 | EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) |
| 707 | add %o0, 64, %o0 |
| 708 | EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) |
| 709 | fsrc2 %f30, %f14 |
| 710 | bgu,pt %xcc, .Lunalign_sloop |
| 711 | prefetch [%o4 + (8 * BLOCK_SIZE)], 20 |
| 712 | |
| 713 | .Lunalign_done: |
| 714 | ! Handle trailing bytes, 64 to 127 |
| 715 | ! Dest long word aligned, Src not long word aligned |
| 716 | cmp %o2, 15 |
| 717 | bleu %xcc, .Lunalign_short |
| 718 | |
| 719 | andn %o2, 0x7, %o5 ! %o5 is multiple of 8 |
| 720 | and %o2, 0x7, %o2 ! residue bytes in %o2 |
| 721 | add %o2, 8, %o2 |
| 722 | sub %o5, 8, %o5 ! insure we do not load past end of src |
| 723 | andn %o1, 0x7, %o4 ! %o4 has long word aligned src address |
| 724 | add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 |
| 725 | EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword |
| 726 | .Lunalign_by8: |
| 727 | EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) |
| 728 | add %o4, 8, %o4 |
| 729 | faligndata %f0, %f2, %f16 |
| 730 | subcc %o5, 8, %o5 |
| 731 | EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) |
| 732 | fsrc2 %f2, %f0 |
| 733 | bgu,pt %xcc, .Lunalign_by8 |
| 734 | add %o0, 8, %o0 |
| 735 | |
| 736 | .Lunalign_short: |
| 737 | #ifdef NON_USER_COPY |
| 738 | VISExitHalfFast |
| 739 | #else |
| 740 | VISExitHalf |
| 741 | #endif |
| 742 | ba .Lsmallrest |
| 743 | nop |
| 744 | |
| 745 | /* |
| 746 | * This is a special case of nested memcpy. This can happen when kernel |
| 747 | * calls unaligned memcpy back to back without saving FP registers. We need |
| 748 | * traps(context switch) to save/restore FP registers. If the kernel calls |
| 749 | * memcpy without this trap sequence we will hit FP corruption. Let's use |
| 750 | * the normal integer load/store method in this case. |
| 751 | */ |
| 752 | |
| 753 | #ifdef NON_USER_COPY |
| 754 | .Lmedium_vis_entry_fail_cp: |
| 755 | or %o0, %o1, %g2 |
| 756 | #endif |
| 757 | .Lmedium_cp: |
| 758 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) |
| 759 | andcc %g2, 0x7, %g0 |
| 760 | bne,pn %xcc, .Lmedium_unaligned_cp |
| 761 | nop |
| 762 | |
| 763 | .Lmedium_noprefetch_cp: |
| 764 | andncc %o2, 0x20 - 1, %o5 |
| 765 | be,pn %xcc, 2f |
| 766 | sub %o2, %o5, %o2 |
| 767 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) |
| 768 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) |
| 769 | EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) |
| 770 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) |
| 771 | add %o1, 0x20, %o1 |
| 772 | subcc %o5, 0x20, %o5 |
| 773 | EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) |
| 774 | EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) |
| 775 | EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) |
| 776 | EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) |
| 777 | bne,pt %xcc, 1b |
| 778 | add %o0, 0x20, %o0 |
| 779 | 2: andcc %o2, 0x18, %o5 |
| 780 | be,pt %xcc, 3f |
| 781 | sub %o2, %o5, %o2 |
| 782 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) |
| 783 | add %o1, 0x08, %o1 |
| 784 | add %o0, 0x08, %o0 |
| 785 | subcc %o5, 0x08, %o5 |
| 786 | bne,pt %xcc, 1b |
| 787 | EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) |
| 788 | 3: brz,pt %o2, .Lexit_cp |
| 789 | cmp %o2, 0x04 |
| 790 | bl,pn %xcc, .Ltiny_cp |
| 791 | nop |
| 792 | EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) |
| 793 | add %o1, 0x04, %o1 |
| 794 | add %o0, 0x04, %o0 |
| 795 | subcc %o2, 0x04, %o2 |
| 796 | bne,pn %xcc, .Ltiny_cp |
| 797 | EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) |
| 798 | ba,a,pt %xcc, .Lexit_cp |
| 799 | |
| 800 | .Lmedium_unaligned_cp: |
| 801 | /* First get dest 8 byte aligned. */ |
| 802 | sub %g0, %o0, %o3 |
| 803 | and %o3, 0x7, %o3 |
| 804 | brz,pt %o3, 2f |
| 805 | sub %o2, %o3, %o2 |
| 806 | |
| 807 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) |
| 808 | add %o1, 1, %o1 |
| 809 | subcc %o3, 1, %o3 |
| 810 | add %o0, 1, %o0 |
| 811 | bne,pt %xcc, 1b |
| 812 | EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) |
| 813 | 2: |
| 814 | and %o1, 0x7, %o3 |
| 815 | brz,pn %o3, .Lmedium_noprefetch_cp |
| 816 | sll %o3, 3, %o3 |
| 817 | mov 64, %g2 |
| 818 | sub %g2, %o3, %g2 |
| 819 | andn %o1, 0x7, %o1 |
| 820 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) |
| 821 | sllx %o4, %o3, %o4 |
| 822 | andn %o2, 0x08 - 1, %o5 |
| 823 | sub %o2, %o5, %o2 |
| 824 | |
| 825 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) |
| 826 | add %o1, 0x08, %o1 |
| 827 | subcc %o5, 0x08, %o5 |
| 828 | srlx %g3, %g2, %g7 |
| 829 | or %g7, %o4, %g7 |
| 830 | EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) |
| 831 | add %o0, 0x08, %o0 |
| 832 | bne,pt %xcc, 1b |
| 833 | sllx %g3, %o3, %o4 |
| 834 | srl %o3, 3, %o3 |
| 835 | add %o1, %o3, %o1 |
| 836 | brz,pn %o2, .Lexit_cp |
| 837 | nop |
| 838 | ba,pt %xcc, .Lsmall_unaligned_cp |
| 839 | |
| 840 | .Ltiny_cp: |
| 841 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) |
| 842 | subcc %o2, 1, %o2 |
| 843 | be,pn %xcc, .Lexit_cp |
| 844 | EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) |
| 845 | EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) |
| 846 | subcc %o2, 1, %o2 |
| 847 | be,pn %xcc, .Lexit_cp |
| 848 | EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) |
| 849 | EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) |
| 850 | ba,pt %xcc, .Lexit_cp |
| 851 | EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) |
| 852 | |
| 853 | .Lsmall_cp: |
| 854 | andcc %g2, 0x3, %g0 |
| 855 | bne,pn %xcc, .Lsmall_unaligned_cp |
| 856 | andn %o2, 0x4 - 1, %o5 |
| 857 | sub %o2, %o5, %o2 |
| 858 | 1: |
| 859 | EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) |
| 860 | add %o1, 0x04, %o1 |
| 861 | subcc %o5, 0x04, %o5 |
| 862 | add %o0, 0x04, %o0 |
| 863 | bne,pt %xcc, 1b |
| 864 | EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) |
| 865 | brz,pt %o2, .Lexit_cp |
| 866 | nop |
| 867 | ba,a,pt %xcc, .Ltiny_cp |
| 868 | |
| 869 | .Lsmall_unaligned_cp: |
| 870 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) |
| 871 | add %o1, 1, %o1 |
| 872 | add %o0, 1, %o0 |
| 873 | subcc %o2, 1, %o2 |
| 874 | bne,pt %xcc, 1b |
| 875 | EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) |
| 876 | ba,a,pt %xcc, .Lexit_cp |
| 877 | |
| 878 | .Lsmallrest: |
| 879 | tst %o2 |
| 880 | bz,pt %xcc, .Lsmallx |
| 881 | cmp %o2, 4 |
| 882 | blt,pn %xcc, .Lsmallleft3 |
| 883 | nop |
| 884 | sub %o2, 3, %o2 |
| 885 | .Lsmallnotalign4: |
| 886 | EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte |
| 887 | subcc %o2, 4, %o2 ! reduce count by 4 |
| 888 | EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat |
| 889 | EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 |
| 890 | add %o1, 4, %o1 ! advance SRC by 4 |
| 891 | EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) |
| 892 | EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) |
| 893 | add %o0, 4, %o0 ! advance DST by 4 |
| 894 | EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) |
| 895 | EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) |
| 896 | bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain |
| 897 | EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) |
| 898 | addcc %o2, 3, %o2 ! restore count |
| 899 | bz,pt %xcc, .Lsmallx |
| 900 | .Lsmallleft3: ! 1, 2, or 3 bytes remain |
| 901 | subcc %o2, 1, %o2 |
| 902 | EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte |
| 903 | bz,pt %xcc, .Lsmallx |
| 904 | EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte |
| 905 | EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte |
| 906 | subcc %o2, 1, %o2 |
| 907 | bz,pt %xcc, .Lsmallx |
| 908 | EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte |
| 909 | EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte |
| 910 | EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte |
| 911 | .Lsmallx: |
| 912 | retl |
| 913 | mov EX_RETVAL(%g1), %o0 |
| 914 | .Lsmallfin: |
| 915 | tst %o2 |
| 916 | bnz,pn %xcc, .Lsmallleft3 |
| 917 | nop |
| 918 | retl |
| 919 | mov EX_RETVAL(%g1), %o0 ! restore %o0 |
| 920 | .Lexit_cp: |
| 921 | retl |
| 922 | mov EX_RETVAL(%g1), %o0 |
| 923 | .size FUNC_NAME, .-FUNC_NAME |