Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * OpenRISC memset.S |
| 3 | * |
| 4 | * Hand-optimized assembler version of memset for OpenRISC. |
| 5 | * Algorithm inspired by several other arch-specific memset routines |
| 6 | * in the kernel tree |
| 7 | * |
| 8 | * Copyright (C) 2015 Olof Kindgren <olof.kindgren@gmail.com> |
| 9 | * |
| 10 | * This program is free software; you can redistribute it and/or |
| 11 | * modify it under the terms of the GNU General Public License |
| 12 | * as published by the Free Software Foundation; either version |
| 13 | * 2 of the License, or (at your option) any later version. |
| 14 | */ |
| 15 | |
| 16 | .global memset |
| 17 | .type memset, @function |
| 18 | memset: |
| 19 | /* arguments: |
| 20 | * r3 = *s |
| 21 | * r4 = c |
| 22 | * r5 = n |
| 23 | * r13, r15, r17, r19 used as temp regs |
| 24 | */ |
| 25 | |
| 26 | /* Exit if n == 0 */ |
| 27 | l.sfeqi r5, 0 |
| 28 | l.bf 4f |
| 29 | |
| 30 | /* Truncate c to char */ |
| 31 | l.andi r13, r4, 0xff |
| 32 | |
| 33 | /* Skip word extension if c is 0 */ |
| 34 | l.sfeqi r13, 0 |
| 35 | l.bf 1f |
| 36 | /* Check for at least two whole words (8 bytes) */ |
| 37 | l.sfleui r5, 7 |
| 38 | |
| 39 | /* Extend char c to 32-bit word cccc in r13 */ |
| 40 | l.slli r15, r13, 16 // r13 = 000c, r15 = 0c00 |
| 41 | l.or r13, r13, r15 // r13 = 0c0c, r15 = 0c00 |
| 42 | l.slli r15, r13, 8 // r13 = 0c0c, r15 = c0c0 |
| 43 | l.or r13, r13, r15 // r13 = cccc, r15 = c0c0 |
| 44 | |
| 45 | 1: l.addi r19, r3, 0 // Set r19 = src |
| 46 | /* Jump to byte copy loop if less than two words */ |
| 47 | l.bf 3f |
| 48 | l.or r17, r5, r0 // Set r17 = n |
| 49 | |
| 50 | /* Mask out two LSBs to check alignment */ |
| 51 | l.andi r15, r3, 0x3 |
| 52 | |
| 53 | /* lsb == 00, jump to word copy loop */ |
| 54 | l.sfeqi r15, 0 |
| 55 | l.bf 2f |
| 56 | l.addi r19, r3, 0 // Set r19 = src |
| 57 | |
| 58 | /* lsb == 01,10 or 11 */ |
| 59 | l.sb 0(r3), r13 // *src = c |
| 60 | l.addi r17, r17, -1 // Decrease n |
| 61 | |
| 62 | l.sfeqi r15, 3 |
| 63 | l.bf 2f |
| 64 | l.addi r19, r3, 1 // src += 1 |
| 65 | |
| 66 | /* lsb == 01 or 10 */ |
| 67 | l.sb 1(r3), r13 // *(src+1) = c |
| 68 | l.addi r17, r17, -1 // Decrease n |
| 69 | |
| 70 | l.sfeqi r15, 2 |
| 71 | l.bf 2f |
| 72 | l.addi r19, r3, 2 // src += 2 |
| 73 | |
| 74 | /* lsb == 01 */ |
| 75 | l.sb 2(r3), r13 // *(src+2) = c |
| 76 | l.addi r17, r17, -1 // Decrease n |
| 77 | l.addi r19, r3, 3 // src += 3 |
| 78 | |
| 79 | /* Word copy loop */ |
| 80 | 2: l.sw 0(r19), r13 // *src = cccc |
| 81 | l.addi r17, r17, -4 // Decrease n |
| 82 | l.sfgeui r17, 4 |
| 83 | l.bf 2b |
| 84 | l.addi r19, r19, 4 // Increase src |
| 85 | |
| 86 | /* When n > 0, copy the remaining bytes, otherwise jump to exit */ |
| 87 | l.sfeqi r17, 0 |
| 88 | l.bf 4f |
| 89 | |
| 90 | /* Byte copy loop */ |
| 91 | 3: l.addi r17, r17, -1 // Decrease n |
| 92 | l.sb 0(r19), r13 // *src = cccc |
| 93 | l.sfnei r17, 0 |
| 94 | l.bf 3b |
| 95 | l.addi r19, r19, 1 // Increase src |
| 96 | |
| 97 | 4: l.jr r9 |
| 98 | l.ori r11, r3, 0 |