aboutsummaryrefslogtreecommitdiff
path: root/lib/aarch64
diff options
context:
space:
mode:
authorAndrew Thoelke <andrew.thoelke@arm.com>2014-04-25 10:49:30 +0100
committerAndrew Thoelke <andrew.thoelke@arm.com>2014-05-07 11:32:25 +0100
commit5f6032a8206bb88655367f96cc1270525bed9e48 (patch)
treeaca93bb3c4778253a238bc41b603a8760aee9783 /lib/aarch64
parente404d7f44a190b82332bb96daffa0c6239732218 (diff)
downloadtrusted-firmware-a-5f6032a8206bb88655367f96cc1270525bed9e48.tar.gz
Optimise data cache clean/invalidate operation
The data cache clean and invalidate operations dcsw_op_all() and dcsw_op_loius() were implemented to invoke a DSB and ISB barrier for every set/way operation. This adds a substantial performance penalty to an already expensive operation. These functions have been reworked to provide an optimised implementation derived from the code in section D3.4 of the ARMv8 ARM. The helper macro setup_dcsw_op_args has been moved and reworked alongside the implementation. Fixes ARM-software/tf-issues#146 Change-Id: Icd5df57816a83f0a842fce935320a369f7465c7f
Diffstat (limited to 'lib/aarch64')
-rw-r--r--lib/aarch64/cache_helpers.S152
1 files changed, 75 insertions, 77 deletions
diff --git a/lib/aarch64/cache_helpers.S b/lib/aarch64/cache_helpers.S
index 2649ad0e04..c272fc748b 100644
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@@ -138,94 +138,92 @@ inv_loop:
ret
- /* ------------------------------------------
- * Data cache operations by set/way to the
- * level specified
- * ------------------------------------------
- * ----------------------------------
- * Call this func with the clidr in
- * x0, starting cache level in x10,
- * last cache level in x3 & cm op in
- * x14
- * ----------------------------------
+ /* ---------------------------------------------------------------
+ * Data cache operations by set/way to the level specified
+ *
+ * The main function, do_dcsw_op requires:
+ * x0: The operation type (0-2), as defined in arch.h
+ * x3: The last cache level to operate on
+ * x9: clidr_el1
+ * and will carry out the operation on each data cache from level 0
+ * to the level in x3 in sequence
+ *
+ * The dcsw_op macro sets up the x3 and x9 parameters based on
+ * clidr_el1 cache information before invoking the main function
+ * ---------------------------------------------------------------
*/
-func dcsw_op
-all_start_at_level:
- add x2, x10, x10, lsr #1 // work out 3x current cache level
- lsr x1, x0, x2 // extract cache type bits from clidr
- and x1, x1, #7 // mask of the bits for current cache only
- cmp x1, #2 // see what cache we have at this level
- b.lt skip // skip if no cache, or just i-cache
- msr csselr_el1, x10 // select current cache level in csselr
- isb // isb to sych the new cssr&csidr
- mrs x1, ccsidr_el1 // read the new ccsidr
- and x2, x1, #7 // extract the length of the cache lines
- add x2, x2, #4 // add 4 (line length offset)
- mov x4, #0x3ff
- and x4, x4, x1, lsr #3 // find maximum number on the way size
- clz w5, w4 // find bit position of way size increment
- mov x7, #0x7fff
- and x7, x7, x1, lsr #13 // extract max number of the index size
-loop2:
- mov x9, x4 // create working copy of max way size
-loop3:
- lsl x6, x9, x5
- orr x11, x10, x6 // factor way and cache number into x11
- lsl x6, x7, x2
- orr x11, x11, x6 // factor index number into x11
- mov x12, x0
- mov x13, x30 // lr
- mov x0, x11
- blr x14
- mov x0, x12
- mov x30, x13 // lr
- subs x9, x9, #1 // decrement the way
- b.ge loop3
- subs x7, x7, #1 // decrement the index
- b.ge loop2
-skip:
- add x10, x10, #2 // increment cache number
- cmp x3, x10
- b.gt all_start_at_level
-finished:
- mov x10, #0 // swith back to cache level 0
- msr csselr_el1, x10 // select current cache level in csselr
- dsb sy
- isb
- ret
+ .macro dcsw_op shift, fw, ls
+ mrs x9, clidr_el1
+ ubfx x3, x9, \shift, \fw
+ lsl x3, x3, \ls
+ b do_dcsw_op
+ .endm
func do_dcsw_op
cbz x3, exit
- cmp x0, #DCISW
- b.eq dc_isw
- cmp x0, #DCCISW
- b.eq dc_cisw
- cmp x0, #DCCSW
- b.eq dc_csw
-dc_isw:
- mov x0, x9
- adr x14, dcisw
- b dcsw_op
-dc_cisw:
+ mov x10, xzr
+ adr x14, dcsw_loop_table // compute inner loop address
+ add x14, x14, x0, lsl #5 // inner loop is 8x32-bit instructions
mov x0, x9
- adr x14, dccisw
- b dcsw_op
-dc_csw:
- mov x0, x9
- adr x14, dccsw
- b dcsw_op
+ mov w8, #1
+loop1:
+ add x2, x10, x10, lsr #1 // work out 3x current cache level
+ lsr x1, x0, x2 // extract cache type bits from clidr
+ and x1, x1, #7 // mask the bits for current cache only
+ cmp x1, #2 // see what cache we have at this level
+ b.lt level_done // nothing to do if no cache or icache
+
+ msr csselr_el1, x10 // select current cache level in csselr
+ isb // isb to sych the new cssr&csidr
+ mrs x1, ccsidr_el1 // read the new ccsidr
+ and x2, x1, #7 // extract the length of the cache lines
+ add x2, x2, #4 // add 4 (line length offset)
+ ubfx x4, x1, #3, #10 // maximum way number
+ clz w5, w4 // bit position of way size increment
+ lsl w9, w4, w5 // w9 = aligned max way number
+ lsl w16, w8, w5 // w16 = way number loop decrement
+ orr w9, w10, w9 // w9 = combine way and cache number
+ ubfx w6, w1, #13, #15 // w6 = max set number
+ lsl w17, w8, w2 // w17 = set number loop decrement
+ dsb sy // barrier before we start this level
+ br x14 // jump to DC operation specific loop
+
+ .macro dcsw_loop _op
+loop2_\_op:
+ lsl w7, w6, w2 // w7 = aligned max set number
+
+loop3_\_op:
+ orr w11, w9, w7 // combine cache, way and set number
+ dc \_op, x11
+ subs w7, w7, w17 // decrement set number
+ b.ge loop3_\_op
+
+ subs x9, x9, x16 // decrement way number
+ b.ge loop2_\_op
+
+ b level_done
+ .endm
+
+level_done:
+ add x10, x10, #2 // increment cache number
+ cmp x3, x10
+ b.gt loop1
+ msr csselr_el1, xzr // select cache level 0 in csselr
+ dsb sy // barrier to complete final cache operation
+ isb
exit:
ret
+dcsw_loop_table:
+ dcsw_loop isw
+ dcsw_loop cisw
+ dcsw_loop csw
+
func dcsw_op_louis
- dsb sy
- setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
- b do_dcsw_op
+ dcsw_op #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
func dcsw_op_all
- dsb sy
- setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
- b do_dcsw_op
+ dcsw_op #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT