Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | #ifndef __MIPS_ASM_SYNC_H__ |
| 3 | #define __MIPS_ASM_SYNC_H__ |
| 4 | |
| 5 | /* |
| 6 | * sync types are defined by the MIPS64 Instruction Set documentation in Volume |
| 7 | * II-A of the MIPS Architecture Reference Manual, which can be found here: |
| 8 | * |
| 9 | * https://www.mips.com/?do-download=the-mips64-instruction-set-v6-06 |
| 10 | * |
| 11 | * Two types of barrier are provided: |
| 12 | * |
| 13 | * 1) Completion barriers, which ensure that a memory operation has actually |
| 14 | * completed & often involve stalling the CPU pipeline to do so. |
| 15 | * |
| 16 | * 2) Ordering barriers, which only ensure that affected memory operations |
| 17 | * won't be reordered in the CPU pipeline in a manner that violates the |
| 18 | * restrictions imposed by the barrier. |
| 19 | * |
| 20 | * Ordering barriers can be more efficient than completion barriers, since: |
| 21 | * |
| 22 | * a) Ordering barriers only require memory access instructions which preceed |
| 23 | * them in program order (older instructions) to reach a point in the |
| 24 | * load/store datapath beyond which reordering is not possible before |
| 25 | * allowing memory access instructions which follow them (younger |
| 26 | * instructions) to be performed. That is, older instructions don't |
| 27 | * actually need to complete - they just need to get far enough that all |
| 28 | * other coherent CPUs will observe their completion before they observe |
| 29 | * the effects of younger instructions. |
| 30 | * |
| 31 | * b) Multiple variants of ordering barrier are provided which allow the |
| 32 | * effects to be restricted to different combinations of older or younger |
| 33 | * loads or stores. By way of example, if we only care that stores older |
| 34 | * than a barrier are observed prior to stores that are younger than a |
| 35 | * barrier & don't care about the ordering of loads then the 'wmb' |
| 36 | * ordering barrier can be used. Limiting the barrier's effects to stores |
| 37 | * allows loads to continue unaffected & potentially allows the CPU to |
| 38 | * make progress faster than if younger loads had to wait for older stores |
| 39 | * to complete. |
| 40 | */ |
| 41 | |
| 42 | /* |
| 43 | * No sync instruction at all; used to allow code to nullify the effect of the |
| 44 | * __SYNC() macro without needing lots of #ifdefery. |
| 45 | */ |
| 46 | #define __SYNC_none -1 |
| 47 | |
| 48 | /* |
| 49 | * A full completion barrier; all memory accesses appearing prior to this sync |
| 50 | * instruction in program order must complete before any memory accesses |
| 51 | * appearing after this sync instruction in program order. |
| 52 | */ |
| 53 | #define __SYNC_full 0x00 |
| 54 | |
| 55 | /* |
| 56 | * For now we use a full completion barrier to implement all sync types, until |
| 57 | * we're satisfied that lightweight ordering barriers defined by MIPSr6 are |
| 58 | * sufficient to uphold our desired memory model. |
| 59 | */ |
| 60 | #define __SYNC_aq __SYNC_full |
| 61 | #define __SYNC_rl __SYNC_full |
| 62 | #define __SYNC_mb __SYNC_full |
| 63 | |
| 64 | /* |
| 65 | * ...except on Cavium Octeon CPUs, which have been using the 'wmb' ordering |
| 66 | * barrier since 2010 & omit 'rmb' barriers because the CPUs don't perform |
| 67 | * speculative reads. |
| 68 | */ |
| 69 | #ifdef CONFIG_CPU_CAVIUM_OCTEON |
| 70 | # define __SYNC_rmb __SYNC_none |
| 71 | # define __SYNC_wmb 0x04 |
| 72 | #else |
| 73 | # define __SYNC_rmb __SYNC_full |
| 74 | # define __SYNC_wmb __SYNC_full |
| 75 | #endif |
| 76 | |
| 77 | /* |
| 78 | * A GINV sync is a little different; it doesn't relate directly to loads or |
| 79 | * stores, but instead causes synchronization of an icache or TLB global |
| 80 | * invalidation operation triggered by the ginvi or ginvt instructions |
| 81 | * respectively. In cases where we need to know that a ginvi or ginvt operation |
| 82 | * has been performed by all coherent CPUs, we must issue a sync instruction of |
| 83 | * this type. Once this instruction graduates all coherent CPUs will have |
| 84 | * observed the invalidation. |
| 85 | */ |
| 86 | #define __SYNC_ginv 0x14 |
| 87 | |
| 88 | /* Trivial; indicate that we always need this sync instruction. */ |
| 89 | #define __SYNC_always (1 << 0) |
| 90 | |
| 91 | /* |
| 92 | * Indicate that we need this sync instruction only on systems with weakly |
| 93 | * ordered memory access. In general this is most MIPS systems, but there are |
| 94 | * exceptions which provide strongly ordered memory. |
| 95 | */ |
| 96 | #ifdef CONFIG_WEAK_ORDERING |
| 97 | # define __SYNC_weak_ordering (1 << 1) |
| 98 | #else |
| 99 | # define __SYNC_weak_ordering 0 |
| 100 | #endif |
| 101 | |
| 102 | /* |
| 103 | * Indicate that we need this sync instruction only on systems where LL/SC |
| 104 | * don't implicitly provide a memory barrier. In general this is most MIPS |
| 105 | * systems. |
| 106 | */ |
| 107 | #ifdef CONFIG_WEAK_REORDERING_BEYOND_LLSC |
| 108 | # define __SYNC_weak_llsc (1 << 2) |
| 109 | #else |
| 110 | # define __SYNC_weak_llsc 0 |
| 111 | #endif |
| 112 | |
| 113 | /* |
| 114 | * Some Loongson 3 CPUs have a bug wherein execution of a memory access (load, |
| 115 | * store or prefetch) in between an LL & SC can cause the SC instruction to |
| 116 | * erroneously succeed, breaking atomicity. Whilst it's unusual to write code |
| 117 | * containing such sequences, this bug bites harder than we might otherwise |
| 118 | * expect due to reordering & speculation: |
| 119 | * |
| 120 | * 1) A memory access appearing prior to the LL in program order may actually |
| 121 | * be executed after the LL - this is the reordering case. |
| 122 | * |
| 123 | * In order to avoid this we need to place a memory barrier (ie. a SYNC |
| 124 | * instruction) prior to every LL instruction, in between it and any earlier |
| 125 | * memory access instructions. |
| 126 | * |
| 127 | * This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later. |
| 128 | * |
| 129 | * 2) If a conditional branch exists between an LL & SC with a target outside |
| 130 | * of the LL-SC loop, for example an exit upon value mismatch in cmpxchg() |
| 131 | * or similar, then misprediction of the branch may allow speculative |
| 132 | * execution of memory accesses from outside of the LL-SC loop. |
| 133 | * |
| 134 | * In order to avoid this we need a memory barrier (ie. a SYNC instruction) |
| 135 | * at each affected branch target. |
| 136 | * |
| 137 | * This case affects all current Loongson 3 CPUs. |
| 138 | * |
| 139 | * The above described cases cause an error in the cache coherence protocol; |
| 140 | * such that the Invalidate of a competing LL-SC goes 'missing' and SC |
| 141 | * erroneously observes its core still has Exclusive state and lets the SC |
| 142 | * proceed. |
| 143 | * |
| 144 | * Therefore the error only occurs on SMP systems. |
| 145 | */ |
| 146 | #ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS |
| 147 | # define __SYNC_loongson3_war (1 << 31) |
| 148 | #else |
| 149 | # define __SYNC_loongson3_war 0 |
| 150 | #endif |
| 151 | |
| 152 | /* |
| 153 | * Some Cavium Octeon CPUs suffer from a bug that causes a single wmb ordering |
| 154 | * barrier to be ineffective, requiring the use of 2 in sequence to provide an |
| 155 | * effective barrier as noted by commit 6b07d38aaa52 ("MIPS: Octeon: Use |
| 156 | * optimized memory barrier primitives."). Here we specify that the affected |
| 157 | * sync instructions should be emitted twice. |
| 158 | * Note that this expression is evaluated by the assembler (not the compiler), |
| 159 | * and that the assembler evaluates '==' as 0 or -1, not 0 or 1. |
| 160 | */ |
| 161 | #ifdef CONFIG_CPU_CAVIUM_OCTEON |
| 162 | # define __SYNC_rpt(type) (1 - (type == __SYNC_wmb)) |
| 163 | #else |
| 164 | # define __SYNC_rpt(type) 1 |
| 165 | #endif |
| 166 | |
| 167 | /* |
| 168 | * The main event. Here we actually emit a sync instruction of a given type, if |
| 169 | * reason is non-zero. |
| 170 | * |
| 171 | * In future we have the option of emitting entries in a fixups-style table |
| 172 | * here that would allow us to opportunistically remove some sync instructions |
| 173 | * when we detect at runtime that we're running on a CPU that doesn't need |
| 174 | * them. |
| 175 | */ |
| 176 | #ifdef CONFIG_CPU_HAS_SYNC |
| 177 | # define ____SYNC(_type, _reason, _else) \ |
| 178 | .if (( _type ) != -1) && ( _reason ); \ |
| 179 | .set push; \ |
| 180 | .set MIPS_ISA_LEVEL_RAW; \ |
| 181 | .rept __SYNC_rpt(_type); \ |
| 182 | sync _type; \ |
| 183 | .endr; \ |
| 184 | .set pop; \ |
| 185 | .else; \ |
| 186 | _else; \ |
| 187 | .endif |
| 188 | #else |
| 189 | # define ____SYNC(_type, _reason, _else) |
| 190 | #endif |
| 191 | |
| 192 | /* |
| 193 | * Preprocessor magic to expand macros used as arguments before we insert them |
| 194 | * into assembly code. |
| 195 | */ |
| 196 | #ifdef __ASSEMBLY__ |
| 197 | # define ___SYNC(type, reason, else) \ |
| 198 | ____SYNC(type, reason, else) |
| 199 | #else |
| 200 | # define ___SYNC(type, reason, else) \ |
| 201 | __stringify(____SYNC(type, reason, else)) |
| 202 | #endif |
| 203 | |
| 204 | #define __SYNC(type, reason) \ |
| 205 | ___SYNC(__SYNC_##type, __SYNC_##reason, ) |
| 206 | #define __SYNC_ELSE(type, reason, else) \ |
| 207 | ___SYNC(__SYNC_##type, __SYNC_##reason, else) |
| 208 | |
| 209 | #endif /* __MIPS_ASM_SYNC_H__ */ |