David Brazdil | 0f672f6 | 2019-12-10 10:32:29 +0000 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 2 | #include <linux/atomic.h> |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 3 | #include <linux/percpu.h> |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 4 | #include <linux/wait.h> |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 5 | #include <linux/lockdep.h> |
| 6 | #include <linux/percpu-rwsem.h> |
| 7 | #include <linux/rcupdate.h> |
| 8 | #include <linux/sched.h> |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 9 | #include <linux/sched/task.h> |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 10 | #include <linux/errno.h> |
| 11 | |
| 12 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 13 | const char *name, struct lock_class_key *key) |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 14 | { |
| 15 | sem->read_count = alloc_percpu(int); |
| 16 | if (unlikely(!sem->read_count)) |
| 17 | return -ENOMEM; |
| 18 | |
David Brazdil | 0f672f6 | 2019-12-10 10:32:29 +0000 | [diff] [blame] | 19 | rcu_sync_init(&sem->rss); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 20 | rcuwait_init(&sem->writer); |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 21 | init_waitqueue_head(&sem->waiters); |
| 22 | atomic_set(&sem->block, 0); |
| 23 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 24 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
| 25 | lockdep_init_map(&sem->dep_map, name, key, 0); |
| 26 | #endif |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 27 | return 0; |
| 28 | } |
| 29 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); |
| 30 | |
| 31 | void percpu_free_rwsem(struct percpu_rw_semaphore *sem) |
| 32 | { |
| 33 | /* |
| 34 | * XXX: temporary kludge. The error path in alloc_super() |
| 35 | * assumes that percpu_free_rwsem() is safe after kzalloc(). |
| 36 | */ |
| 37 | if (!sem->read_count) |
| 38 | return; |
| 39 | |
| 40 | rcu_sync_dtor(&sem->rss); |
| 41 | free_percpu(sem->read_count); |
| 42 | sem->read_count = NULL; /* catch use after free bugs */ |
| 43 | } |
| 44 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); |
| 45 | |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 46 | static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 47 | { |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 48 | this_cpu_inc(*sem->read_count); |
| 49 | |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 50 | /* |
| 51 | * Due to having preemption disabled the decrement happens on |
| 52 | * the same CPU as the increment, avoiding the |
| 53 | * increment-on-one-CPU-and-decrement-on-another problem. |
| 54 | * |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 55 | * If the reader misses the writer's assignment of sem->block, then the |
| 56 | * writer is guaranteed to see the reader's increment. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 57 | * |
| 58 | * Conversely, any readers that increment their sem->read_count after |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 59 | * the writer looks are guaranteed to see the sem->block value, which |
| 60 | * in turn means that they are guaranteed to immediately decrement |
| 61 | * their sem->read_count, so that it doesn't matter that the writer |
| 62 | * missed them. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 63 | */ |
| 64 | |
| 65 | smp_mb(); /* A matches D */ |
| 66 | |
| 67 | /* |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 68 | * If !sem->block the critical section starts here, matched by the |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 69 | * release in percpu_up_write(). |
| 70 | */ |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 71 | if (likely(!atomic_read_acquire(&sem->block))) |
| 72 | return true; |
| 73 | |
| 74 | this_cpu_dec(*sem->read_count); |
| 75 | |
| 76 | /* Prod writer to re-evaluate readers_active_check() */ |
| 77 | rcuwait_wake_up(&sem->writer); |
| 78 | |
| 79 | return false; |
| 80 | } |
| 81 | |
| 82 | static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) |
| 83 | { |
| 84 | if (atomic_read(&sem->block)) |
| 85 | return false; |
| 86 | |
| 87 | return atomic_xchg(&sem->block, 1) == 0; |
| 88 | } |
| 89 | |
| 90 | static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) |
| 91 | { |
| 92 | if (reader) { |
| 93 | bool ret; |
| 94 | |
| 95 | preempt_disable(); |
| 96 | ret = __percpu_down_read_trylock(sem); |
| 97 | preempt_enable(); |
| 98 | |
| 99 | return ret; |
| 100 | } |
| 101 | return __percpu_down_write_trylock(sem); |
| 102 | } |
| 103 | |
| 104 | /* |
| 105 | * The return value of wait_queue_entry::func means: |
| 106 | * |
| 107 | * <0 - error, wakeup is terminated and the error is returned |
| 108 | * 0 - no wakeup, a next waiter is tried |
| 109 | * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. |
| 110 | * |
| 111 | * We use EXCLUSIVE for both readers and writers to preserve FIFO order, |
| 112 | * and play games with the return value to allow waking multiple readers. |
| 113 | * |
| 114 | * Specifically, we wake readers until we've woken a single writer, or until a |
| 115 | * trylock fails. |
| 116 | */ |
| 117 | static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, |
| 118 | unsigned int mode, int wake_flags, |
| 119 | void *key) |
| 120 | { |
| 121 | bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; |
| 122 | struct percpu_rw_semaphore *sem = key; |
| 123 | struct task_struct *p; |
| 124 | |
| 125 | /* concurrent against percpu_down_write(), can get stolen */ |
| 126 | if (!__percpu_rwsem_trylock(sem, reader)) |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 127 | return 1; |
| 128 | |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 129 | p = get_task_struct(wq_entry->private); |
| 130 | list_del_init(&wq_entry->entry); |
| 131 | smp_store_release(&wq_entry->private, NULL); |
| 132 | |
| 133 | wake_up_process(p); |
| 134 | put_task_struct(p); |
| 135 | |
| 136 | return !reader; /* wake (readers until) 1 writer */ |
| 137 | } |
| 138 | |
| 139 | static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) |
| 140 | { |
| 141 | DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); |
| 142 | bool wait; |
| 143 | |
| 144 | spin_lock_irq(&sem->waiters.lock); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 145 | /* |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 146 | * Serialize against the wakeup in percpu_up_write(), if we fail |
| 147 | * the trylock, the wakeup must see us on the list. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 148 | */ |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 149 | wait = !__percpu_rwsem_trylock(sem, reader); |
| 150 | if (wait) { |
| 151 | wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; |
| 152 | __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); |
| 153 | } |
| 154 | spin_unlock_irq(&sem->waiters.lock); |
| 155 | |
| 156 | while (wait) { |
| 157 | set_current_state(TASK_UNINTERRUPTIBLE); |
| 158 | if (!smp_load_acquire(&wq_entry.private)) |
| 159 | break; |
| 160 | schedule(); |
| 161 | } |
| 162 | __set_current_state(TASK_RUNNING); |
| 163 | } |
| 164 | |
| 165 | bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) |
| 166 | { |
| 167 | if (__percpu_down_read_trylock(sem)) |
| 168 | return true; |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 169 | |
| 170 | if (try) |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 171 | return false; |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 172 | |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 173 | preempt_enable(); |
| 174 | percpu_rwsem_wait(sem, /* .reader = */ true); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 175 | preempt_disable(); |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 176 | |
| 177 | return true; |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 178 | } |
| 179 | EXPORT_SYMBOL_GPL(__percpu_down_read); |
| 180 | |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 181 | #define per_cpu_sum(var) \ |
| 182 | ({ \ |
| 183 | typeof(var) __sum = 0; \ |
| 184 | int cpu; \ |
| 185 | compiletime_assert_atomic_type(__sum); \ |
| 186 | for_each_possible_cpu(cpu) \ |
| 187 | __sum += per_cpu(var, cpu); \ |
| 188 | __sum; \ |
| 189 | }) |
| 190 | |
| 191 | /* |
| 192 | * Return true if the modular sum of the sem->read_count per-CPU variable is |
| 193 | * zero. If this sum is zero, then it is stable due to the fact that if any |
| 194 | * newly arriving readers increment a given counter, they will immediately |
| 195 | * decrement that same counter. |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 196 | * |
| 197 | * Assumes sem->block is set. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 198 | */ |
| 199 | static bool readers_active_check(struct percpu_rw_semaphore *sem) |
| 200 | { |
| 201 | if (per_cpu_sum(*sem->read_count) != 0) |
| 202 | return false; |
| 203 | |
| 204 | /* |
| 205 | * If we observed the decrement; ensure we see the entire critical |
| 206 | * section. |
| 207 | */ |
| 208 | |
| 209 | smp_mb(); /* C matches B */ |
| 210 | |
| 211 | return true; |
| 212 | } |
| 213 | |
| 214 | void percpu_down_write(struct percpu_rw_semaphore *sem) |
| 215 | { |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 216 | might_sleep(); |
| 217 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
| 218 | |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 219 | /* Notify readers to take the slow path. */ |
| 220 | rcu_sync_enter(&sem->rss); |
| 221 | |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 222 | /* |
| 223 | * Try set sem->block; this provides writer-writer exclusion. |
| 224 | * Having sem->block set makes new readers block. |
| 225 | */ |
| 226 | if (!__percpu_down_write_trylock(sem)) |
| 227 | percpu_rwsem_wait(sem, /* .reader = */ false); |
| 228 | |
| 229 | /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 230 | |
| 231 | /* |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 232 | * If they don't see our store of sem->block, then we are guaranteed to |
| 233 | * see their sem->read_count increment, and therefore will wait for |
| 234 | * them. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 235 | */ |
| 236 | |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 237 | /* Wait for all active readers to complete. */ |
| 238 | rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 239 | } |
| 240 | EXPORT_SYMBOL_GPL(percpu_down_write); |
| 241 | |
| 242 | void percpu_up_write(struct percpu_rw_semaphore *sem) |
| 243 | { |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 244 | rwsem_release(&sem->dep_map, _RET_IP_); |
| 245 | |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 246 | /* |
| 247 | * Signal the writer is done, no fast path yet. |
| 248 | * |
| 249 | * One reason that we cannot just immediately flip to readers_fast is |
| 250 | * that new readers might fail to see the results of this writer's |
| 251 | * critical section. |
| 252 | * |
| 253 | * Therefore we force it through the slow path which guarantees an |
| 254 | * acquire and thereby guarantees the critical section's consistency. |
| 255 | */ |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 256 | atomic_set_release(&sem->block, 0); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 257 | |
| 258 | /* |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 259 | * Prod any pending reader/writer to make progress. |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 260 | */ |
Olivier Deprez | 157378f | 2022-04-04 15:47:50 +0200 | [diff] [blame^] | 261 | __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); |
Andrew Scull | b4b6d4a | 2019-01-02 15:54:55 +0000 | [diff] [blame] | 262 | |
| 263 | /* |
| 264 | * Once this completes (at least one RCU-sched grace period hence) the |
| 265 | * reader fast path will be available again. Safe to use outside the |
| 266 | * exclusive write lock because its counting. |
| 267 | */ |
| 268 | rcu_sync_exit(&sem->rss); |
| 269 | } |
| 270 | EXPORT_SYMBOL_GPL(percpu_up_write); |