blob: 11ae2747701b55a044eea9a82c6cd22b41a3157e [file] [log] [blame]
David Brazdil0f672f62019-12-10 10:32:29 +00001// SPDX-License-Identifier: GPL-2.0+
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002/*
3 * Read-Copy Update mechanism for mutual exclusion
4 *
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00005 * Copyright IBM Corporation, 2008
6 *
7 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
8 * Manfred Spraul <manfred@colorfullife.com>
David Brazdil0f672f62019-12-10 10:32:29 +00009 * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000010 *
David Brazdil0f672f62019-12-10 10:32:29 +000011 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000012 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
13 *
14 * For detailed explanation of Read-Copy Update mechanism see -
15 * Documentation/RCU
16 */
17
18#define pr_fmt(fmt) "rcu: " fmt
19
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/init.h>
23#include <linux/spinlock.h>
24#include <linux/smp.h>
25#include <linux/rcupdate_wait.h>
26#include <linux/interrupt.h>
27#include <linux/sched.h>
28#include <linux/sched/debug.h>
29#include <linux/nmi.h>
30#include <linux/atomic.h>
31#include <linux/bitops.h>
32#include <linux/export.h>
33#include <linux/completion.h>
34#include <linux/moduleparam.h>
35#include <linux/percpu.h>
36#include <linux/notifier.h>
37#include <linux/cpu.h>
38#include <linux/mutex.h>
39#include <linux/time.h>
40#include <linux/kernel_stat.h>
41#include <linux/wait.h>
42#include <linux/kthread.h>
43#include <uapi/linux/sched/types.h>
44#include <linux/prefetch.h>
45#include <linux/delay.h>
46#include <linux/stop_machine.h>
47#include <linux/random.h>
48#include <linux/trace_events.h>
49#include <linux/suspend.h>
50#include <linux/ftrace.h>
David Brazdil0f672f62019-12-10 10:32:29 +000051#include <linux/tick.h>
52#include <linux/sysrq.h>
53#include <linux/kprobes.h>
54#include <linux/gfp.h>
55#include <linux/oom.h>
56#include <linux/smpboot.h>
57#include <linux/jiffies.h>
58#include <linux/sched/isolation.h>
59#include <linux/sched/clock.h>
60#include "../time/tick-internal.h"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000061
62#include "tree.h"
63#include "rcu.h"
64
65#ifdef MODULE_PARAM_PREFIX
66#undef MODULE_PARAM_PREFIX
67#endif
68#define MODULE_PARAM_PREFIX "rcutree."
69
70/* Data structures. */
71
72/*
David Brazdil0f672f62019-12-10 10:32:29 +000073 * Steal a bit from the bottom of ->dynticks for idle entry/exit
74 * control. Initially this is for TLB flushing.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000075 */
David Brazdil0f672f62019-12-10 10:32:29 +000076#define RCU_DYNTICK_CTRL_MASK 0x1
77#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
78#ifndef rcu_eqs_special_exit
79#define rcu_eqs_special_exit() do { } while (0)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000080#endif
81
David Brazdil0f672f62019-12-10 10:32:29 +000082static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
83 .dynticks_nesting = 1,
84 .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
85 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
86};
87struct rcu_state rcu_state = {
88 .level = { &rcu_state.node[0] },
89 .gp_state = RCU_GP_IDLE,
90 .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
91 .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
92 .name = RCU_NAME,
93 .abbr = RCU_ABBR,
94 .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
95 .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
96 .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
97};
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000098
99/* Dump rcu_node combining tree at boot to verify correct setup. */
100static bool dump_tree;
101module_param(dump_tree, bool, 0444);
David Brazdil0f672f62019-12-10 10:32:29 +0000102/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
103static bool use_softirq = 1;
104module_param(use_softirq, bool, 0444);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000105/* Control rcu_node-tree auto-balancing at boot time. */
106static bool rcu_fanout_exact;
107module_param(rcu_fanout_exact, bool, 0444);
108/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
109static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
110module_param(rcu_fanout_leaf, int, 0444);
111int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
112/* Number of rcu_nodes at specified level. */
113int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
114int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000115
116/*
117 * The rcu_scheduler_active variable is initialized to the value
118 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
119 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
120 * RCU can assume that there is but one task, allowing RCU to (for example)
121 * optimize synchronize_rcu() to a simple barrier(). When this variable
122 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
123 * to detect real grace periods. This variable is also used to suppress
124 * boot-time false positives from lockdep-RCU error checking. Finally, it
125 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
126 * is fully initialized, including all of its kthreads having been spawned.
127 */
128int rcu_scheduler_active __read_mostly;
129EXPORT_SYMBOL_GPL(rcu_scheduler_active);
130
131/*
132 * The rcu_scheduler_fully_active variable transitions from zero to one
133 * during the early_initcall() processing, which is after the scheduler
134 * is capable of creating new tasks. So RCU processing (for example,
135 * creating tasks for RCU priority boosting) must be delayed until after
136 * rcu_scheduler_fully_active transitions from zero to one. We also
137 * currently delay invocation of any RCU callbacks until after this point.
138 *
139 * It might later prove better for people registering RCU callbacks during
140 * early boot to take responsibility for these callbacks, but one step at
141 * a time.
142 */
143static int rcu_scheduler_fully_active __read_mostly;
144
David Brazdil0f672f62019-12-10 10:32:29 +0000145static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
146 unsigned long gps, unsigned long flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000147static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
148static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
149static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
150static void invoke_rcu_core(void);
David Brazdil0f672f62019-12-10 10:32:29 +0000151static void rcu_report_exp_rdp(struct rcu_data *rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000152static void sync_sched_exp_online_cleanup(int cpu);
153
154/* rcuc/rcub kthread realtime priority */
155static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
David Brazdil0f672f62019-12-10 10:32:29 +0000156module_param(kthread_prio, int, 0444);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000157
158/* Delay in jiffies for grace-period initialization delays, debug only. */
159
160static int gp_preinit_delay;
161module_param(gp_preinit_delay, int, 0444);
162static int gp_init_delay;
163module_param(gp_init_delay, int, 0444);
164static int gp_cleanup_delay;
165module_param(gp_cleanup_delay, int, 0444);
166
David Brazdil0f672f62019-12-10 10:32:29 +0000167/* Retrieve RCU kthreads priority for rcutorture */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000168int rcu_get_gp_kthreads_prio(void)
169{
170 return kthread_prio;
171}
172EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
173
174/*
175 * Number of grace periods between delays, normalized by the duration of
176 * the delay. The longer the delay, the more the grace periods between
177 * each delay. The reason for this normalization is that it means that,
178 * for non-zero delays, the overall slowdown of grace periods is constant
179 * regardless of the duration of the delay. This arrangement balances
180 * the need for long delays to increase some race probabilities with the
181 * need for fast grace periods to increase other race probabilities.
182 */
183#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
184
185/*
186 * Compute the mask of online CPUs for the specified rcu_node structure.
187 * This will not be stable unless the rcu_node structure's ->lock is
188 * held, but the bit corresponding to the current CPU will be stable
189 * in most contexts.
190 */
191unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
192{
193 return READ_ONCE(rnp->qsmaskinitnext);
194}
195
196/*
197 * Return true if an RCU grace period is in progress. The READ_ONCE()s
198 * permit this function to be invoked without holding the root rcu_node
199 * structure's ->lock, but of course results can be subject to change.
200 */
David Brazdil0f672f62019-12-10 10:32:29 +0000201static int rcu_gp_in_progress(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000202{
David Brazdil0f672f62019-12-10 10:32:29 +0000203 return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000204}
205
206/*
David Brazdil0f672f62019-12-10 10:32:29 +0000207 * Return the number of callbacks queued on the specified CPU.
208 * Handles both the nocbs and normal cases.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000209 */
David Brazdil0f672f62019-12-10 10:32:29 +0000210static long rcu_get_n_cbs_cpu(int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000211{
David Brazdil0f672f62019-12-10 10:32:29 +0000212 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
213
214 if (rcu_segcblist_is_enabled(&rdp->cblist))
215 return rcu_segcblist_n_cbs(&rdp->cblist);
216 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000217}
218
David Brazdil0f672f62019-12-10 10:32:29 +0000219void rcu_softirq_qs(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000220{
David Brazdil0f672f62019-12-10 10:32:29 +0000221 rcu_qs();
222 rcu_preempt_deferred_qs(current);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000223}
224
225/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000226 * Record entry into an extended quiescent state. This is only to be
227 * called when not already in an extended quiescent state.
228 */
229static void rcu_dynticks_eqs_enter(void)
230{
David Brazdil0f672f62019-12-10 10:32:29 +0000231 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000232 int seq;
233
234 /*
235 * CPUs seeing atomic_add_return() must see prior RCU read-side
236 * critical sections, and we also must force ordering with the
237 * next idle sojourn.
238 */
David Brazdil0f672f62019-12-10 10:32:29 +0000239 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000240 /* Better be in an extended quiescent state! */
241 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
242 (seq & RCU_DYNTICK_CTRL_CTR));
243 /* Better not have special action (TLB flush) pending! */
244 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
245 (seq & RCU_DYNTICK_CTRL_MASK));
246}
247
248/*
249 * Record exit from an extended quiescent state. This is only to be
250 * called from an extended quiescent state.
251 */
252static void rcu_dynticks_eqs_exit(void)
253{
David Brazdil0f672f62019-12-10 10:32:29 +0000254 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000255 int seq;
256
257 /*
258 * CPUs seeing atomic_add_return() must see prior idle sojourns,
259 * and we also must force ordering with the next RCU read-side
260 * critical section.
261 */
David Brazdil0f672f62019-12-10 10:32:29 +0000262 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000263 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
264 !(seq & RCU_DYNTICK_CTRL_CTR));
265 if (seq & RCU_DYNTICK_CTRL_MASK) {
David Brazdil0f672f62019-12-10 10:32:29 +0000266 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000267 smp_mb__after_atomic(); /* _exit after clearing mask. */
268 /* Prefer duplicate flushes to losing a flush. */
269 rcu_eqs_special_exit();
270 }
271}
272
273/*
274 * Reset the current CPU's ->dynticks counter to indicate that the
275 * newly onlined CPU is no longer in an extended quiescent state.
276 * This will either leave the counter unchanged, or increment it
277 * to the next non-quiescent value.
278 *
279 * The non-atomic test/increment sequence works because the upper bits
280 * of the ->dynticks counter are manipulated only by the corresponding CPU,
281 * or when the corresponding CPU is offline.
282 */
283static void rcu_dynticks_eqs_online(void)
284{
David Brazdil0f672f62019-12-10 10:32:29 +0000285 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000286
David Brazdil0f672f62019-12-10 10:32:29 +0000287 if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000288 return;
David Brazdil0f672f62019-12-10 10:32:29 +0000289 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000290}
291
292/*
293 * Is the current CPU in an extended quiescent state?
294 *
295 * No ordering, as we are sampling CPU-local information.
296 */
297bool rcu_dynticks_curr_cpu_in_eqs(void)
298{
David Brazdil0f672f62019-12-10 10:32:29 +0000299 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000300
David Brazdil0f672f62019-12-10 10:32:29 +0000301 return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000302}
303
304/*
305 * Snapshot the ->dynticks counter with full ordering so as to allow
306 * stable comparison of this counter with past and future snapshots.
307 */
David Brazdil0f672f62019-12-10 10:32:29 +0000308int rcu_dynticks_snap(struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000309{
David Brazdil0f672f62019-12-10 10:32:29 +0000310 int snap = atomic_add_return(0, &rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000311
312 return snap & ~RCU_DYNTICK_CTRL_MASK;
313}
314
315/*
316 * Return true if the snapshot returned from rcu_dynticks_snap()
317 * indicates that RCU is in an extended quiescent state.
318 */
319static bool rcu_dynticks_in_eqs(int snap)
320{
321 return !(snap & RCU_DYNTICK_CTRL_CTR);
322}
323
324/*
David Brazdil0f672f62019-12-10 10:32:29 +0000325 * Return true if the CPU corresponding to the specified rcu_data
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000326 * structure has spent some time in an extended quiescent state since
327 * rcu_dynticks_snap() returned the specified snapshot.
328 */
David Brazdil0f672f62019-12-10 10:32:29 +0000329static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000330{
David Brazdil0f672f62019-12-10 10:32:29 +0000331 return snap != rcu_dynticks_snap(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000332}
333
334/*
335 * Set the special (bottom) bit of the specified CPU so that it
336 * will take special action (such as flushing its TLB) on the
337 * next exit from an extended quiescent state. Returns true if
338 * the bit was successfully set, or false if the CPU was not in
339 * an extended quiescent state.
340 */
341bool rcu_eqs_special_set(int cpu)
342{
343 int old;
344 int new;
David Brazdil0f672f62019-12-10 10:32:29 +0000345 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000346
347 do {
David Brazdil0f672f62019-12-10 10:32:29 +0000348 old = atomic_read(&rdp->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000349 if (old & RCU_DYNTICK_CTRL_CTR)
350 return false;
351 new = old | RCU_DYNTICK_CTRL_MASK;
David Brazdil0f672f62019-12-10 10:32:29 +0000352 } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000353 return true;
354}
355
356/*
357 * Let the RCU core know that this CPU has gone through the scheduler,
358 * which is a quiescent state. This is called when the need for a
359 * quiescent state is urgent, so we burn an atomic operation and full
360 * memory barriers to let the RCU core know about it, regardless of what
361 * this CPU might (or might not) do in the near future.
362 *
363 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
364 *
365 * The caller must have disabled interrupts and must not be idle.
366 */
David Brazdil0f672f62019-12-10 10:32:29 +0000367static void __maybe_unused rcu_momentary_dyntick_idle(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000368{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000369 int special;
370
David Brazdil0f672f62019-12-10 10:32:29 +0000371 raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
372 special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
373 &this_cpu_ptr(&rcu_data)->dynticks);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000374 /* It is illegal to call this from idle state. */
375 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
David Brazdil0f672f62019-12-10 10:32:29 +0000376 rcu_preempt_deferred_qs(current);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000377}
378
David Brazdil0f672f62019-12-10 10:32:29 +0000379/**
380 * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000381 *
David Brazdil0f672f62019-12-10 10:32:29 +0000382 * If the current CPU is idle and running at a first-level (not nested)
383 * interrupt from idle, return true. The caller must have at least
384 * disabled preemption.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000385 */
David Brazdil0f672f62019-12-10 10:32:29 +0000386static int rcu_is_cpu_rrupt_from_idle(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000387{
David Brazdil0f672f62019-12-10 10:32:29 +0000388 /* Called only from within the scheduling-clock interrupt */
389 lockdep_assert_in_irq();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000390
David Brazdil0f672f62019-12-10 10:32:29 +0000391 /* Check for counter underflows */
392 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
393 "RCU dynticks_nesting counter underflow!");
394 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
395 "RCU dynticks_nmi_nesting counter underflow/zero!");
396
397 /* Are we at first interrupt nesting level? */
398 if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
399 return false;
400
401 /* Does CPU appear to be idle from an RCU standpoint? */
402 return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000403}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000404
David Brazdil0f672f62019-12-10 10:32:29 +0000405#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
406#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000407static long blimit = DEFAULT_RCU_BLIMIT;
408#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
409static long qhimark = DEFAULT_RCU_QHIMARK;
410#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
411static long qlowmark = DEFAULT_RCU_QLOMARK;
412
413module_param(blimit, long, 0444);
414module_param(qhimark, long, 0444);
415module_param(qlowmark, long, 0444);
416
417static ulong jiffies_till_first_fqs = ULONG_MAX;
418static ulong jiffies_till_next_fqs = ULONG_MAX;
419static bool rcu_kick_kthreads;
David Brazdil0f672f62019-12-10 10:32:29 +0000420static int rcu_divisor = 7;
421module_param(rcu_divisor, int, 0644);
422
423/* Force an exit from rcu_do_batch() after 3 milliseconds. */
424static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
425module_param(rcu_resched_ns, long, 0644);
426
427/*
428 * How long the grace period must be before we start recruiting
429 * quiescent-state help from rcu_note_context_switch().
430 */
431static ulong jiffies_till_sched_qs = ULONG_MAX;
432module_param(jiffies_till_sched_qs, ulong, 0444);
433static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
434module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
435
436/*
437 * Make sure that we give the grace-period kthread time to detect any
438 * idle CPUs before taking active measures to force quiescent states.
439 * However, don't go below 100 milliseconds, adjusted upwards for really
440 * large systems.
441 */
442static void adjust_jiffies_till_sched_qs(void)
443{
444 unsigned long j;
445
446 /* If jiffies_till_sched_qs was specified, respect the request. */
447 if (jiffies_till_sched_qs != ULONG_MAX) {
448 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
449 return;
450 }
451 /* Otherwise, set to third fqs scan, but bound below on large system. */
452 j = READ_ONCE(jiffies_till_first_fqs) +
453 2 * READ_ONCE(jiffies_till_next_fqs);
454 if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
455 j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
456 pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
457 WRITE_ONCE(jiffies_to_sched_qs, j);
458}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000459
460static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp)
461{
462 ulong j;
463 int ret = kstrtoul(val, 0, &j);
464
David Brazdil0f672f62019-12-10 10:32:29 +0000465 if (!ret) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000466 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j);
David Brazdil0f672f62019-12-10 10:32:29 +0000467 adjust_jiffies_till_sched_qs();
468 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000469 return ret;
470}
471
472static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp)
473{
474 ulong j;
475 int ret = kstrtoul(val, 0, &j);
476
David Brazdil0f672f62019-12-10 10:32:29 +0000477 if (!ret) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000478 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1));
David Brazdil0f672f62019-12-10 10:32:29 +0000479 adjust_jiffies_till_sched_qs();
480 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000481 return ret;
482}
483
484static struct kernel_param_ops first_fqs_jiffies_ops = {
485 .set = param_set_first_fqs_jiffies,
486 .get = param_get_ulong,
487};
488
489static struct kernel_param_ops next_fqs_jiffies_ops = {
490 .set = param_set_next_fqs_jiffies,
491 .get = param_get_ulong,
492};
493
494module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644);
495module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644);
496module_param(rcu_kick_kthreads, bool, 0644);
497
David Brazdil0f672f62019-12-10 10:32:29 +0000498static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000499static int rcu_pending(void);
500
501/*
502 * Return the number of RCU GPs completed thus far for debug & stats.
503 */
504unsigned long rcu_get_gp_seq(void)
505{
David Brazdil0f672f62019-12-10 10:32:29 +0000506 return READ_ONCE(rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000507}
508EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
509
510/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000511 * Return the number of RCU expedited batches completed thus far for
512 * debug & stats. Odd numbers mean that a batch is in progress, even
513 * numbers mean idle. The value returned will thus be roughly double
514 * the cumulative batches since boot.
515 */
516unsigned long rcu_exp_batches_completed(void)
517{
David Brazdil0f672f62019-12-10 10:32:29 +0000518 return rcu_state.expedited_sequence;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000519}
520EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
521
522/*
David Brazdil0f672f62019-12-10 10:32:29 +0000523 * Return the root node of the rcu_state structure.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000524 */
David Brazdil0f672f62019-12-10 10:32:29 +0000525static struct rcu_node *rcu_get_root(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000526{
David Brazdil0f672f62019-12-10 10:32:29 +0000527 return &rcu_state.node[0];
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000528}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000529
530/*
David Brazdil0f672f62019-12-10 10:32:29 +0000531 * Convert a ->gp_state value to a character string.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000532 */
David Brazdil0f672f62019-12-10 10:32:29 +0000533static const char *gp_state_getname(short gs)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000534{
David Brazdil0f672f62019-12-10 10:32:29 +0000535 if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
536 return "???";
537 return gp_state_names[gs];
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000538}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000539
540/*
541 * Send along grace-period-related data for rcutorture diagnostics.
542 */
543void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
544 unsigned long *gp_seq)
545{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000546 switch (test_type) {
547 case RCU_FLAVOR:
David Brazdil0f672f62019-12-10 10:32:29 +0000548 *flags = READ_ONCE(rcu_state.gp_flags);
549 *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000550 break;
551 default:
552 break;
553 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000554}
555EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
556
557/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000558 * Enter an RCU extended quiescent state, which can be either the
559 * idle loop or adaptive-tickless usermode execution.
560 *
561 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
562 * the possibility of usermode upcalls having messed up our count
563 * of interrupt nesting level during the prior busy period.
564 */
565static void rcu_eqs_enter(bool user)
566{
David Brazdil0f672f62019-12-10 10:32:29 +0000567 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000568
David Brazdil0f672f62019-12-10 10:32:29 +0000569 WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
570 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000571 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
David Brazdil0f672f62019-12-10 10:32:29 +0000572 rdp->dynticks_nesting == 0);
573 if (rdp->dynticks_nesting != 1) {
574 rdp->dynticks_nesting--;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000575 return;
576 }
577
578 lockdep_assert_irqs_disabled();
Olivier Deprez0e641232021-09-23 10:07:05 +0200579 trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000580 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
David Brazdil0f672f62019-12-10 10:32:29 +0000581 rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000582 rcu_prepare_for_idle();
David Brazdil0f672f62019-12-10 10:32:29 +0000583 rcu_preempt_deferred_qs(current);
584 WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000585 rcu_dynticks_eqs_enter();
586 rcu_dynticks_task_enter();
587}
588
589/**
590 * rcu_idle_enter - inform RCU that current CPU is entering idle
591 *
592 * Enter idle mode, in other words, -leave- the mode in which RCU
593 * read-side critical sections can occur. (Though RCU read-side
594 * critical sections can occur in irq handlers in idle, a possibility
595 * handled by irq_enter() and irq_exit().)
596 *
597 * If you add or remove a call to rcu_idle_enter(), be sure to test with
598 * CONFIG_RCU_EQS_DEBUG=y.
599 */
600void rcu_idle_enter(void)
601{
602 lockdep_assert_irqs_disabled();
603 rcu_eqs_enter(false);
604}
605
606#ifdef CONFIG_NO_HZ_FULL
607/**
608 * rcu_user_enter - inform RCU that we are resuming userspace.
609 *
610 * Enter RCU idle mode right before resuming userspace. No use of RCU
611 * is permitted between this call and rcu_user_exit(). This way the
612 * CPU doesn't need to maintain the tick for RCU maintenance purposes
613 * when the CPU runs in userspace.
614 *
615 * If you add or remove a call to rcu_user_enter(), be sure to test with
616 * CONFIG_RCU_EQS_DEBUG=y.
617 */
618void rcu_user_enter(void)
619{
Olivier Deprez0e641232021-09-23 10:07:05 +0200620 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
621
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000622 lockdep_assert_irqs_disabled();
Olivier Deprez0e641232021-09-23 10:07:05 +0200623
624 instrumentation_begin();
625 do_nocb_deferred_wakeup(rdp);
626 instrumentation_end();
627
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000628 rcu_eqs_enter(true);
629}
630#endif /* CONFIG_NO_HZ_FULL */
631
David Brazdil0f672f62019-12-10 10:32:29 +0000632/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000633 * If we are returning from the outermost NMI handler that interrupted an
David Brazdil0f672f62019-12-10 10:32:29 +0000634 * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000635 * to let the RCU grace-period handling know that the CPU is back to
636 * being RCU-idle.
637 *
David Brazdil0f672f62019-12-10 10:32:29 +0000638 * If you add or remove a call to rcu_nmi_exit_common(), be sure to test
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000639 * with CONFIG_RCU_EQS_DEBUG=y.
640 */
David Brazdil0f672f62019-12-10 10:32:29 +0000641static __always_inline void rcu_nmi_exit_common(bool irq)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000642{
David Brazdil0f672f62019-12-10 10:32:29 +0000643 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000644
645 /*
646 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
647 * (We are exiting an NMI handler, so RCU better be paying attention
648 * to us!)
649 */
David Brazdil0f672f62019-12-10 10:32:29 +0000650 WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000651 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
652
653 /*
654 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
655 * leave it in non-RCU-idle state.
656 */
David Brazdil0f672f62019-12-10 10:32:29 +0000657 if (rdp->dynticks_nmi_nesting != 1) {
Olivier Deprez0e641232021-09-23 10:07:05 +0200658 trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
659 atomic_read(&rdp->dynticks));
David Brazdil0f672f62019-12-10 10:32:29 +0000660 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
661 rdp->dynticks_nmi_nesting - 2);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000662 return;
663 }
664
665 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
Olivier Deprez0e641232021-09-23 10:07:05 +0200666 trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
David Brazdil0f672f62019-12-10 10:32:29 +0000667 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
668
669 if (irq)
670 rcu_prepare_for_idle();
671
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000672 rcu_dynticks_eqs_enter();
David Brazdil0f672f62019-12-10 10:32:29 +0000673
674 if (irq)
675 rcu_dynticks_task_enter();
676}
677
678/**
679 * rcu_nmi_exit - inform RCU of exit from NMI context
680 *
681 * If you add or remove a call to rcu_nmi_exit(), be sure to test
682 * with CONFIG_RCU_EQS_DEBUG=y.
683 */
684void rcu_nmi_exit(void)
685{
686 rcu_nmi_exit_common(false);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000687}
688
689/**
690 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
691 *
692 * Exit from an interrupt handler, which might possibly result in entering
693 * idle mode, in other words, leaving the mode in which read-side critical
694 * sections can occur. The caller must have disabled interrupts.
695 *
696 * This code assumes that the idle loop never does anything that might
697 * result in unbalanced calls to irq_enter() and irq_exit(). If your
698 * architecture's idle loop violates this assumption, RCU will give you what
699 * you deserve, good and hard. But very infrequently and irreproducibly.
700 *
701 * Use things like work queues to work around this limitation.
702 *
703 * You have been warned.
704 *
705 * If you add or remove a call to rcu_irq_exit(), be sure to test with
706 * CONFIG_RCU_EQS_DEBUG=y.
707 */
708void rcu_irq_exit(void)
709{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000710 lockdep_assert_irqs_disabled();
David Brazdil0f672f62019-12-10 10:32:29 +0000711 rcu_nmi_exit_common(true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000712}
713
714/*
715 * Wrapper for rcu_irq_exit() where interrupts are enabled.
716 *
717 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
718 * with CONFIG_RCU_EQS_DEBUG=y.
719 */
720void rcu_irq_exit_irqson(void)
721{
722 unsigned long flags;
723
724 local_irq_save(flags);
725 rcu_irq_exit();
726 local_irq_restore(flags);
727}
728
729/*
730 * Exit an RCU extended quiescent state, which can be either the
731 * idle loop or adaptive-tickless usermode execution.
732 *
733 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
734 * allow for the possibility of usermode upcalls messing up our count of
735 * interrupt nesting level during the busy period that is just now starting.
736 */
737static void rcu_eqs_exit(bool user)
738{
David Brazdil0f672f62019-12-10 10:32:29 +0000739 struct rcu_data *rdp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000740 long oldval;
741
742 lockdep_assert_irqs_disabled();
David Brazdil0f672f62019-12-10 10:32:29 +0000743 rdp = this_cpu_ptr(&rcu_data);
744 oldval = rdp->dynticks_nesting;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000745 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
746 if (oldval) {
David Brazdil0f672f62019-12-10 10:32:29 +0000747 rdp->dynticks_nesting++;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000748 return;
749 }
750 rcu_dynticks_task_exit();
751 rcu_dynticks_eqs_exit();
752 rcu_cleanup_after_idle();
Olivier Deprez0e641232021-09-23 10:07:05 +0200753 trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000754 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
David Brazdil0f672f62019-12-10 10:32:29 +0000755 WRITE_ONCE(rdp->dynticks_nesting, 1);
756 WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
757 WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000758}
759
760/**
761 * rcu_idle_exit - inform RCU that current CPU is leaving idle
762 *
763 * Exit idle mode, in other words, -enter- the mode in which RCU
764 * read-side critical sections can occur.
765 *
766 * If you add or remove a call to rcu_idle_exit(), be sure to test with
767 * CONFIG_RCU_EQS_DEBUG=y.
768 */
769void rcu_idle_exit(void)
770{
771 unsigned long flags;
772
773 local_irq_save(flags);
774 rcu_eqs_exit(false);
775 local_irq_restore(flags);
776}
777
778#ifdef CONFIG_NO_HZ_FULL
779/**
780 * rcu_user_exit - inform RCU that we are exiting userspace.
781 *
782 * Exit RCU idle mode while entering the kernel because it can
783 * run a RCU read side critical section anytime.
784 *
785 * If you add or remove a call to rcu_user_exit(), be sure to test with
786 * CONFIG_RCU_EQS_DEBUG=y.
787 */
788void rcu_user_exit(void)
789{
790 rcu_eqs_exit(1);
791}
792#endif /* CONFIG_NO_HZ_FULL */
793
794/**
David Brazdil0f672f62019-12-10 10:32:29 +0000795 * rcu_nmi_enter_common - inform RCU of entry to NMI context
796 * @irq: Is this call from rcu_irq_enter?
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000797 *
David Brazdil0f672f62019-12-10 10:32:29 +0000798 * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
799 * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000800 * that the CPU is active. This implementation permits nested NMIs, as
801 * long as the nesting level does not overflow an int. (You will probably
802 * run out of stack space first.)
803 *
David Brazdil0f672f62019-12-10 10:32:29 +0000804 * If you add or remove a call to rcu_nmi_enter_common(), be sure to test
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000805 * with CONFIG_RCU_EQS_DEBUG=y.
806 */
David Brazdil0f672f62019-12-10 10:32:29 +0000807static __always_inline void rcu_nmi_enter_common(bool irq)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000808{
David Brazdil0f672f62019-12-10 10:32:29 +0000809 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000810 long incby = 2;
811
812 /* Complain about underflow. */
David Brazdil0f672f62019-12-10 10:32:29 +0000813 WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000814
815 /*
816 * If idle from RCU viewpoint, atomically increment ->dynticks
817 * to mark non-idle and increment ->dynticks_nmi_nesting by one.
818 * Otherwise, increment ->dynticks_nmi_nesting by two. This means
819 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
820 * to be in the outermost NMI handler that interrupted an RCU-idle
821 * period (observation due to Andy Lutomirski).
822 */
823 if (rcu_dynticks_curr_cpu_in_eqs()) {
David Brazdil0f672f62019-12-10 10:32:29 +0000824
825 if (irq)
826 rcu_dynticks_task_exit();
827
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000828 rcu_dynticks_eqs_exit();
David Brazdil0f672f62019-12-10 10:32:29 +0000829
830 if (irq)
831 rcu_cleanup_after_idle();
832
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000833 incby = 1;
834 }
835 trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
David Brazdil0f672f62019-12-10 10:32:29 +0000836 rdp->dynticks_nmi_nesting,
Olivier Deprez0e641232021-09-23 10:07:05 +0200837 rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
David Brazdil0f672f62019-12-10 10:32:29 +0000838 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
839 rdp->dynticks_nmi_nesting + incby);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000840 barrier();
841}
842
843/**
David Brazdil0f672f62019-12-10 10:32:29 +0000844 * rcu_nmi_enter - inform RCU of entry to NMI context
845 */
846void rcu_nmi_enter(void)
847{
848 rcu_nmi_enter_common(false);
849}
850NOKPROBE_SYMBOL(rcu_nmi_enter);
851
852/**
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000853 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
854 *
855 * Enter an interrupt handler, which might possibly result in exiting
856 * idle mode, in other words, entering the mode in which read-side critical
857 * sections can occur. The caller must have disabled interrupts.
858 *
859 * Note that the Linux kernel is fully capable of entering an interrupt
860 * handler that it never exits, for example when doing upcalls to user mode!
861 * This code assumes that the idle loop never does upcalls to user mode.
862 * If your architecture's idle loop does do upcalls to user mode (or does
863 * anything else that results in unbalanced calls to the irq_enter() and
864 * irq_exit() functions), RCU will give you what you deserve, good and hard.
865 * But very infrequently and irreproducibly.
866 *
867 * Use things like work queues to work around this limitation.
868 *
869 * You have been warned.
870 *
871 * If you add or remove a call to rcu_irq_enter(), be sure to test with
872 * CONFIG_RCU_EQS_DEBUG=y.
873 */
874void rcu_irq_enter(void)
875{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000876 lockdep_assert_irqs_disabled();
David Brazdil0f672f62019-12-10 10:32:29 +0000877 rcu_nmi_enter_common(true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000878}
879
880/*
881 * Wrapper for rcu_irq_enter() where interrupts are enabled.
882 *
883 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
884 * with CONFIG_RCU_EQS_DEBUG=y.
885 */
886void rcu_irq_enter_irqson(void)
887{
888 unsigned long flags;
889
890 local_irq_save(flags);
891 rcu_irq_enter();
892 local_irq_restore(flags);
893}
894
895/**
David Brazdil0f672f62019-12-10 10:32:29 +0000896 * rcu_is_watching - see if RCU thinks that the current CPU is not idle
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000897 *
898 * Return true if RCU is watching the running CPU, which means that this
899 * CPU can safely enter RCU read-side critical sections. In other words,
David Brazdil0f672f62019-12-10 10:32:29 +0000900 * if the current CPU is not in its idle loop or is in an interrupt or
901 * NMI handler, return true.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000902 */
903bool notrace rcu_is_watching(void)
904{
905 bool ret;
906
907 preempt_disable_notrace();
908 ret = !rcu_dynticks_curr_cpu_in_eqs();
909 preempt_enable_notrace();
910 return ret;
911}
912EXPORT_SYMBOL_GPL(rcu_is_watching);
913
914/*
915 * If a holdout task is actually running, request an urgent quiescent
916 * state from its CPU. This is unsynchronized, so migrations can cause
917 * the request to go to the wrong CPU. Which is OK, all that will happen
918 * is that the CPU's next context switch will be a bit slower and next
919 * time around this task will generate another request.
920 */
921void rcu_request_urgent_qs_task(struct task_struct *t)
922{
923 int cpu;
924
925 barrier();
926 cpu = task_cpu(t);
927 if (!task_curr(t))
928 return; /* This task is not running on that CPU. */
David Brazdil0f672f62019-12-10 10:32:29 +0000929 smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000930}
931
932#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
933
934/*
935 * Is the current CPU online as far as RCU is concerned?
936 *
937 * Disable preemption to avoid false positives that could otherwise
938 * happen due to the current CPU number being sampled, this task being
939 * preempted, its old CPU being taken offline, resuming on some other CPU,
David Brazdil0f672f62019-12-10 10:32:29 +0000940 * then determining that its old CPU is now offline.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000941 *
942 * Disable checking if in an NMI handler because we cannot safely
943 * report errors from NMI handlers anyway. In addition, it is OK to use
944 * RCU on an offline processor during initial boot, hence the check for
945 * rcu_scheduler_fully_active.
946 */
947bool rcu_lockdep_current_cpu_online(void)
948{
949 struct rcu_data *rdp;
950 struct rcu_node *rnp;
David Brazdil0f672f62019-12-10 10:32:29 +0000951 bool ret = false;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000952
953 if (in_nmi() || !rcu_scheduler_fully_active)
954 return true;
955 preempt_disable();
David Brazdil0f672f62019-12-10 10:32:29 +0000956 rdp = this_cpu_ptr(&rcu_data);
957 rnp = rdp->mynode;
958 if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
959 ret = true;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000960 preempt_enable();
David Brazdil0f672f62019-12-10 10:32:29 +0000961 return ret;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000962}
963EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
964
965#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
966
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000967/*
968 * We are reporting a quiescent state on behalf of some other CPU, so
969 * it is our responsibility to check for and handle potential overflow
970 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
971 * After all, the CPU might be in deep idle state, and thus executing no
972 * code whatsoever.
973 */
974static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
975{
976 raw_lockdep_assert_held_rcu_node(rnp);
977 if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
978 rnp->gp_seq))
979 WRITE_ONCE(rdp->gpwrap, true);
980 if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
981 rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
982}
983
984/*
985 * Snapshot the specified CPU's dynticks counter so that we can later
986 * credit them with an implicit quiescent state. Return 1 if this CPU
987 * is in dynticks idle mode, which is an extended quiescent state.
988 */
989static int dyntick_save_progress_counter(struct rcu_data *rdp)
990{
David Brazdil0f672f62019-12-10 10:32:29 +0000991 rdp->dynticks_snap = rcu_dynticks_snap(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000992 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
David Brazdil0f672f62019-12-10 10:32:29 +0000993 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000994 rcu_gpnum_ovf(rdp->mynode, rdp);
995 return 1;
996 }
997 return 0;
998}
999
1000/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001001 * Return true if the specified CPU has passed through a quiescent
1002 * state by virtue of being in or having passed through an dynticks
1003 * idle state since the last call to dyntick_save_progress_counter()
1004 * for this same CPU, or by virtue of having been offline.
1005 */
1006static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
1007{
1008 unsigned long jtsq;
1009 bool *rnhqp;
1010 bool *ruqp;
1011 struct rcu_node *rnp = rdp->mynode;
1012
1013 /*
1014 * If the CPU passed through or entered a dynticks idle phase with
1015 * no active irq/NMI handlers, then we can safely pretend that the CPU
1016 * already acknowledged the request to pass through a quiescent
1017 * state. Either way, that CPU cannot possibly be in an RCU
1018 * read-side critical section that started before the beginning
1019 * of the current RCU grace period.
1020 */
David Brazdil0f672f62019-12-10 10:32:29 +00001021 if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
1022 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001023 rcu_gpnum_ovf(rnp, rdp);
1024 return 1;
1025 }
1026
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001027 /* If waiting too long on an offline CPU, complain. */
1028 if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
David Brazdil0f672f62019-12-10 10:32:29 +00001029 time_after(jiffies, rcu_state.gp_start + HZ)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001030 bool onl;
1031 struct rcu_node *rnp1;
1032
1033 WARN_ON(1); /* Offline CPUs are supposed to report QS! */
1034 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
1035 __func__, rnp->grplo, rnp->grphi, rnp->level,
1036 (long)rnp->gp_seq, (long)rnp->completedqs);
1037 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
1038 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
1039 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
1040 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
1041 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
1042 __func__, rdp->cpu, ".o"[onl],
1043 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
1044 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
1045 return 1; /* Break things loose after complaining. */
1046 }
1047
1048 /*
1049 * A CPU running for an extended time within the kernel can
David Brazdil0f672f62019-12-10 10:32:29 +00001050 * delay RCU grace periods: (1) At age jiffies_to_sched_qs,
1051 * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
1052 * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
1053 * unsynchronized assignments to the per-CPU rcu_need_heavy_qs
1054 * variable are safe because the assignments are repeated if this
1055 * CPU failed to pass through a quiescent state. This code
1056 * also checks .jiffies_resched in case jiffies_to_sched_qs
1057 * is set way high.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001058 */
David Brazdil0f672f62019-12-10 10:32:29 +00001059 jtsq = READ_ONCE(jiffies_to_sched_qs);
1060 ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
1061 rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001062 if (!READ_ONCE(*rnhqp) &&
David Brazdil0f672f62019-12-10 10:32:29 +00001063 (time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
1064 time_after(jiffies, rcu_state.jiffies_resched))) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001065 WRITE_ONCE(*rnhqp, true);
1066 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1067 smp_store_release(ruqp, true);
David Brazdil0f672f62019-12-10 10:32:29 +00001068 } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
1069 WRITE_ONCE(*ruqp, true);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001070 }
1071
1072 /*
David Brazdil0f672f62019-12-10 10:32:29 +00001073 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
1074 * The above code handles this, but only for straight cond_resched().
1075 * And some in-kernel loops check need_resched() before calling
1076 * cond_resched(), which defeats the above code for CPUs that are
1077 * running in-kernel with scheduling-clock interrupts disabled.
1078 * So hit them over the head with the resched_cpu() hammer!
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001079 */
David Brazdil0f672f62019-12-10 10:32:29 +00001080 if (tick_nohz_full_cpu(rdp->cpu) &&
1081 time_after(jiffies,
1082 READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001083 resched_cpu(rdp->cpu);
David Brazdil0f672f62019-12-10 10:32:29 +00001084 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1085 }
1086
1087 /*
1088 * If more than halfway to RCU CPU stall-warning time, invoke
1089 * resched_cpu() more frequently to try to loosen things up a bit.
1090 * Also check to see if the CPU is getting hammered with interrupts,
1091 * but only once per grace period, just to keep the IPIs down to
1092 * a dull roar.
1093 */
1094 if (time_after(jiffies, rcu_state.jiffies_resched)) {
1095 if (time_after(jiffies,
1096 READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
1097 resched_cpu(rdp->cpu);
1098 WRITE_ONCE(rdp->last_fqs_resched, jiffies);
1099 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001100 if (IS_ENABLED(CONFIG_IRQ_WORK) &&
1101 !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
1102 (rnp->ffmask & rdp->grpmask)) {
1103 init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
1104 rdp->rcu_iw_pending = true;
1105 rdp->rcu_iw_gp_seq = rnp->gp_seq;
1106 irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
1107 }
1108 }
1109
1110 return 0;
1111}
1112
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001113/* Trace-event wrapper function for trace_rcu_future_grace_period. */
1114static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1115 unsigned long gp_seq_req, const char *s)
1116{
David Brazdil0f672f62019-12-10 10:32:29 +00001117 trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001118 rnp->level, rnp->grplo, rnp->grphi, s);
1119}
1120
1121/*
1122 * rcu_start_this_gp - Request the start of a particular grace period
1123 * @rnp_start: The leaf node of the CPU from which to start.
1124 * @rdp: The rcu_data corresponding to the CPU from which to start.
1125 * @gp_seq_req: The gp_seq of the grace period to start.
1126 *
1127 * Start the specified grace period, as needed to handle newly arrived
1128 * callbacks. The required future grace periods are recorded in each
1129 * rcu_node structure's ->gp_seq_needed field. Returns true if there
1130 * is reason to awaken the grace-period kthread.
1131 *
1132 * The caller must hold the specified rcu_node structure's ->lock, which
1133 * is why the caller is responsible for waking the grace-period kthread.
1134 *
1135 * Returns true if the GP thread needs to be awakened else false.
1136 */
1137static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
1138 unsigned long gp_seq_req)
1139{
1140 bool ret = false;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001141 struct rcu_node *rnp;
1142
1143 /*
1144 * Use funnel locking to either acquire the root rcu_node
1145 * structure's lock or bail out if the need for this grace period
1146 * has already been recorded -- or if that grace period has in
1147 * fact already started. If there is already a grace period in
1148 * progress in a non-leaf node, no recording is needed because the
1149 * end of the grace period will scan the leaf rcu_node structures.
1150 * Note that rnp_start->lock must not be released.
1151 */
1152 raw_lockdep_assert_held_rcu_node(rnp_start);
1153 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
1154 for (rnp = rnp_start; 1; rnp = rnp->parent) {
1155 if (rnp != rnp_start)
1156 raw_spin_lock_rcu_node(rnp);
1157 if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) ||
1158 rcu_seq_started(&rnp->gp_seq, gp_seq_req) ||
1159 (rnp != rnp_start &&
1160 rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) {
1161 trace_rcu_this_gp(rnp, rdp, gp_seq_req,
1162 TPS("Prestarted"));
1163 goto unlock_out;
1164 }
1165 rnp->gp_seq_needed = gp_seq_req;
1166 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
1167 /*
1168 * We just marked the leaf or internal node, and a
1169 * grace period is in progress, which means that
1170 * rcu_gp_cleanup() will see the marking. Bail to
1171 * reduce contention.
1172 */
1173 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req,
1174 TPS("Startedleaf"));
1175 goto unlock_out;
1176 }
1177 if (rnp != rnp_start && rnp->parent != NULL)
1178 raw_spin_unlock_rcu_node(rnp);
1179 if (!rnp->parent)
1180 break; /* At root, and perhaps also leaf. */
1181 }
1182
1183 /* If GP already in progress, just leave, otherwise start one. */
David Brazdil0f672f62019-12-10 10:32:29 +00001184 if (rcu_gp_in_progress()) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001185 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
1186 goto unlock_out;
1187 }
1188 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
David Brazdil0f672f62019-12-10 10:32:29 +00001189 WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
1190 rcu_state.gp_req_activity = jiffies;
1191 if (!rcu_state.gp_kthread) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001192 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
1193 goto unlock_out;
1194 }
David Brazdil0f672f62019-12-10 10:32:29 +00001195 trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001196 ret = true; /* Caller must wake GP kthread. */
1197unlock_out:
1198 /* Push furthest requested GP to leaf node and rcu_data structure. */
1199 if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
1200 rnp_start->gp_seq_needed = rnp->gp_seq_needed;
1201 rdp->gp_seq_needed = rnp->gp_seq_needed;
1202 }
1203 if (rnp != rnp_start)
1204 raw_spin_unlock_rcu_node(rnp);
1205 return ret;
1206}
1207
1208/*
1209 * Clean up any old requests for the just-ended grace period. Also return
1210 * whether any additional grace periods have been requested.
1211 */
David Brazdil0f672f62019-12-10 10:32:29 +00001212static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001213{
1214 bool needmore;
David Brazdil0f672f62019-12-10 10:32:29 +00001215 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001216
1217 needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
1218 if (!needmore)
1219 rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */
1220 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq,
1221 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1222 return needmore;
1223}
1224
1225/*
David Brazdil0f672f62019-12-10 10:32:29 +00001226 * Awaken the grace-period kthread. Don't do a self-awaken (unless in
1227 * an interrupt or softirq handler), and don't bother awakening when there
1228 * is nothing for the grace-period kthread to do (as in several CPUs raced
1229 * to awaken, and we lost), and finally don't try to awaken a kthread that
1230 * has not yet been created. If all those checks are passed, track some
1231 * debug information and awaken.
1232 *
1233 * So why do the self-wakeup when in an interrupt or softirq handler
1234 * in the grace-period kthread's context? Because the kthread might have
1235 * been interrupted just as it was going to sleep, and just after the final
1236 * pre-sleep check of the awaken condition. In this case, a wakeup really
1237 * is required, and is therefore supplied.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001238 */
David Brazdil0f672f62019-12-10 10:32:29 +00001239static void rcu_gp_kthread_wake(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001240{
David Brazdil0f672f62019-12-10 10:32:29 +00001241 if ((current == rcu_state.gp_kthread &&
1242 !in_irq() && !in_serving_softirq()) ||
1243 !READ_ONCE(rcu_state.gp_flags) ||
1244 !rcu_state.gp_kthread)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001245 return;
David Brazdil0f672f62019-12-10 10:32:29 +00001246 WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
1247 WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
1248 swake_up_one(&rcu_state.gp_wq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001249}
1250
1251/*
1252 * If there is room, assign a ->gp_seq number to any callbacks on this
1253 * CPU that have not already been assigned. Also accelerate any callbacks
1254 * that were previously assigned a ->gp_seq number that has since proven
1255 * to be too conservative, which can happen if callbacks get assigned a
1256 * ->gp_seq number while RCU is idle, but with reference to a non-root
1257 * rcu_node structure. This function is idempotent, so it does not hurt
1258 * to call it repeatedly. Returns an flag saying that we should awaken
1259 * the RCU grace-period kthread.
1260 *
1261 * The caller must hold rnp->lock with interrupts disabled.
1262 */
David Brazdil0f672f62019-12-10 10:32:29 +00001263static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001264{
1265 unsigned long gp_seq_req;
1266 bool ret = false;
1267
David Brazdil0f672f62019-12-10 10:32:29 +00001268 rcu_lockdep_assert_cblist_protected(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001269 raw_lockdep_assert_held_rcu_node(rnp);
1270
1271 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1272 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1273 return false;
1274
1275 /*
1276 * Callbacks are often registered with incomplete grace-period
1277 * information. Something about the fact that getting exact
1278 * information requires acquiring a global lock... RCU therefore
1279 * makes a conservative estimate of the grace period number at which
1280 * a given callback will become ready to invoke. The following
1281 * code checks this estimate and improves it when possible, thus
1282 * accelerating callback invocation to an earlier grace-period
1283 * number.
1284 */
David Brazdil0f672f62019-12-10 10:32:29 +00001285 gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001286 if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
1287 ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
1288
1289 /* Trace depending on how much we were able to accelerate. */
1290 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
David Brazdil0f672f62019-12-10 10:32:29 +00001291 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001292 else
David Brazdil0f672f62019-12-10 10:32:29 +00001293 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001294 return ret;
1295}
1296
1297/*
1298 * Similar to rcu_accelerate_cbs(), but does not require that the leaf
1299 * rcu_node structure's ->lock be held. It consults the cached value
1300 * of ->gp_seq_needed in the rcu_data structure, and if that indicates
1301 * that a new grace-period request be made, invokes rcu_accelerate_cbs()
1302 * while holding the leaf rcu_node structure's ->lock.
1303 */
David Brazdil0f672f62019-12-10 10:32:29 +00001304static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001305 struct rcu_data *rdp)
1306{
1307 unsigned long c;
1308 bool needwake;
1309
David Brazdil0f672f62019-12-10 10:32:29 +00001310 rcu_lockdep_assert_cblist_protected(rdp);
1311 c = rcu_seq_snap(&rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001312 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1313 /* Old request still live, so mark recent callbacks. */
1314 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1315 return;
1316 }
1317 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
David Brazdil0f672f62019-12-10 10:32:29 +00001318 needwake = rcu_accelerate_cbs(rnp, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001319 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1320 if (needwake)
David Brazdil0f672f62019-12-10 10:32:29 +00001321 rcu_gp_kthread_wake();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001322}
1323
1324/*
1325 * Move any callbacks whose grace period has completed to the
1326 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1327 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1328 * sublist. This function is idempotent, so it does not hurt to
1329 * invoke it repeatedly. As long as it is not invoked -too- often...
1330 * Returns true if the RCU grace-period kthread needs to be awakened.
1331 *
1332 * The caller must hold rnp->lock with interrupts disabled.
1333 */
David Brazdil0f672f62019-12-10 10:32:29 +00001334static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001335{
David Brazdil0f672f62019-12-10 10:32:29 +00001336 rcu_lockdep_assert_cblist_protected(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001337 raw_lockdep_assert_held_rcu_node(rnp);
1338
1339 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1340 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1341 return false;
1342
1343 /*
1344 * Find all callbacks whose ->gp_seq numbers indicate that they
1345 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1346 */
1347 rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
1348
1349 /* Classify any remaining callbacks. */
David Brazdil0f672f62019-12-10 10:32:29 +00001350 return rcu_accelerate_cbs(rnp, rdp);
1351}
1352
1353/*
1354 * Move and classify callbacks, but only if doing so won't require
1355 * that the RCU grace-period kthread be awakened.
1356 */
1357static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1358 struct rcu_data *rdp)
1359{
1360 rcu_lockdep_assert_cblist_protected(rdp);
1361 if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
1362 !raw_spin_trylock_rcu_node(rnp))
1363 return;
1364 WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1365 raw_spin_unlock_rcu_node(rnp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001366}
1367
1368/*
1369 * Update CPU-local rcu_data state to record the beginnings and ends of
1370 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1371 * structure corresponding to the current CPU, and must have irqs disabled.
1372 * Returns true if the grace-period kthread needs to be awakened.
1373 */
David Brazdil0f672f62019-12-10 10:32:29 +00001374static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001375{
David Brazdil0f672f62019-12-10 10:32:29 +00001376 bool ret = false;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001377 bool need_gp;
David Brazdil0f672f62019-12-10 10:32:29 +00001378 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1379 rcu_segcblist_is_offloaded(&rdp->cblist);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001380
1381 raw_lockdep_assert_held_rcu_node(rnp);
1382
1383 if (rdp->gp_seq == rnp->gp_seq)
1384 return false; /* Nothing to do. */
1385
1386 /* Handle the ends of any preceding grace periods first. */
1387 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
1388 unlikely(READ_ONCE(rdp->gpwrap))) {
David Brazdil0f672f62019-12-10 10:32:29 +00001389 if (!offloaded)
1390 ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
1391 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001392 } else {
David Brazdil0f672f62019-12-10 10:32:29 +00001393 if (!offloaded)
1394 ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001395 }
1396
1397 /* Now handle the beginnings of any new-to-this-CPU grace periods. */
1398 if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
1399 unlikely(READ_ONCE(rdp->gpwrap))) {
1400 /*
1401 * If the current grace period is waiting for this CPU,
1402 * set up to detect a quiescent state, otherwise don't
1403 * go looking for one.
1404 */
David Brazdil0f672f62019-12-10 10:32:29 +00001405 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001406 need_gp = !!(rnp->qsmask & rdp->grpmask);
1407 rdp->cpu_no_qs.b.norm = need_gp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001408 rdp->core_needs_qs = need_gp;
1409 zero_cpu_stall_ticks(rdp);
1410 }
1411 rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
David Brazdil0f672f62019-12-10 10:32:29 +00001412 if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001413 rdp->gp_seq_needed = rnp->gp_seq_needed;
1414 WRITE_ONCE(rdp->gpwrap, false);
1415 rcu_gpnum_ovf(rnp, rdp);
1416 return ret;
1417}
1418
David Brazdil0f672f62019-12-10 10:32:29 +00001419static void note_gp_changes(struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001420{
1421 unsigned long flags;
1422 bool needwake;
1423 struct rcu_node *rnp;
1424
1425 local_irq_save(flags);
1426 rnp = rdp->mynode;
1427 if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) &&
1428 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
1429 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
1430 local_irq_restore(flags);
1431 return;
1432 }
David Brazdil0f672f62019-12-10 10:32:29 +00001433 needwake = __note_gp_changes(rnp, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001434 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1435 if (needwake)
David Brazdil0f672f62019-12-10 10:32:29 +00001436 rcu_gp_kthread_wake();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001437}
1438
David Brazdil0f672f62019-12-10 10:32:29 +00001439static void rcu_gp_slow(int delay)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001440{
1441 if (delay > 0 &&
David Brazdil0f672f62019-12-10 10:32:29 +00001442 !(rcu_seq_ctr(rcu_state.gp_seq) %
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001443 (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1444 schedule_timeout_uninterruptible(delay);
1445}
1446
1447/*
1448 * Initialize a new grace period. Return false if no grace period required.
1449 */
David Brazdil0f672f62019-12-10 10:32:29 +00001450static bool rcu_gp_init(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001451{
1452 unsigned long flags;
1453 unsigned long oldmask;
1454 unsigned long mask;
1455 struct rcu_data *rdp;
David Brazdil0f672f62019-12-10 10:32:29 +00001456 struct rcu_node *rnp = rcu_get_root();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001457
David Brazdil0f672f62019-12-10 10:32:29 +00001458 WRITE_ONCE(rcu_state.gp_activity, jiffies);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001459 raw_spin_lock_irq_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001460 if (!READ_ONCE(rcu_state.gp_flags)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001461 /* Spurious wakeup, tell caller to go back to sleep. */
1462 raw_spin_unlock_irq_rcu_node(rnp);
1463 return false;
1464 }
David Brazdil0f672f62019-12-10 10:32:29 +00001465 WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001466
David Brazdil0f672f62019-12-10 10:32:29 +00001467 if (WARN_ON_ONCE(rcu_gp_in_progress())) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001468 /*
1469 * Grace period already in progress, don't start another.
1470 * Not supposed to be able to happen.
1471 */
1472 raw_spin_unlock_irq_rcu_node(rnp);
1473 return false;
1474 }
1475
1476 /* Advance to a new grace period and initialize state. */
David Brazdil0f672f62019-12-10 10:32:29 +00001477 record_gp_stall_check_time();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001478 /* Record GP times before starting GP, hence rcu_seq_start(). */
David Brazdil0f672f62019-12-10 10:32:29 +00001479 rcu_seq_start(&rcu_state.gp_seq);
1480 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001481 raw_spin_unlock_irq_rcu_node(rnp);
1482
1483 /*
1484 * Apply per-leaf buffered online and offline operations to the
1485 * rcu_node tree. Note that this new grace period need not wait
1486 * for subsequent online CPUs, and that quiescent-state forcing
1487 * will handle subsequent offline CPUs.
1488 */
David Brazdil0f672f62019-12-10 10:32:29 +00001489 rcu_state.gp_state = RCU_GP_ONOFF;
1490 rcu_for_each_leaf_node(rnp) {
1491 raw_spin_lock(&rcu_state.ofl_lock);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001492 raw_spin_lock_irq_rcu_node(rnp);
1493 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1494 !rnp->wait_blkd_tasks) {
1495 /* Nothing to do on this leaf rcu_node structure. */
1496 raw_spin_unlock_irq_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001497 raw_spin_unlock(&rcu_state.ofl_lock);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001498 continue;
1499 }
1500
1501 /* Record old state, apply changes to ->qsmaskinit field. */
1502 oldmask = rnp->qsmaskinit;
1503 rnp->qsmaskinit = rnp->qsmaskinitnext;
1504
1505 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1506 if (!oldmask != !rnp->qsmaskinit) {
1507 if (!oldmask) { /* First online CPU for rcu_node. */
1508 if (!rnp->wait_blkd_tasks) /* Ever offline? */
1509 rcu_init_new_rnp(rnp);
1510 } else if (rcu_preempt_has_tasks(rnp)) {
1511 rnp->wait_blkd_tasks = true; /* blocked tasks */
1512 } else { /* Last offline CPU and can propagate. */
1513 rcu_cleanup_dead_rnp(rnp);
1514 }
1515 }
1516
1517 /*
1518 * If all waited-on tasks from prior grace period are
1519 * done, and if all this rcu_node structure's CPUs are
1520 * still offline, propagate up the rcu_node tree and
1521 * clear ->wait_blkd_tasks. Otherwise, if one of this
1522 * rcu_node structure's CPUs has since come back online,
1523 * simply clear ->wait_blkd_tasks.
1524 */
1525 if (rnp->wait_blkd_tasks &&
1526 (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
1527 rnp->wait_blkd_tasks = false;
1528 if (!rnp->qsmaskinit)
1529 rcu_cleanup_dead_rnp(rnp);
1530 }
1531
1532 raw_spin_unlock_irq_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001533 raw_spin_unlock(&rcu_state.ofl_lock);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001534 }
David Brazdil0f672f62019-12-10 10:32:29 +00001535 rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001536
1537 /*
1538 * Set the quiescent-state-needed bits in all the rcu_node
David Brazdil0f672f62019-12-10 10:32:29 +00001539 * structures for all currently online CPUs in breadth-first
1540 * order, starting from the root rcu_node structure, relying on the
1541 * layout of the tree within the rcu_state.node[] array. Note that
1542 * other CPUs will access only the leaves of the hierarchy, thus
1543 * seeing that no grace period is in progress, at least until the
1544 * corresponding leaf node has been initialized.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001545 *
1546 * The grace period cannot complete until the initialization
1547 * process finishes, because this kthread handles both.
1548 */
David Brazdil0f672f62019-12-10 10:32:29 +00001549 rcu_state.gp_state = RCU_GP_INIT;
1550 rcu_for_each_node_breadth_first(rnp) {
1551 rcu_gp_slow(gp_init_delay);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001552 raw_spin_lock_irqsave_rcu_node(rnp, flags);
David Brazdil0f672f62019-12-10 10:32:29 +00001553 rdp = this_cpu_ptr(&rcu_data);
1554 rcu_preempt_check_blocked_tasks(rnp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001555 rnp->qsmask = rnp->qsmaskinit;
David Brazdil0f672f62019-12-10 10:32:29 +00001556 WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001557 if (rnp == rdp->mynode)
David Brazdil0f672f62019-12-10 10:32:29 +00001558 (void)__note_gp_changes(rnp, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001559 rcu_preempt_boost_start_gp(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001560 trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001561 rnp->level, rnp->grplo,
1562 rnp->grphi, rnp->qsmask);
1563 /* Quiescent states for tasks on any now-offline CPUs. */
1564 mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1565 rnp->rcu_gp_init_mask = mask;
1566 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
David Brazdil0f672f62019-12-10 10:32:29 +00001567 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001568 else
1569 raw_spin_unlock_irq_rcu_node(rnp);
1570 cond_resched_tasks_rcu_qs();
David Brazdil0f672f62019-12-10 10:32:29 +00001571 WRITE_ONCE(rcu_state.gp_activity, jiffies);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001572 }
1573
1574 return true;
1575}
1576
1577/*
1578 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1579 * time.
1580 */
David Brazdil0f672f62019-12-10 10:32:29 +00001581static bool rcu_gp_fqs_check_wake(int *gfp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001582{
David Brazdil0f672f62019-12-10 10:32:29 +00001583 struct rcu_node *rnp = rcu_get_root();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001584
1585 /* Someone like call_rcu() requested a force-quiescent-state scan. */
David Brazdil0f672f62019-12-10 10:32:29 +00001586 *gfp = READ_ONCE(rcu_state.gp_flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001587 if (*gfp & RCU_GP_FLAG_FQS)
1588 return true;
1589
1590 /* The current grace period has completed. */
1591 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1592 return true;
1593
1594 return false;
1595}
1596
1597/*
1598 * Do one round of quiescent-state forcing.
1599 */
David Brazdil0f672f62019-12-10 10:32:29 +00001600static void rcu_gp_fqs(bool first_time)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001601{
David Brazdil0f672f62019-12-10 10:32:29 +00001602 struct rcu_node *rnp = rcu_get_root();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001603
David Brazdil0f672f62019-12-10 10:32:29 +00001604 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1605 rcu_state.n_force_qs++;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001606 if (first_time) {
1607 /* Collect dyntick-idle snapshots. */
David Brazdil0f672f62019-12-10 10:32:29 +00001608 force_qs_rnp(dyntick_save_progress_counter);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001609 } else {
1610 /* Handle dyntick-idle and offline CPUs. */
David Brazdil0f672f62019-12-10 10:32:29 +00001611 force_qs_rnp(rcu_implicit_dynticks_qs);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001612 }
1613 /* Clear flag to prevent immediate re-entry. */
David Brazdil0f672f62019-12-10 10:32:29 +00001614 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001615 raw_spin_lock_irq_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001616 WRITE_ONCE(rcu_state.gp_flags,
1617 READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001618 raw_spin_unlock_irq_rcu_node(rnp);
1619 }
1620}
1621
1622/*
David Brazdil0f672f62019-12-10 10:32:29 +00001623 * Loop doing repeated quiescent-state forcing until the grace period ends.
1624 */
1625static void rcu_gp_fqs_loop(void)
1626{
1627 bool first_gp_fqs;
1628 int gf;
1629 unsigned long j;
1630 int ret;
1631 struct rcu_node *rnp = rcu_get_root();
1632
1633 first_gp_fqs = true;
1634 j = READ_ONCE(jiffies_till_first_fqs);
1635 ret = 0;
1636 for (;;) {
1637 if (!ret) {
1638 rcu_state.jiffies_force_qs = jiffies + j;
1639 WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
1640 jiffies + (j ? 3 * j : 2));
1641 }
1642 trace_rcu_grace_period(rcu_state.name,
1643 READ_ONCE(rcu_state.gp_seq),
1644 TPS("fqswait"));
1645 rcu_state.gp_state = RCU_GP_WAIT_FQS;
1646 ret = swait_event_idle_timeout_exclusive(
1647 rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
1648 rcu_state.gp_state = RCU_GP_DOING_FQS;
1649 /* Locking provides needed memory barriers. */
1650 /* If grace period done, leave loop. */
1651 if (!READ_ONCE(rnp->qsmask) &&
1652 !rcu_preempt_blocked_readers_cgp(rnp))
1653 break;
1654 /* If time for quiescent-state forcing, do it. */
1655 if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
1656 (gf & RCU_GP_FLAG_FQS)) {
1657 trace_rcu_grace_period(rcu_state.name,
1658 READ_ONCE(rcu_state.gp_seq),
1659 TPS("fqsstart"));
1660 rcu_gp_fqs(first_gp_fqs);
1661 first_gp_fqs = false;
1662 trace_rcu_grace_period(rcu_state.name,
1663 READ_ONCE(rcu_state.gp_seq),
1664 TPS("fqsend"));
1665 cond_resched_tasks_rcu_qs();
1666 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1667 ret = 0; /* Force full wait till next FQS. */
1668 j = READ_ONCE(jiffies_till_next_fqs);
1669 } else {
1670 /* Deal with stray signal. */
1671 cond_resched_tasks_rcu_qs();
1672 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1673 WARN_ON(signal_pending(current));
1674 trace_rcu_grace_period(rcu_state.name,
1675 READ_ONCE(rcu_state.gp_seq),
1676 TPS("fqswaitsig"));
1677 ret = 1; /* Keep old FQS timing. */
1678 j = jiffies;
1679 if (time_after(jiffies, rcu_state.jiffies_force_qs))
1680 j = 1;
1681 else
1682 j = rcu_state.jiffies_force_qs - j;
1683 }
1684 }
1685}
1686
1687/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001688 * Clean up after the old grace period.
1689 */
David Brazdil0f672f62019-12-10 10:32:29 +00001690static void rcu_gp_cleanup(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001691{
1692 unsigned long gp_duration;
1693 bool needgp = false;
1694 unsigned long new_gp_seq;
David Brazdil0f672f62019-12-10 10:32:29 +00001695 bool offloaded;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001696 struct rcu_data *rdp;
David Brazdil0f672f62019-12-10 10:32:29 +00001697 struct rcu_node *rnp = rcu_get_root();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001698 struct swait_queue_head *sq;
1699
David Brazdil0f672f62019-12-10 10:32:29 +00001700 WRITE_ONCE(rcu_state.gp_activity, jiffies);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001701 raw_spin_lock_irq_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001702 rcu_state.gp_end = jiffies;
1703 gp_duration = rcu_state.gp_end - rcu_state.gp_start;
1704 if (gp_duration > rcu_state.gp_max)
1705 rcu_state.gp_max = gp_duration;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001706
1707 /*
1708 * We know the grace period is complete, but to everyone else
1709 * it appears to still be ongoing. But it is also the case
1710 * that to everyone else it looks like there is nothing that
1711 * they can do to advance the grace period. It is therefore
1712 * safe for us to drop the lock in order to mark the grace
1713 * period as completed in all of the rcu_node structures.
1714 */
1715 raw_spin_unlock_irq_rcu_node(rnp);
1716
1717 /*
1718 * Propagate new ->gp_seq value to rcu_node structures so that
1719 * other CPUs don't have to wait until the start of the next grace
1720 * period to process their callbacks. This also avoids some nasty
1721 * RCU grace-period initialization races by forcing the end of
1722 * the current grace period to be completely recorded in all of
1723 * the rcu_node structures before the beginning of the next grace
1724 * period is recorded in any of the rcu_node structures.
1725 */
David Brazdil0f672f62019-12-10 10:32:29 +00001726 new_gp_seq = rcu_state.gp_seq;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001727 rcu_seq_end(&new_gp_seq);
David Brazdil0f672f62019-12-10 10:32:29 +00001728 rcu_for_each_node_breadth_first(rnp) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001729 raw_spin_lock_irq_rcu_node(rnp);
1730 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
David Brazdil0f672f62019-12-10 10:32:29 +00001731 dump_blkd_tasks(rnp, 10);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001732 WARN_ON_ONCE(rnp->qsmask);
1733 WRITE_ONCE(rnp->gp_seq, new_gp_seq);
David Brazdil0f672f62019-12-10 10:32:29 +00001734 rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001735 if (rnp == rdp->mynode)
David Brazdil0f672f62019-12-10 10:32:29 +00001736 needgp = __note_gp_changes(rnp, rdp) || needgp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001737 /* smp_mb() provided by prior unlock-lock pair. */
David Brazdil0f672f62019-12-10 10:32:29 +00001738 needgp = rcu_future_gp_cleanup(rnp) || needgp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001739 sq = rcu_nocb_gp_get(rnp);
1740 raw_spin_unlock_irq_rcu_node(rnp);
1741 rcu_nocb_gp_cleanup(sq);
1742 cond_resched_tasks_rcu_qs();
David Brazdil0f672f62019-12-10 10:32:29 +00001743 WRITE_ONCE(rcu_state.gp_activity, jiffies);
1744 rcu_gp_slow(gp_cleanup_delay);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001745 }
David Brazdil0f672f62019-12-10 10:32:29 +00001746 rnp = rcu_get_root();
1747 raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001748
David Brazdil0f672f62019-12-10 10:32:29 +00001749 /* Declare grace period done, trace first to use old GP number. */
1750 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
1751 rcu_seq_end(&rcu_state.gp_seq);
1752 rcu_state.gp_state = RCU_GP_IDLE;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001753 /* Check for GP requests since above loop. */
David Brazdil0f672f62019-12-10 10:32:29 +00001754 rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001755 if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
1756 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed,
1757 TPS("CleanupMore"));
1758 needgp = true;
1759 }
1760 /* Advance CBs to reduce false positives below. */
David Brazdil0f672f62019-12-10 10:32:29 +00001761 offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1762 rcu_segcblist_is_offloaded(&rdp->cblist);
1763 if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
1764 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
1765 rcu_state.gp_req_activity = jiffies;
1766 trace_rcu_grace_period(rcu_state.name,
1767 READ_ONCE(rcu_state.gp_seq),
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001768 TPS("newreq"));
1769 } else {
David Brazdil0f672f62019-12-10 10:32:29 +00001770 WRITE_ONCE(rcu_state.gp_flags,
1771 rcu_state.gp_flags & RCU_GP_FLAG_INIT);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001772 }
1773 raw_spin_unlock_irq_rcu_node(rnp);
1774}
1775
1776/*
1777 * Body of kthread that handles grace periods.
1778 */
David Brazdil0f672f62019-12-10 10:32:29 +00001779static int __noreturn rcu_gp_kthread(void *unused)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001780{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001781 rcu_bind_gp_kthread();
1782 for (;;) {
1783
1784 /* Handle grace-period start. */
1785 for (;;) {
David Brazdil0f672f62019-12-10 10:32:29 +00001786 trace_rcu_grace_period(rcu_state.name,
1787 READ_ONCE(rcu_state.gp_seq),
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001788 TPS("reqwait"));
David Brazdil0f672f62019-12-10 10:32:29 +00001789 rcu_state.gp_state = RCU_GP_WAIT_GPS;
1790 swait_event_idle_exclusive(rcu_state.gp_wq,
1791 READ_ONCE(rcu_state.gp_flags) &
1792 RCU_GP_FLAG_INIT);
1793 rcu_state.gp_state = RCU_GP_DONE_GPS;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001794 /* Locking provides needed memory barrier. */
David Brazdil0f672f62019-12-10 10:32:29 +00001795 if (rcu_gp_init())
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001796 break;
1797 cond_resched_tasks_rcu_qs();
David Brazdil0f672f62019-12-10 10:32:29 +00001798 WRITE_ONCE(rcu_state.gp_activity, jiffies);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001799 WARN_ON(signal_pending(current));
David Brazdil0f672f62019-12-10 10:32:29 +00001800 trace_rcu_grace_period(rcu_state.name,
1801 READ_ONCE(rcu_state.gp_seq),
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001802 TPS("reqwaitsig"));
1803 }
1804
1805 /* Handle quiescent-state forcing. */
David Brazdil0f672f62019-12-10 10:32:29 +00001806 rcu_gp_fqs_loop();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001807
1808 /* Handle grace-period end. */
David Brazdil0f672f62019-12-10 10:32:29 +00001809 rcu_state.gp_state = RCU_GP_CLEANUP;
1810 rcu_gp_cleanup();
1811 rcu_state.gp_state = RCU_GP_CLEANED;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001812 }
1813}
1814
1815/*
David Brazdil0f672f62019-12-10 10:32:29 +00001816 * Report a full set of quiescent states to the rcu_state data structure.
1817 * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
1818 * another grace period is required. Whether we wake the grace-period
1819 * kthread or it awakens itself for the next round of quiescent-state
1820 * forcing, that kthread will clean up after the just-completed grace
1821 * period. Note that the caller must hold rnp->lock, which is released
1822 * before return.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001823 */
David Brazdil0f672f62019-12-10 10:32:29 +00001824static void rcu_report_qs_rsp(unsigned long flags)
1825 __releases(rcu_get_root()->lock)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001826{
David Brazdil0f672f62019-12-10 10:32:29 +00001827 raw_lockdep_assert_held_rcu_node(rcu_get_root());
1828 WARN_ON_ONCE(!rcu_gp_in_progress());
1829 WRITE_ONCE(rcu_state.gp_flags,
1830 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
1831 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
1832 rcu_gp_kthread_wake();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001833}
1834
1835/*
1836 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
1837 * Allows quiescent states for a group of CPUs to be reported at one go
1838 * to the specified rcu_node structure, though all the CPUs in the group
1839 * must be represented by the same rcu_node structure (which need not be a
1840 * leaf rcu_node structure, though it often will be). The gps parameter
1841 * is the grace-period snapshot, which means that the quiescent states
1842 * are valid only if rnp->gp_seq is equal to gps. That structure's lock
1843 * must be held upon entry, and it is released before return.
1844 *
1845 * As a special case, if mask is zero, the bit-already-cleared check is
1846 * disabled. This allows propagating quiescent state due to resumed tasks
1847 * during grace-period initialization.
1848 */
David Brazdil0f672f62019-12-10 10:32:29 +00001849static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
1850 unsigned long gps, unsigned long flags)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001851 __releases(rnp->lock)
1852{
1853 unsigned long oldmask = 0;
1854 struct rcu_node *rnp_c;
1855
1856 raw_lockdep_assert_held_rcu_node(rnp);
1857
1858 /* Walk up the rcu_node hierarchy. */
1859 for (;;) {
1860 if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) {
1861
1862 /*
1863 * Our bit has already been cleared, or the
1864 * relevant grace period is already over, so done.
1865 */
1866 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1867 return;
1868 }
1869 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
1870 WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
1871 rcu_preempt_blocked_readers_cgp(rnp));
1872 rnp->qsmask &= ~mask;
David Brazdil0f672f62019-12-10 10:32:29 +00001873 trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001874 mask, rnp->qsmask, rnp->level,
1875 rnp->grplo, rnp->grphi,
1876 !!rnp->gp_tasks);
1877 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
1878
1879 /* Other bits still set at this level, so done. */
1880 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1881 return;
1882 }
1883 rnp->completedqs = rnp->gp_seq;
1884 mask = rnp->grpmask;
1885 if (rnp->parent == NULL) {
1886
1887 /* No more levels. Exit loop holding root lock. */
1888
1889 break;
1890 }
1891 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1892 rnp_c = rnp;
1893 rnp = rnp->parent;
1894 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1895 oldmask = rnp_c->qsmask;
1896 }
1897
1898 /*
1899 * Get here if we are the last CPU to pass through a quiescent
1900 * state for this grace period. Invoke rcu_report_qs_rsp()
1901 * to clean up and start the next grace period if one is needed.
1902 */
David Brazdil0f672f62019-12-10 10:32:29 +00001903 rcu_report_qs_rsp(flags); /* releases rnp->lock. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001904}
1905
1906/*
1907 * Record a quiescent state for all tasks that were previously queued
1908 * on the specified rcu_node structure and that were blocking the current
David Brazdil0f672f62019-12-10 10:32:29 +00001909 * RCU grace period. The caller must hold the corresponding rnp->lock with
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001910 * irqs disabled, and this lock is released upon return, but irqs remain
1911 * disabled.
1912 */
1913static void __maybe_unused
David Brazdil0f672f62019-12-10 10:32:29 +00001914rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001915 __releases(rnp->lock)
1916{
1917 unsigned long gps;
1918 unsigned long mask;
1919 struct rcu_node *rnp_p;
1920
1921 raw_lockdep_assert_held_rcu_node(rnp);
David Brazdil0f672f62019-12-10 10:32:29 +00001922 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001923 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
1924 rnp->qsmask != 0) {
1925 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1926 return; /* Still need more quiescent states! */
1927 }
1928
1929 rnp->completedqs = rnp->gp_seq;
1930 rnp_p = rnp->parent;
1931 if (rnp_p == NULL) {
1932 /*
1933 * Only one rcu_node structure in the tree, so don't
1934 * try to report up to its nonexistent parent!
1935 */
David Brazdil0f672f62019-12-10 10:32:29 +00001936 rcu_report_qs_rsp(flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001937 return;
1938 }
1939
1940 /* Report up the rest of the hierarchy, tracking current ->gp_seq. */
1941 gps = rnp->gp_seq;
1942 mask = rnp->grpmask;
1943 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1944 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
David Brazdil0f672f62019-12-10 10:32:29 +00001945 rcu_report_qs_rnp(mask, rnp_p, gps, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001946}
1947
1948/*
1949 * Record a quiescent state for the specified CPU to that CPU's rcu_data
1950 * structure. This must be called from the specified CPU.
1951 */
1952static void
David Brazdil0f672f62019-12-10 10:32:29 +00001953rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001954{
1955 unsigned long flags;
1956 unsigned long mask;
David Brazdil0f672f62019-12-10 10:32:29 +00001957 bool needwake = false;
1958 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1959 rcu_segcblist_is_offloaded(&rdp->cblist);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001960 struct rcu_node *rnp;
1961
1962 rnp = rdp->mynode;
1963 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1964 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
1965 rdp->gpwrap) {
1966
1967 /*
1968 * The grace period in which this quiescent state was
1969 * recorded has ended, so don't report it upwards.
1970 * We will instead need a new quiescent state that lies
1971 * within the current grace period.
1972 */
1973 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001974 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1975 return;
1976 }
1977 mask = rdp->grpmask;
David Brazdil0f672f62019-12-10 10:32:29 +00001978 rdp->core_needs_qs = false;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001979 if ((rnp->qsmask & mask) == 0) {
1980 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1981 } else {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001982 /*
1983 * This GP can't end until cpu checks in, so all of our
1984 * callbacks can be processed during the next GP.
1985 */
David Brazdil0f672f62019-12-10 10:32:29 +00001986 if (!offloaded)
1987 needwake = rcu_accelerate_cbs(rnp, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001988
David Brazdil0f672f62019-12-10 10:32:29 +00001989 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001990 /* ^^^ Released rnp->lock */
1991 if (needwake)
David Brazdil0f672f62019-12-10 10:32:29 +00001992 rcu_gp_kthread_wake();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001993 }
1994}
1995
1996/*
1997 * Check to see if there is a new grace period of which this CPU
1998 * is not yet aware, and if so, set up local rcu_data state for it.
1999 * Otherwise, see if this CPU has just passed through its first
2000 * quiescent state for this grace period, and record that fact if so.
2001 */
2002static void
David Brazdil0f672f62019-12-10 10:32:29 +00002003rcu_check_quiescent_state(struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002004{
2005 /* Check for grace-period ends and beginnings. */
David Brazdil0f672f62019-12-10 10:32:29 +00002006 note_gp_changes(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002007
2008 /*
2009 * Does this CPU still need to do its part for current grace period?
2010 * If no, return and let the other CPUs do their part as well.
2011 */
2012 if (!rdp->core_needs_qs)
2013 return;
2014
2015 /*
2016 * Was there a quiescent state since the beginning of the grace
2017 * period? If no, then exit and wait for the next call.
2018 */
2019 if (rdp->cpu_no_qs.b.norm)
2020 return;
2021
2022 /*
2023 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
2024 * judge of that).
2025 */
David Brazdil0f672f62019-12-10 10:32:29 +00002026 rcu_report_qs_rdp(rdp->cpu, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002027}
2028
2029/*
David Brazdil0f672f62019-12-10 10:32:29 +00002030 * Near the end of the offline process. Trace the fact that this CPU
2031 * is going offline.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002032 */
David Brazdil0f672f62019-12-10 10:32:29 +00002033int rcutree_dying_cpu(unsigned int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002034{
David Brazdil0f672f62019-12-10 10:32:29 +00002035 bool blkd;
2036 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
2037 struct rcu_node *rnp = rdp->mynode;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002038
2039 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
David Brazdil0f672f62019-12-10 10:32:29 +00002040 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002041
David Brazdil0f672f62019-12-10 10:32:29 +00002042 blkd = !!(rnp->qsmask & rdp->grpmask);
2043 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002044 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
David Brazdil0f672f62019-12-10 10:32:29 +00002045 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002046}
2047
2048/*
2049 * All CPUs for the specified rcu_node structure have gone offline,
2050 * and all tasks that were preempted within an RCU read-side critical
2051 * section while running on one of those CPUs have since exited their RCU
2052 * read-side critical section. Some other CPU is reporting this fact with
2053 * the specified rcu_node structure's ->lock held and interrupts disabled.
2054 * This function therefore goes up the tree of rcu_node structures,
2055 * clearing the corresponding bits in the ->qsmaskinit fields. Note that
2056 * the leaf rcu_node structure's ->qsmaskinit field has already been
2057 * updated.
2058 *
2059 * This function does check that the specified rcu_node structure has
2060 * all CPUs offline and no blocked tasks, so it is OK to invoke it
2061 * prematurely. That said, invoking it after the fact will cost you
2062 * a needless lock acquisition. So once it has done its work, don't
2063 * invoke it again.
2064 */
2065static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2066{
2067 long mask;
2068 struct rcu_node *rnp = rnp_leaf;
2069
2070 raw_lockdep_assert_held_rcu_node(rnp_leaf);
2071 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
2072 WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
2073 WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
2074 return;
2075 for (;;) {
2076 mask = rnp->grpmask;
2077 rnp = rnp->parent;
2078 if (!rnp)
2079 break;
2080 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
2081 rnp->qsmaskinit &= ~mask;
2082 /* Between grace periods, so better already be zero! */
2083 WARN_ON_ONCE(rnp->qsmask);
2084 if (rnp->qsmaskinit) {
2085 raw_spin_unlock_rcu_node(rnp);
2086 /* irqs remain disabled. */
2087 return;
2088 }
2089 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2090 }
2091}
2092
2093/*
2094 * The CPU has been completely removed, and some other CPU is reporting
2095 * this fact from process context. Do the remainder of the cleanup.
2096 * There can only be one CPU hotplug operation at a time, so no need for
2097 * explicit locking.
2098 */
David Brazdil0f672f62019-12-10 10:32:29 +00002099int rcutree_dead_cpu(unsigned int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002100{
David Brazdil0f672f62019-12-10 10:32:29 +00002101 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002102 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2103
2104 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
David Brazdil0f672f62019-12-10 10:32:29 +00002105 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002106
2107 /* Adjust any no-longer-needed kthreads. */
2108 rcu_boost_kthread_setaffinity(rnp, -1);
David Brazdil0f672f62019-12-10 10:32:29 +00002109 /* Do any needed no-CB deferred wakeups from this CPU. */
2110 do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
2111 return 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002112}
2113
2114/*
2115 * Invoke any RCU callbacks that have made it to the end of their grace
2116 * period. Thottle as specified by rdp->blimit.
2117 */
David Brazdil0f672f62019-12-10 10:32:29 +00002118static void rcu_do_batch(struct rcu_data *rdp)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002119{
2120 unsigned long flags;
David Brazdil0f672f62019-12-10 10:32:29 +00002121 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2122 rcu_segcblist_is_offloaded(&rdp->cblist);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002123 struct rcu_head *rhp;
2124 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2125 long bl, count;
David Brazdil0f672f62019-12-10 10:32:29 +00002126 long pending, tlimit = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002127
2128 /* If no callbacks are ready, just return. */
2129 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
David Brazdil0f672f62019-12-10 10:32:29 +00002130 trace_rcu_batch_start(rcu_state.name,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002131 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2132 rcu_segcblist_n_cbs(&rdp->cblist), 0);
David Brazdil0f672f62019-12-10 10:32:29 +00002133 trace_rcu_batch_end(rcu_state.name, 0,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002134 !rcu_segcblist_empty(&rdp->cblist),
2135 need_resched(), is_idle_task(current),
2136 rcu_is_callbacks_kthread());
2137 return;
2138 }
2139
2140 /*
2141 * Extract the list of ready callbacks, disabling to prevent
2142 * races with call_rcu() from interrupt handlers. Leave the
2143 * callback counts, as rcu_barrier() needs to be conservative.
2144 */
2145 local_irq_save(flags);
David Brazdil0f672f62019-12-10 10:32:29 +00002146 rcu_nocb_lock(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002147 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
David Brazdil0f672f62019-12-10 10:32:29 +00002148 pending = rcu_segcblist_n_cbs(&rdp->cblist);
2149 bl = max(rdp->blimit, pending >> rcu_divisor);
2150 if (unlikely(bl > 100))
2151 tlimit = local_clock() + rcu_resched_ns;
2152 trace_rcu_batch_start(rcu_state.name,
2153 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002154 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2155 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
David Brazdil0f672f62019-12-10 10:32:29 +00002156 if (offloaded)
2157 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2158 rcu_nocb_unlock_irqrestore(rdp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002159
2160 /* Invoke callbacks. */
2161 rhp = rcu_cblist_dequeue(&rcl);
2162 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2163 debug_rcu_head_unqueue(rhp);
David Brazdil0f672f62019-12-10 10:32:29 +00002164 if (__rcu_reclaim(rcu_state.name, rhp))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002165 rcu_cblist_dequeued_lazy(&rcl);
2166 /*
2167 * Stop only if limit reached and CPU has something to do.
2168 * Note: The rcl structure counts down from zero.
2169 */
David Brazdil0f672f62019-12-10 10:32:29 +00002170 if (-rcl.len >= bl && !offloaded &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002171 (need_resched() ||
2172 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2173 break;
David Brazdil0f672f62019-12-10 10:32:29 +00002174 if (unlikely(tlimit)) {
2175 /* only call local_clock() every 32 callbacks */
2176 if (likely((-rcl.len & 31) || local_clock() < tlimit))
2177 continue;
2178 /* Exceeded the time limit, so leave. */
2179 break;
2180 }
2181 if (offloaded) {
2182 WARN_ON_ONCE(in_serving_softirq());
2183 local_bh_enable();
2184 lockdep_assert_irqs_enabled();
2185 cond_resched_tasks_rcu_qs();
2186 lockdep_assert_irqs_enabled();
2187 local_bh_disable();
2188 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002189 }
2190
2191 local_irq_save(flags);
David Brazdil0f672f62019-12-10 10:32:29 +00002192 rcu_nocb_lock(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002193 count = -rcl.len;
David Brazdil0f672f62019-12-10 10:32:29 +00002194 trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002195 is_idle_task(current), rcu_is_callbacks_kthread());
2196
2197 /* Update counts and requeue any remaining callbacks. */
2198 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2199 smp_mb(); /* List handling before counting for rcu_barrier(). */
2200 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2201
2202 /* Reinstate batch limit if we have worked down the excess. */
2203 count = rcu_segcblist_n_cbs(&rdp->cblist);
David Brazdil0f672f62019-12-10 10:32:29 +00002204 if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002205 rdp->blimit = blimit;
2206
2207 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2208 if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2209 rdp->qlen_last_fqs_check = 0;
David Brazdil0f672f62019-12-10 10:32:29 +00002210 rdp->n_force_qs_snap = rcu_state.n_force_qs;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002211 } else if (count < rdp->qlen_last_fqs_check - qhimark)
2212 rdp->qlen_last_fqs_check = count;
2213
2214 /*
2215 * The following usually indicates a double call_rcu(). To track
2216 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2217 */
David Brazdil0f672f62019-12-10 10:32:29 +00002218 WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
2219 WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2220 count != 0 && rcu_segcblist_empty(&rdp->cblist));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002221
David Brazdil0f672f62019-12-10 10:32:29 +00002222 rcu_nocb_unlock_irqrestore(rdp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002223
2224 /* Re-invoke RCU core processing if there are callbacks remaining. */
David Brazdil0f672f62019-12-10 10:32:29 +00002225 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002226 invoke_rcu_core();
2227}
2228
2229/*
David Brazdil0f672f62019-12-10 10:32:29 +00002230 * This function is invoked from each scheduling-clock interrupt,
2231 * and checks to see if this CPU is in a non-context-switch quiescent
2232 * state, for example, user mode or idle loop. It also schedules RCU
2233 * core processing. If the current grace period has gone on too long,
2234 * it will ask the scheduler to manufacture a context switch for the sole
2235 * purpose of providing a providing the needed quiescent state.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002236 */
David Brazdil0f672f62019-12-10 10:32:29 +00002237void rcu_sched_clock_irq(int user)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002238{
2239 trace_rcu_utilization(TPS("Start scheduler-tick"));
David Brazdil0f672f62019-12-10 10:32:29 +00002240 raw_cpu_inc(rcu_data.ticks_this_gp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002241 /* The load-acquire pairs with the store-release setting to true. */
David Brazdil0f672f62019-12-10 10:32:29 +00002242 if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002243 /* Idle and userspace execution already are quiescent states. */
2244 if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2245 set_tsk_need_resched(current);
2246 set_preempt_need_resched();
2247 }
David Brazdil0f672f62019-12-10 10:32:29 +00002248 __this_cpu_write(rcu_data.rcu_urgent_qs, false);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002249 }
David Brazdil0f672f62019-12-10 10:32:29 +00002250 rcu_flavor_sched_clock_irq(user);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002251 if (rcu_pending())
2252 invoke_rcu_core();
2253
2254 trace_rcu_utilization(TPS("End scheduler-tick"));
2255}
2256
2257/*
David Brazdil0f672f62019-12-10 10:32:29 +00002258 * Scan the leaf rcu_node structures. For each structure on which all
2259 * CPUs have reported a quiescent state and on which there are tasks
2260 * blocking the current grace period, initiate RCU priority boosting.
2261 * Otherwise, invoke the specified function to check dyntick state for
2262 * each CPU that has not yet reported a quiescent state.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002263 */
David Brazdil0f672f62019-12-10 10:32:29 +00002264static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002265{
2266 int cpu;
2267 unsigned long flags;
2268 unsigned long mask;
2269 struct rcu_node *rnp;
2270
David Brazdil0f672f62019-12-10 10:32:29 +00002271 rcu_for_each_leaf_node(rnp) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002272 cond_resched_tasks_rcu_qs();
2273 mask = 0;
2274 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2275 if (rnp->qsmask == 0) {
David Brazdil0f672f62019-12-10 10:32:29 +00002276 if (!IS_ENABLED(CONFIG_PREEMPTION) ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002277 rcu_preempt_blocked_readers_cgp(rnp)) {
2278 /*
2279 * No point in scanning bits because they
2280 * are all zero. But we might need to
2281 * priority-boost blocked readers.
2282 */
2283 rcu_initiate_boost(rnp, flags);
2284 /* rcu_initiate_boost() releases rnp->lock */
2285 continue;
2286 }
2287 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2288 continue;
2289 }
2290 for_each_leaf_node_possible_cpu(rnp, cpu) {
2291 unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
2292 if ((rnp->qsmask & bit) != 0) {
David Brazdil0f672f62019-12-10 10:32:29 +00002293 if (f(per_cpu_ptr(&rcu_data, cpu)))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002294 mask |= bit;
2295 }
2296 }
2297 if (mask != 0) {
2298 /* Idle/offline CPUs, report (releases rnp->lock). */
David Brazdil0f672f62019-12-10 10:32:29 +00002299 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002300 } else {
2301 /* Nothing to do here, so just drop the lock. */
2302 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2303 }
2304 }
2305}
2306
2307/*
2308 * Force quiescent states on reluctant CPUs, and also detect which
2309 * CPUs are in dyntick-idle mode.
2310 */
David Brazdil0f672f62019-12-10 10:32:29 +00002311void rcu_force_quiescent_state(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002312{
2313 unsigned long flags;
2314 bool ret;
2315 struct rcu_node *rnp;
2316 struct rcu_node *rnp_old = NULL;
2317
2318 /* Funnel through hierarchy to reduce memory contention. */
David Brazdil0f672f62019-12-10 10:32:29 +00002319 rnp = __this_cpu_read(rcu_data.mynode);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002320 for (; rnp != NULL; rnp = rnp->parent) {
David Brazdil0f672f62019-12-10 10:32:29 +00002321 ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002322 !raw_spin_trylock(&rnp->fqslock);
2323 if (rnp_old != NULL)
2324 raw_spin_unlock(&rnp_old->fqslock);
2325 if (ret)
2326 return;
2327 rnp_old = rnp;
2328 }
David Brazdil0f672f62019-12-10 10:32:29 +00002329 /* rnp_old == rcu_get_root(), rnp == NULL. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002330
2331 /* Reached the root of the rcu_node tree, acquire lock. */
2332 raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2333 raw_spin_unlock(&rnp_old->fqslock);
David Brazdil0f672f62019-12-10 10:32:29 +00002334 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002335 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2336 return; /* Someone beat us to it. */
2337 }
David Brazdil0f672f62019-12-10 10:32:29 +00002338 WRITE_ONCE(rcu_state.gp_flags,
2339 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002340 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
David Brazdil0f672f62019-12-10 10:32:29 +00002341 rcu_gp_kthread_wake();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002342}
David Brazdil0f672f62019-12-10 10:32:29 +00002343EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002344
David Brazdil0f672f62019-12-10 10:32:29 +00002345/* Perform RCU core processing work for the current CPU. */
2346static __latent_entropy void rcu_core(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002347{
2348 unsigned long flags;
David Brazdil0f672f62019-12-10 10:32:29 +00002349 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002350 struct rcu_node *rnp = rdp->mynode;
David Brazdil0f672f62019-12-10 10:32:29 +00002351 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2352 rcu_segcblist_is_offloaded(&rdp->cblist);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002353
2354 if (cpu_is_offline(smp_processor_id()))
2355 return;
2356 trace_rcu_utilization(TPS("Start RCU core"));
David Brazdil0f672f62019-12-10 10:32:29 +00002357 WARN_ON_ONCE(!rdp->beenonline);
2358
2359 /* Report any deferred quiescent states if preemption enabled. */
2360 if (!(preempt_count() & PREEMPT_MASK)) {
2361 rcu_preempt_deferred_qs(current);
2362 } else if (rcu_preempt_need_deferred_qs(current)) {
2363 set_tsk_need_resched(current);
2364 set_preempt_need_resched();
2365 }
2366
2367 /* Update RCU state based on any recent quiescent states. */
2368 rcu_check_quiescent_state(rdp);
2369
2370 /* No grace period and unregistered callbacks? */
2371 if (!rcu_gp_in_progress() &&
2372 rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
2373 local_irq_save(flags);
2374 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2375 rcu_accelerate_cbs_unlocked(rnp, rdp);
2376 local_irq_restore(flags);
2377 }
2378
2379 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
2380
2381 /* If there are callbacks ready, invoke them. */
2382 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
2383 likely(READ_ONCE(rcu_scheduler_fully_active)))
2384 rcu_do_batch(rdp);
2385
2386 /* Do any needed deferred wakeups of rcuo kthreads. */
2387 do_nocb_deferred_wakeup(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002388 trace_rcu_utilization(TPS("End RCU core"));
2389}
2390
David Brazdil0f672f62019-12-10 10:32:29 +00002391static void rcu_core_si(struct softirq_action *h)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002392{
David Brazdil0f672f62019-12-10 10:32:29 +00002393 rcu_core();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002394}
2395
David Brazdil0f672f62019-12-10 10:32:29 +00002396static void rcu_wake_cond(struct task_struct *t, int status)
2397{
2398 /*
2399 * If the thread is yielding, only wake it when this
2400 * is invoked from idle
2401 */
2402 if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
2403 wake_up_process(t);
2404}
2405
2406static void invoke_rcu_core_kthread(void)
2407{
2408 struct task_struct *t;
2409 unsigned long flags;
2410
2411 local_irq_save(flags);
2412 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
2413 t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2414 if (t != NULL && t != current)
2415 rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2416 local_irq_restore(flags);
2417}
2418
2419/*
2420 * Wake up this CPU's rcuc kthread to do RCU core processing.
2421 */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002422static void invoke_rcu_core(void)
2423{
David Brazdil0f672f62019-12-10 10:32:29 +00002424 if (!cpu_online(smp_processor_id()))
2425 return;
2426 if (use_softirq)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002427 raise_softirq(RCU_SOFTIRQ);
David Brazdil0f672f62019-12-10 10:32:29 +00002428 else
2429 invoke_rcu_core_kthread();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002430}
2431
David Brazdil0f672f62019-12-10 10:32:29 +00002432static void rcu_cpu_kthread_park(unsigned int cpu)
2433{
2434 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2435}
2436
2437static int rcu_cpu_kthread_should_run(unsigned int cpu)
2438{
2439 return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2440}
2441
2442/*
2443 * Per-CPU kernel thread that invokes RCU callbacks. This replaces
2444 * the RCU softirq used in configurations of RCU that do not support RCU
2445 * priority boosting.
2446 */
2447static void rcu_cpu_kthread(unsigned int cpu)
2448{
2449 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2450 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2451 int spincnt;
2452
2453 for (spincnt = 0; spincnt < 10; spincnt++) {
2454 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
2455 local_bh_disable();
2456 *statusp = RCU_KTHREAD_RUNNING;
2457 local_irq_disable();
2458 work = *workp;
2459 *workp = 0;
2460 local_irq_enable();
2461 if (work)
2462 rcu_core();
2463 local_bh_enable();
2464 if (*workp == 0) {
2465 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2466 *statusp = RCU_KTHREAD_WAITING;
2467 return;
2468 }
2469 }
2470 *statusp = RCU_KTHREAD_YIELDING;
2471 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2472 schedule_timeout_interruptible(2);
2473 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2474 *statusp = RCU_KTHREAD_WAITING;
2475}
2476
2477static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2478 .store = &rcu_data.rcu_cpu_kthread_task,
2479 .thread_should_run = rcu_cpu_kthread_should_run,
2480 .thread_fn = rcu_cpu_kthread,
2481 .thread_comm = "rcuc/%u",
2482 .setup = rcu_cpu_kthread_setup,
2483 .park = rcu_cpu_kthread_park,
2484};
2485
2486/*
2487 * Spawn per-CPU RCU core processing kthreads.
2488 */
2489static int __init rcu_spawn_core_kthreads(void)
2490{
2491 int cpu;
2492
2493 for_each_possible_cpu(cpu)
2494 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
2495 if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
2496 return 0;
2497 WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2498 "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2499 return 0;
2500}
David Brazdil0f672f62019-12-10 10:32:29 +00002501
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002502/*
2503 * Handle any core-RCU processing required by a call_rcu() invocation.
2504 */
David Brazdil0f672f62019-12-10 10:32:29 +00002505static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
2506 unsigned long flags)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002507{
2508 /*
2509 * If called from an extended quiescent state, invoke the RCU
2510 * core in order to force a re-evaluation of RCU's idleness.
2511 */
2512 if (!rcu_is_watching())
2513 invoke_rcu_core();
2514
2515 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2516 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
2517 return;
2518
2519 /*
2520 * Force the grace period if too many callbacks or too long waiting.
David Brazdil0f672f62019-12-10 10:32:29 +00002521 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002522 * if some other CPU has recently done so. Also, don't bother
David Brazdil0f672f62019-12-10 10:32:29 +00002523 * invoking rcu_force_quiescent_state() if the newly enqueued callback
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002524 * is the only one waiting for a grace period to complete.
2525 */
2526 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
2527 rdp->qlen_last_fqs_check + qhimark)) {
2528
2529 /* Are we ignoring a completed grace period? */
David Brazdil0f672f62019-12-10 10:32:29 +00002530 note_gp_changes(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002531
2532 /* Start a new grace period if one not already started. */
David Brazdil0f672f62019-12-10 10:32:29 +00002533 if (!rcu_gp_in_progress()) {
2534 rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002535 } else {
2536 /* Give the grace period a kick. */
David Brazdil0f672f62019-12-10 10:32:29 +00002537 rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2538 if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002539 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
David Brazdil0f672f62019-12-10 10:32:29 +00002540 rcu_force_quiescent_state();
2541 rdp->n_force_qs_snap = rcu_state.n_force_qs;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002542 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2543 }
2544 }
2545}
2546
2547/*
2548 * RCU callback function to leak a callback.
2549 */
2550static void rcu_leak_callback(struct rcu_head *rhp)
2551{
2552}
2553
2554/*
2555 * Helper function for call_rcu() and friends. The cpu argument will
2556 * normally be -1, indicating "currently running CPU". It may specify
David Brazdil0f672f62019-12-10 10:32:29 +00002557 * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier()
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002558 * is expected to specify a CPU.
2559 */
2560static void
David Brazdil0f672f62019-12-10 10:32:29 +00002561__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002562{
2563 unsigned long flags;
2564 struct rcu_data *rdp;
David Brazdil0f672f62019-12-10 10:32:29 +00002565 bool was_alldone;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002566
2567 /* Misaligned rcu_head! */
2568 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
2569
2570 if (debug_rcu_head_queue(head)) {
2571 /*
2572 * Probable double call_rcu(), so leak the callback.
2573 * Use rcu:rcu_callback trace event to find the previous
2574 * time callback was passed to __call_rcu().
2575 */
David Brazdil0f672f62019-12-10 10:32:29 +00002576 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002577 head, head->func);
2578 WRITE_ONCE(head->func, rcu_leak_callback);
2579 return;
2580 }
2581 head->func = func;
2582 head->next = NULL;
2583 local_irq_save(flags);
David Brazdil0f672f62019-12-10 10:32:29 +00002584 rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002585
2586 /* Add the callback to our list. */
David Brazdil0f672f62019-12-10 10:32:29 +00002587 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
2588 // This can trigger due to call_rcu() from offline CPU:
2589 WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002590 WARN_ON_ONCE(!rcu_is_watching());
David Brazdil0f672f62019-12-10 10:32:29 +00002591 // Very early boot, before rcu_init(). Initialize if needed
2592 // and then drop through to queue the callback.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002593 if (rcu_segcblist_empty(&rdp->cblist))
2594 rcu_segcblist_init(&rdp->cblist);
2595 }
David Brazdil0f672f62019-12-10 10:32:29 +00002596 if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
2597 return; // Enqueued onto ->nocb_bypass, so just leave.
2598 /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002599 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002600 if (__is_kfree_rcu_offset((unsigned long)func))
David Brazdil0f672f62019-12-10 10:32:29 +00002601 trace_rcu_kfree_callback(rcu_state.name, head,
2602 (unsigned long)func,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002603 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2604 rcu_segcblist_n_cbs(&rdp->cblist));
2605 else
David Brazdil0f672f62019-12-10 10:32:29 +00002606 trace_rcu_callback(rcu_state.name, head,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002607 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2608 rcu_segcblist_n_cbs(&rdp->cblist));
2609
2610 /* Go handle any RCU core processing required. */
David Brazdil0f672f62019-12-10 10:32:29 +00002611 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2612 unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
2613 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
2614 } else {
2615 __call_rcu_core(rdp, head, flags);
2616 local_irq_restore(flags);
2617 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002618}
2619
2620/**
David Brazdil0f672f62019-12-10 10:32:29 +00002621 * call_rcu() - Queue an RCU callback for invocation after a grace period.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002622 * @head: structure to be used for queueing the RCU updates.
2623 * @func: actual callback function to be invoked after the grace period
2624 *
2625 * The callback function will be invoked some time after a full grace
David Brazdil0f672f62019-12-10 10:32:29 +00002626 * period elapses, in other words after all pre-existing RCU read-side
2627 * critical sections have completed. However, the callback function
2628 * might well execute concurrently with RCU read-side critical sections
2629 * that started after call_rcu() was invoked. RCU read-side critical
2630 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
2631 * may be nested. In addition, regions of code across which interrupts,
2632 * preemption, or softirqs have been disabled also serve as RCU read-side
2633 * critical sections. This includes hardware interrupt handlers, softirq
2634 * handlers, and NMI handlers.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002635 *
David Brazdil0f672f62019-12-10 10:32:29 +00002636 * Note that all CPUs must agree that the grace period extended beyond
2637 * all pre-existing RCU read-side critical section. On systems with more
2638 * than one CPU, this means that when "func()" is invoked, each CPU is
2639 * guaranteed to have executed a full memory barrier since the end of its
2640 * last RCU read-side critical section whose beginning preceded the call
2641 * to call_rcu(). It also means that each CPU executing an RCU read-side
2642 * critical section that continues beyond the start of "func()" must have
2643 * executed a memory barrier after the call_rcu() but before the beginning
2644 * of that RCU read-side critical section. Note that these guarantees
2645 * include CPUs that are offline, idle, or executing in user mode, as
2646 * well as CPUs that are executing in the kernel.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002647 *
David Brazdil0f672f62019-12-10 10:32:29 +00002648 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2649 * resulting RCU callback function "func()", then both CPU A and CPU B are
2650 * guaranteed to execute a full memory barrier during the time interval
2651 * between the call to call_rcu() and the invocation of "func()" -- even
2652 * if CPU A and CPU B are the same CPU (but again only if the system has
2653 * more than one CPU).
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002654 */
David Brazdil0f672f62019-12-10 10:32:29 +00002655void call_rcu(struct rcu_head *head, rcu_callback_t func)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002656{
David Brazdil0f672f62019-12-10 10:32:29 +00002657 __call_rcu(head, func, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002658}
David Brazdil0f672f62019-12-10 10:32:29 +00002659EXPORT_SYMBOL_GPL(call_rcu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002660
2661/*
2662 * Queue an RCU callback for lazy invocation after a grace period.
2663 * This will likely be later named something like "call_rcu_lazy()",
2664 * but this change will require some way of tagging the lazy RCU
2665 * callbacks in the list of pending callbacks. Until then, this
2666 * function may only be called from __kfree_rcu().
2667 */
David Brazdil0f672f62019-12-10 10:32:29 +00002668void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002669{
David Brazdil0f672f62019-12-10 10:32:29 +00002670 __call_rcu(head, func, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002671}
2672EXPORT_SYMBOL_GPL(kfree_call_rcu);
2673
2674/*
David Brazdil0f672f62019-12-10 10:32:29 +00002675 * During early boot, any blocking grace-period wait automatically
2676 * implies a grace period. Later on, this is never the case for PREEMPT.
2677 *
2678 * Howevr, because a context switch is a grace period for !PREEMPT, any
2679 * blocking grace-period wait automatically implies a grace period if
2680 * there is only one CPU online at any point time during execution of
2681 * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002682 * occasionally incorrectly indicate that there are multiple CPUs online
David Brazdil0f672f62019-12-10 10:32:29 +00002683 * when there was in fact only one the whole time, as this just adds some
2684 * overhead: RCU still operates correctly.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002685 */
2686static int rcu_blocking_is_gp(void)
2687{
2688 int ret;
2689
David Brazdil0f672f62019-12-10 10:32:29 +00002690 if (IS_ENABLED(CONFIG_PREEMPTION))
2691 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002692 might_sleep(); /* Check for RCU read-side critical section. */
2693 preempt_disable();
2694 ret = num_online_cpus() <= 1;
2695 preempt_enable();
2696 return ret;
2697}
2698
2699/**
David Brazdil0f672f62019-12-10 10:32:29 +00002700 * synchronize_rcu - wait until a grace period has elapsed.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002701 *
David Brazdil0f672f62019-12-10 10:32:29 +00002702 * Control will return to the caller some time after a full grace
2703 * period has elapsed, in other words after all currently executing RCU
2704 * read-side critical sections have completed. Note, however, that
2705 * upon return from synchronize_rcu(), the caller might well be executing
2706 * concurrently with new RCU read-side critical sections that began while
2707 * synchronize_rcu() was waiting. RCU read-side critical sections are
2708 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
2709 * In addition, regions of code across which interrupts, preemption, or
2710 * softirqs have been disabled also serve as RCU read-side critical
2711 * sections. This includes hardware interrupt handlers, softirq handlers,
2712 * and NMI handlers.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002713 *
2714 * Note that this guarantee implies further memory-ordering guarantees.
David Brazdil0f672f62019-12-10 10:32:29 +00002715 * On systems with more than one CPU, when synchronize_rcu() returns,
2716 * each CPU is guaranteed to have executed a full memory barrier since
2717 * the end of its last RCU read-side critical section whose beginning
2718 * preceded the call to synchronize_rcu(). In addition, each CPU having
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002719 * an RCU read-side critical section that extends beyond the return from
David Brazdil0f672f62019-12-10 10:32:29 +00002720 * synchronize_rcu() is guaranteed to have executed a full memory barrier
2721 * after the beginning of synchronize_rcu() and before the beginning of
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002722 * that RCU read-side critical section. Note that these guarantees include
2723 * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2724 * that are executing in the kernel.
2725 *
David Brazdil0f672f62019-12-10 10:32:29 +00002726 * Furthermore, if CPU A invoked synchronize_rcu(), which returned
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002727 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2728 * to have executed a full memory barrier during the execution of
David Brazdil0f672f62019-12-10 10:32:29 +00002729 * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002730 * again only if the system has more than one CPU).
2731 */
David Brazdil0f672f62019-12-10 10:32:29 +00002732void synchronize_rcu(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002733{
2734 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
2735 lock_is_held(&rcu_lock_map) ||
2736 lock_is_held(&rcu_sched_lock_map),
David Brazdil0f672f62019-12-10 10:32:29 +00002737 "Illegal synchronize_rcu() in RCU read-side critical section");
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002738 if (rcu_blocking_is_gp())
2739 return;
2740 if (rcu_gp_is_expedited())
David Brazdil0f672f62019-12-10 10:32:29 +00002741 synchronize_rcu_expedited();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002742 else
David Brazdil0f672f62019-12-10 10:32:29 +00002743 wait_rcu_gp(call_rcu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002744}
David Brazdil0f672f62019-12-10 10:32:29 +00002745EXPORT_SYMBOL_GPL(synchronize_rcu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002746
2747/**
2748 * get_state_synchronize_rcu - Snapshot current RCU state
2749 *
2750 * Returns a cookie that is used by a later call to cond_synchronize_rcu()
2751 * to determine whether or not a full grace period has elapsed in the
2752 * meantime.
2753 */
2754unsigned long get_state_synchronize_rcu(void)
2755{
2756 /*
2757 * Any prior manipulation of RCU-protected data must happen
2758 * before the load from ->gp_seq.
2759 */
2760 smp_mb(); /* ^^^ */
David Brazdil0f672f62019-12-10 10:32:29 +00002761 return rcu_seq_snap(&rcu_state.gp_seq);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002762}
2763EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
2764
2765/**
2766 * cond_synchronize_rcu - Conditionally wait for an RCU grace period
2767 *
2768 * @oldstate: return value from earlier call to get_state_synchronize_rcu()
2769 *
2770 * If a full RCU grace period has elapsed since the earlier call to
2771 * get_state_synchronize_rcu(), just return. Otherwise, invoke
2772 * synchronize_rcu() to wait for a full grace period.
2773 *
2774 * Yes, this function does not take counter wrap into account. But
2775 * counter wrap is harmless. If the counter wraps, we have waited for
2776 * more than 2 billion grace periods (and way more on a 64-bit system!),
2777 * so waiting for one additional grace period should be just fine.
2778 */
2779void cond_synchronize_rcu(unsigned long oldstate)
2780{
David Brazdil0f672f62019-12-10 10:32:29 +00002781 if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002782 synchronize_rcu();
2783 else
2784 smp_mb(); /* Ensure GP ends before subsequent accesses. */
2785}
2786EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
2787
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002788/*
David Brazdil0f672f62019-12-10 10:32:29 +00002789 * Check to see if there is any immediate RCU-related work to be done by
2790 * the current CPU, returning 1 if so and zero otherwise. The checks are
2791 * in order of increasing expense: checks that can be carried out against
2792 * CPU-local state are performed first. However, we must check for CPU
2793 * stalls first, else we might not get a chance.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002794 */
David Brazdil0f672f62019-12-10 10:32:29 +00002795static int rcu_pending(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002796{
David Brazdil0f672f62019-12-10 10:32:29 +00002797 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002798 struct rcu_node *rnp = rdp->mynode;
2799
2800 /* Check for CPU stalls, if enabled. */
David Brazdil0f672f62019-12-10 10:32:29 +00002801 check_cpu_stall(rdp);
2802
2803 /* Does this CPU need a deferred NOCB wakeup? */
2804 if (rcu_nocb_need_deferred_wakeup(rdp))
2805 return 1;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002806
2807 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
David Brazdil0f672f62019-12-10 10:32:29 +00002808 if (rcu_nohz_full_cpu())
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002809 return 0;
2810
2811 /* Is the RCU core waiting for a quiescent state from this CPU? */
2812 if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm)
2813 return 1;
2814
2815 /* Does this CPU have callbacks ready to invoke? */
2816 if (rcu_segcblist_ready_cbs(&rdp->cblist))
2817 return 1;
2818
2819 /* Has RCU gone idle with this CPU needing another grace period? */
David Brazdil0f672f62019-12-10 10:32:29 +00002820 if (!rcu_gp_in_progress() &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002821 rcu_segcblist_is_enabled(&rdp->cblist) &&
David Brazdil0f672f62019-12-10 10:32:29 +00002822 (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
2823 !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002824 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2825 return 1;
2826
2827 /* Have RCU grace period completed or started? */
2828 if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq ||
2829 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
2830 return 1;
2831
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002832 /* nothing to do */
2833 return 0;
2834}
2835
2836/*
David Brazdil0f672f62019-12-10 10:32:29 +00002837 * Helper function for rcu_barrier() tracing. If tracing is disabled,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002838 * the compiler is expected to optimize this away.
2839 */
David Brazdil0f672f62019-12-10 10:32:29 +00002840static void rcu_barrier_trace(const char *s, int cpu, unsigned long done)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002841{
David Brazdil0f672f62019-12-10 10:32:29 +00002842 trace_rcu_barrier(rcu_state.name, s, cpu,
2843 atomic_read(&rcu_state.barrier_cpu_count), done);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002844}
2845
2846/*
David Brazdil0f672f62019-12-10 10:32:29 +00002847 * RCU callback function for rcu_barrier(). If we are last, wake
2848 * up the task executing rcu_barrier().
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002849 */
2850static void rcu_barrier_callback(struct rcu_head *rhp)
2851{
David Brazdil0f672f62019-12-10 10:32:29 +00002852 if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
2853 rcu_barrier_trace(TPS("LastCB"), -1,
2854 rcu_state.barrier_sequence);
2855 complete(&rcu_state.barrier_completion);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002856 } else {
David Brazdil0f672f62019-12-10 10:32:29 +00002857 rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002858 }
2859}
2860
2861/*
2862 * Called with preemption disabled, and from cross-cpu IRQ context.
2863 */
David Brazdil0f672f62019-12-10 10:32:29 +00002864static void rcu_barrier_func(void *unused)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002865{
David Brazdil0f672f62019-12-10 10:32:29 +00002866 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002867
David Brazdil0f672f62019-12-10 10:32:29 +00002868 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002869 rdp->barrier_head.func = rcu_barrier_callback;
2870 debug_rcu_head_queue(&rdp->barrier_head);
David Brazdil0f672f62019-12-10 10:32:29 +00002871 rcu_nocb_lock(rdp);
2872 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002873 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
David Brazdil0f672f62019-12-10 10:32:29 +00002874 atomic_inc(&rcu_state.barrier_cpu_count);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002875 } else {
2876 debug_rcu_head_unqueue(&rdp->barrier_head);
David Brazdil0f672f62019-12-10 10:32:29 +00002877 rcu_barrier_trace(TPS("IRQNQ"), -1,
2878 rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002879 }
David Brazdil0f672f62019-12-10 10:32:29 +00002880 rcu_nocb_unlock(rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002881}
2882
David Brazdil0f672f62019-12-10 10:32:29 +00002883/**
2884 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
2885 *
2886 * Note that this primitive does not necessarily wait for an RCU grace period
2887 * to complete. For example, if there are no RCU callbacks queued anywhere
2888 * in the system, then rcu_barrier() is within its rights to return
2889 * immediately, without waiting for anything, much less an RCU grace period.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002890 */
David Brazdil0f672f62019-12-10 10:32:29 +00002891void rcu_barrier(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002892{
2893 int cpu;
2894 struct rcu_data *rdp;
David Brazdil0f672f62019-12-10 10:32:29 +00002895 unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002896
David Brazdil0f672f62019-12-10 10:32:29 +00002897 rcu_barrier_trace(TPS("Begin"), -1, s);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002898
2899 /* Take mutex to serialize concurrent rcu_barrier() requests. */
David Brazdil0f672f62019-12-10 10:32:29 +00002900 mutex_lock(&rcu_state.barrier_mutex);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002901
2902 /* Did someone else do our work for us? */
David Brazdil0f672f62019-12-10 10:32:29 +00002903 if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
2904 rcu_barrier_trace(TPS("EarlyExit"), -1,
2905 rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002906 smp_mb(); /* caller's subsequent code after above check. */
David Brazdil0f672f62019-12-10 10:32:29 +00002907 mutex_unlock(&rcu_state.barrier_mutex);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002908 return;
2909 }
2910
2911 /* Mark the start of the barrier operation. */
David Brazdil0f672f62019-12-10 10:32:29 +00002912 rcu_seq_start(&rcu_state.barrier_sequence);
2913 rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002914
2915 /*
2916 * Initialize the count to one rather than to zero in order to
2917 * avoid a too-soon return to zero in case of a short grace period
2918 * (or preemption of this task). Exclude CPU-hotplug operations
2919 * to ensure that no offline CPU has callbacks queued.
2920 */
David Brazdil0f672f62019-12-10 10:32:29 +00002921 init_completion(&rcu_state.barrier_completion);
2922 atomic_set(&rcu_state.barrier_cpu_count, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002923 get_online_cpus();
2924
2925 /*
2926 * Force each CPU with callbacks to register a new callback.
2927 * When that callback is invoked, we will know that all of the
2928 * corresponding CPU's preceding callbacks have been invoked.
2929 */
2930 for_each_possible_cpu(cpu) {
David Brazdil0f672f62019-12-10 10:32:29 +00002931 rdp = per_cpu_ptr(&rcu_data, cpu);
2932 if (!cpu_online(cpu) &&
2933 !rcu_segcblist_is_offloaded(&rdp->cblist))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002934 continue;
David Brazdil0f672f62019-12-10 10:32:29 +00002935 if (rcu_segcblist_n_cbs(&rdp->cblist)) {
2936 rcu_barrier_trace(TPS("OnlineQ"), cpu,
2937 rcu_state.barrier_sequence);
2938 smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002939 } else {
David Brazdil0f672f62019-12-10 10:32:29 +00002940 rcu_barrier_trace(TPS("OnlineNQ"), cpu,
2941 rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002942 }
2943 }
2944 put_online_cpus();
2945
2946 /*
2947 * Now that we have an rcu_barrier_callback() callback on each
2948 * CPU, and thus each counted, remove the initial count.
2949 */
David Brazdil0f672f62019-12-10 10:32:29 +00002950 if (atomic_dec_and_test(&rcu_state.barrier_cpu_count))
2951 complete(&rcu_state.barrier_completion);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002952
2953 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
David Brazdil0f672f62019-12-10 10:32:29 +00002954 wait_for_completion(&rcu_state.barrier_completion);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002955
2956 /* Mark the end of the barrier operation. */
David Brazdil0f672f62019-12-10 10:32:29 +00002957 rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
2958 rcu_seq_end(&rcu_state.barrier_sequence);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002959
2960 /* Other rcu_barrier() invocations can now safely proceed. */
David Brazdil0f672f62019-12-10 10:32:29 +00002961 mutex_unlock(&rcu_state.barrier_mutex);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002962}
David Brazdil0f672f62019-12-10 10:32:29 +00002963EXPORT_SYMBOL_GPL(rcu_barrier);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002964
2965/*
2966 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
2967 * first CPU in a given leaf rcu_node structure coming online. The caller
2968 * must hold the corresponding leaf rcu_node ->lock with interrrupts
2969 * disabled.
2970 */
2971static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
2972{
2973 long mask;
2974 long oldmask;
2975 struct rcu_node *rnp = rnp_leaf;
2976
2977 raw_lockdep_assert_held_rcu_node(rnp_leaf);
2978 WARN_ON_ONCE(rnp->wait_blkd_tasks);
2979 for (;;) {
2980 mask = rnp->grpmask;
2981 rnp = rnp->parent;
2982 if (rnp == NULL)
2983 return;
2984 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
2985 oldmask = rnp->qsmaskinit;
2986 rnp->qsmaskinit |= mask;
2987 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
2988 if (oldmask)
2989 return;
2990 }
2991}
2992
2993/*
2994 * Do boot-time initialization of a CPU's per-CPU RCU data.
2995 */
2996static void __init
David Brazdil0f672f62019-12-10 10:32:29 +00002997rcu_boot_init_percpu_data(int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002998{
David Brazdil0f672f62019-12-10 10:32:29 +00002999 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003000
3001 /* Set up local state, ensuring consistent view of global state. */
3002 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
David Brazdil0f672f62019-12-10 10:32:29 +00003003 WARN_ON_ONCE(rdp->dynticks_nesting != 1);
3004 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
3005 rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003006 rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
David Brazdil0f672f62019-12-10 10:32:29 +00003007 rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003008 rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
3009 rdp->cpu = cpu;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003010 rcu_boot_init_nocb_percpu_data(rdp);
3011}
3012
3013/*
David Brazdil0f672f62019-12-10 10:32:29 +00003014 * Invoked early in the CPU-online process, when pretty much all services
3015 * are available. The incoming CPU is not present.
3016 *
3017 * Initializes a CPU's per-CPU RCU data. Note that only one online or
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003018 * offline event can be happening at a given time. Note also that we can
3019 * accept some slop in the rsp->gp_seq access due to the fact that this
David Brazdil0f672f62019-12-10 10:32:29 +00003020 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
3021 * And any offloaded callbacks are being numbered elsewhere.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003022 */
David Brazdil0f672f62019-12-10 10:32:29 +00003023int rcutree_prepare_cpu(unsigned int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003024{
3025 unsigned long flags;
David Brazdil0f672f62019-12-10 10:32:29 +00003026 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3027 struct rcu_node *rnp = rcu_get_root();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003028
3029 /* Set up local state, ensuring consistent view of global state. */
3030 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3031 rdp->qlen_last_fqs_check = 0;
David Brazdil0f672f62019-12-10 10:32:29 +00003032 rdp->n_force_qs_snap = rcu_state.n_force_qs;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003033 rdp->blimit = blimit;
3034 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
David Brazdil0f672f62019-12-10 10:32:29 +00003035 !rcu_segcblist_is_offloaded(&rdp->cblist))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003036 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
David Brazdil0f672f62019-12-10 10:32:29 +00003037 rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003038 rcu_dynticks_eqs_online();
3039 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
3040
3041 /*
3042 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3043 * propagation up the rcu_node tree will happen at the beginning
3044 * of the next grace period.
3045 */
3046 rnp = rdp->mynode;
3047 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
3048 rdp->beenonline = true; /* We have now been online. */
3049 rdp->gp_seq = rnp->gp_seq;
3050 rdp->gp_seq_needed = rnp->gp_seq;
3051 rdp->cpu_no_qs.b.norm = true;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003052 rdp->core_needs_qs = false;
3053 rdp->rcu_iw_pending = false;
3054 rdp->rcu_iw_gp_seq = rnp->gp_seq - 1;
David Brazdil0f672f62019-12-10 10:32:29 +00003055 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003056 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003057 rcu_prepare_kthreads(cpu);
David Brazdil0f672f62019-12-10 10:32:29 +00003058 rcu_spawn_cpu_nocb_kthread(cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003059
3060 return 0;
3061}
3062
3063/*
3064 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
3065 */
3066static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3067{
David Brazdil0f672f62019-12-10 10:32:29 +00003068 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003069
3070 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3071}
3072
3073/*
3074 * Near the end of the CPU-online process. Pretty much all services
3075 * enabled, and the CPU is now very much alive.
3076 */
3077int rcutree_online_cpu(unsigned int cpu)
3078{
3079 unsigned long flags;
3080 struct rcu_data *rdp;
3081 struct rcu_node *rnp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003082
David Brazdil0f672f62019-12-10 10:32:29 +00003083 rdp = per_cpu_ptr(&rcu_data, cpu);
3084 rnp = rdp->mynode;
3085 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3086 rnp->ffmask |= rdp->grpmask;
3087 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003088 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
3089 return 0; /* Too early in boot for scheduler work. */
3090 sync_sched_exp_online_cleanup(cpu);
3091 rcutree_affinity_setting(cpu, -1);
3092 return 0;
3093}
3094
3095/*
3096 * Near the beginning of the process. The CPU is still very much alive
3097 * with pretty much all services enabled.
3098 */
3099int rcutree_offline_cpu(unsigned int cpu)
3100{
3101 unsigned long flags;
3102 struct rcu_data *rdp;
3103 struct rcu_node *rnp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003104
David Brazdil0f672f62019-12-10 10:32:29 +00003105 rdp = per_cpu_ptr(&rcu_data, cpu);
3106 rnp = rdp->mynode;
3107 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3108 rnp->ffmask &= ~rdp->grpmask;
3109 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003110
3111 rcutree_affinity_setting(cpu, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003112 return 0;
3113}
3114
3115static DEFINE_PER_CPU(int, rcu_cpu_started);
3116
3117/*
3118 * Mark the specified CPU as being online so that subsequent grace periods
3119 * (both expedited and normal) will wait on it. Note that this means that
3120 * incoming CPUs are not allowed to use RCU read-side critical sections
3121 * until this function is called. Failing to observe this restriction
3122 * will result in lockdep splats.
3123 *
3124 * Note that this function is special in that it is invoked directly
3125 * from the incoming CPU rather than from the cpuhp_step mechanism.
3126 * This is because this function must be invoked at a precise location.
3127 */
3128void rcu_cpu_starting(unsigned int cpu)
3129{
3130 unsigned long flags;
3131 unsigned long mask;
3132 int nbits;
3133 unsigned long oldmask;
3134 struct rcu_data *rdp;
3135 struct rcu_node *rnp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003136
3137 if (per_cpu(rcu_cpu_started, cpu))
3138 return;
3139
3140 per_cpu(rcu_cpu_started, cpu) = 1;
3141
David Brazdil0f672f62019-12-10 10:32:29 +00003142 rdp = per_cpu_ptr(&rcu_data, cpu);
3143 rnp = rdp->mynode;
3144 mask = rdp->grpmask;
3145 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3146 rnp->qsmaskinitnext |= mask;
3147 oldmask = rnp->expmaskinitnext;
3148 rnp->expmaskinitnext |= mask;
3149 oldmask ^= rnp->expmaskinitnext;
3150 nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
3151 /* Allow lockless access for expedited grace periods. */
3152 smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
3153 rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
3154 rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
3155 rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
3156 if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
3157 /* Report QS -after- changing ->qsmaskinitnext! */
3158 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
3159 } else {
3160 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003161 }
3162 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
3163}
3164
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003165/*
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003166 * The outgoing function has no further need of RCU, so remove it from
David Brazdil0f672f62019-12-10 10:32:29 +00003167 * the rcu_node tree's ->qsmaskinitnext bit masks.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003168 *
3169 * Note that this function is special in that it is invoked directly
3170 * from the outgoing CPU rather than from the cpuhp_step mechanism.
3171 * This is because this function must be invoked at a precise location.
3172 */
3173void rcu_report_dead(unsigned int cpu)
3174{
David Brazdil0f672f62019-12-10 10:32:29 +00003175 unsigned long flags;
3176 unsigned long mask;
3177 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3178 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003179
David Brazdil0f672f62019-12-10 10:32:29 +00003180 /* QS for any half-done expedited grace period. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003181 preempt_disable();
David Brazdil0f672f62019-12-10 10:32:29 +00003182 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003183 preempt_enable();
David Brazdil0f672f62019-12-10 10:32:29 +00003184 rcu_preempt_deferred_qs(current);
3185
3186 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
3187 mask = rdp->grpmask;
3188 raw_spin_lock(&rcu_state.ofl_lock);
3189 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
3190 rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
3191 rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
3192 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
3193 /* Report quiescent state -before- changing ->qsmaskinitnext! */
3194 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
3195 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3196 }
3197 rnp->qsmaskinitnext &= ~mask;
3198 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3199 raw_spin_unlock(&rcu_state.ofl_lock);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003200
3201 per_cpu(rcu_cpu_started, cpu) = 0;
3202}
3203
Olivier Deprez0e641232021-09-23 10:07:05 +02003204#ifdef CONFIG_HOTPLUG_CPU
David Brazdil0f672f62019-12-10 10:32:29 +00003205/*
3206 * The outgoing CPU has just passed through the dying-idle state, and we
3207 * are being invoked from the CPU that was IPIed to continue the offline
3208 * operation. Migrate the outgoing CPU's callbacks to the current CPU.
3209 */
3210void rcutree_migrate_callbacks(int cpu)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003211{
3212 unsigned long flags;
3213 struct rcu_data *my_rdp;
David Brazdil0f672f62019-12-10 10:32:29 +00003214 struct rcu_node *my_rnp;
3215 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003216 bool needwake;
3217
David Brazdil0f672f62019-12-10 10:32:29 +00003218 if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
3219 rcu_segcblist_empty(&rdp->cblist))
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003220 return; /* No callbacks to migrate. */
3221
3222 local_irq_save(flags);
David Brazdil0f672f62019-12-10 10:32:29 +00003223 my_rdp = this_cpu_ptr(&rcu_data);
3224 my_rnp = my_rdp->mynode;
3225 rcu_nocb_lock(my_rdp); /* irqs already disabled. */
3226 WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
3227 raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003228 /* Leverage recent GPs and set GP for new callbacks. */
David Brazdil0f672f62019-12-10 10:32:29 +00003229 needwake = rcu_advance_cbs(my_rnp, rdp) ||
3230 rcu_advance_cbs(my_rnp, my_rdp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003231 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
David Brazdil0f672f62019-12-10 10:32:29 +00003232 needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
3233 rcu_segcblist_disable(&rdp->cblist);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003234 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
3235 !rcu_segcblist_n_cbs(&my_rdp->cblist));
David Brazdil0f672f62019-12-10 10:32:29 +00003236 if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
3237 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
3238 __call_rcu_nocb_wake(my_rdp, true, flags);
3239 } else {
3240 rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
3241 raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
3242 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003243 if (needwake)
David Brazdil0f672f62019-12-10 10:32:29 +00003244 rcu_gp_kthread_wake();
3245 lockdep_assert_irqs_enabled();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003246 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
3247 !rcu_segcblist_empty(&rdp->cblist),
3248 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
3249 cpu, rcu_segcblist_n_cbs(&rdp->cblist),
3250 rcu_segcblist_first_cb(&rdp->cblist));
3251}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003252#endif
3253
3254/*
3255 * On non-huge systems, use expedited RCU grace periods to make suspend
3256 * and hibernation run faster.
3257 */
3258static int rcu_pm_notify(struct notifier_block *self,
3259 unsigned long action, void *hcpu)
3260{
3261 switch (action) {
3262 case PM_HIBERNATION_PREPARE:
3263 case PM_SUSPEND_PREPARE:
David Brazdil0f672f62019-12-10 10:32:29 +00003264 rcu_expedite_gp();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003265 break;
3266 case PM_POST_HIBERNATION:
3267 case PM_POST_SUSPEND:
David Brazdil0f672f62019-12-10 10:32:29 +00003268 rcu_unexpedite_gp();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003269 break;
3270 default:
3271 break;
3272 }
3273 return NOTIFY_OK;
3274}
3275
3276/*
David Brazdil0f672f62019-12-10 10:32:29 +00003277 * Spawn the kthreads that handle RCU's grace periods.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003278 */
3279static int __init rcu_spawn_gp_kthread(void)
3280{
3281 unsigned long flags;
3282 int kthread_prio_in = kthread_prio;
3283 struct rcu_node *rnp;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003284 struct sched_param sp;
3285 struct task_struct *t;
3286
3287 /* Force priority into range. */
3288 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
3289 && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
3290 kthread_prio = 2;
3291 else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
3292 kthread_prio = 1;
3293 else if (kthread_prio < 0)
3294 kthread_prio = 0;
3295 else if (kthread_prio > 99)
3296 kthread_prio = 99;
3297
3298 if (kthread_prio != kthread_prio_in)
3299 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
3300 kthread_prio, kthread_prio_in);
3301
3302 rcu_scheduler_fully_active = 1;
David Brazdil0f672f62019-12-10 10:32:29 +00003303 t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
3304 if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
3305 return 0;
3306 if (kthread_prio) {
3307 sp.sched_priority = kthread_prio;
3308 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003309 }
David Brazdil0f672f62019-12-10 10:32:29 +00003310 rnp = rcu_get_root();
3311 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3312 rcu_state.gp_kthread = t;
3313 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3314 wake_up_process(t);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003315 rcu_spawn_nocb_kthreads();
3316 rcu_spawn_boost_kthreads();
Olivier Deprez0e641232021-09-23 10:07:05 +02003317 rcu_spawn_core_kthreads();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003318 return 0;
3319}
3320early_initcall(rcu_spawn_gp_kthread);
3321
3322/*
3323 * This function is invoked towards the end of the scheduler's
3324 * initialization process. Before this is called, the idle task might
3325 * contain synchronous grace-period primitives (during which time, this idle
3326 * task is booting the system, and such primitives are no-ops). After this
3327 * function is called, any synchronous grace-period primitives are run as
3328 * expedited, with the requesting task driving the grace period forward.
3329 * A later core_initcall() rcu_set_runtime_mode() will switch to full
3330 * runtime RCU functionality.
3331 */
3332void rcu_scheduler_starting(void)
3333{
3334 WARN_ON(num_online_cpus() != 1);
3335 WARN_ON(nr_context_switches() > 0);
3336 rcu_test_sync_prims();
3337 rcu_scheduler_active = RCU_SCHEDULER_INIT;
3338 rcu_test_sync_prims();
3339}
3340
3341/*
David Brazdil0f672f62019-12-10 10:32:29 +00003342 * Helper function for rcu_init() that initializes the rcu_state structure.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003343 */
David Brazdil0f672f62019-12-10 10:32:29 +00003344static void __init rcu_init_one(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003345{
3346 static const char * const buf[] = RCU_NODE_NAME_INIT;
3347 static const char * const fqs[] = RCU_FQS_NAME_INIT;
3348 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
3349 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
3350
3351 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
3352 int cpustride = 1;
3353 int i;
3354 int j;
3355 struct rcu_node *rnp;
3356
3357 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
3358
3359 /* Silence gcc 4.8 false positive about array index out of range. */
3360 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
3361 panic("rcu_init_one: rcu_num_lvls out of range");
3362
3363 /* Initialize the level-tracking arrays. */
3364
3365 for (i = 1; i < rcu_num_lvls; i++)
David Brazdil0f672f62019-12-10 10:32:29 +00003366 rcu_state.level[i] =
3367 rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003368 rcu_init_levelspread(levelspread, num_rcu_lvl);
3369
3370 /* Initialize the elements themselves, starting from the leaves. */
3371
3372 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3373 cpustride *= levelspread[i];
David Brazdil0f672f62019-12-10 10:32:29 +00003374 rnp = rcu_state.level[i];
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003375 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
3376 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
3377 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
3378 &rcu_node_class[i], buf[i]);
3379 raw_spin_lock_init(&rnp->fqslock);
3380 lockdep_set_class_and_name(&rnp->fqslock,
3381 &rcu_fqs_class[i], fqs[i]);
David Brazdil0f672f62019-12-10 10:32:29 +00003382 rnp->gp_seq = rcu_state.gp_seq;
3383 rnp->gp_seq_needed = rcu_state.gp_seq;
3384 rnp->completedqs = rcu_state.gp_seq;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003385 rnp->qsmask = 0;
3386 rnp->qsmaskinit = 0;
3387 rnp->grplo = j * cpustride;
3388 rnp->grphi = (j + 1) * cpustride - 1;
3389 if (rnp->grphi >= nr_cpu_ids)
3390 rnp->grphi = nr_cpu_ids - 1;
3391 if (i == 0) {
3392 rnp->grpnum = 0;
3393 rnp->grpmask = 0;
3394 rnp->parent = NULL;
3395 } else {
3396 rnp->grpnum = j % levelspread[i - 1];
David Brazdil0f672f62019-12-10 10:32:29 +00003397 rnp->grpmask = BIT(rnp->grpnum);
3398 rnp->parent = rcu_state.level[i - 1] +
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003399 j / levelspread[i - 1];
3400 }
3401 rnp->level = i;
3402 INIT_LIST_HEAD(&rnp->blkd_tasks);
3403 rcu_init_one_nocb(rnp);
3404 init_waitqueue_head(&rnp->exp_wq[0]);
3405 init_waitqueue_head(&rnp->exp_wq[1]);
3406 init_waitqueue_head(&rnp->exp_wq[2]);
3407 init_waitqueue_head(&rnp->exp_wq[3]);
3408 spin_lock_init(&rnp->exp_lock);
3409 }
3410 }
3411
David Brazdil0f672f62019-12-10 10:32:29 +00003412 init_swait_queue_head(&rcu_state.gp_wq);
3413 init_swait_queue_head(&rcu_state.expedited_wq);
3414 rnp = rcu_first_leaf_node();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003415 for_each_possible_cpu(i) {
3416 while (i > rnp->grphi)
3417 rnp++;
David Brazdil0f672f62019-12-10 10:32:29 +00003418 per_cpu_ptr(&rcu_data, i)->mynode = rnp;
3419 rcu_boot_init_percpu_data(i);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003420 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003421}
3422
3423/*
3424 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3425 * replace the definitions in tree.h because those are needed to size
3426 * the ->node array in the rcu_state structure.
3427 */
Olivier Deprez0e641232021-09-23 10:07:05 +02003428void rcu_init_geometry(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003429{
3430 ulong d;
3431 int i;
Olivier Deprez0e641232021-09-23 10:07:05 +02003432 static unsigned long old_nr_cpu_ids;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003433 int rcu_capacity[RCU_NUM_LVLS];
Olivier Deprez0e641232021-09-23 10:07:05 +02003434 static bool initialized;
3435
3436 if (initialized) {
3437 /*
3438 * Warn if setup_nr_cpu_ids() had not yet been invoked,
3439 * unless nr_cpus_ids == NR_CPUS, in which case who cares?
3440 */
3441 WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
3442 return;
3443 }
3444
3445 old_nr_cpu_ids = nr_cpu_ids;
3446 initialized = true;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003447
3448 /*
3449 * Initialize any unspecified boot parameters.
3450 * The default values of jiffies_till_first_fqs and
3451 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
3452 * value, which is a function of HZ, then adding one for each
3453 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
3454 */
3455 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
3456 if (jiffies_till_first_fqs == ULONG_MAX)
3457 jiffies_till_first_fqs = d;
3458 if (jiffies_till_next_fqs == ULONG_MAX)
3459 jiffies_till_next_fqs = d;
David Brazdil0f672f62019-12-10 10:32:29 +00003460 adjust_jiffies_till_sched_qs();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003461
3462 /* If the compile-time values are accurate, just leave. */
3463 if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
3464 nr_cpu_ids == NR_CPUS)
3465 return;
3466 pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
3467 rcu_fanout_leaf, nr_cpu_ids);
3468
3469 /*
3470 * The boot-time rcu_fanout_leaf parameter must be at least two
3471 * and cannot exceed the number of bits in the rcu_node masks.
3472 * Complain and fall back to the compile-time values if this
3473 * limit is exceeded.
3474 */
3475 if (rcu_fanout_leaf < 2 ||
3476 rcu_fanout_leaf > sizeof(unsigned long) * 8) {
3477 rcu_fanout_leaf = RCU_FANOUT_LEAF;
3478 WARN_ON(1);
3479 return;
3480 }
3481
3482 /*
3483 * Compute number of nodes that can be handled an rcu_node tree
3484 * with the given number of levels.
3485 */
3486 rcu_capacity[0] = rcu_fanout_leaf;
3487 for (i = 1; i < RCU_NUM_LVLS; i++)
3488 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
3489
3490 /*
3491 * The tree must be able to accommodate the configured number of CPUs.
3492 * If this limit is exceeded, fall back to the compile-time values.
3493 */
3494 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
3495 rcu_fanout_leaf = RCU_FANOUT_LEAF;
3496 WARN_ON(1);
3497 return;
3498 }
3499
3500 /* Calculate the number of levels in the tree. */
3501 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
3502 }
3503 rcu_num_lvls = i + 1;
3504
3505 /* Calculate the number of rcu_nodes at each level of the tree. */
3506 for (i = 0; i < rcu_num_lvls; i++) {
3507 int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
3508 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
3509 }
3510
3511 /* Calculate the total number of rcu_node structures. */
3512 rcu_num_nodes = 0;
3513 for (i = 0; i < rcu_num_lvls; i++)
3514 rcu_num_nodes += num_rcu_lvl[i];
3515}
3516
3517/*
3518 * Dump out the structure of the rcu_node combining tree associated
David Brazdil0f672f62019-12-10 10:32:29 +00003519 * with the rcu_state structure.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003520 */
David Brazdil0f672f62019-12-10 10:32:29 +00003521static void __init rcu_dump_rcu_node_tree(void)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003522{
3523 int level = 0;
3524 struct rcu_node *rnp;
3525
3526 pr_info("rcu_node tree layout dump\n");
3527 pr_info(" ");
David Brazdil0f672f62019-12-10 10:32:29 +00003528 rcu_for_each_node_breadth_first(rnp) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003529 if (rnp->level != level) {
3530 pr_cont("\n");
3531 pr_info(" ");
3532 level = rnp->level;
3533 }
3534 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
3535 }
3536 pr_cont("\n");
3537}
3538
3539struct workqueue_struct *rcu_gp_wq;
3540struct workqueue_struct *rcu_par_gp_wq;
3541
3542void __init rcu_init(void)
3543{
3544 int cpu;
3545
3546 rcu_early_boot_tests();
3547
3548 rcu_bootup_announce();
3549 rcu_init_geometry();
David Brazdil0f672f62019-12-10 10:32:29 +00003550 rcu_init_one();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003551 if (dump_tree)
David Brazdil0f672f62019-12-10 10:32:29 +00003552 rcu_dump_rcu_node_tree();
3553 if (use_softirq)
3554 open_softirq(RCU_SOFTIRQ, rcu_core_si);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003555
3556 /*
3557 * We don't need protection against CPU-hotplug here because
3558 * this is called early in boot, before either interrupts
3559 * or the scheduler are operational.
3560 */
3561 pm_notifier(rcu_pm_notify, 0);
3562 for_each_online_cpu(cpu) {
3563 rcutree_prepare_cpu(cpu);
3564 rcu_cpu_starting(cpu);
3565 rcutree_online_cpu(cpu);
3566 }
3567
3568 /* Create workqueue for expedited GPs and for Tree SRCU. */
3569 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
3570 WARN_ON(!rcu_gp_wq);
3571 rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
3572 WARN_ON(!rcu_par_gp_wq);
David Brazdil0f672f62019-12-10 10:32:29 +00003573 srcu_init();
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003574}
3575
David Brazdil0f672f62019-12-10 10:32:29 +00003576#include "tree_stall.h"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00003577#include "tree_exp.h"
3578#include "tree_plugin.h"