source: mainline/kernel/generic/src/synch/rcu.c@ e3306d04

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since e3306d04 was e3306d04, checked in by Jiří Zárevúcky <jiri.zarevucky@…>, 7 years ago

Convert atomic_t to atomic_size_t (4): Use atomic_store instead of atomic_set

  • Property mode set to 100644
File size: 55.9 KB
RevLine 
[79d74fe]1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
[a35b458]28
29
[79d74fe]30/** @addtogroup sync
31 * @{
32 */
33
34/**
35 * @file
[181a746]36 * @brief Preemptible read-copy update. Usable from interrupt handlers.
[1b20da0]37 *
[fbe6b65]38 * @par Podzimek-preempt-RCU (RCU_PREEMPT_PODZIMEK)
[1b20da0]39 *
[fbe6b65]40 * Podzimek-preempt-RCU is a preemptible variant of Podzimek's non-preemptible
41 * RCU algorithm [1, 2]. Grace period (GP) detection is centralized into a
42 * single detector thread. The detector requests that each cpu announces
43 * that it passed a quiescent state (QS), ie a state when the cpu is
44 * outside of an rcu reader section (CS). Cpus check for QSs during context
[1b20da0]45 * switches and when entering and exiting rcu reader sections. Once all
46 * cpus announce a QS and if there were no threads preempted in a CS, the
[fbe6b65]47 * GP ends.
[1b20da0]48 *
49 * The detector increments the global GP counter, _rcu_cur_gp, in order
50 * to start a new GP. Readers notice the new GP by comparing the changed
[fbe6b65]51 * _rcu_cur_gp to a locally stored value last_seen_gp which denotes the
52 * the last GP number for which the cpu noted an explicit QS (and issued
53 * a memory barrier). Readers check for the change in the outer-most
[1b20da0]54 * (ie not nested) rcu_read_lock()/unlock() as these functions represent
55 * a QS. The reader first executes a memory barrier (MB) in order to contain
56 * memory references within a CS (and to make changes made by writers
57 * visible in the CS following rcu_read_lock()). Next, the reader notes
[fbe6b65]58 * that it reached a QS by updating the cpu local last_seen_gp to the
59 * global GP counter, _rcu_cur_gp. Cache coherency eventually makes
60 * the updated last_seen_gp visible to the detector cpu, much like it
61 * delivered the changed _rcu_cur_gp to all cpus.
[1b20da0]62 *
63 * The detector waits a while after starting a GP and then reads each
64 * cpu's last_seen_gp to see if it reached a QS. If a cpu did not record
[fbe6b65]65 * a QS (might be a long running thread without an RCU reader CS; or cache
66 * coherency has yet to make the most current last_seen_gp visible to
67 * the detector; or the cpu is still in a CS) the cpu is interrupted
68 * via an IPI. If the IPI handler finds the cpu still in a CS, it instructs
[82719589]69 * the cpu to notify the detector that it had exited the CS via a semaphore
[1b20da0]70 * (CPU->rcu.is_delaying_gp).
[fbe6b65]71 * The detector then waits on the semaphore for any cpus to exit their
[1b20da0]72 * CSs. Lastly, it waits for the last reader preempted in a CS to
[fbe6b65]73 * exit its CS if there were any and signals the end of the GP to
74 * separate reclaimer threads wired to each cpu. Reclaimers then
75 * execute the callbacks queued on each of the cpus.
[1b20da0]76 *
77 *
[fbe6b65]78 * @par A-RCU algorithm (RCU_PREEMPT_A)
[1b20da0]79 *
[fbe6b65]80 * A-RCU is based on the user space rcu algorithm in [3] utilizing signals
[1b20da0]81 * (urcu) and Podzimek's rcu [1]. Like in Podzimek's rcu, callbacks are
82 * executed by cpu-bound reclaimer threads. There is however no dedicated
83 * detector thread and the reclaimers take on the responsibilities of the
84 * detector when they need to start a new GP. A new GP is again announced
[fbe6b65]85 * and acknowledged with _rcu_cur_gp and the cpu local last_seen_gp. Unlike
[1b20da0]86 * Podzimek's rcu, cpus check explicitly for QS only during context switches.
[fbe6b65]87 * Like in urcu, rcu_read_lock()/unlock() only maintain the nesting count
88 * and never issue any memory barriers. This makes rcu_read_lock()/unlock()
89 * simple and fast.
[1b20da0]90 *
[fbe6b65]91 * If a new callback is queued for a reclaimer and no GP is in progress,
[1b20da0]92 * the reclaimer takes on the role of a detector. The detector increments
93 * _rcu_cur_gp in order to start a new GP. It waits a while to give cpus
[fbe6b65]94 * a chance to switch a context (a natural QS). Then, it examines each
95 * non-idle cpu that has yet to pass a QS via an IPI. The IPI handler
96 * sees the most current _rcu_cur_gp and last_seen_gp and notes a QS
97 * with a memory barrier and an update to last_seen_gp. If the handler
98 * finds the cpu in a CS it does nothing and let the detector poll/interrupt
99 * the cpu again after a short sleep.
[1b20da0]100 *
[fbe6b65]101 * @par Caveats
[1b20da0]102 *
[fbe6b65]103 * last_seen_gp and _rcu_cur_gp are always 64bit variables and they
104 * are read non-atomically on 32bit machines. Reading a clobbered
105 * value of last_seen_gp or _rcu_cur_gp or writing a clobbered value
106 * of _rcu_cur_gp to last_seen_gp will at worst force the detector
[1b20da0]107 * to unnecessarily interrupt a cpu. Interrupting a cpu makes the
[fbe6b65]108 * correct value of _rcu_cur_gp visible to the cpu and correctly
109 * resets last_seen_gp in both algorithms.
[1b20da0]110 *
111 *
112 *
[fbe6b65]113 * [1] Read-copy-update for opensolaris,
114 * 2010, Podzimek
115 * https://andrej.podzimek.org/thesis.pdf
[1b20da0]116 *
[fbe6b65]117 * [2] (podzimek-rcu) implementation file "rcu.patch"
118 * http://d3s.mff.cuni.cz/projects/operating_systems/rcu/rcu.patch
[1b20da0]119 *
[fbe6b65]120 * [3] User-level implementations of read-copy update,
121 * 2012, appendix
122 * http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
[1b20da0]123 *
[79d74fe]124 */
[63e27ef]125
126#include <assert.h>
[79d74fe]127#include <synch/rcu.h>
[181a746]128#include <synch/condvar.h>
129#include <synch/semaphore.h>
130#include <synch/spinlock.h>
[4ec9ea41]131#include <synch/mutex.h>
[181a746]132#include <proc/thread.h>
133#include <cpu/cpu_mask.h>
134#include <cpu.h>
135#include <smp/smp_call.h>
[05882233]136#include <barrier.h>
[181a746]137#include <atomic.h>
138#include <arch.h>
139#include <macros.h>
[79d74fe]140
[1b20da0]141/*
142 * Number of milliseconds to give to preexisting readers to finish
[181a746]143 * when non-expedited grace period detection is in progress.
144 */
[0cf813d]145#define DETECT_SLEEP_MS 10
[1b20da0]146/*
147 * Max number of pending callbacks in the local cpu's queue before
[181a746]148 * aggressively expediting the current grace period
149 */
[0cf813d]150#define EXPEDITE_THRESHOLD 2000
151/*
152 * Max number of callbacks to execute in one go with preemption
153 * enabled. If there are more callbacks to be executed they will
154 * be run with preemption disabled in order to prolong reclaimer's
155 * time slice and give it a chance to catch up with callback producers.
156 */
157#define CRITICAL_THRESHOLD 30000
[181a746]158/* Half the number of values a uint32 can hold. */
159#define UINT32_MAX_HALF 2147483648U
[79d74fe]160
[1b20da0]161/**
162 * The current grace period number. Increases monotonically.
[8e3ed06]163 * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
164 */
165rcu_gp_t _rcu_cur_gp;
[181a746]166
[0cf813d]167/** Global RCU data. */
[181a746]168typedef struct rcu_data {
169 /** Detector uses so signal reclaimers that a grace period ended. */
170 condvar_t gp_ended;
171 /** Reclaimers use to notify the detector to accelerate GP detection. */
172 condvar_t expedite_now;
[1b20da0]173 /**
[d4d36f9]174 * Protects: req_gp_end_cnt, req_expedited_cnt, completed_gp, _rcu_cur_gp;
175 * or: completed_gp, _rcu_cur_gp
[181a746]176 */
177 SPINLOCK_DECLARE(gp_lock);
178 /**
[1b20da0]179 * The number of the most recently completed grace period. At most
180 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period
[8e3ed06]181 * detection is not in progress and the detector is idle.
[181a746]182 */
183 rcu_gp_t completed_gp;
[a35b458]184
[0cf813d]185 /** Protects the following 3 fields. */
[181a746]186 IRQ_SPINLOCK_DECLARE(preempt_lock);
187 /** Preexisting readers that have been preempted. */
188 list_t cur_preempted;
189 /** Reader that have been preempted and might delay the next grace period.*/
190 list_t next_preempted;
[1b20da0]191 /**
192 * The detector is waiting for the last preempted reader
193 * in cur_preempted to announce that it exited its reader
[181a746]194 * section by up()ing remaining_readers.
195 */
196 bool preempt_blocking_det;
[a35b458]197
[d4d36f9]198#ifdef RCU_PREEMPT_A
[a35b458]199
[1b20da0]200 /**
201 * The detector waits on this semaphore for any preempted readers
[d4d36f9]202 * delaying the grace period once all cpus pass a quiescent state.
203 */
204 semaphore_t remaining_readers;
205
206#elif defined(RCU_PREEMPT_PODZIMEK)
[a35b458]207
[d4d36f9]208 /** Reclaimers notify the detector when they request more grace periods.*/
209 condvar_t req_gp_changed;
210 /** Number of grace period ends the detector was requested to announce. */
211 size_t req_gp_end_cnt;
212 /** Number of consecutive grace periods to detect quickly and aggressively.*/
213 size_t req_expedited_cnt;
[1b20da0]214 /**
[181a746]215 * Number of cpus with readers that are delaying the current GP.
216 * They will up() remaining_readers.
217 */
218 atomic_t delaying_cpu_cnt;
[1b20da0]219 /**
[d4d36f9]220 * The detector waits on this semaphore for any readers delaying the GP.
[1b20da0]221 *
222 * Each of the cpus with readers that are delaying the current GP
223 * must up() this sema once they reach a quiescent state. If there
224 * are any readers in cur_preempted (ie preempted preexisting) and
[d4d36f9]225 * they are already delaying GP detection, the last to unlock its
226 * reader section must up() this sema once.
227 */
228 semaphore_t remaining_readers;
229#endif
[a35b458]230
[4ec9ea41]231 /** Excludes simultaneous rcu_barrier() calls. */
232 mutex_t barrier_mtx;
233 /** Number of cpus that we are waiting for to complete rcu_barrier(). */
234 atomic_t barrier_wait_cnt;
235 /** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
236 waitq_t barrier_wq;
[a35b458]237
[0cf813d]238 /** Interruptible attached detector thread pointer. */
[181a746]239 thread_t *detector_thr;
[a35b458]240
[181a746]241 /* Some statistics. */
242 size_t stat_expedited_cnt;
243 size_t stat_delayed_cnt;
244 size_t stat_preempt_blocking_cnt;
245 /* Does not contain self/local calls. */
246 size_t stat_smp_call_cnt;
247} rcu_data_t;
248
249
250static rcu_data_t rcu;
251
252static void start_reclaimers(void);
253static void synch_complete(rcu_item_t *rcu_item);
[1b20da0]254static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
[18b6a88]255 rcu_func_t func);
[4ec9ea41]256static void add_barrier_cb(void *arg);
257static void barrier_complete(rcu_item_t *barrier_item);
[181a746]258static bool arriving_cbs_empty(void);
259static bool next_cbs_empty(void);
260static bool cur_cbs_empty(void);
261static bool all_cbs_empty(void);
262static void reclaimer(void *arg);
263static bool wait_for_pending_cbs(void);
264static bool advance_cbs(void);
265static void exec_completed_cbs(rcu_gp_t last_completed_gp);
266static void exec_cbs(rcu_item_t **phead);
267static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
[0cf813d]268static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
[d4d36f9]269
270#ifdef RCU_PREEMPT_PODZIMEK
271static void start_detector(void);
272static void read_unlock_impl(size_t *pnesting_cnt);
273static void req_detection(size_t req_cnt);
[181a746]274static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
275static void detector(void *);
276static bool wait_for_detect_req(void);
277static void end_cur_gp(void);
278static bool wait_for_readers(void);
279static bool gp_sleep(void);
280static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
281static bool wait_for_delaying_cpus(void);
[d4d36f9]282#elif defined(RCU_PREEMPT_A)
283static bool wait_for_readers(bool expedite);
284static bool gp_sleep(bool *expedite);
285#endif
286
287static void start_new_gp(void);
288static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
289static void sample_cpus(cpu_mask_t *reader_cpus, void *arg);
290static void sample_local_cpu(void *);
[181a746]291static bool wait_for_preempt_reader(void);
[d4d36f9]292static void note_preempted_reader(void);
293static void rm_preempted_reader(void);
[82719589]294static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt);
[181a746]295
296
297
298/** Initializes global RCU structures. */
299void rcu_init(void)
300{
301 condvar_initialize(&rcu.gp_ended);
302 condvar_initialize(&rcu.expedite_now);
[d4d36f9]303
[181a746]304 spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
[8e3ed06]305 _rcu_cur_gp = 0;
[181a746]306 rcu.completed_gp = 0;
[a35b458]307
[181a746]308 irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
309 list_initialize(&rcu.cur_preempted);
310 list_initialize(&rcu.next_preempted);
311 rcu.preempt_blocking_det = false;
[a35b458]312
[4ec9ea41]313 mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
[e3306d04]314 atomic_store(&rcu.barrier_wait_cnt, 0);
[4ec9ea41]315 waitq_initialize(&rcu.barrier_wq);
[d4d36f9]316
317 semaphore_initialize(&rcu.remaining_readers, 0);
[a35b458]318
[d4d36f9]319#ifdef RCU_PREEMPT_PODZIMEK
320 condvar_initialize(&rcu.req_gp_changed);
[a35b458]321
[d4d36f9]322 rcu.req_gp_end_cnt = 0;
323 rcu.req_expedited_cnt = 0;
[e3306d04]324 atomic_store(&rcu.delaying_cpu_cnt, 0);
[d4d36f9]325#endif
[a35b458]326
[205832b]327 rcu.detector_thr = NULL;
[a35b458]328
[181a746]329 rcu.stat_expedited_cnt = 0;
330 rcu.stat_delayed_cnt = 0;
331 rcu.stat_preempt_blocking_cnt = 0;
332 rcu.stat_smp_call_cnt = 0;
333}
334
335/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
336void rcu_cpu_init(void)
337{
338 if (config.cpu_active == 1) {
339 rcu_init();
340 }
[d4d36f9]341
[181a746]342 CPU->rcu.last_seen_gp = 0;
[d4d36f9]343
344#ifdef RCU_PREEMPT_PODZIMEK
[5b03a72]345 CPU->rcu.nesting_cnt = 0;
[d4d36f9]346 CPU->rcu.is_delaying_gp = false;
347 CPU->rcu.signal_unlock = false;
348#endif
[a35b458]349
[205832b]350 CPU->rcu.cur_cbs = NULL;
[0cf813d]351 CPU->rcu.cur_cbs_cnt = 0;
[205832b]352 CPU->rcu.next_cbs = NULL;
[0cf813d]353 CPU->rcu.next_cbs_cnt = 0;
[205832b]354 CPU->rcu.arriving_cbs = NULL;
[181a746]355 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
356 CPU->rcu.arriving_cbs_cnt = 0;
357
358 CPU->rcu.cur_cbs_gp = 0;
359 CPU->rcu.next_cbs_gp = 0;
[a35b458]360
[181a746]361 semaphore_initialize(&CPU->rcu.arrived_flag, 0);
[0cf813d]362
363 /* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
364 if (config.cpu_active == 1)
[205832b]365 CPU->rcu.reclaimer_thr = NULL;
[a35b458]366
[181a746]367 CPU->rcu.stat_max_cbs = 0;
368 CPU->rcu.stat_avg_cbs = 0;
369 CPU->rcu.stat_missed_gps = 0;
[0cf813d]370 CPU->rcu.stat_missed_gp_in_wait = 0;
371 CPU->rcu.stat_max_slice_cbs = 0;
372 CPU->rcu.last_arriving_cnt = 0;
[181a746]373}
374
375/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
376void rcu_kinit_init(void)
377{
[d4d36f9]378#ifdef RCU_PREEMPT_PODZIMEK
[181a746]379 start_detector();
[d4d36f9]380#endif
[a35b458]381
[181a746]382 start_reclaimers();
383}
384
385/** Initializes any per-thread RCU structures. */
386void rcu_thread_init(thread_t *thread)
387{
388 thread->rcu.nesting_cnt = 0;
[d4d36f9]389
390#ifdef RCU_PREEMPT_PODZIMEK
[181a746]391 thread->rcu.was_preempted = false;
[d4d36f9]392#endif
[a35b458]393
[181a746]394 link_initialize(&thread->rcu.preempt_link);
395}
396
[79d74fe]397
[1b20da0]398/** Cleans up global RCU resources and stops dispatching callbacks.
399 *
[181a746]400 * Call when shutting down the kernel. Outstanding callbacks will
401 * not be processed. Instead they will linger forever.
402 */
403void rcu_stop(void)
404{
405 /* Stop and wait for reclaimers. */
406 for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
[63e27ef]407 assert(cpus[cpu_id].rcu.reclaimer_thr != NULL);
[a35b458]408
[181a746]409 if (cpus[cpu_id].rcu.reclaimer_thr) {
410 thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
411 thread_join(cpus[cpu_id].rcu.reclaimer_thr);
412 thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
[205832b]413 cpus[cpu_id].rcu.reclaimer_thr = NULL;
[181a746]414 }
415 }
416
[d4d36f9]417#ifdef RCU_PREEMPT_PODZIMEK
[181a746]418 /* Stop the detector and wait. */
419 if (rcu.detector_thr) {
420 thread_interrupt(rcu.detector_thr);
421 thread_join(rcu.detector_thr);
422 thread_detach(rcu.detector_thr);
[205832b]423 rcu.detector_thr = NULL;
[181a746]424 }
[d4d36f9]425#endif
[181a746]426}
[79d74fe]427
[d4d36f9]428/** Returns the number of elapsed grace periods since boot. */
429uint64_t rcu_completed_gps(void)
[181a746]430{
[d4d36f9]431 spinlock_lock(&rcu.gp_lock);
432 uint64_t completed = rcu.completed_gp;
433 spinlock_unlock(&rcu.gp_lock);
[a35b458]434
[d4d36f9]435 return completed;
[181a746]436}
437
438/** Creates and runs cpu-bound reclaimer threads. */
439static void start_reclaimers(void)
440{
441 for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
[18b6a88]442 char name[THREAD_NAME_BUFLEN] = { 0 };
[a35b458]443
[181a746]444 snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
[a35b458]445
[1b20da0]446 cpus[cpu_id].rcu.reclaimer_thr =
[18b6a88]447 thread_create(reclaimer, NULL, TASK, THREAD_FLAG_NONE, name);
[181a746]448
[1b20da0]449 if (!cpus[cpu_id].rcu.reclaimer_thr)
[181a746]450 panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
451
452 thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
453 thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
454 }
455}
456
[d4d36f9]457#ifdef RCU_PREEMPT_PODZIMEK
458
459/** Starts the detector thread. */
460static void start_detector(void)
[181a746]461{
[1b20da0]462 rcu.detector_thr =
[18b6a88]463 thread_create(detector, NULL, TASK, THREAD_FLAG_NONE, "rcu-det");
[a35b458]464
[1b20da0]465 if (!rcu.detector_thr)
[d4d36f9]466 panic("Failed to create RCU detector thread.");
[a35b458]467
[d4d36f9]468 thread_ready(rcu.detector_thr);
[181a746]469}
470
[4a6da62]471/** Returns true if in an rcu reader section. */
472bool rcu_read_locked(void)
473{
474 preemption_disable();
[5b03a72]475 bool locked = 0 < CPU->rcu.nesting_cnt;
[4a6da62]476 preemption_enable();
[a35b458]477
[4a6da62]478 return locked;
479}
480
[1b20da0]481/** Unlocks the local reader section using the given nesting count.
482 *
483 * Preemption or interrupts must be disabled.
484 *
485 * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or
[181a746]486 * THREAD->rcu.nesting_cnt.
[79d74fe]487 */
[8e3ed06]488static void read_unlock_impl(size_t *pnesting_cnt)
[181a746]489{
[63e27ef]490 assert(PREEMPTION_DISABLED || interrupts_disabled());
[a35b458]491
[181a746]492 if (0 == --(*pnesting_cnt)) {
[8e3ed06]493 _rcu_record_qs();
[a35b458]494
[1b20da0]495 /*
496 * The thread was preempted while in a critical section or
497 * the detector is eagerly waiting for this cpu's reader
498 * to finish.
499 *
[205832b]500 * Note that THREAD may be NULL in scheduler() and not just during boot.
[181a746]501 */
502 if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
503 /* Rechecks with disabled interrupts. */
[8e3ed06]504 _rcu_signal_read_unlock();
[181a746]505 }
506 }
507}
508
509/** If necessary, signals the detector that we exited a reader section. */
[8e3ed06]510void _rcu_signal_read_unlock(void)
[181a746]511{
[63e27ef]512 assert(PREEMPTION_DISABLED || interrupts_disabled());
[a35b458]513
[181a746]514 /*
[82719589]515 * If an interrupt occurs here (even a NMI) it may beat us to
516 * resetting .is_delaying_gp or .was_preempted and up the semaphore
517 * for us.
[181a746]518 */
[a35b458]519
[1b20da0]520 /*
[181a746]521 * If the detector is eagerly waiting for this cpu's reader to unlock,
522 * notify it that the reader did so.
523 */
[82719589]524 if (local_atomic_exchange(&CPU->rcu.is_delaying_gp, false)) {
[181a746]525 semaphore_up(&rcu.remaining_readers);
526 }
[a35b458]527
[181a746]528 /*
529 * This reader was preempted while in a reader section.
530 * We might be holding up the current GP. Notify the
531 * detector if so.
532 */
[82719589]533 if (THREAD && local_atomic_exchange(&THREAD->rcu.was_preempted, false)) {
[63e27ef]534 assert(link_used(&THREAD->rcu.preempt_link));
[181a746]535
[d4d36f9]536 rm_preempted_reader();
[181a746]537 }
[a35b458]538
[f0fcb04]539 /* If there was something to signal to the detector we have done so. */
540 CPU->rcu.signal_unlock = false;
[181a746]541}
542
[d4d36f9]543#endif /* RCU_PREEMPT_PODZIMEK */
544
[181a746]545typedef struct synch_item {
546 waitq_t wq;
547 rcu_item_t rcu_item;
548} synch_item_t;
549
550/** Blocks until all preexisting readers exit their critical sections. */
[79d74fe]551void rcu_synchronize(void)
[4ec9ea41]552{
553 _rcu_synchronize(false);
554}
555
556/** Blocks until all preexisting readers exit their critical sections. */
557void rcu_synchronize_expedite(void)
558{
559 _rcu_synchronize(true);
560}
561
562/** Blocks until all preexisting readers exit their critical sections. */
563void _rcu_synchronize(bool expedite)
[79d74fe]564{
[181a746]565 /* Calling from a reader section will deadlock. */
[63e27ef]566 assert(!rcu_read_locked());
[a35b458]567
[1b20da0]568 synch_item_t completion;
[181a746]569
570 waitq_initialize(&completion.wq);
[4ec9ea41]571 _rcu_call(expedite, &completion.rcu_item, synch_complete);
[181a746]572 waitq_sleep(&completion.wq);
[79d74fe]573}
574
[181a746]575/** rcu_synchronize's callback. */
576static void synch_complete(rcu_item_t *rcu_item)
577{
578 synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
[63e27ef]579 assert(completion);
[181a746]580 waitq_wakeup(&completion->wq, WAKEUP_FIRST);
581}
[79d74fe]582
[4ec9ea41]583/** Waits for all outstanding rcu calls to complete. */
584void rcu_barrier(void)
585{
[1b20da0]586 /*
[4ec9ea41]587 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
588 * currently in use by rcu_barrier().
589 */
590 mutex_lock(&rcu.barrier_mtx);
[a35b458]591
[1b20da0]592 /*
[4ec9ea41]593 * Ensure we queue a barrier callback on all cpus before the already
594 * enqueued barrier callbacks start signaling completion.
595 */
[e3306d04]596 atomic_store(&rcu.barrier_wait_cnt, 1);
[4ec9ea41]597
598 DEFINE_CPU_MASK(cpu_mask);
599 cpu_mask_active(cpu_mask);
[a35b458]600
[4ec9ea41]601 cpu_mask_for_each(*cpu_mask, cpu_id) {
[205832b]602 smp_call(cpu_id, add_barrier_cb, NULL);
[4ec9ea41]603 }
[a35b458]604
[4ec9ea41]605 if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
606 waitq_sleep(&rcu.barrier_wq);
607 }
[a35b458]608
[4ec9ea41]609 mutex_unlock(&rcu.barrier_mtx);
610}
611
[1b20da0]612/** Issues a rcu_barrier() callback on the local cpu.
613 *
614 * Executed with interrupts disabled.
[4ec9ea41]615 */
616static void add_barrier_cb(void *arg)
617{
[63e27ef]618 assert(interrupts_disabled() || PREEMPTION_DISABLED);
[4ec9ea41]619 atomic_inc(&rcu.barrier_wait_cnt);
620 rcu_call(&CPU->rcu.barrier_item, barrier_complete);
621}
622
623/** Local cpu's rcu_barrier() completion callback. */
624static void barrier_complete(rcu_item_t *barrier_item)
625{
626 /* Is this the last barrier callback completed? */
627 if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
628 /* Notify rcu_barrier() that we're done. */
629 waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
630 }
631}
632
[1b20da0]633/** Adds a callback to invoke after all preexisting readers finish.
634 *
[181a746]635 * May be called from within interrupt handlers or RCU reader sections.
[1b20da0]636 *
[181a746]637 * @param rcu_item Used by RCU to track the call. Must remain
638 * until the user callback function is entered.
639 * @param func User callback function that will be invoked once a full
640 * grace period elapsed, ie at a time when all preexisting
641 * readers have finished. The callback should be short and must
642 * not block. If you must sleep, enqueue your work in the system
643 * work queue from the callback (ie workq_global_enqueue()).
[79d74fe]644 */
[181a746]645void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
[79d74fe]646{
[3648ea56]647 rcu_call_impl(false, rcu_item, func);
[79d74fe]648}
649
[181a746]650/** rcu_call() implementation. See rcu_call() for comments. */
651void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
[3648ea56]652{
653 rcu_call_impl(expedite, rcu_item, func);
654}
655
656/** rcu_call() inline-able implementation. See rcu_call() for comments. */
[1b20da0]657static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
[18b6a88]658 rcu_func_t func)
[181a746]659{
[63e27ef]660 assert(rcu_item);
[a35b458]661
[181a746]662 rcu_item->func = func;
[205832b]663 rcu_item->next = NULL;
[a35b458]664
[181a746]665 preemption_disable();
[79d74fe]666
[3648ea56]667 rcu_cpu_data_t *r = &CPU->rcu;
[82719589]668
[18b6a88]669 rcu_item_t **prev_tail =
670 local_atomic_exchange(&r->parriving_cbs_tail, &rcu_item->next);
[82719589]671 *prev_tail = rcu_item;
[a35b458]672
[82719589]673 /* Approximate the number of callbacks present. */
674 ++r->arriving_cbs_cnt;
[a35b458]675
[181a746]676 if (expedite) {
[3648ea56]677 r->expedite_arriving = true;
[181a746]678 }
[a35b458]679
[82719589]680 bool first_cb = (prev_tail == &CPU->rcu.arriving_cbs);
[a35b458]681
[181a746]682 /* Added first callback - notify the reclaimer. */
[82719589]683 if (first_cb && !semaphore_count_get(&r->arrived_flag)) {
[3648ea56]684 semaphore_up(&r->arrived_flag);
[181a746]685 }
[a35b458]686
[181a746]687 preemption_enable();
688}
689
690static bool cur_cbs_empty(void)
691{
[63e27ef]692 assert(THREAD && THREAD->wired);
[205832b]693 return NULL == CPU->rcu.cur_cbs;
[181a746]694}
695
696static bool next_cbs_empty(void)
697{
[63e27ef]698 assert(THREAD && THREAD->wired);
[205832b]699 return NULL == CPU->rcu.next_cbs;
[181a746]700}
701
702/** Disable interrupts to get an up-to-date result. */
703static bool arriving_cbs_empty(void)
704{
[63e27ef]705 assert(THREAD && THREAD->wired);
[1b20da0]706 /*
707 * Accessing with interrupts enabled may at worst lead to
[181a746]708 * a false negative if we race with a local interrupt handler.
709 */
[205832b]710 return NULL == CPU->rcu.arriving_cbs;
[181a746]711}
712
713static bool all_cbs_empty(void)
714{
715 return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
716}
717
[d4d36f9]718
[181a746]719/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
720static void reclaimer(void *arg)
721{
[63e27ef]722 assert(THREAD && THREAD->wired);
723 assert(THREAD == CPU->rcu.reclaimer_thr);
[181a746]724
725 rcu_gp_t last_compl_gp = 0;
726 bool ok = true;
[a35b458]727
[181a746]728 while (ok && wait_for_pending_cbs()) {
[63e27ef]729 assert(CPU->rcu.reclaimer_thr == THREAD);
[a35b458]730
[0cf813d]731 exec_completed_cbs(last_compl_gp);
732
[181a746]733 bool expedite = advance_cbs();
[a35b458]734
[181a746]735 ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
736 }
737}
738
739/** Waits until there are callbacks waiting to be dispatched. */
740static bool wait_for_pending_cbs(void)
741{
[1b20da0]742 if (!all_cbs_empty())
[181a746]743 return true;
744
745 bool ok = true;
[a35b458]746
[181a746]747 while (arriving_cbs_empty() && ok) {
748 ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
749 }
[a35b458]750
[181a746]751 return ok;
752}
753
[b68ae24]754static void upd_stat_missed_gp(rcu_gp_t compl)
[181a746]755{
[b68ae24]756 if (CPU->rcu.cur_cbs_gp < compl) {
757 CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
[181a746]758 }
759}
760
761/** Executes all callbacks for the given completed grace period. */
762static void exec_completed_cbs(rcu_gp_t last_completed_gp)
763{
[b68ae24]764 upd_stat_missed_gp(last_completed_gp);
[a35b458]765
[0cf813d]766 /* Both next_cbs and cur_cbs GP elapsed. */
[181a746]767 if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
[63e27ef]768 assert(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
[a35b458]769
[0cf813d]770 size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
[a35b458]771
[0cf813d]772 if (exec_cnt < CRITICAL_THRESHOLD) {
773 exec_cbs(&CPU->rcu.cur_cbs);
[1b20da0]774 exec_cbs(&CPU->rcu.next_cbs);
[0cf813d]775 } else {
[1b20da0]776 /*
777 * Getting overwhelmed with too many callbacks to run.
778 * Disable preemption in order to prolong our time slice
[0cf813d]779 * and catch up with updaters posting new callbacks.
780 */
781 preemption_disable();
782 exec_cbs(&CPU->rcu.cur_cbs);
[1b20da0]783 exec_cbs(&CPU->rcu.next_cbs);
[0cf813d]784 preemption_enable();
785 }
[a35b458]786
[0cf813d]787 CPU->rcu.cur_cbs_cnt = 0;
788 CPU->rcu.next_cbs_cnt = 0;
789 } else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
790
791 if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
792 exec_cbs(&CPU->rcu.cur_cbs);
793 } else {
[1b20da0]794 /*
795 * Getting overwhelmed with too many callbacks to run.
796 * Disable preemption in order to prolong our time slice
[0cf813d]797 * and catch up with updaters posting new callbacks.
798 */
799 preemption_disable();
800 exec_cbs(&CPU->rcu.cur_cbs);
801 preemption_enable();
802 }
803
804 CPU->rcu.cur_cbs_cnt = 0;
[181a746]805 }
806}
807
808/** Executes callbacks in the single-linked list. The list is left empty. */
809static void exec_cbs(rcu_item_t **phead)
810{
811 rcu_item_t *rcu_item = *phead;
812
813 while (rcu_item) {
814 /* func() may free rcu_item. Get a local copy. */
815 rcu_item_t *next = rcu_item->next;
816 rcu_func_t func = rcu_item->func;
[a35b458]817
[181a746]818 func(rcu_item);
[a35b458]819
[181a746]820 rcu_item = next;
821 }
[a35b458]822
[205832b]823 *phead = NULL;
[181a746]824}
825
826static void upd_stat_cb_cnts(size_t arriving_cnt)
827{
828 CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
829 if (0 < arriving_cnt) {
[1b20da0]830 CPU->rcu.stat_avg_cbs =
[18b6a88]831 (99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
[181a746]832 }
833}
834
835/** Prepares another batch of callbacks to dispatch at the nest grace period.
[1b20da0]836 *
[181a746]837 * @return True if the next batch of callbacks must be expedited quickly.
[79d74fe]838 */
[181a746]839static bool advance_cbs(void)
840{
841 /* Move next_cbs to cur_cbs. */
842 CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
[0cf813d]843 CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
[181a746]844 CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
[a35b458]845
[82719589]846 /* Move arriving_cbs to next_cbs. */
[a35b458]847
[82719589]848 CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
849 CPU->rcu.arriving_cbs_cnt = 0;
[a35b458]850
[1b20da0]851 /*
[181a746]852 * Too many callbacks queued. Better speed up the detection
853 * or risk exhausting all system memory.
854 */
[18b6a88]855 bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.next_cbs_cnt) ||
856 CPU->rcu.expedite_arriving;
[181a746]857 CPU->rcu.expedite_arriving = false;
[82719589]858
859 /* Start moving the arriving_cbs list to next_cbs. */
[181a746]860 CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
[a35b458]861
[1b20da0]862 /*
[82719589]863 * At least one callback arrived. The tail therefore does not point
864 * to the head of arriving_cbs and we can safely reset it to NULL.
865 */
866 if (CPU->rcu.next_cbs) {
[63e27ef]867 assert(CPU->rcu.parriving_cbs_tail != &CPU->rcu.arriving_cbs);
[a35b458]868
[82719589]869 CPU->rcu.arriving_cbs = NULL;
870 /* Reset arriving_cbs before updating the tail pointer. */
871 compiler_barrier();
872 /* Updating the tail pointer completes the move of arriving_cbs. */
873 ACCESS_ONCE(CPU->rcu.parriving_cbs_tail) = &CPU->rcu.arriving_cbs;
874 } else {
[1b20da0]875 /*
876 * arriving_cbs was null and parriving_cbs_tail pointed to it
[82719589]877 * so leave it that way. Note that interrupt handlers may have
878 * added a callback in the meantime so it is not safe to reset
879 * arriving_cbs or parriving_cbs.
880 */
881 }
[0cf813d]882
883 /* Update statistics of arrived callbacks. */
884 upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
[a35b458]885
[1b20da0]886 /*
887 * Make changes prior to queuing next_cbs visible to readers.
[181a746]888 * See comment in wait_for_readers().
889 */
890 memory_barrier(); /* MB A, B */
891
892 /* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
[a35b458]893
[181a746]894 if (!next_cbs_empty()) {
895 spinlock_lock(&rcu.gp_lock);
[a35b458]896
[181a746]897 /* Exec next_cbs at the end of the next GP. */
[8e3ed06]898 CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
[a35b458]899
[1b20da0]900 /*
[181a746]901 * There are no callbacks to invoke before next_cbs. Instruct
902 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
[1b20da0]903 * That could be sooner than next_cbs_gp (if the current GP
[181a746]904 * had not yet completed), so we'll create a shorter batch
905 * of callbacks next time around.
906 */
907 if (cur_cbs_empty()) {
908 CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
[1b20da0]909 }
[a35b458]910
[181a746]911 spinlock_unlock(&rcu.gp_lock);
912 } else {
913 CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
914 }
[a35b458]915
[63e27ef]916 assert(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
[a35b458]917
[1b20da0]918 return expedite;
[181a746]919}
920
[d4d36f9]921
922#ifdef RCU_PREEMPT_A
923
[1b20da0]924/** Waits for the grace period associated with callbacks cub_cbs to elapse.
925 *
926 * @param expedite Instructs the detector to aggressively speed up grace
[181a746]927 * period detection without any delay.
[1b20da0]928 * @param completed_gp Returns the most recent completed grace period
[181a746]929 * number.
930 * @return false if the thread was interrupted and should stop.
931 */
932static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
933{
934 spinlock_lock(&rcu.gp_lock);
[d4d36f9]935
[63e27ef]936 assert(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
937 assert(CPU->rcu.cur_cbs_gp <= _rcu_cur_gp + 1);
[a35b458]938
[d4d36f9]939 while (rcu.completed_gp < CPU->rcu.cur_cbs_gp) {
940 /* GP has not yet started - start a new one. */
941 if (rcu.completed_gp == _rcu_cur_gp) {
942 start_new_gp();
943 spinlock_unlock(&rcu.gp_lock);
[181a746]944
[d4d36f9]945 if (!wait_for_readers(expedite))
946 return false;
947
948 spinlock_lock(&rcu.gp_lock);
949 /* Notify any reclaimers this GP had ended. */
950 rcu.completed_gp = _rcu_cur_gp;
951 condvar_broadcast(&rcu.gp_ended);
952 } else {
[1b20da0]953 /* GP detection is in progress.*/
[a35b458]954
[1b20da0]955 if (expedite)
[d4d36f9]956 condvar_signal(&rcu.expedite_now);
[a35b458]957
[d4d36f9]958 /* Wait for the GP to complete. */
[1b20da0]959 errno_t ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
[18b6a88]960 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
[a35b458]961
[897fd8f1]962 if (ret == EINTR) {
[d4d36f9]963 spinlock_unlock(&rcu.gp_lock);
[1b20da0]964 return false;
[d4d36f9]965 }
966 }
967 }
[a35b458]968
[09737cc]969 upd_missed_gp_in_wait(rcu.completed_gp);
[a35b458]970
[181a746]971 *completed_gp = rcu.completed_gp;
[d4d36f9]972 spinlock_unlock(&rcu.gp_lock);
[a35b458]973
[d4d36f9]974 return true;
[181a746]975}
976
[d4d36f9]977static bool wait_for_readers(bool expedite)
[0cf813d]978{
[d4d36f9]979 DEFINE_CPU_MASK(reader_cpus);
[a35b458]980
[d4d36f9]981 cpu_mask_active(reader_cpus);
982 rm_quiescent_cpus(reader_cpus);
[a35b458]983
[d4d36f9]984 while (!cpu_mask_is_none(reader_cpus)) {
985 /* Give cpus a chance to context switch (a QS) and batch callbacks. */
[18b6a88]986 if (!gp_sleep(&expedite))
[d4d36f9]987 return false;
[a35b458]988
[d4d36f9]989 rm_quiescent_cpus(reader_cpus);
990 sample_cpus(reader_cpus, reader_cpus);
991 }
[a35b458]992
[d4d36f9]993 /* Update statistic. */
994 if (expedite) {
995 ++rcu.stat_expedited_cnt;
996 }
[a35b458]997
[1b20da0]998 /*
[d4d36f9]999 * All cpus have passed through a QS and see the most recent _rcu_cur_gp.
1000 * As a result newly preempted readers will associate with next_preempted
1001 * and the number of old readers in cur_preempted will monotonically
1002 * decrease. Wait for those old/preexisting readers.
1003 */
1004 return wait_for_preempt_reader();
[0cf813d]1005}
1006
[d4d36f9]1007static bool gp_sleep(bool *expedite)
[181a746]1008{
[d4d36f9]1009 if (*expedite) {
1010 scheduler();
1011 return true;
1012 } else {
1013 spinlock_lock(&rcu.gp_lock);
[181a746]1014
[b7fd2a0]1015 errno_t ret = 0;
[d4d36f9]1016 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
[18b6a88]1017 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
[d4d36f9]1018
1019 /* rcu.expedite_now was signaled. */
[897fd8f1]1020 if (ret == EOK) {
[d4d36f9]1021 *expedite = true;
[181a746]1022 }
[d4d36f9]1023
1024 spinlock_unlock(&rcu.gp_lock);
1025
[897fd8f1]1026 return (ret != EINTR);
[181a746]1027 }
1028}
1029
[d4d36f9]1030static void sample_local_cpu(void *arg)
[181a746]1031{
[63e27ef]1032 assert(interrupts_disabled());
[d4d36f9]1033 cpu_mask_t *reader_cpus = (cpu_mask_t *)arg;
[a35b458]1034
[d4d36f9]1035 bool locked = RCU_CNT_INC <= THE->rcu_nesting;
[853d613]1036 /* smp_call machinery makes the most current _rcu_cur_gp visible. */
[d4d36f9]1037 bool passed_qs = (CPU->rcu.last_seen_gp == _rcu_cur_gp);
[a35b458]1038
[d4d36f9]1039 if (locked && !passed_qs) {
[1b20da0]1040 /*
[d4d36f9]1041 * This cpu has not yet passed a quiescent state during this grace
1042 * period and it is currently in a reader section. We'll have to
1043 * try to sample this cpu again later.
1044 */
1045 } else {
1046 /* Either not in a reader section or already passed a QS. */
1047 cpu_mask_reset(reader_cpus, CPU->id);
1048 /* Contain new reader sections and make prior changes visible to them.*/
1049 memory_barrier();
1050 CPU->rcu.last_seen_gp = _rcu_cur_gp;
1051 }
1052}
1053
1054/** Called by the scheduler() when switching away from the current thread. */
1055void rcu_after_thread_ran(void)
1056{
[63e27ef]1057 assert(interrupts_disabled());
[d4d36f9]1058
[1b20da0]1059 /*
1060 * In order not to worry about NMI seeing rcu_nesting change work
[853d613]1061 * with a local copy.
1062 */
[82719589]1063 size_t nesting_cnt = local_atomic_exchange(&THE->rcu_nesting, 0);
[a35b458]1064
[1b20da0]1065 /*
[82719589]1066 * Ensures NMIs see .rcu_nesting without the WAS_PREEMPTED mark and
1067 * do not accidentally call rm_preempted_reader() from unlock().
1068 */
1069 compiler_barrier();
[a35b458]1070
[d4d36f9]1071 /* Preempted a reader critical section for the first time. */
[853d613]1072 if (RCU_CNT_INC <= nesting_cnt && !(nesting_cnt & RCU_WAS_PREEMPTED)) {
1073 nesting_cnt |= RCU_WAS_PREEMPTED;
[d4d36f9]1074 note_preempted_reader();
1075 }
[a35b458]1076
[d4d36f9]1077 /* Save the thread's nesting count when it is not running. */
[853d613]1078 THREAD->rcu.nesting_cnt = nesting_cnt;
[d4d36f9]1079
1080 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
[1b20da0]1081 /*
1082 * Contain any memory accesses of old readers before announcing a QS.
[d4d36f9]1083 * Also make changes from the previous GP visible to this cpu.
[1b20da0]1084 * Moreover it separates writing to last_seen_gp from
[853d613]1085 * note_preempted_reader().
[d4d36f9]1086 */
1087 memory_barrier();
[1b20da0]1088 /*
[853d613]1089 * The preempted reader has been noted globally. There are therefore
1090 * no readers running on this cpu so this is a quiescent state.
[1b20da0]1091 *
1092 * Reading the multiword _rcu_cur_gp non-atomically is benign.
[853d613]1093 * At worst, the read value will be different from the actual value.
1094 * As a result, both the detector and this cpu will believe
1095 * this cpu has not yet passed a QS although it really did.
[1b20da0]1096 *
[fbe6b65]1097 * Reloading _rcu_cur_gp is benign, because it cannot change
1098 * until this cpu acknowledges it passed a QS by writing to
1099 * last_seen_gp. Since interrupts are disabled, only this
1100 * code may to so (IPIs won't get through).
[853d613]1101 */
[d4d36f9]1102 CPU->rcu.last_seen_gp = _rcu_cur_gp;
1103 }
1104
[1b20da0]1105 /*
[853d613]1106 * Forcefully associate the reclaimer with the highest priority
[d4d36f9]1107 * even if preempted due to its time slice running out.
1108 */
1109 if (THREAD == CPU->rcu.reclaimer_thr) {
1110 THREAD->priority = -1;
[1b20da0]1111 }
[a35b458]1112
[82719589]1113 upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
[d4d36f9]1114}
1115
1116/** Called by the scheduler() when switching to a newly scheduled thread. */
1117void rcu_before_thread_runs(void)
1118{
[63e27ef]1119 assert(!rcu_read_locked());
[a35b458]1120
[d4d36f9]1121 /* Load the thread's saved nesting count from before it was preempted. */
1122 THE->rcu_nesting = THREAD->rcu.nesting_cnt;
1123}
1124
[1b20da0]1125/** Called from scheduler() when exiting the current thread.
1126 *
[d4d36f9]1127 * Preemption or interrupts are disabled and the scheduler() already
1128 * switched away from the current thread, calling rcu_after_thread_ran().
1129 */
1130void rcu_thread_exiting(void)
1131{
[63e27ef]1132 assert(THE->rcu_nesting == 0);
[a35b458]1133
[1b20da0]1134 /*
1135 * The thread forgot to exit its reader critical section.
[d4d36f9]1136 * It is a bug, but rather than letting the entire system lock up
[1b20da0]1137 * forcefully leave the reader section. The thread is not holding
[d4d36f9]1138 * any references anyway since it is exiting so it is safe.
1139 */
1140 if (RCU_CNT_INC <= THREAD->rcu.nesting_cnt) {
1141 /* Emulate _rcu_preempted_unlock() with the proper nesting count. */
1142 if (THREAD->rcu.nesting_cnt & RCU_WAS_PREEMPTED) {
1143 rm_preempted_reader();
1144 }
[fbe17545]1145
1146 printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
[18b6a88]1147 " section.\n", THREAD->tid, THREAD->name);
[d4d36f9]1148 }
1149}
1150
1151/** Returns true if in an rcu reader section. */
1152bool rcu_read_locked(void)
1153{
1154 return RCU_CNT_INC <= THE->rcu_nesting;
1155}
1156
1157/** Invoked when a preempted reader finally exits its reader section. */
1158void _rcu_preempted_unlock(void)
1159{
[63e27ef]1160 assert(0 == THE->rcu_nesting || RCU_WAS_PREEMPTED == THE->rcu_nesting);
[a35b458]1161
[82719589]1162 size_t prev = local_atomic_exchange(&THE->rcu_nesting, 0);
1163 if (prev == RCU_WAS_PREEMPTED) {
[1b20da0]1164 /*
[fbe6b65]1165 * NMI handlers are never preempted but may call rm_preempted_reader()
1166 * if a NMI occurred in _rcu_preempted_unlock() of a preempted thread.
[82719589]1167 * The only other rcu code that may have been interrupted by the NMI
1168 * in _rcu_preempted_unlock() is: an IPI/sample_local_cpu() and
1169 * the initial part of rcu_after_thread_ran().
[1b20da0]1170 *
[fbe6b65]1171 * rm_preempted_reader() will not deadlock because none of the locks
[82719589]1172 * it uses are locked in this case. Neither _rcu_preempted_unlock()
1173 * nor sample_local_cpu() nor the initial part of rcu_after_thread_ran()
1174 * acquire any locks.
[fbe6b65]1175 */
[d4d36f9]1176 rm_preempted_reader();
1177 }
1178}
1179
1180#elif defined(RCU_PREEMPT_PODZIMEK)
1181
[1b20da0]1182/** Waits for the grace period associated with callbacks cub_cbs to elapse.
1183 *
1184 * @param expedite Instructs the detector to aggressively speed up grace
[d4d36f9]1185 * period detection without any delay.
[1b20da0]1186 * @param completed_gp Returns the most recent completed grace period
[d4d36f9]1187 * number.
1188 * @return false if the thread was interrupted and should stop.
1189 */
1190static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
1191{
[1b20da0]1192 /*
[d4d36f9]1193 * Use a possibly outdated version of completed_gp to bypass checking
1194 * with the lock.
[1b20da0]1195 *
1196 * Note that loading and storing rcu.completed_gp is not atomic
1197 * (it is 64bit wide). Reading a clobbered value that is less than
1198 * rcu.completed_gp is harmless - we'll recheck with a lock. The
1199 * only way to read a clobbered value that is greater than the actual
1200 * value is if the detector increases the higher-order word first and
1201 * then decreases the lower-order word (or we see stores in that order),
1202 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value
1203 * suddenly jumps by 2^32. It would take hours for such an increase
1204 * to occur so it is safe to discard the value. We allow increases
[d4d36f9]1205 * of up to half the maximum to generously accommodate for loading an
1206 * outdated lower word.
1207 */
1208 rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
[18b6a88]1209 if (CPU->rcu.cur_cbs_gp <= compl_gp &&
1210 compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
[d4d36f9]1211 *completed_gp = compl_gp;
1212 return true;
1213 }
[a35b458]1214
[d4d36f9]1215 spinlock_lock(&rcu.gp_lock);
[a35b458]1216
[d4d36f9]1217 if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
1218 *completed_gp = rcu.completed_gp;
1219 spinlock_unlock(&rcu.gp_lock);
1220 return true;
1221 }
[a35b458]1222
[63e27ef]1223 assert(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
1224 assert(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
[a35b458]1225
[1b20da0]1226 /*
1227 * Notify the detector of how many GP ends we intend to wait for, so
[d4d36f9]1228 * it can avoid going to sleep unnecessarily. Optimistically assume
1229 * new callbacks will arrive while we're waiting; hence +1.
1230 */
1231 size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
1232 req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
[a35b458]1233
[1b20da0]1234 /*
1235 * Ask the detector to speed up GP detection if there are too many
[d4d36f9]1236 * pending callbacks and other reclaimers have not already done so.
1237 */
1238 if (expedite) {
[18b6a88]1239 if (0 == rcu.req_expedited_cnt)
[d4d36f9]1240 condvar_signal(&rcu.expedite_now);
[a35b458]1241
[1b20da0]1242 /*
1243 * Expedite only cub_cbs. If there really is a surge of callbacks
[d4d36f9]1244 * the arriving batch will expedite the GP for the huge number
1245 * of callbacks currently in next_cbs
1246 */
1247 rcu.req_expedited_cnt = 1;
1248 }
1249
1250 /* Wait for cur_cbs_gp to end. */
1251 bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
[a35b458]1252
[d4d36f9]1253 *completed_gp = rcu.completed_gp;
[1b20da0]1254 spinlock_unlock(&rcu.gp_lock);
[a35b458]1255
[09737cc]1256 if (!interrupted)
1257 upd_missed_gp_in_wait(*completed_gp);
[a35b458]1258
[d4d36f9]1259 return !interrupted;
1260}
1261
1262/** Waits for an announcement of the end of the grace period wait_on_gp. */
1263static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
1264{
[63e27ef]1265 assert(spinlock_locked(&rcu.gp_lock));
[a35b458]1266
[d4d36f9]1267 bool interrupted = false;
[a35b458]1268
[d4d36f9]1269 /* Wait until wait_on_gp ends. */
1270 while (rcu.completed_gp < wait_on_gp && !interrupted) {
[1b20da0]1271 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
[18b6a88]1272 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
[897fd8f1]1273 interrupted = (ret == EINTR);
[181a746]1274 }
[a35b458]1275
[181a746]1276 return interrupted;
1277}
1278
[d4d36f9]1279/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
1280static void req_detection(size_t req_cnt)
1281{
1282 if (rcu.req_gp_end_cnt < req_cnt) {
1283 bool detector_idle = (0 == rcu.req_gp_end_cnt);
1284 rcu.req_gp_end_cnt = req_cnt;
1285
1286 if (detector_idle) {
[63e27ef]1287 assert(_rcu_cur_gp == rcu.completed_gp);
[d4d36f9]1288 condvar_signal(&rcu.req_gp_changed);
1289 }
1290 }
1291}
1292
1293
[181a746]1294/** The detector thread detects and notifies reclaimers of grace period ends. */
1295static void detector(void *arg)
1296{
1297 spinlock_lock(&rcu.gp_lock);
[a35b458]1298
[181a746]1299 while (wait_for_detect_req()) {
[1b20da0]1300 /*
[181a746]1301 * Announce new GP started. Readers start lazily acknowledging that
1302 * they passed a QS.
1303 */
1304 start_new_gp();
[a35b458]1305
[181a746]1306 spinlock_unlock(&rcu.gp_lock);
[a35b458]1307
[1b20da0]1308 if (!wait_for_readers())
[181a746]1309 goto unlocked_out;
[a35b458]1310
[181a746]1311 spinlock_lock(&rcu.gp_lock);
1312
1313 /* Notify reclaimers that they may now invoke queued callbacks. */
1314 end_cur_gp();
1315 }
[a35b458]1316
[181a746]1317 spinlock_unlock(&rcu.gp_lock);
[a35b458]1318
[181a746]1319unlocked_out:
1320 return;
1321}
1322
1323/** Waits for a request from a reclaimer thread to detect a grace period. */
1324static bool wait_for_detect_req(void)
1325{
[63e27ef]1326 assert(spinlock_locked(&rcu.gp_lock));
[a35b458]1327
[181a746]1328 bool interrupted = false;
[a35b458]1329
[181a746]1330 while (0 == rcu.req_gp_end_cnt && !interrupted) {
[1b20da0]1331 int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed,
[18b6a88]1332 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
[a35b458]1333
[897fd8f1]1334 interrupted = (ret == EINTR);
[181a746]1335 }
[a35b458]1336
[d4d36f9]1337 return !interrupted;
1338}
1339
1340
1341static void end_cur_gp(void)
1342{
[63e27ef]1343 assert(spinlock_locked(&rcu.gp_lock));
[a35b458]1344
[d4d36f9]1345 rcu.completed_gp = _rcu_cur_gp;
1346 --rcu.req_gp_end_cnt;
[a35b458]1347
[d4d36f9]1348 condvar_broadcast(&rcu.gp_ended);
1349}
1350
1351/** Waits for readers that started before the current GP started to finish. */
1352static bool wait_for_readers(void)
1353{
1354 DEFINE_CPU_MASK(reading_cpus);
[a35b458]1355
[d4d36f9]1356 /* All running cpus have potential readers. */
1357 cpu_mask_active(reading_cpus);
1358
[1b20da0]1359 /*
1360 * Give readers time to pass through a QS. Also, batch arriving
[d4d36f9]1361 * callbacks in order to amortize detection overhead.
1362 */
1363 if (!gp_sleep())
1364 return false;
[a35b458]1365
[d4d36f9]1366 /* Non-intrusively determine which cpus have yet to pass a QS. */
1367 rm_quiescent_cpus(reading_cpus);
[a35b458]1368
[d4d36f9]1369 /* Actively interrupt cpus delaying the current GP and demand a QS. */
1370 interrupt_delaying_cpus(reading_cpus);
[a35b458]1371
[d4d36f9]1372 /* Wait for the interrupted cpus to notify us that they reached a QS. */
1373 if (!wait_for_delaying_cpus())
1374 return false;
1375 /*
1376 * All cpus recorded a QS or are still idle. Any new readers will be added
1377 * to next_preempt if preempted, ie the number of readers in cur_preempted
1378 * monotonically descreases.
1379 */
[a35b458]1380
[d4d36f9]1381 /* Wait for the last reader in cur_preempted to notify us it is done. */
1382 if (!wait_for_preempt_reader())
1383 return false;
[a35b458]1384
[d4d36f9]1385 return true;
1386}
1387
1388/** Sleeps a while if the current grace period is not to be expedited. */
1389static bool gp_sleep(void)
1390{
1391 spinlock_lock(&rcu.gp_lock);
1392
1393 int ret = 0;
1394 while (0 == rcu.req_expedited_cnt && 0 == ret) {
1395 /* minor bug: sleeps for the same duration if woken up spuriously. */
1396 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
[18b6a88]1397 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
[d4d36f9]1398 }
[a35b458]1399
[d4d36f9]1400 if (0 < rcu.req_expedited_cnt) {
1401 --rcu.req_expedited_cnt;
1402 /* Update statistic. */
1403 ++rcu.stat_expedited_cnt;
1404 }
[a35b458]1405
[d4d36f9]1406 spinlock_unlock(&rcu.gp_lock);
[a35b458]1407
[897fd8f1]1408 return (ret != EINTR);
[d4d36f9]1409}
1410
1411/** Actively interrupts and checks the offending cpus for quiescent states. */
1412static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
1413{
[e3306d04]1414 atomic_store(&rcu.delaying_cpu_cnt, 0);
[a35b458]1415
[205832b]1416 sample_cpus(cpu_mask, NULL);
[d4d36f9]1417}
1418
[1b20da0]1419/** Invoked on a cpu delaying grace period detection.
1420 *
1421 * Induces a quiescent state for the cpu or it instructs remaining
[d4d36f9]1422 * readers to notify the detector once they finish.
1423 */
1424static void sample_local_cpu(void *arg)
1425{
[63e27ef]1426 assert(interrupts_disabled());
1427 assert(!CPU->rcu.is_delaying_gp);
[a35b458]1428
[d4d36f9]1429 /* Cpu did not pass a quiescent state yet. */
1430 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
1431 /* Interrupted a reader in a reader critical section. */
1432 if (0 < CPU->rcu.nesting_cnt) {
[63e27ef]1433 assert(!CPU->idle);
[1b20da0]1434 /*
1435 * Note to notify the detector from rcu_read_unlock().
1436 *
[82719589]1437 * ACCESS_ONCE ensures the compiler writes to is_delaying_gp
1438 * only after it determines that we are in a reader CS.
[d4d36f9]1439 */
[82719589]1440 ACCESS_ONCE(CPU->rcu.is_delaying_gp) = true;
[d4d36f9]1441 CPU->rcu.signal_unlock = true;
[a35b458]1442
[d4d36f9]1443 atomic_inc(&rcu.delaying_cpu_cnt);
1444 } else {
[1b20da0]1445 /*
1446 * The cpu did not enter any rcu reader sections since
[d4d36f9]1447 * the start of the current GP. Record a quiescent state.
[1b20da0]1448 *
[d4d36f9]1449 * Or, we interrupted rcu_read_unlock_impl() right before
[1b20da0]1450 * it recorded a QS. Record a QS for it. The memory barrier
1451 * contains the reader section's mem accesses before
[d4d36f9]1452 * updating last_seen_gp.
[1b20da0]1453 *
[d4d36f9]1454 * Or, we interrupted rcu_read_lock() right after it recorded
1455 * a QS for the previous GP but before it got a chance to
1456 * increment its nesting count. The memory barrier again
1457 * stops the CS code from spilling out of the CS.
1458 */
1459 memory_barrier();
1460 CPU->rcu.last_seen_gp = _rcu_cur_gp;
1461 }
1462 } else {
[1b20da0]1463 /*
1464 * This cpu already acknowledged that it had passed through
1465 * a quiescent state since the start of cur_gp.
[d4d36f9]1466 */
1467 }
[a35b458]1468
[1b20da0]1469 /*
[d4d36f9]1470 * smp_call() makes sure any changes propagate back to the caller.
1471 * In particular, it makes the most current last_seen_gp visible
1472 * to the detector.
1473 */
1474}
1475
1476/** Waits for cpus delaying the current grace period if there are any. */
1477static bool wait_for_delaying_cpus(void)
1478{
[036e97c]1479 int delaying_cpu_cnt = atomic_load(&rcu.delaying_cpu_cnt);
[d4d36f9]1480
[18b6a88]1481 for (int i = 0; i < delaying_cpu_cnt; ++i) {
[d4d36f9]1482 if (!semaphore_down_interruptable(&rcu.remaining_readers))
1483 return false;
1484 }
[a35b458]1485
[d4d36f9]1486 /* Update statistic. */
1487 rcu.stat_delayed_cnt += delaying_cpu_cnt;
[a35b458]1488
[d4d36f9]1489 return true;
1490}
1491
1492/** Called by the scheduler() when switching away from the current thread. */
1493void rcu_after_thread_ran(void)
1494{
[63e27ef]1495 assert(interrupts_disabled());
[d4d36f9]1496
[1b20da0]1497 /*
[d4d36f9]1498 * Prevent NMI handlers from interfering. The detector will be notified
[1b20da0]1499 * in this function if CPU->rcu.is_delaying_gp. The current thread is
[82719589]1500 * no longer running so there is nothing else to signal to the detector.
[d4d36f9]1501 */
1502 CPU->rcu.signal_unlock = false;
[1b20da0]1503 /*
1504 * Separates clearing of .signal_unlock from accesses to
[82719589]1505 * THREAD->rcu.was_preempted and CPU->rcu.nesting_cnt.
1506 */
[d4d36f9]1507 compiler_barrier();
[a35b458]1508
[d4d36f9]1509 /* Save the thread's nesting count when it is not running. */
1510 THREAD->rcu.nesting_cnt = CPU->rcu.nesting_cnt;
[a35b458]1511
[d4d36f9]1512 /* Preempted a reader critical section for the first time. */
1513 if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
1514 THREAD->rcu.was_preempted = true;
1515 note_preempted_reader();
1516 }
[a35b458]1517
[1b20da0]1518 /*
[d4d36f9]1519 * The preempted reader has been noted globally. There are therefore
1520 * no readers running on this cpu so this is a quiescent state.
1521 */
1522 _rcu_record_qs();
1523
[1b20da0]1524 /*
1525 * Interrupt handlers might use RCU while idle in scheduler().
1526 * The preempted reader has been noted globally, so the handlers
[82719589]1527 * may now start announcing quiescent states.
1528 */
1529 CPU->rcu.nesting_cnt = 0;
[a35b458]1530
[1b20da0]1531 /*
1532 * This cpu is holding up the current GP. Let the detector know
1533 * it has just passed a quiescent state.
1534 *
1535 * The detector waits separately for preempted readers, so we have
[d4d36f9]1536 * to notify the detector even if we have just preempted a reader.
1537 */
1538 if (CPU->rcu.is_delaying_gp) {
1539 CPU->rcu.is_delaying_gp = false;
1540 semaphore_up(&rcu.remaining_readers);
1541 }
1542
[1b20da0]1543 /*
[d4d36f9]1544 * Forcefully associate the detector with the highest priority
1545 * even if preempted due to its time slice running out.
[1b20da0]1546 *
[d4d36f9]1547 * todo: Replace with strict scheduler priority classes.
1548 */
1549 if (THREAD == rcu.detector_thr) {
1550 THREAD->priority = -1;
[18b6a88]1551 } else if (THREAD == CPU->rcu.reclaimer_thr) {
[d4d36f9]1552 THREAD->priority = -1;
[1b20da0]1553 }
[a35b458]1554
[82719589]1555 upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
[d4d36f9]1556}
1557
1558/** Called by the scheduler() when switching to a newly scheduled thread. */
1559void rcu_before_thread_runs(void)
1560{
[63e27ef]1561 assert(PREEMPTION_DISABLED || interrupts_disabled());
1562 assert(0 == CPU->rcu.nesting_cnt);
[a35b458]1563
[d4d36f9]1564 /* Load the thread's saved nesting count from before it was preempted. */
1565 CPU->rcu.nesting_cnt = THREAD->rcu.nesting_cnt;
[a35b458]1566
[1b20da0]1567 /*
[82719589]1568 * Ensures NMI see the proper nesting count before .signal_unlock.
1569 * Otherwise the NMI may incorrectly signal that a preempted reader
1570 * exited its reader section.
1571 */
1572 compiler_barrier();
[a35b458]1573
[1b20da0]1574 /*
1575 * In the unlikely event that a NMI occurs between the loading of the
1576 * variables and setting signal_unlock, the NMI handler may invoke
[d4d36f9]1577 * rcu_read_unlock() and clear signal_unlock. In that case we will
1578 * incorrectly overwrite signal_unlock from false to true. This event
[1b20da0]1579 * is benign and the next rcu_read_unlock() will at worst
[d4d36f9]1580 * needlessly invoke _rcu_signal_unlock().
1581 */
1582 CPU->rcu.signal_unlock = THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp;
1583}
1584
[1b20da0]1585/** Called from scheduler() when exiting the current thread.
1586 *
[d4d36f9]1587 * Preemption or interrupts are disabled and the scheduler() already
1588 * switched away from the current thread, calling rcu_after_thread_ran().
1589 */
1590void rcu_thread_exiting(void)
1591{
[63e27ef]1592 assert(THREAD != NULL);
1593 assert(THREAD->state == Exiting);
1594 assert(PREEMPTION_DISABLED || interrupts_disabled());
[a35b458]1595
[1b20da0]1596 /*
1597 * The thread forgot to exit its reader critical section.
[d4d36f9]1598 * It is a bug, but rather than letting the entire system lock up
[1b20da0]1599 * forcefully leave the reader section. The thread is not holding
[d4d36f9]1600 * any references anyway since it is exiting so it is safe.
1601 */
1602 if (0 < THREAD->rcu.nesting_cnt) {
1603 THREAD->rcu.nesting_cnt = 1;
1604 read_unlock_impl(&THREAD->rcu.nesting_cnt);
[fbe17545]1605
1606 printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
[18b6a88]1607 " section.\n", THREAD->tid, THREAD->name);
[d4d36f9]1608 }
[181a746]1609}
1610
[d4d36f9]1611
1612#endif /* RCU_PREEMPT_PODZIMEK */
1613
[181a746]1614/** Announces the start of a new grace period for preexisting readers to ack. */
1615static void start_new_gp(void)
1616{
[63e27ef]1617 assert(spinlock_locked(&rcu.gp_lock));
[a35b458]1618
[181a746]1619 irq_spinlock_lock(&rcu.preempt_lock, true);
[a35b458]1620
[181a746]1621 /* Start a new GP. Announce to readers that a quiescent state is needed. */
[8e3ed06]1622 ++_rcu_cur_gp;
[a35b458]1623
[1b20da0]1624 /*
[181a746]1625 * Readers preempted before the start of this GP (next_preempted)
[1b20da0]1626 * are preexisting readers now that a GP started and will hold up
[181a746]1627 * the current GP until they exit their reader sections.
[1b20da0]1628 *
1629 * Preempted readers from the previous GP have finished so
1630 * cur_preempted is empty, but see comment in _rcu_record_qs().
[181a746]1631 */
[c14762e]1632 list_concat(&rcu.cur_preempted, &rcu.next_preempted);
[a35b458]1633
[181a746]1634 irq_spinlock_unlock(&rcu.preempt_lock, true);
1635}
1636
[d4d36f9]1637/** Remove those cpus from the mask that have already passed a quiescent
1638 * state since the start of the current grace period.
1639 */
1640static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
[181a746]1641{
1642 /*
[1b20da0]1643 * Ensure the announcement of the start of a new GP (ie up-to-date
1644 * cur_gp) propagates to cpus that are just coming out of idle
[181a746]1645 * mode before we sample their idle state flag.
[1b20da0]1646 *
[181a746]1647 * Cpus guarantee that after they set CPU->idle = true they will not
1648 * execute any RCU reader sections without first setting idle to
1649 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
1650 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
1651 * the cpu must not have yet executed its memory barrier (otherwise
1652 * it would pair up with this mem barrier and we would see idle == false).
1653 * That memory barrier will pair up with the one below and ensure
1654 * that a reader on the now-non-idle cpu will see the most current
1655 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
1656 * pending_readers) during this GP, which allows the detector to
1657 * ignore that cpu (the detector thinks it is idle). Moreover, any
1658 * changes made by RCU updaters will have propagated to readers
1659 * on the previously idle cpu -- again thanks to issuing a memory
1660 * barrier after returning from idle mode.
[1b20da0]1661 *
[181a746]1662 * idle -> non-idle cpu | detector | reclaimer
1663 * ------------------------------------------------------
1664 * rcu reader 1 | | rcu_call()
1665 * MB X | |
[1b20da0]1666 * idle = true | | rcu_call()
1667 * (no rcu readers allowed ) | | MB A in advance_cbs()
[181a746]1668 * MB Y | (...) | (...)
[1b20da0]1669 * (no rcu readers allowed) | | MB B in advance_cbs()
[181a746]1670 * idle = false | ++cur_gp |
1671 * (no rcu readers allowed) | MB C |
1672 * MB Z | signal gp_end |
1673 * rcu reader 2 | | exec_cur_cbs()
[1b20da0]1674 *
1675 *
[181a746]1676 * MB Y orders visibility of changes to idle for detector's sake.
[1b20da0]1677 *
1678 * MB Z pairs up with MB C. The cpu making a transition from idle
[181a746]1679 * will see the most current value of cur_gp and will not attempt
1680 * to notify the detector even if preempted during this GP.
[1b20da0]1681 *
[181a746]1682 * MB Z pairs up with MB A from the previous batch. Updaters' changes
[1b20da0]1683 * are visible to reader 2 even when the detector thinks the cpu is idle
[181a746]1684 * but it is not anymore.
[1b20da0]1685 *
[181a746]1686 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
[1b20da0]1687 * and visible before idling and before any callbacks are executed
[181a746]1688 * by reclaimers.
[1b20da0]1689 *
[181a746]1690 * In summary, the detector does not know of or wait for reader 2, but
1691 * it does not have to since it is a new reader that will not access
1692 * data from previous GPs and will see any changes.
1693 */
1694 memory_barrier(); /* MB C */
[a35b458]1695
[181a746]1696 cpu_mask_for_each(*cpu_mask, cpu_id) {
[1b20da0]1697 /*
1698 * The cpu already checked for and passed through a quiescent
[181a746]1699 * state since the beginning of this GP.
[1b20da0]1700 *
1701 * _rcu_cur_gp is modified by local detector thread only.
1702 * Therefore, it is up-to-date even without a lock.
1703 *
[853d613]1704 * cpu.last_seen_gp may not be up-to-date. At worst, we will
[1b20da0]1705 * unnecessarily sample its last_seen_gp with a smp_call.
[181a746]1706 */
[8e3ed06]1707 bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
[a35b458]1708
[181a746]1709 /*
1710 * Either the cpu is idle or it is exiting away from idle mode
[8e3ed06]1711 * and already sees the most current _rcu_cur_gp. See comment
[181a746]1712 * in wait_for_readers().
1713 */
1714 bool cpu_idle = cpus[cpu_id].idle;
[a35b458]1715
[181a746]1716 if (cpu_acked_gp || cpu_idle) {
1717 cpu_mask_reset(cpu_mask, cpu_id);
1718 }
1719 }
1720}
1721
[853d613]1722/** Serially invokes sample_local_cpu(arg) on each cpu of reader_cpus. */
[d4d36f9]1723static void sample_cpus(cpu_mask_t *reader_cpus, void *arg)
[181a746]1724{
[d4d36f9]1725 cpu_mask_for_each(*reader_cpus, cpu_id) {
[853d613]1726 smp_call(cpu_id, sample_local_cpu, arg);
[181a746]1727
1728 /* Update statistic. */
1729 if (CPU->id != cpu_id)
1730 ++rcu.stat_smp_call_cnt;
1731 }
1732}
1733
[d4d36f9]1734static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
[181a746]1735{
[63e27ef]1736 assert(CPU->rcu.cur_cbs_gp <= completed_gp);
[a35b458]1737
[d4d36f9]1738 size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
1739 CPU->rcu.stat_missed_gp_in_wait += delta;
1740}
1741
1742/** Globally note that the current thread was preempted in a reader section. */
1743static void note_preempted_reader(void)
1744{
1745 irq_spinlock_lock(&rcu.preempt_lock, false);
1746
[8e3ed06]1747 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
[d4d36f9]1748 /* The reader started before the GP started - we must wait for it.*/
1749 list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
[181a746]1750 } else {
[1b20da0]1751 /*
[d4d36f9]1752 * The reader started after the GP started and this cpu
1753 * already noted a quiescent state. We might block the next GP.
[181a746]1754 */
[d4d36f9]1755 list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
[181a746]1756 }
[d4d36f9]1757
1758 irq_spinlock_unlock(&rcu.preempt_lock, false);
[181a746]1759}
1760
[d4d36f9]1761/** Remove the current thread from the global list of preempted readers. */
1762static void rm_preempted_reader(void)
[181a746]1763{
[82719589]1764 irq_spinlock_lock(&rcu.preempt_lock, true);
[a35b458]1765
[63e27ef]1766 assert(link_used(&THREAD->rcu.preempt_link));
[79d74fe]1767
[d4d36f9]1768 bool prev_empty = list_empty(&rcu.cur_preempted);
1769 list_remove(&THREAD->rcu.preempt_link);
1770 bool now_empty = list_empty(&rcu.cur_preempted);
1771
1772 /* This was the last reader in cur_preempted. */
1773 bool last_removed = now_empty && !prev_empty;
1774
[1b20da0]1775 /*
1776 * Preempted readers are blocking the detector and
1777 * this was the last reader blocking the current GP.
[d4d36f9]1778 */
1779 if (last_removed && rcu.preempt_blocking_det) {
1780 rcu.preempt_blocking_det = false;
1781 semaphore_up(&rcu.remaining_readers);
[181a746]1782 }
[d4d36f9]1783
[82719589]1784 irq_spinlock_unlock(&rcu.preempt_lock, true);
[181a746]1785}
1786
1787/** Waits for any preempted readers blocking this grace period to finish.*/
1788static bool wait_for_preempt_reader(void)
1789{
1790 irq_spinlock_lock(&rcu.preempt_lock, true);
1791
1792 bool reader_exists = !list_empty(&rcu.cur_preempted);
1793 rcu.preempt_blocking_det = reader_exists;
[a35b458]1794
[181a746]1795 irq_spinlock_unlock(&rcu.preempt_lock, true);
[a35b458]1796
[181a746]1797 if (reader_exists) {
1798 /* Update statistic. */
1799 ++rcu.stat_preempt_blocking_cnt;
[a35b458]1800
[181a746]1801 return semaphore_down_interruptable(&rcu.remaining_readers);
[1b20da0]1802 }
[a35b458]1803
[181a746]1804 return true;
1805}
1806
[82719589]1807static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt)
[0cf813d]1808{
1809 rcu_cpu_data_t *cr = &CPU->rcu;
[a35b458]1810
[82719589]1811 if (arriving_cbs_cnt > cr->last_arriving_cnt) {
1812 size_t arrived_cnt = arriving_cbs_cnt - cr->last_arriving_cnt;
[0cf813d]1813 cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
1814 }
[a35b458]1815
[82719589]1816 cr->last_arriving_cnt = arriving_cbs_cnt;
[181a746]1817}
1818
1819/** Prints RCU run-time statistics. */
1820void rcu_print_stat(void)
1821{
[1b20da0]1822 /*
1823 * Don't take locks. Worst case is we get out-dated values.
1824 * CPU local values are updated without any locks, so there
[0cf813d]1825 * are no locks to lock in order to get up-to-date values.
1826 */
[a35b458]1827
[d4d36f9]1828#ifdef RCU_PREEMPT_PODZIMEK
1829 const char *algo = "podzimek-preempt-rcu";
1830#elif defined(RCU_PREEMPT_A)
1831 const char *algo = "a-preempt-rcu";
1832#endif
[a35b458]1833
[d4d36f9]1834 printf("Config: expedite_threshold=%d, critical_threshold=%d,"
[18b6a88]1835 " detect_sleep=%dms, %s\n",
1836 EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS, algo);
[181a746]1837 printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
1838 printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
[1b20da0]1839 printf("Delayed GPs: %zu (cpus w/ still running readers after gp sleep)\n",
[18b6a88]1840 rcu.stat_delayed_cnt);
[181a746]1841 printf("Preempt blocked GPs: %zu (waited for preempted readers; "
[18b6a88]1842 "running or not)\n", rcu.stat_preempt_blocking_cnt);
[b68ae24]1843 printf("Smp calls: %zu\n", rcu.stat_smp_call_cnt);
[a35b458]1844
[0cf813d]1845 printf("Max arrived callbacks per GP and CPU:\n");
1846 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1847 printf(" %zu", cpus[i].rcu.stat_max_cbs);
1848 }
1849
[0cf813d]1850 printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
1851 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1852 printf(" %zu", cpus[i].rcu.stat_avg_cbs);
1853 }
[a35b458]1854
[0cf813d]1855 printf("\nMax arrived callbacks per time slice and CPU:\n");
1856 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1857 printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
1858 }
1859
1860 printf("\nMissed GP notifications per CPU:\n");
1861 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1862 printf(" %zu", cpus[i].rcu.stat_missed_gps);
1863 }
[0cf813d]1864
1865 printf("\nMissed GP notifications per CPU while waking up:\n");
1866 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1867 printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
1868 }
[181a746]1869 printf("\n");
1870}
1871
1872/** @}
1873 */
Note: See TracBrowser for help on using the repository browser.