source: mainline/kernel/generic/src/synch/rcu.c@ 205832b

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 205832b was 205832b, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago

Replaced 0 with NULL where appropriate (in rcu, cht, uspace hashtable, smp_call, workqueue).

  • Property mode set to 100644
File size: 50.1 KB
Line 
1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29
30/** @addtogroup sync
31 * @{
32 */
33
34/**
35 * @file
36 * @brief Preemptible read-copy update. Usable from interrupt handlers.
37 */
38
39#include <synch/rcu.h>
40#include <synch/condvar.h>
41#include <synch/semaphore.h>
42#include <synch/spinlock.h>
43#include <synch/mutex.h>
44#include <proc/thread.h>
45#include <cpu/cpu_mask.h>
46#include <cpu.h>
47#include <smp/smp_call.h>
48#include <compiler/barrier.h>
49#include <atomic.h>
50#include <arch.h>
51#include <macros.h>
52
53/*
54 * Number of milliseconds to give to preexisting readers to finish
55 * when non-expedited grace period detection is in progress.
56 */
57#define DETECT_SLEEP_MS 10
58/*
59 * Max number of pending callbacks in the local cpu's queue before
60 * aggressively expediting the current grace period
61 */
62#define EXPEDITE_THRESHOLD 2000
63/*
64 * Max number of callbacks to execute in one go with preemption
65 * enabled. If there are more callbacks to be executed they will
66 * be run with preemption disabled in order to prolong reclaimer's
67 * time slice and give it a chance to catch up with callback producers.
68 */
69#define CRITICAL_THRESHOLD 30000
70/* Half the number of values a uint32 can hold. */
71#define UINT32_MAX_HALF 2147483648U
72
73/**
74 * The current grace period number. Increases monotonically.
75 * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
76 */
77rcu_gp_t _rcu_cur_gp;
78
79/** Global RCU data. */
80typedef struct rcu_data {
81 /** Detector uses so signal reclaimers that a grace period ended. */
82 condvar_t gp_ended;
83 /** Reclaimers use to notify the detector to accelerate GP detection. */
84 condvar_t expedite_now;
85 /**
86 * Protects: req_gp_end_cnt, req_expedited_cnt, completed_gp, _rcu_cur_gp;
87 * or: completed_gp, _rcu_cur_gp
88 */
89 SPINLOCK_DECLARE(gp_lock);
90 /**
91 * The number of the most recently completed grace period. At most
92 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period
93 * detection is not in progress and the detector is idle.
94 */
95 rcu_gp_t completed_gp;
96
97 /** Protects the following 3 fields. */
98 IRQ_SPINLOCK_DECLARE(preempt_lock);
99 /** Preexisting readers that have been preempted. */
100 list_t cur_preempted;
101 /** Reader that have been preempted and might delay the next grace period.*/
102 list_t next_preempted;
103 /**
104 * The detector is waiting for the last preempted reader
105 * in cur_preempted to announce that it exited its reader
106 * section by up()ing remaining_readers.
107 */
108 bool preempt_blocking_det;
109
110#ifdef RCU_PREEMPT_A
111
112 /**
113 * The detector waits on this semaphore for any preempted readers
114 * delaying the grace period once all cpus pass a quiescent state.
115 */
116 semaphore_t remaining_readers;
117
118#elif defined(RCU_PREEMPT_PODZIMEK)
119
120 /** Reclaimers notify the detector when they request more grace periods.*/
121 condvar_t req_gp_changed;
122 /** Number of grace period ends the detector was requested to announce. */
123 size_t req_gp_end_cnt;
124 /** Number of consecutive grace periods to detect quickly and aggressively.*/
125 size_t req_expedited_cnt;
126 /**
127 * Number of cpus with readers that are delaying the current GP.
128 * They will up() remaining_readers.
129 */
130 atomic_t delaying_cpu_cnt;
131 /**
132 * The detector waits on this semaphore for any readers delaying the GP.
133 *
134 * Each of the cpus with readers that are delaying the current GP
135 * must up() this sema once they reach a quiescent state. If there
136 * are any readers in cur_preempted (ie preempted preexisting) and
137 * they are already delaying GP detection, the last to unlock its
138 * reader section must up() this sema once.
139 */
140 semaphore_t remaining_readers;
141#endif
142
143 /** Excludes simultaneous rcu_barrier() calls. */
144 mutex_t barrier_mtx;
145 /** Number of cpus that we are waiting for to complete rcu_barrier(). */
146 atomic_t barrier_wait_cnt;
147 /** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
148 waitq_t barrier_wq;
149
150 /** Interruptible attached detector thread pointer. */
151 thread_t *detector_thr;
152
153 /* Some statistics. */
154 size_t stat_expedited_cnt;
155 size_t stat_delayed_cnt;
156 size_t stat_preempt_blocking_cnt;
157 /* Does not contain self/local calls. */
158 size_t stat_smp_call_cnt;
159} rcu_data_t;
160
161
162static rcu_data_t rcu;
163
164static void start_reclaimers(void);
165static void synch_complete(rcu_item_t *rcu_item);
166static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
167 rcu_func_t func);
168static void add_barrier_cb(void *arg);
169static void barrier_complete(rcu_item_t *barrier_item);
170static bool arriving_cbs_empty(void);
171static bool next_cbs_empty(void);
172static bool cur_cbs_empty(void);
173static bool all_cbs_empty(void);
174static void reclaimer(void *arg);
175static bool wait_for_pending_cbs(void);
176static bool advance_cbs(void);
177static void exec_completed_cbs(rcu_gp_t last_completed_gp);
178static void exec_cbs(rcu_item_t **phead);
179static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
180static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
181
182#ifdef RCU_PREEMPT_PODZIMEK
183static void start_detector(void);
184static void read_unlock_impl(size_t *pnesting_cnt);
185static void req_detection(size_t req_cnt);
186static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
187static void detector(void *);
188static bool wait_for_detect_req(void);
189static void end_cur_gp(void);
190static bool wait_for_readers(void);
191static bool gp_sleep(void);
192static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
193static bool wait_for_delaying_cpus(void);
194#elif defined(RCU_PREEMPT_A)
195static bool wait_for_readers(bool expedite);
196static bool gp_sleep(bool *expedite);
197#endif
198
199static void start_new_gp(void);
200static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
201static void sample_cpus(cpu_mask_t *reader_cpus, void *arg);
202static void sample_local_cpu(void *);
203static bool wait_for_preempt_reader(void);
204static void note_preempted_reader(void);
205static void rm_preempted_reader(void);
206static void upd_max_cbs_in_slice(void);
207
208
209
210/** Initializes global RCU structures. */
211void rcu_init(void)
212{
213 condvar_initialize(&rcu.gp_ended);
214 condvar_initialize(&rcu.expedite_now);
215
216 spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
217 _rcu_cur_gp = 0;
218 rcu.completed_gp = 0;
219
220 irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
221 list_initialize(&rcu.cur_preempted);
222 list_initialize(&rcu.next_preempted);
223 rcu.preempt_blocking_det = false;
224
225 mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
226 atomic_set(&rcu.barrier_wait_cnt, 0);
227 waitq_initialize(&rcu.barrier_wq);
228
229 semaphore_initialize(&rcu.remaining_readers, 0);
230
231#ifdef RCU_PREEMPT_PODZIMEK
232 condvar_initialize(&rcu.req_gp_changed);
233
234 rcu.req_gp_end_cnt = 0;
235 rcu.req_expedited_cnt = 0;
236 atomic_set(&rcu.delaying_cpu_cnt, 0);
237#endif
238
239 rcu.detector_thr = NULL;
240
241 rcu.stat_expedited_cnt = 0;
242 rcu.stat_delayed_cnt = 0;
243 rcu.stat_preempt_blocking_cnt = 0;
244 rcu.stat_smp_call_cnt = 0;
245}
246
247/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
248void rcu_cpu_init(void)
249{
250 if (config.cpu_active == 1) {
251 rcu_init();
252 }
253
254 CPU->rcu.last_seen_gp = 0;
255
256#ifdef RCU_PREEMPT_PODZIMEK
257 CPU->rcu.nesting_cnt = 0;
258 CPU->rcu.is_delaying_gp = false;
259 CPU->rcu.signal_unlock = false;
260#endif
261
262 CPU->rcu.cur_cbs = NULL;
263 CPU->rcu.cur_cbs_cnt = 0;
264 CPU->rcu.next_cbs = NULL;
265 CPU->rcu.next_cbs_cnt = 0;
266 CPU->rcu.arriving_cbs = NULL;
267 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
268 CPU->rcu.arriving_cbs_cnt = 0;
269
270 CPU->rcu.cur_cbs_gp = 0;
271 CPU->rcu.next_cbs_gp = 0;
272
273 semaphore_initialize(&CPU->rcu.arrived_flag, 0);
274
275 /* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
276 if (config.cpu_active == 1)
277 CPU->rcu.reclaimer_thr = NULL;
278
279 CPU->rcu.stat_max_cbs = 0;
280 CPU->rcu.stat_avg_cbs = 0;
281 CPU->rcu.stat_missed_gps = 0;
282 CPU->rcu.stat_missed_gp_in_wait = 0;
283 CPU->rcu.stat_max_slice_cbs = 0;
284 CPU->rcu.last_arriving_cnt = 0;
285}
286
287/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
288void rcu_kinit_init(void)
289{
290#ifdef RCU_PREEMPT_PODZIMEK
291 start_detector();
292#endif
293
294 start_reclaimers();
295}
296
297/** Initializes any per-thread RCU structures. */
298void rcu_thread_init(thread_t *thread)
299{
300 thread->rcu.nesting_cnt = 0;
301
302#ifdef RCU_PREEMPT_PODZIMEK
303 thread->rcu.was_preempted = false;
304#endif
305
306 link_initialize(&thread->rcu.preempt_link);
307}
308
309
310/** Cleans up global RCU resources and stops dispatching callbacks.
311 *
312 * Call when shutting down the kernel. Outstanding callbacks will
313 * not be processed. Instead they will linger forever.
314 */
315void rcu_stop(void)
316{
317 /* Stop and wait for reclaimers. */
318 for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
319 ASSERT(cpus[cpu_id].rcu.reclaimer_thr != NULL);
320
321 if (cpus[cpu_id].rcu.reclaimer_thr) {
322 thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
323 thread_join(cpus[cpu_id].rcu.reclaimer_thr);
324 thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
325 cpus[cpu_id].rcu.reclaimer_thr = NULL;
326 }
327 }
328
329#ifdef RCU_PREEMPT_PODZIMEK
330 /* Stop the detector and wait. */
331 if (rcu.detector_thr) {
332 thread_interrupt(rcu.detector_thr);
333 thread_join(rcu.detector_thr);
334 thread_detach(rcu.detector_thr);
335 rcu.detector_thr = NULL;
336 }
337#endif
338}
339
340/** Returns the number of elapsed grace periods since boot. */
341uint64_t rcu_completed_gps(void)
342{
343 spinlock_lock(&rcu.gp_lock);
344 uint64_t completed = rcu.completed_gp;
345 spinlock_unlock(&rcu.gp_lock);
346
347 return completed;
348}
349
350/** Creates and runs cpu-bound reclaimer threads. */
351static void start_reclaimers(void)
352{
353 for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
354 char name[THREAD_NAME_BUFLEN] = {0};
355
356 snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
357
358 cpus[cpu_id].rcu.reclaimer_thr =
359 thread_create(reclaimer, NULL, TASK, THREAD_FLAG_NONE, name);
360
361 if (!cpus[cpu_id].rcu.reclaimer_thr)
362 panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
363
364 thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
365 thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
366 }
367}
368
369#ifdef RCU_PREEMPT_PODZIMEK
370
371/** Starts the detector thread. */
372static void start_detector(void)
373{
374 rcu.detector_thr =
375 thread_create(detector, NULL, TASK, THREAD_FLAG_NONE, "rcu-det");
376
377 if (!rcu.detector_thr)
378 panic("Failed to create RCU detector thread.");
379
380 thread_ready(rcu.detector_thr);
381}
382
383/** Returns true if in an rcu reader section. */
384bool rcu_read_locked(void)
385{
386 preemption_disable();
387 bool locked = 0 < CPU->rcu.nesting_cnt;
388 preemption_enable();
389
390 return locked;
391}
392
393/** Unlocks the local reader section using the given nesting count.
394 *
395 * Preemption or interrupts must be disabled.
396 *
397 * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or
398 * THREAD->rcu.nesting_cnt.
399 */
400static void read_unlock_impl(size_t *pnesting_cnt)
401{
402 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
403
404 if (0 == --(*pnesting_cnt)) {
405 _rcu_record_qs();
406
407 /*
408 * The thread was preempted while in a critical section or
409 * the detector is eagerly waiting for this cpu's reader
410 * to finish.
411 *
412 * Note that THREAD may be NULL in scheduler() and not just during boot.
413 */
414 if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
415 /* Rechecks with disabled interrupts. */
416 _rcu_signal_read_unlock();
417 }
418 }
419}
420
421/** If necessary, signals the detector that we exited a reader section. */
422void _rcu_signal_read_unlock(void)
423{
424 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
425
426 /* todo: make NMI safe with cpu-local atomic ops. */
427
428 /*
429 * We have to disable interrupts in order to make checking
430 * and resetting was_preempted and is_delaying_gp atomic
431 * with respect to local interrupt handlers. Otherwise
432 * an interrupt could beat us to calling semaphore_up()
433 * before we reset the appropriate flag.
434 */
435 ipl_t ipl = interrupts_disable();
436
437 /*
438 * If the detector is eagerly waiting for this cpu's reader to unlock,
439 * notify it that the reader did so.
440 */
441 if (CPU->rcu.is_delaying_gp) {
442 CPU->rcu.is_delaying_gp = false;
443 semaphore_up(&rcu.remaining_readers);
444 }
445
446 /*
447 * This reader was preempted while in a reader section.
448 * We might be holding up the current GP. Notify the
449 * detector if so.
450 */
451 if (THREAD && THREAD->rcu.was_preempted) {
452 ASSERT(link_used(&THREAD->rcu.preempt_link));
453 THREAD->rcu.was_preempted = false;
454
455 rm_preempted_reader();
456 }
457
458 /* If there was something to signal to the detector we have done so. */
459 CPU->rcu.signal_unlock = false;
460
461 interrupts_restore(ipl);
462}
463
464#endif /* RCU_PREEMPT_PODZIMEK */
465
466typedef struct synch_item {
467 waitq_t wq;
468 rcu_item_t rcu_item;
469} synch_item_t;
470
471/** Blocks until all preexisting readers exit their critical sections. */
472void rcu_synchronize(void)
473{
474 _rcu_synchronize(false);
475}
476
477/** Blocks until all preexisting readers exit their critical sections. */
478void rcu_synchronize_expedite(void)
479{
480 _rcu_synchronize(true);
481}
482
483/** Blocks until all preexisting readers exit their critical sections. */
484void _rcu_synchronize(bool expedite)
485{
486 /* Calling from a reader section will deadlock. */
487 ASSERT(!rcu_read_locked());
488
489 synch_item_t completion;
490
491 waitq_initialize(&completion.wq);
492 _rcu_call(expedite, &completion.rcu_item, synch_complete);
493 waitq_sleep(&completion.wq);
494 waitq_complete_wakeup(&completion.wq);
495}
496
497/** rcu_synchronize's callback. */
498static void synch_complete(rcu_item_t *rcu_item)
499{
500 synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
501 ASSERT(completion);
502 waitq_wakeup(&completion->wq, WAKEUP_FIRST);
503}
504
505/** Waits for all outstanding rcu calls to complete. */
506void rcu_barrier(void)
507{
508 /*
509 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
510 * currently in use by rcu_barrier().
511 */
512 mutex_lock(&rcu.barrier_mtx);
513
514 /*
515 * Ensure we queue a barrier callback on all cpus before the already
516 * enqueued barrier callbacks start signaling completion.
517 */
518 atomic_set(&rcu.barrier_wait_cnt, 1);
519
520 DEFINE_CPU_MASK(cpu_mask);
521 cpu_mask_active(cpu_mask);
522
523 cpu_mask_for_each(*cpu_mask, cpu_id) {
524 smp_call(cpu_id, add_barrier_cb, NULL);
525 }
526
527 if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
528 waitq_sleep(&rcu.barrier_wq);
529 }
530
531 mutex_unlock(&rcu.barrier_mtx);
532}
533
534/** Issues a rcu_barrier() callback on the local cpu.
535 *
536 * Executed with interrupts disabled.
537 */
538static void add_barrier_cb(void *arg)
539{
540 ASSERT(interrupts_disabled() || PREEMPTION_DISABLED);
541 atomic_inc(&rcu.barrier_wait_cnt);
542 rcu_call(&CPU->rcu.barrier_item, barrier_complete);
543}
544
545/** Local cpu's rcu_barrier() completion callback. */
546static void barrier_complete(rcu_item_t *barrier_item)
547{
548 /* Is this the last barrier callback completed? */
549 if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
550 /* Notify rcu_barrier() that we're done. */
551 waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
552 }
553}
554
555/** Adds a callback to invoke after all preexisting readers finish.
556 *
557 * May be called from within interrupt handlers or RCU reader sections.
558 *
559 * @param rcu_item Used by RCU to track the call. Must remain
560 * until the user callback function is entered.
561 * @param func User callback function that will be invoked once a full
562 * grace period elapsed, ie at a time when all preexisting
563 * readers have finished. The callback should be short and must
564 * not block. If you must sleep, enqueue your work in the system
565 * work queue from the callback (ie workq_global_enqueue()).
566 */
567void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
568{
569 rcu_call_impl(false, rcu_item, func);
570}
571
572/** rcu_call() implementation. See rcu_call() for comments. */
573void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
574{
575 rcu_call_impl(expedite, rcu_item, func);
576}
577
578/** rcu_call() inline-able implementation. See rcu_call() for comments. */
579static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
580 rcu_func_t func)
581{
582 ASSERT(rcu_item);
583
584 rcu_item->func = func;
585 rcu_item->next = NULL;
586
587 preemption_disable();
588
589 ipl_t ipl = interrupts_disable();
590
591 rcu_cpu_data_t *r = &CPU->rcu;
592 *r->parriving_cbs_tail = rcu_item;
593 r->parriving_cbs_tail = &rcu_item->next;
594
595 size_t cnt = ++r->arriving_cbs_cnt;
596 interrupts_restore(ipl);
597
598 if (expedite) {
599 r->expedite_arriving = true;
600 }
601
602 /* Added first callback - notify the reclaimer. */
603 if (cnt == 1 && !semaphore_count_get(&r->arrived_flag)) {
604 semaphore_up(&r->arrived_flag);
605 }
606
607 preemption_enable();
608}
609
610static bool cur_cbs_empty(void)
611{
612 ASSERT(THREAD && THREAD->wired);
613 return NULL == CPU->rcu.cur_cbs;
614}
615
616static bool next_cbs_empty(void)
617{
618 ASSERT(THREAD && THREAD->wired);
619 return NULL == CPU->rcu.next_cbs;
620}
621
622/** Disable interrupts to get an up-to-date result. */
623static bool arriving_cbs_empty(void)
624{
625 ASSERT(THREAD && THREAD->wired);
626 /*
627 * Accessing with interrupts enabled may at worst lead to
628 * a false negative if we race with a local interrupt handler.
629 */
630 return NULL == CPU->rcu.arriving_cbs;
631}
632
633static bool all_cbs_empty(void)
634{
635 return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
636}
637
638
639/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
640static void reclaimer(void *arg)
641{
642 ASSERT(THREAD && THREAD->wired);
643 ASSERT(THREAD == CPU->rcu.reclaimer_thr);
644
645 rcu_gp_t last_compl_gp = 0;
646 bool ok = true;
647
648 while (ok && wait_for_pending_cbs()) {
649 ASSERT(CPU->rcu.reclaimer_thr == THREAD);
650
651 exec_completed_cbs(last_compl_gp);
652
653 bool expedite = advance_cbs();
654
655 ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
656 }
657}
658
659/** Waits until there are callbacks waiting to be dispatched. */
660static bool wait_for_pending_cbs(void)
661{
662 if (!all_cbs_empty())
663 return true;
664
665 bool ok = true;
666
667 while (arriving_cbs_empty() && ok) {
668 ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
669 }
670
671 return ok;
672}
673
674static void upd_stat_missed_gp(rcu_gp_t compl)
675{
676 if (CPU->rcu.cur_cbs_gp < compl) {
677 CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
678 }
679}
680
681/** Executes all callbacks for the given completed grace period. */
682static void exec_completed_cbs(rcu_gp_t last_completed_gp)
683{
684 upd_stat_missed_gp(last_completed_gp);
685
686 /* Both next_cbs and cur_cbs GP elapsed. */
687 if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
688 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
689
690 size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
691
692 if (exec_cnt < CRITICAL_THRESHOLD) {
693 exec_cbs(&CPU->rcu.cur_cbs);
694 exec_cbs(&CPU->rcu.next_cbs);
695 } else {
696 /*
697 * Getting overwhelmed with too many callbacks to run.
698 * Disable preemption in order to prolong our time slice
699 * and catch up with updaters posting new callbacks.
700 */
701 preemption_disable();
702 exec_cbs(&CPU->rcu.cur_cbs);
703 exec_cbs(&CPU->rcu.next_cbs);
704 preemption_enable();
705 }
706
707 CPU->rcu.cur_cbs_cnt = 0;
708 CPU->rcu.next_cbs_cnt = 0;
709 } else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
710
711 if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
712 exec_cbs(&CPU->rcu.cur_cbs);
713 } else {
714 /*
715 * Getting overwhelmed with too many callbacks to run.
716 * Disable preemption in order to prolong our time slice
717 * and catch up with updaters posting new callbacks.
718 */
719 preemption_disable();
720 exec_cbs(&CPU->rcu.cur_cbs);
721 preemption_enable();
722 }
723
724 CPU->rcu.cur_cbs_cnt = 0;
725 }
726}
727
728/** Executes callbacks in the single-linked list. The list is left empty. */
729static void exec_cbs(rcu_item_t **phead)
730{
731 rcu_item_t *rcu_item = *phead;
732
733 while (rcu_item) {
734 /* func() may free rcu_item. Get a local copy. */
735 rcu_item_t *next = rcu_item->next;
736 rcu_func_t func = rcu_item->func;
737
738 func(rcu_item);
739
740 rcu_item = next;
741 }
742
743 *phead = NULL;
744}
745
746static void upd_stat_cb_cnts(size_t arriving_cnt)
747{
748 CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
749 if (0 < arriving_cnt) {
750 CPU->rcu.stat_avg_cbs =
751 (99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
752 }
753}
754
755/** Prepares another batch of callbacks to dispatch at the nest grace period.
756 *
757 * @return True if the next batch of callbacks must be expedited quickly.
758 */
759static bool advance_cbs(void)
760{
761 /* Move next_cbs to cur_cbs. */
762 CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
763 CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
764 CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
765
766 /* Move arriving_cbs to next_cbs. Empties arriving_cbs. */
767 ipl_t ipl = interrupts_disable();
768
769 /*
770 * Too many callbacks queued. Better speed up the detection
771 * or risk exhausting all system memory.
772 */
773 bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.arriving_cbs_cnt)
774 || CPU->rcu.expedite_arriving;
775
776 CPU->rcu.expedite_arriving = false;
777
778 CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
779 CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
780
781 CPU->rcu.arriving_cbs = NULL;
782 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
783 CPU->rcu.arriving_cbs_cnt = 0;
784
785 interrupts_restore(ipl);
786
787 /* Update statistics of arrived callbacks. */
788 upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
789
790 /*
791 * Make changes prior to queuing next_cbs visible to readers.
792 * See comment in wait_for_readers().
793 */
794 memory_barrier(); /* MB A, B */
795
796 /* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
797
798 if (!next_cbs_empty()) {
799 spinlock_lock(&rcu.gp_lock);
800
801 /* Exec next_cbs at the end of the next GP. */
802 CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
803
804 /*
805 * There are no callbacks to invoke before next_cbs. Instruct
806 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
807 * That could be sooner than next_cbs_gp (if the current GP
808 * had not yet completed), so we'll create a shorter batch
809 * of callbacks next time around.
810 */
811 if (cur_cbs_empty()) {
812 CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
813 }
814
815 spinlock_unlock(&rcu.gp_lock);
816 } else {
817 CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
818 }
819
820 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
821
822 return expedite;
823}
824
825
826#ifdef RCU_PREEMPT_A
827
828/** Waits for the grace period associated with callbacks cub_cbs to elapse.
829 *
830 * @param expedite Instructs the detector to aggressively speed up grace
831 * period detection without any delay.
832 * @param completed_gp Returns the most recent completed grace period
833 * number.
834 * @return false if the thread was interrupted and should stop.
835 */
836static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
837{
838 spinlock_lock(&rcu.gp_lock);
839
840 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
841 ASSERT(CPU->rcu.cur_cbs_gp <= _rcu_cur_gp + 1);
842
843 while (rcu.completed_gp < CPU->rcu.cur_cbs_gp) {
844 /* GP has not yet started - start a new one. */
845 if (rcu.completed_gp == _rcu_cur_gp) {
846 start_new_gp();
847 spinlock_unlock(&rcu.gp_lock);
848
849 if (!wait_for_readers(expedite))
850 return false;
851
852 spinlock_lock(&rcu.gp_lock);
853 /* Notify any reclaimers this GP had ended. */
854 rcu.completed_gp = _rcu_cur_gp;
855 condvar_broadcast(&rcu.gp_ended);
856 } else {
857 /* GP detection is in progress.*/
858
859 if (expedite)
860 condvar_signal(&rcu.expedite_now);
861
862 /* Wait for the GP to complete. */
863 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended,
864 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
865
866 if (ret == ESYNCH_INTERRUPTED) {
867 spinlock_unlock(&rcu.gp_lock);
868 return false;
869 }
870 }
871 }
872
873 upd_missed_gp_in_wait(rcu.completed_gp);
874
875 *completed_gp = rcu.completed_gp;
876 spinlock_unlock(&rcu.gp_lock);
877
878 return true;
879}
880
881static bool wait_for_readers(bool expedite)
882{
883 DEFINE_CPU_MASK(reader_cpus);
884
885 cpu_mask_active(reader_cpus);
886 rm_quiescent_cpus(reader_cpus);
887
888 while (!cpu_mask_is_none(reader_cpus)) {
889 /* Give cpus a chance to context switch (a QS) and batch callbacks. */
890 if(!gp_sleep(&expedite))
891 return false;
892
893 rm_quiescent_cpus(reader_cpus);
894 sample_cpus(reader_cpus, reader_cpus);
895 }
896
897 /* Update statistic. */
898 if (expedite) {
899 ++rcu.stat_expedited_cnt;
900 }
901
902 /*
903 * All cpus have passed through a QS and see the most recent _rcu_cur_gp.
904 * As a result newly preempted readers will associate with next_preempted
905 * and the number of old readers in cur_preempted will monotonically
906 * decrease. Wait for those old/preexisting readers.
907 */
908 return wait_for_preempt_reader();
909}
910
911static bool gp_sleep(bool *expedite)
912{
913 if (*expedite) {
914 scheduler();
915 return true;
916 } else {
917 spinlock_lock(&rcu.gp_lock);
918
919 int ret = 0;
920 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
921 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
922
923 /* rcu.expedite_now was signaled. */
924 if (ret == ESYNCH_OK_BLOCKED) {
925 *expedite = true;
926 }
927
928 spinlock_unlock(&rcu.gp_lock);
929
930 return (ret != ESYNCH_INTERRUPTED);
931 }
932}
933
934static void sample_local_cpu(void *arg)
935{
936 ASSERT(interrupts_disabled());
937 cpu_mask_t *reader_cpus = (cpu_mask_t *)arg;
938
939 bool locked = RCU_CNT_INC <= THE->rcu_nesting;
940 bool passed_qs = (CPU->rcu.last_seen_gp == _rcu_cur_gp);
941
942 if (locked && !passed_qs) {
943 /*
944 * This cpu has not yet passed a quiescent state during this grace
945 * period and it is currently in a reader section. We'll have to
946 * try to sample this cpu again later.
947 */
948 } else {
949 /* Either not in a reader section or already passed a QS. */
950 cpu_mask_reset(reader_cpus, CPU->id);
951 /* Contain new reader sections and make prior changes visible to them.*/
952 memory_barrier();
953 CPU->rcu.last_seen_gp = _rcu_cur_gp;
954 }
955}
956
957/** Called by the scheduler() when switching away from the current thread. */
958void rcu_after_thread_ran(void)
959{
960 ASSERT(interrupts_disabled());
961
962 /* Preempted a reader critical section for the first time. */
963 if (rcu_read_locked() && !(THE->rcu_nesting & RCU_WAS_PREEMPTED)) {
964 THE->rcu_nesting |= RCU_WAS_PREEMPTED;
965 note_preempted_reader();
966 }
967
968 /* Save the thread's nesting count when it is not running. */
969 THREAD->rcu.nesting_cnt = THE->rcu_nesting;
970
971 /* Clear rcu_nesting only after noting that a thread was preempted. */
972 compiler_barrier();
973 THE->rcu_nesting = 0;
974
975 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
976 /*
977 * Contain any memory accesses of old readers before announcing a QS.
978 * Also make changes from the previous GP visible to this cpu.
979 */
980 memory_barrier();
981 /*
982 * The preempted reader has been noted globally. There are therefore
983 * no readers running on this cpu so this is a quiescent state.
984 */
985 CPU->rcu.last_seen_gp = _rcu_cur_gp;
986 }
987
988 /*
989 * Forcefully associate the reclaime with the highest priority
990 * even if preempted due to its time slice running out.
991 */
992 if (THREAD == CPU->rcu.reclaimer_thr) {
993 THREAD->priority = -1;
994 }
995
996 upd_max_cbs_in_slice();
997}
998
999/** Called by the scheduler() when switching to a newly scheduled thread. */
1000void rcu_before_thread_runs(void)
1001{
1002 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1003 ASSERT(!rcu_read_locked());
1004
1005 /* Load the thread's saved nesting count from before it was preempted. */
1006 THE->rcu_nesting = THREAD->rcu.nesting_cnt;
1007}
1008
1009/** Called from scheduler() when exiting the current thread.
1010 *
1011 * Preemption or interrupts are disabled and the scheduler() already
1012 * switched away from the current thread, calling rcu_after_thread_ran().
1013 */
1014void rcu_thread_exiting(void)
1015{
1016 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1017 /*
1018 * The thread forgot to exit its reader critical section.
1019 * It is a bug, but rather than letting the entire system lock up
1020 * forcefully leave the reader section. The thread is not holding
1021 * any references anyway since it is exiting so it is safe.
1022 */
1023 if (RCU_CNT_INC <= THREAD->rcu.nesting_cnt) {
1024 /* Emulate _rcu_preempted_unlock() with the proper nesting count. */
1025 if (THREAD->rcu.nesting_cnt & RCU_WAS_PREEMPTED) {
1026 ipl_t ipl = interrupts_disable();
1027 rm_preempted_reader();
1028 interrupts_restore(ipl);
1029 }
1030
1031 printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
1032 " section.\n", THREAD->tid, THREAD->name);
1033 }
1034}
1035
1036/** Returns true if in an rcu reader section. */
1037bool rcu_read_locked(void)
1038{
1039 return RCU_CNT_INC <= THE->rcu_nesting;
1040}
1041
1042/** Invoked when a preempted reader finally exits its reader section. */
1043void _rcu_preempted_unlock(void)
1044{
1045 ipl_t ipl = interrupts_disable();
1046
1047 if (THE->rcu_nesting == RCU_WAS_PREEMPTED) {
1048 THE->rcu_nesting = 0;
1049 rm_preempted_reader();
1050 }
1051
1052 interrupts_restore(ipl);
1053}
1054
1055#elif defined(RCU_PREEMPT_PODZIMEK)
1056
1057/** Waits for the grace period associated with callbacks cub_cbs to elapse.
1058 *
1059 * @param expedite Instructs the detector to aggressively speed up grace
1060 * period detection without any delay.
1061 * @param completed_gp Returns the most recent completed grace period
1062 * number.
1063 * @return false if the thread was interrupted and should stop.
1064 */
1065static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
1066{
1067 /*
1068 * Use a possibly outdated version of completed_gp to bypass checking
1069 * with the lock.
1070 *
1071 * Note that loading and storing rcu.completed_gp is not atomic
1072 * (it is 64bit wide). Reading a clobbered value that is less than
1073 * rcu.completed_gp is harmless - we'll recheck with a lock. The
1074 * only way to read a clobbered value that is greater than the actual
1075 * value is if the detector increases the higher-order word first and
1076 * then decreases the lower-order word (or we see stores in that order),
1077 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value
1078 * suddenly jumps by 2^32. It would take hours for such an increase
1079 * to occur so it is safe to discard the value. We allow increases
1080 * of up to half the maximum to generously accommodate for loading an
1081 * outdated lower word.
1082 */
1083 rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
1084 if (CPU->rcu.cur_cbs_gp <= compl_gp
1085 && compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
1086 *completed_gp = compl_gp;
1087 return true;
1088 }
1089
1090 spinlock_lock(&rcu.gp_lock);
1091
1092 if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
1093 *completed_gp = rcu.completed_gp;
1094 spinlock_unlock(&rcu.gp_lock);
1095 return true;
1096 }
1097
1098 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
1099 ASSERT(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
1100
1101 /*
1102 * Notify the detector of how many GP ends we intend to wait for, so
1103 * it can avoid going to sleep unnecessarily. Optimistically assume
1104 * new callbacks will arrive while we're waiting; hence +1.
1105 */
1106 size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
1107 req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
1108
1109 /*
1110 * Ask the detector to speed up GP detection if there are too many
1111 * pending callbacks and other reclaimers have not already done so.
1112 */
1113 if (expedite) {
1114 if(0 == rcu.req_expedited_cnt)
1115 condvar_signal(&rcu.expedite_now);
1116
1117 /*
1118 * Expedite only cub_cbs. If there really is a surge of callbacks
1119 * the arriving batch will expedite the GP for the huge number
1120 * of callbacks currently in next_cbs
1121 */
1122 rcu.req_expedited_cnt = 1;
1123 }
1124
1125 /* Wait for cur_cbs_gp to end. */
1126 bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
1127
1128 *completed_gp = rcu.completed_gp;
1129 spinlock_unlock(&rcu.gp_lock);
1130
1131 if (!interrupted)
1132 upd_missed_gp_in_wait(*completed_gp);
1133
1134 return !interrupted;
1135}
1136
1137/** Waits for an announcement of the end of the grace period wait_on_gp. */
1138static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
1139{
1140 ASSERT(spinlock_locked(&rcu.gp_lock));
1141
1142 bool interrupted = false;
1143
1144 /* Wait until wait_on_gp ends. */
1145 while (rcu.completed_gp < wait_on_gp && !interrupted) {
1146 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
1147 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
1148 interrupted = (ret == ESYNCH_INTERRUPTED);
1149 }
1150
1151 return interrupted;
1152}
1153
1154/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
1155static void req_detection(size_t req_cnt)
1156{
1157 if (rcu.req_gp_end_cnt < req_cnt) {
1158 bool detector_idle = (0 == rcu.req_gp_end_cnt);
1159 rcu.req_gp_end_cnt = req_cnt;
1160
1161 if (detector_idle) {
1162 ASSERT(_rcu_cur_gp == rcu.completed_gp);
1163 condvar_signal(&rcu.req_gp_changed);
1164 }
1165 }
1166}
1167
1168
1169/** The detector thread detects and notifies reclaimers of grace period ends. */
1170static void detector(void *arg)
1171{
1172 spinlock_lock(&rcu.gp_lock);
1173
1174 while (wait_for_detect_req()) {
1175 /*
1176 * Announce new GP started. Readers start lazily acknowledging that
1177 * they passed a QS.
1178 */
1179 start_new_gp();
1180
1181 spinlock_unlock(&rcu.gp_lock);
1182
1183 if (!wait_for_readers())
1184 goto unlocked_out;
1185
1186 spinlock_lock(&rcu.gp_lock);
1187
1188 /* Notify reclaimers that they may now invoke queued callbacks. */
1189 end_cur_gp();
1190 }
1191
1192 spinlock_unlock(&rcu.gp_lock);
1193
1194unlocked_out:
1195 return;
1196}
1197
1198/** Waits for a request from a reclaimer thread to detect a grace period. */
1199static bool wait_for_detect_req(void)
1200{
1201 ASSERT(spinlock_locked(&rcu.gp_lock));
1202
1203 bool interrupted = false;
1204
1205 while (0 == rcu.req_gp_end_cnt && !interrupted) {
1206 int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed,
1207 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
1208
1209 interrupted = (ret == ESYNCH_INTERRUPTED);
1210 }
1211
1212 return !interrupted;
1213}
1214
1215
1216static void end_cur_gp(void)
1217{
1218 ASSERT(spinlock_locked(&rcu.gp_lock));
1219
1220 rcu.completed_gp = _rcu_cur_gp;
1221 --rcu.req_gp_end_cnt;
1222
1223 condvar_broadcast(&rcu.gp_ended);
1224}
1225
1226/** Waits for readers that started before the current GP started to finish. */
1227static bool wait_for_readers(void)
1228{
1229 DEFINE_CPU_MASK(reading_cpus);
1230
1231 /* All running cpus have potential readers. */
1232 cpu_mask_active(reading_cpus);
1233
1234 /*
1235 * Give readers time to pass through a QS. Also, batch arriving
1236 * callbacks in order to amortize detection overhead.
1237 */
1238 if (!gp_sleep())
1239 return false;
1240
1241 /* Non-intrusively determine which cpus have yet to pass a QS. */
1242 rm_quiescent_cpus(reading_cpus);
1243
1244 /* Actively interrupt cpus delaying the current GP and demand a QS. */
1245 interrupt_delaying_cpus(reading_cpus);
1246
1247 /* Wait for the interrupted cpus to notify us that they reached a QS. */
1248 if (!wait_for_delaying_cpus())
1249 return false;
1250 /*
1251 * All cpus recorded a QS or are still idle. Any new readers will be added
1252 * to next_preempt if preempted, ie the number of readers in cur_preempted
1253 * monotonically descreases.
1254 */
1255
1256 /* Wait for the last reader in cur_preempted to notify us it is done. */
1257 if (!wait_for_preempt_reader())
1258 return false;
1259
1260 return true;
1261}
1262
1263/** Sleeps a while if the current grace period is not to be expedited. */
1264static bool gp_sleep(void)
1265{
1266 spinlock_lock(&rcu.gp_lock);
1267
1268 int ret = 0;
1269 while (0 == rcu.req_expedited_cnt && 0 == ret) {
1270 /* minor bug: sleeps for the same duration if woken up spuriously. */
1271 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
1272 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
1273 }
1274
1275 if (0 < rcu.req_expedited_cnt) {
1276 --rcu.req_expedited_cnt;
1277 /* Update statistic. */
1278 ++rcu.stat_expedited_cnt;
1279 }
1280
1281 spinlock_unlock(&rcu.gp_lock);
1282
1283 return (ret != ESYNCH_INTERRUPTED);
1284}
1285
1286/** Actively interrupts and checks the offending cpus for quiescent states. */
1287static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
1288{
1289 atomic_set(&rcu.delaying_cpu_cnt, 0);
1290
1291 sample_cpus(cpu_mask, NULL);
1292}
1293
1294/** Invoked on a cpu delaying grace period detection.
1295 *
1296 * Induces a quiescent state for the cpu or it instructs remaining
1297 * readers to notify the detector once they finish.
1298 */
1299static void sample_local_cpu(void *arg)
1300{
1301 ASSERT(interrupts_disabled());
1302 ASSERT(!CPU->rcu.is_delaying_gp);
1303
1304 /* Cpu did not pass a quiescent state yet. */
1305 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
1306 /* Interrupted a reader in a reader critical section. */
1307 if (0 < CPU->rcu.nesting_cnt) {
1308 ASSERT(!CPU->idle);
1309 /* Note to notify the detector from rcu_read_unlock(). */
1310 CPU->rcu.is_delaying_gp = true;
1311 /*
1312 * Set signal_unlock only after setting is_delaying_gp so
1313 * that NMI handlers do not accidentally clear it in unlock()
1314 * before seeing and acting upon is_delaying_gp.
1315 */
1316 compiler_barrier();
1317 CPU->rcu.signal_unlock = true;
1318
1319 atomic_inc(&rcu.delaying_cpu_cnt);
1320 } else {
1321 /*
1322 * The cpu did not enter any rcu reader sections since
1323 * the start of the current GP. Record a quiescent state.
1324 *
1325 * Or, we interrupted rcu_read_unlock_impl() right before
1326 * it recorded a QS. Record a QS for it. The memory barrier
1327 * contains the reader section's mem accesses before
1328 * updating last_seen_gp.
1329 *
1330 * Or, we interrupted rcu_read_lock() right after it recorded
1331 * a QS for the previous GP but before it got a chance to
1332 * increment its nesting count. The memory barrier again
1333 * stops the CS code from spilling out of the CS.
1334 */
1335 memory_barrier();
1336 CPU->rcu.last_seen_gp = _rcu_cur_gp;
1337 }
1338 } else {
1339 /*
1340 * This cpu already acknowledged that it had passed through
1341 * a quiescent state since the start of cur_gp.
1342 */
1343 }
1344
1345 /*
1346 * smp_call() makes sure any changes propagate back to the caller.
1347 * In particular, it makes the most current last_seen_gp visible
1348 * to the detector.
1349 */
1350}
1351
1352/** Waits for cpus delaying the current grace period if there are any. */
1353static bool wait_for_delaying_cpus(void)
1354{
1355 int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
1356
1357 for (int i = 0; i < delaying_cpu_cnt; ++i){
1358 if (!semaphore_down_interruptable(&rcu.remaining_readers))
1359 return false;
1360 }
1361
1362 /* Update statistic. */
1363 rcu.stat_delayed_cnt += delaying_cpu_cnt;
1364
1365 return true;
1366}
1367
1368/** Called by the scheduler() when switching away from the current thread. */
1369void rcu_after_thread_ran(void)
1370{
1371 ASSERT(interrupts_disabled());
1372 /* todo: make is_delaying_gp and was_preempted NMI safe via local atomics.*/
1373
1374 /*
1375 * Prevent NMI handlers from interfering. The detector will be notified
1376 * here if CPU->rcu.is_delaying_gp and the current thread is no longer
1377 * running so there is nothing to signal to the detector.
1378 */
1379 CPU->rcu.signal_unlock = false;
1380 /* Separates clearing of .signal_unlock from CPU->rcu.nesting_cnt = 0. */
1381 compiler_barrier();
1382
1383 /* Save the thread's nesting count when it is not running. */
1384 THREAD->rcu.nesting_cnt = CPU->rcu.nesting_cnt;
1385 /* Interrupt handlers might use RCU while idle in scheduler(). */
1386 CPU->rcu.nesting_cnt = 0;
1387
1388 /* Preempted a reader critical section for the first time. */
1389 if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
1390 THREAD->rcu.was_preempted = true;
1391 note_preempted_reader();
1392 }
1393
1394 /*
1395 * The preempted reader has been noted globally. There are therefore
1396 * no readers running on this cpu so this is a quiescent state.
1397 */
1398 _rcu_record_qs();
1399
1400 /*
1401 * This cpu is holding up the current GP. Let the detector know
1402 * it has just passed a quiescent state.
1403 *
1404 * The detector waits separately for preempted readers, so we have
1405 * to notify the detector even if we have just preempted a reader.
1406 */
1407 if (CPU->rcu.is_delaying_gp) {
1408 CPU->rcu.is_delaying_gp = false;
1409 semaphore_up(&rcu.remaining_readers);
1410 }
1411
1412 /*
1413 * Forcefully associate the detector with the highest priority
1414 * even if preempted due to its time slice running out.
1415 *
1416 * todo: Replace with strict scheduler priority classes.
1417 */
1418 if (THREAD == rcu.detector_thr) {
1419 THREAD->priority = -1;
1420 }
1421 else if (THREAD == CPU->rcu.reclaimer_thr) {
1422 THREAD->priority = -1;
1423 }
1424
1425 upd_max_cbs_in_slice();
1426}
1427
1428/** Called by the scheduler() when switching to a newly scheduled thread. */
1429void rcu_before_thread_runs(void)
1430{
1431 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1432 ASSERT(0 == CPU->rcu.nesting_cnt);
1433
1434 /* Load the thread's saved nesting count from before it was preempted. */
1435 CPU->rcu.nesting_cnt = THREAD->rcu.nesting_cnt;
1436 /*
1437 * In the unlikely event that a NMI occurs between the loading of the
1438 * variables and setting signal_unlock, the NMI handler may invoke
1439 * rcu_read_unlock() and clear signal_unlock. In that case we will
1440 * incorrectly overwrite signal_unlock from false to true. This event
1441 * situation benign and the next rcu_read_unlock() will at worst
1442 * needlessly invoke _rcu_signal_unlock().
1443 */
1444 CPU->rcu.signal_unlock = THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp;
1445}
1446
1447/** Called from scheduler() when exiting the current thread.
1448 *
1449 * Preemption or interrupts are disabled and the scheduler() already
1450 * switched away from the current thread, calling rcu_after_thread_ran().
1451 */
1452void rcu_thread_exiting(void)
1453{
1454 ASSERT(THREAD != NULL);
1455 ASSERT(THREAD->state == Exiting);
1456 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1457
1458 /*
1459 * The thread forgot to exit its reader critical section.
1460 * It is a bug, but rather than letting the entire system lock up
1461 * forcefully leave the reader section. The thread is not holding
1462 * any references anyway since it is exiting so it is safe.
1463 */
1464 if (0 < THREAD->rcu.nesting_cnt) {
1465 THREAD->rcu.nesting_cnt = 1;
1466 read_unlock_impl(&THREAD->rcu.nesting_cnt);
1467
1468 printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
1469 " section.\n", THREAD->tid, THREAD->name);
1470 }
1471}
1472
1473
1474#endif /* RCU_PREEMPT_PODZIMEK */
1475
1476/** Announces the start of a new grace period for preexisting readers to ack. */
1477static void start_new_gp(void)
1478{
1479 ASSERT(spinlock_locked(&rcu.gp_lock));
1480
1481 irq_spinlock_lock(&rcu.preempt_lock, true);
1482
1483 /* Start a new GP. Announce to readers that a quiescent state is needed. */
1484 ++_rcu_cur_gp;
1485
1486 /*
1487 * Readers preempted before the start of this GP (next_preempted)
1488 * are preexisting readers now that a GP started and will hold up
1489 * the current GP until they exit their reader sections.
1490 *
1491 * Preempted readers from the previous GP have finished so
1492 * cur_preempted is empty, but see comment in _rcu_record_qs().
1493 */
1494 list_concat(&rcu.cur_preempted, &rcu.next_preempted);
1495
1496 irq_spinlock_unlock(&rcu.preempt_lock, true);
1497}
1498
1499/** Remove those cpus from the mask that have already passed a quiescent
1500 * state since the start of the current grace period.
1501 */
1502static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
1503{
1504 /*
1505 * Ensure the announcement of the start of a new GP (ie up-to-date
1506 * cur_gp) propagates to cpus that are just coming out of idle
1507 * mode before we sample their idle state flag.
1508 *
1509 * Cpus guarantee that after they set CPU->idle = true they will not
1510 * execute any RCU reader sections without first setting idle to
1511 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
1512 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
1513 * the cpu must not have yet executed its memory barrier (otherwise
1514 * it would pair up with this mem barrier and we would see idle == false).
1515 * That memory barrier will pair up with the one below and ensure
1516 * that a reader on the now-non-idle cpu will see the most current
1517 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
1518 * pending_readers) during this GP, which allows the detector to
1519 * ignore that cpu (the detector thinks it is idle). Moreover, any
1520 * changes made by RCU updaters will have propagated to readers
1521 * on the previously idle cpu -- again thanks to issuing a memory
1522 * barrier after returning from idle mode.
1523 *
1524 * idle -> non-idle cpu | detector | reclaimer
1525 * ------------------------------------------------------
1526 * rcu reader 1 | | rcu_call()
1527 * MB X | |
1528 * idle = true | | rcu_call()
1529 * (no rcu readers allowed ) | | MB A in advance_cbs()
1530 * MB Y | (...) | (...)
1531 * (no rcu readers allowed) | | MB B in advance_cbs()
1532 * idle = false | ++cur_gp |
1533 * (no rcu readers allowed) | MB C |
1534 * MB Z | signal gp_end |
1535 * rcu reader 2 | | exec_cur_cbs()
1536 *
1537 *
1538 * MB Y orders visibility of changes to idle for detector's sake.
1539 *
1540 * MB Z pairs up with MB C. The cpu making a transition from idle
1541 * will see the most current value of cur_gp and will not attempt
1542 * to notify the detector even if preempted during this GP.
1543 *
1544 * MB Z pairs up with MB A from the previous batch. Updaters' changes
1545 * are visible to reader 2 even when the detector thinks the cpu is idle
1546 * but it is not anymore.
1547 *
1548 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
1549 * and visible before idling and before any callbacks are executed
1550 * by reclaimers.
1551 *
1552 * In summary, the detector does not know of or wait for reader 2, but
1553 * it does not have to since it is a new reader that will not access
1554 * data from previous GPs and will see any changes.
1555 */
1556 memory_barrier(); /* MB C */
1557
1558 cpu_mask_for_each(*cpu_mask, cpu_id) {
1559 /*
1560 * The cpu already checked for and passed through a quiescent
1561 * state since the beginning of this GP.
1562 *
1563 * _rcu_cur_gp is modified by local detector thread only.
1564 * Therefore, it is up-to-date even without a lock.
1565 */
1566 bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
1567
1568 /*
1569 * Either the cpu is idle or it is exiting away from idle mode
1570 * and already sees the most current _rcu_cur_gp. See comment
1571 * in wait_for_readers().
1572 */
1573 bool cpu_idle = cpus[cpu_id].idle;
1574
1575 if (cpu_acked_gp || cpu_idle) {
1576 cpu_mask_reset(cpu_mask, cpu_id);
1577 }
1578 }
1579}
1580
1581/** Invokes sample_local_cpu(arg) on each cpu of reader_cpus. */
1582static void sample_cpus(cpu_mask_t *reader_cpus, void *arg)
1583{
1584 const size_t max_conconcurrent_calls = 16;
1585 smp_call_t call[max_conconcurrent_calls];
1586 size_t outstanding_calls = 0;
1587
1588 cpu_mask_for_each(*reader_cpus, cpu_id) {
1589 smp_call_async(cpu_id, sample_local_cpu, arg, &call[outstanding_calls]);
1590 ++outstanding_calls;
1591
1592 /* Update statistic. */
1593 if (CPU->id != cpu_id)
1594 ++rcu.stat_smp_call_cnt;
1595
1596 if (outstanding_calls == max_conconcurrent_calls) {
1597 for (size_t k = 0; k < outstanding_calls; ++k) {
1598 smp_call_wait(&call[k]);
1599 }
1600
1601 outstanding_calls = 0;
1602 }
1603 }
1604
1605 for (size_t k = 0; k < outstanding_calls; ++k) {
1606 smp_call_wait(&call[k]);
1607 }
1608}
1609
1610static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
1611{
1612 ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
1613
1614 size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
1615 CPU->rcu.stat_missed_gp_in_wait += delta;
1616}
1617
1618/** Globally note that the current thread was preempted in a reader section. */
1619static void note_preempted_reader(void)
1620{
1621 irq_spinlock_lock(&rcu.preempt_lock, false);
1622
1623 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
1624 /* The reader started before the GP started - we must wait for it.*/
1625 list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
1626 } else {
1627 /*
1628 * The reader started after the GP started and this cpu
1629 * already noted a quiescent state. We might block the next GP.
1630 */
1631 list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
1632 }
1633
1634 irq_spinlock_unlock(&rcu.preempt_lock, false);
1635}
1636
1637/** Remove the current thread from the global list of preempted readers. */
1638static void rm_preempted_reader(void)
1639{
1640 irq_spinlock_lock(&rcu.preempt_lock, false);
1641
1642 ASSERT(link_used(&THREAD->rcu.preempt_link));
1643
1644 bool prev_empty = list_empty(&rcu.cur_preempted);
1645 list_remove(&THREAD->rcu.preempt_link);
1646 bool now_empty = list_empty(&rcu.cur_preempted);
1647
1648 /* This was the last reader in cur_preempted. */
1649 bool last_removed = now_empty && !prev_empty;
1650
1651 /*
1652 * Preempted readers are blocking the detector and
1653 * this was the last reader blocking the current GP.
1654 */
1655 if (last_removed && rcu.preempt_blocking_det) {
1656 rcu.preempt_blocking_det = false;
1657 semaphore_up(&rcu.remaining_readers);
1658 }
1659
1660 irq_spinlock_unlock(&rcu.preempt_lock, false);
1661}
1662
1663/** Waits for any preempted readers blocking this grace period to finish.*/
1664static bool wait_for_preempt_reader(void)
1665{
1666 irq_spinlock_lock(&rcu.preempt_lock, true);
1667
1668 bool reader_exists = !list_empty(&rcu.cur_preempted);
1669 rcu.preempt_blocking_det = reader_exists;
1670
1671 irq_spinlock_unlock(&rcu.preempt_lock, true);
1672
1673 if (reader_exists) {
1674 /* Update statistic. */
1675 ++rcu.stat_preempt_blocking_cnt;
1676
1677 return semaphore_down_interruptable(&rcu.remaining_readers);
1678 }
1679
1680 return true;
1681}
1682
1683static void upd_max_cbs_in_slice(void)
1684{
1685 rcu_cpu_data_t *cr = &CPU->rcu;
1686
1687 if (cr->arriving_cbs_cnt > cr->last_arriving_cnt) {
1688 size_t arrived_cnt = cr->arriving_cbs_cnt - cr->last_arriving_cnt;
1689 cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
1690 }
1691
1692 cr->last_arriving_cnt = cr->arriving_cbs_cnt;
1693}
1694
1695/** Prints RCU run-time statistics. */
1696void rcu_print_stat(void)
1697{
1698 /*
1699 * Don't take locks. Worst case is we get out-dated values.
1700 * CPU local values are updated without any locks, so there
1701 * are no locks to lock in order to get up-to-date values.
1702 */
1703
1704#ifdef RCU_PREEMPT_PODZIMEK
1705 const char *algo = "podzimek-preempt-rcu";
1706#elif defined(RCU_PREEMPT_A)
1707 const char *algo = "a-preempt-rcu";
1708#endif
1709
1710 printf("Config: expedite_threshold=%d, critical_threshold=%d,"
1711 " detect_sleep=%dms, %s\n",
1712 EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS, algo);
1713 printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
1714 printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
1715 printf("Delayed GPs: %zu (cpus w/ still running readers after gp sleep)\n",
1716 rcu.stat_delayed_cnt);
1717 printf("Preempt blocked GPs: %zu (waited for preempted readers; "
1718 "running or not)\n", rcu.stat_preempt_blocking_cnt);
1719 printf("Smp calls: %zu\n", rcu.stat_smp_call_cnt);
1720
1721 printf("Max arrived callbacks per GP and CPU:\n");
1722 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1723 printf(" %zu", cpus[i].rcu.stat_max_cbs);
1724 }
1725
1726 printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
1727 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1728 printf(" %zu", cpus[i].rcu.stat_avg_cbs);
1729 }
1730
1731 printf("\nMax arrived callbacks per time slice and CPU:\n");
1732 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1733 printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
1734 }
1735
1736 printf("\nMissed GP notifications per CPU:\n");
1737 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1738 printf(" %zu", cpus[i].rcu.stat_missed_gps);
1739 }
1740
1741 printf("\nMissed GP notifications per CPU while waking up:\n");
1742 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1743 printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
1744 }
1745 printf("\n");
1746}
1747
1748/** @}
1749 */
Note: See TracBrowser for help on using the repository browser.