source: mainline/kernel/generic/src/synch/rcu.c@ 3648ea56

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 3648ea56 was 3648ea56, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago

rcu: Cosmetic change. Shaved off some overhead in rcu_call().

  • Property mode set to 100644
File size: 49.8 KB
Line 
1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29
30/** @addtogroup sync
31 * @{
32 */
33
34/**
35 * @file
36 * @brief Preemptible read-copy update. Usable from interrupt handlers.
37 */
38
39#include <synch/rcu.h>
40#include <synch/condvar.h>
41#include <synch/semaphore.h>
42#include <synch/spinlock.h>
43#include <synch/mutex.h>
44#include <proc/thread.h>
45#include <cpu/cpu_mask.h>
46#include <cpu.h>
47#include <smp/smp_call.h>
48#include <compiler/barrier.h>
49#include <atomic.h>
50#include <arch.h>
51#include <macros.h>
52
53/*
54 * Number of milliseconds to give to preexisting readers to finish
55 * when non-expedited grace period detection is in progress.
56 */
57#define DETECT_SLEEP_MS 10
58/*
59 * Max number of pending callbacks in the local cpu's queue before
60 * aggressively expediting the current grace period
61 */
62#define EXPEDITE_THRESHOLD 2000
63/*
64 * Max number of callbacks to execute in one go with preemption
65 * enabled. If there are more callbacks to be executed they will
66 * be run with preemption disabled in order to prolong reclaimer's
67 * time slice and give it a chance to catch up with callback producers.
68 */
69#define CRITICAL_THRESHOLD 30000
70/* Half the number of values a uint32 can hold. */
71#define UINT32_MAX_HALF 2147483648U
72
73/**
74 * The current grace period number. Increases monotonically.
75 * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
76 */
77rcu_gp_t _rcu_cur_gp;
78
79/** Global RCU data. */
80typedef struct rcu_data {
81 /** Detector uses so signal reclaimers that a grace period ended. */
82 condvar_t gp_ended;
83 /** Reclaimers use to notify the detector to accelerate GP detection. */
84 condvar_t expedite_now;
85 /**
86 * Protects: req_gp_end_cnt, req_expedited_cnt, completed_gp, _rcu_cur_gp;
87 * or: completed_gp, _rcu_cur_gp
88 */
89 SPINLOCK_DECLARE(gp_lock);
90 /**
91 * The number of the most recently completed grace period. At most
92 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period
93 * detection is not in progress and the detector is idle.
94 */
95 rcu_gp_t completed_gp;
96
97 /** Protects the following 3 fields. */
98 IRQ_SPINLOCK_DECLARE(preempt_lock);
99 /** Preexisting readers that have been preempted. */
100 list_t cur_preempted;
101 /** Reader that have been preempted and might delay the next grace period.*/
102 list_t next_preempted;
103 /**
104 * The detector is waiting for the last preempted reader
105 * in cur_preempted to announce that it exited its reader
106 * section by up()ing remaining_readers.
107 */
108 bool preempt_blocking_det;
109
110#ifdef RCU_PREEMPT_A
111
112 /**
113 * The detector waits on this semaphore for any preempted readers
114 * delaying the grace period once all cpus pass a quiescent state.
115 */
116 semaphore_t remaining_readers;
117
118#elif defined(RCU_PREEMPT_PODZIMEK)
119
120 /** Reclaimers notify the detector when they request more grace periods.*/
121 condvar_t req_gp_changed;
122 /** Number of grace period ends the detector was requested to announce. */
123 size_t req_gp_end_cnt;
124 /** Number of consecutive grace periods to detect quickly and aggressively.*/
125 size_t req_expedited_cnt;
126 /**
127 * Number of cpus with readers that are delaying the current GP.
128 * They will up() remaining_readers.
129 */
130 atomic_t delaying_cpu_cnt;
131 /**
132 * The detector waits on this semaphore for any readers delaying the GP.
133 *
134 * Each of the cpus with readers that are delaying the current GP
135 * must up() this sema once they reach a quiescent state. If there
136 * are any readers in cur_preempted (ie preempted preexisting) and
137 * they are already delaying GP detection, the last to unlock its
138 * reader section must up() this sema once.
139 */
140 semaphore_t remaining_readers;
141#endif
142
143 /** Excludes simultaneous rcu_barrier() calls. */
144 mutex_t barrier_mtx;
145 /** Number of cpus that we are waiting for to complete rcu_barrier(). */
146 atomic_t barrier_wait_cnt;
147 /** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
148 waitq_t barrier_wq;
149
150 /** Interruptible attached detector thread pointer. */
151 thread_t *detector_thr;
152
153 /* Some statistics. */
154 size_t stat_expedited_cnt;
155 size_t stat_delayed_cnt;
156 size_t stat_preempt_blocking_cnt;
157 /* Does not contain self/local calls. */
158 size_t stat_smp_call_cnt;
159} rcu_data_t;
160
161
162static rcu_data_t rcu;
163
164static void start_reclaimers(void);
165static void synch_complete(rcu_item_t *rcu_item);
166static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
167 rcu_func_t func);
168static void add_barrier_cb(void *arg);
169static void barrier_complete(rcu_item_t *barrier_item);
170static bool arriving_cbs_empty(void);
171static bool next_cbs_empty(void);
172static bool cur_cbs_empty(void);
173static bool all_cbs_empty(void);
174static void reclaimer(void *arg);
175static bool wait_for_pending_cbs(void);
176static bool advance_cbs(void);
177static void exec_completed_cbs(rcu_gp_t last_completed_gp);
178static void exec_cbs(rcu_item_t **phead);
179static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
180static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
181
182#ifdef RCU_PREEMPT_PODZIMEK
183static void start_detector(void);
184static void read_unlock_impl(size_t *pnesting_cnt);
185static void req_detection(size_t req_cnt);
186static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
187static void detector(void *);
188static bool wait_for_detect_req(void);
189static void end_cur_gp(void);
190static bool wait_for_readers(void);
191static bool gp_sleep(void);
192static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
193static bool wait_for_delaying_cpus(void);
194#elif defined(RCU_PREEMPT_A)
195static bool wait_for_readers(bool expedite);
196static bool gp_sleep(bool *expedite);
197#endif
198
199static void start_new_gp(void);
200static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
201static void sample_cpus(cpu_mask_t *reader_cpus, void *arg);
202static void sample_local_cpu(void *);
203static bool wait_for_preempt_reader(void);
204static void note_preempted_reader(void);
205static void rm_preempted_reader(void);
206static void upd_max_cbs_in_slice(void);
207
208
209
210/** Initializes global RCU structures. */
211void rcu_init(void)
212{
213 condvar_initialize(&rcu.gp_ended);
214 condvar_initialize(&rcu.expedite_now);
215
216 spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
217 _rcu_cur_gp = 0;
218 rcu.completed_gp = 0;
219
220 irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
221 list_initialize(&rcu.cur_preempted);
222 list_initialize(&rcu.next_preempted);
223 rcu.preempt_blocking_det = false;
224
225 mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
226 atomic_set(&rcu.barrier_wait_cnt, 0);
227 waitq_initialize(&rcu.barrier_wq);
228
229 semaphore_initialize(&rcu.remaining_readers, 0);
230
231#ifdef RCU_PREEMPT_PODZIMEK
232 condvar_initialize(&rcu.req_gp_changed);
233
234 rcu.req_gp_end_cnt = 0;
235 rcu.req_expedited_cnt = 0;
236 atomic_set(&rcu.delaying_cpu_cnt, 0);
237#endif
238
239 rcu.detector_thr = 0;
240
241 rcu.stat_expedited_cnt = 0;
242 rcu.stat_delayed_cnt = 0;
243 rcu.stat_preempt_blocking_cnt = 0;
244 rcu.stat_smp_call_cnt = 0;
245}
246
247/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
248void rcu_cpu_init(void)
249{
250 if (config.cpu_active == 1) {
251 rcu_init();
252 }
253
254 CPU->rcu.last_seen_gp = 0;
255
256#ifdef RCU_PREEMPT_PODZIMEK
257 CPU->rcu.nesting_cnt = 0;
258 CPU->rcu.is_delaying_gp = false;
259 CPU->rcu.signal_unlock = false;
260#endif
261
262 CPU->rcu.cur_cbs = 0;
263 CPU->rcu.cur_cbs_cnt = 0;
264 CPU->rcu.next_cbs = 0;
265 CPU->rcu.next_cbs_cnt = 0;
266 CPU->rcu.arriving_cbs = 0;
267 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
268 CPU->rcu.arriving_cbs_cnt = 0;
269
270 CPU->rcu.cur_cbs_gp = 0;
271 CPU->rcu.next_cbs_gp = 0;
272
273 semaphore_initialize(&CPU->rcu.arrived_flag, 0);
274
275 /* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
276 if (config.cpu_active == 1)
277 CPU->rcu.reclaimer_thr = 0;
278
279 CPU->rcu.stat_max_cbs = 0;
280 CPU->rcu.stat_avg_cbs = 0;
281 CPU->rcu.stat_missed_gps = 0;
282 CPU->rcu.stat_missed_gp_in_wait = 0;
283 CPU->rcu.stat_max_slice_cbs = 0;
284 CPU->rcu.last_arriving_cnt = 0;
285}
286
287/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
288void rcu_kinit_init(void)
289{
290#ifdef RCU_PREEMPT_PODZIMEK
291 start_detector();
292#endif
293
294 start_reclaimers();
295}
296
297/** Initializes any per-thread RCU structures. */
298void rcu_thread_init(thread_t *thread)
299{
300 thread->rcu.nesting_cnt = 0;
301
302#ifdef RCU_PREEMPT_PODZIMEK
303 thread->rcu.was_preempted = false;
304#endif
305
306 link_initialize(&thread->rcu.preempt_link);
307}
308
309
310/** Cleans up global RCU resources and stops dispatching callbacks.
311 *
312 * Call when shutting down the kernel. Outstanding callbacks will
313 * not be processed. Instead they will linger forever.
314 */
315void rcu_stop(void)
316{
317 /* Stop and wait for reclaimers. */
318 for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
319 ASSERT(cpus[cpu_id].rcu.reclaimer_thr != 0);
320
321 if (cpus[cpu_id].rcu.reclaimer_thr) {
322 thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
323 thread_join(cpus[cpu_id].rcu.reclaimer_thr);
324 thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
325 cpus[cpu_id].rcu.reclaimer_thr = 0;
326 }
327 }
328
329#ifdef RCU_PREEMPT_PODZIMEK
330 /* Stop the detector and wait. */
331 if (rcu.detector_thr) {
332 thread_interrupt(rcu.detector_thr);
333 thread_join(rcu.detector_thr);
334 thread_detach(rcu.detector_thr);
335 rcu.detector_thr = 0;
336 }
337#endif
338}
339
340/** Returns the number of elapsed grace periods since boot. */
341uint64_t rcu_completed_gps(void)
342{
343 spinlock_lock(&rcu.gp_lock);
344 uint64_t completed = rcu.completed_gp;
345 spinlock_unlock(&rcu.gp_lock);
346
347 return completed;
348}
349
350/** Creates and runs cpu-bound reclaimer threads. */
351static void start_reclaimers(void)
352{
353 for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
354 char name[THREAD_NAME_BUFLEN] = {0};
355
356 snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
357
358 cpus[cpu_id].rcu.reclaimer_thr =
359 thread_create(reclaimer, 0, TASK, THREAD_FLAG_NONE, name);
360
361 if (!cpus[cpu_id].rcu.reclaimer_thr)
362 panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
363
364 thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
365 thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
366 }
367}
368
369#ifdef RCU_PREEMPT_PODZIMEK
370
371/** Starts the detector thread. */
372static void start_detector(void)
373{
374 rcu.detector_thr =
375 thread_create(detector, 0, TASK, THREAD_FLAG_NONE, "rcu-det");
376
377 if (!rcu.detector_thr)
378 panic("Failed to create RCU detector thread.");
379
380 thread_ready(rcu.detector_thr);
381}
382
383/** Returns true if in an rcu reader section. */
384bool rcu_read_locked(void)
385{
386 preemption_disable();
387 bool locked = 0 < CPU->rcu.nesting_cnt;
388 preemption_enable();
389
390 return locked;
391}
392
393/** Unlocks the local reader section using the given nesting count.
394 *
395 * Preemption or interrupts must be disabled.
396 *
397 * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or
398 * THREAD->rcu.nesting_cnt.
399 */
400static void read_unlock_impl(size_t *pnesting_cnt)
401{
402 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
403
404 if (0 == --(*pnesting_cnt)) {
405 _rcu_record_qs();
406
407 /*
408 * The thread was preempted while in a critical section or
409 * the detector is eagerly waiting for this cpu's reader
410 * to finish.
411 *
412 * Note that THREAD may be 0 in scheduler() and not just during boot.
413 */
414 if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
415 /* Rechecks with disabled interrupts. */
416 _rcu_signal_read_unlock();
417 }
418 }
419}
420
421/** If necessary, signals the detector that we exited a reader section. */
422void _rcu_signal_read_unlock(void)
423{
424 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
425
426 /* todo: make NMI safe with cpu-local atomic ops. */
427
428 /*
429 * We have to disable interrupts in order to make checking
430 * and resetting was_preempted and is_delaying_gp atomic
431 * with respect to local interrupt handlers. Otherwise
432 * an interrupt could beat us to calling semaphore_up()
433 * before we reset the appropriate flag.
434 */
435 ipl_t ipl = interrupts_disable();
436
437 /*
438 * If the detector is eagerly waiting for this cpu's reader to unlock,
439 * notify it that the reader did so.
440 */
441 if (CPU->rcu.is_delaying_gp) {
442 CPU->rcu.is_delaying_gp = false;
443 semaphore_up(&rcu.remaining_readers);
444 }
445
446 /*
447 * This reader was preempted while in a reader section.
448 * We might be holding up the current GP. Notify the
449 * detector if so.
450 */
451 if (THREAD && THREAD->rcu.was_preempted) {
452 ASSERT(link_used(&THREAD->rcu.preempt_link));
453 THREAD->rcu.was_preempted = false;
454
455 rm_preempted_reader();
456 }
457
458 /* If there was something to signal to the detector we have done so. */
459 CPU->rcu.signal_unlock = false;
460
461 interrupts_restore(ipl);
462}
463
464#endif /* RCU_PREEMPT_PODZIMEK */
465
466typedef struct synch_item {
467 waitq_t wq;
468 rcu_item_t rcu_item;
469} synch_item_t;
470
471/** Blocks until all preexisting readers exit their critical sections. */
472void rcu_synchronize(void)
473{
474 _rcu_synchronize(false);
475}
476
477/** Blocks until all preexisting readers exit their critical sections. */
478void rcu_synchronize_expedite(void)
479{
480 _rcu_synchronize(true);
481}
482
483/** Blocks until all preexisting readers exit their critical sections. */
484void _rcu_synchronize(bool expedite)
485{
486 /* Calling from a reader section will deadlock. */
487 ASSERT(!rcu_read_locked());
488
489 synch_item_t completion;
490
491 waitq_initialize(&completion.wq);
492 _rcu_call(expedite, &completion.rcu_item, synch_complete);
493 waitq_sleep(&completion.wq);
494 waitq_complete_wakeup(&completion.wq);
495}
496
497/** rcu_synchronize's callback. */
498static void synch_complete(rcu_item_t *rcu_item)
499{
500 synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
501 ASSERT(completion);
502 waitq_wakeup(&completion->wq, WAKEUP_FIRST);
503}
504
505/** Waits for all outstanding rcu calls to complete. */
506void rcu_barrier(void)
507{
508 /*
509 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
510 * currently in use by rcu_barrier().
511 */
512 mutex_lock(&rcu.barrier_mtx);
513
514 /*
515 * Ensure we queue a barrier callback on all cpus before the already
516 * enqueued barrier callbacks start signaling completion.
517 */
518 atomic_set(&rcu.barrier_wait_cnt, 1);
519
520 DEFINE_CPU_MASK(cpu_mask);
521 cpu_mask_active(cpu_mask);
522
523 cpu_mask_for_each(*cpu_mask, cpu_id) {
524 smp_call(cpu_id, add_barrier_cb, 0);
525 }
526
527 if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
528 waitq_sleep(&rcu.barrier_wq);
529 }
530
531 mutex_unlock(&rcu.barrier_mtx);
532}
533
534/** Issues a rcu_barrier() callback on the local cpu.
535 *
536 * Executed with interrupts disabled.
537 */
538static void add_barrier_cb(void *arg)
539{
540 ASSERT(interrupts_disabled() || PREEMPTION_DISABLED);
541 atomic_inc(&rcu.barrier_wait_cnt);
542 rcu_call(&CPU->rcu.barrier_item, barrier_complete);
543}
544
545/** Local cpu's rcu_barrier() completion callback. */
546static void barrier_complete(rcu_item_t *barrier_item)
547{
548 /* Is this the last barrier callback completed? */
549 if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
550 /* Notify rcu_barrier() that we're done. */
551 waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
552 }
553}
554
555/** Adds a callback to invoke after all preexisting readers finish.
556 *
557 * May be called from within interrupt handlers or RCU reader sections.
558 *
559 * @param rcu_item Used by RCU to track the call. Must remain
560 * until the user callback function is entered.
561 * @param func User callback function that will be invoked once a full
562 * grace period elapsed, ie at a time when all preexisting
563 * readers have finished. The callback should be short and must
564 * not block. If you must sleep, enqueue your work in the system
565 * work queue from the callback (ie workq_global_enqueue()).
566 */
567void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
568{
569 rcu_call_impl(false, rcu_item, func);
570}
571
572/** rcu_call() implementation. See rcu_call() for comments. */
573void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
574{
575 rcu_call_impl(expedite, rcu_item, func);
576}
577
578/** rcu_call() inline-able implementation. See rcu_call() for comments. */
579static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item,
580 rcu_func_t func)
581{
582 ASSERT(rcu_item);
583
584 rcu_item->func = func;
585 rcu_item->next = 0;
586
587 preemption_disable();
588
589 ipl_t ipl = interrupts_disable();
590
591 rcu_cpu_data_t *r = &CPU->rcu;
592 *r->parriving_cbs_tail = rcu_item;
593 r->parriving_cbs_tail = &rcu_item->next;
594
595 size_t cnt = ++r->arriving_cbs_cnt;
596 interrupts_restore(ipl);
597
598 if (expedite) {
599 r->expedite_arriving = true;
600 }
601
602 /* Added first callback - notify the reclaimer. */
603 if (cnt == 1 && !semaphore_count_get(&r->arrived_flag)) {
604 semaphore_up(&r->arrived_flag);
605 }
606
607 preemption_enable();
608}
609
610static bool cur_cbs_empty(void)
611{
612 ASSERT(THREAD && THREAD->wired);
613 return 0 == CPU->rcu.cur_cbs;
614}
615
616static bool next_cbs_empty(void)
617{
618 ASSERT(THREAD && THREAD->wired);
619 return 0 == CPU->rcu.next_cbs;
620}
621
622/** Disable interrupts to get an up-to-date result. */
623static bool arriving_cbs_empty(void)
624{
625 ASSERT(THREAD && THREAD->wired);
626 /*
627 * Accessing with interrupts enabled may at worst lead to
628 * a false negative if we race with a local interrupt handler.
629 */
630 return 0 == CPU->rcu.arriving_cbs;
631}
632
633static bool all_cbs_empty(void)
634{
635 return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
636}
637
638
639/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
640static void reclaimer(void *arg)
641{
642 ASSERT(THREAD && THREAD->wired);
643 ASSERT(THREAD == CPU->rcu.reclaimer_thr);
644
645 rcu_gp_t last_compl_gp = 0;
646 bool ok = true;
647
648 while (ok && wait_for_pending_cbs()) {
649 ASSERT(CPU->rcu.reclaimer_thr == THREAD);
650
651 exec_completed_cbs(last_compl_gp);
652
653 bool expedite = advance_cbs();
654
655 ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
656 }
657}
658
659/** Waits until there are callbacks waiting to be dispatched. */
660static bool wait_for_pending_cbs(void)
661{
662 if (!all_cbs_empty())
663 return true;
664
665 bool ok = true;
666
667 while (arriving_cbs_empty() && ok) {
668 ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
669 }
670
671 return ok;
672}
673
674static void upd_stat_missed_gp(rcu_gp_t compl)
675{
676 if (CPU->rcu.cur_cbs_gp < compl) {
677 CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
678 }
679}
680
681/** Executes all callbacks for the given completed grace period. */
682static void exec_completed_cbs(rcu_gp_t last_completed_gp)
683{
684 upd_stat_missed_gp(last_completed_gp);
685
686 /* Both next_cbs and cur_cbs GP elapsed. */
687 if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
688 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
689
690 size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
691
692 if (exec_cnt < CRITICAL_THRESHOLD) {
693 exec_cbs(&CPU->rcu.cur_cbs);
694 exec_cbs(&CPU->rcu.next_cbs);
695 } else {
696 /*
697 * Getting overwhelmed with too many callbacks to run.
698 * Disable preemption in order to prolong our time slice
699 * and catch up with updaters posting new callbacks.
700 */
701 preemption_disable();
702 exec_cbs(&CPU->rcu.cur_cbs);
703 exec_cbs(&CPU->rcu.next_cbs);
704 preemption_enable();
705 }
706
707 CPU->rcu.cur_cbs_cnt = 0;
708 CPU->rcu.next_cbs_cnt = 0;
709 } else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
710
711 if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
712 exec_cbs(&CPU->rcu.cur_cbs);
713 } else {
714 /*
715 * Getting overwhelmed with too many callbacks to run.
716 * Disable preemption in order to prolong our time slice
717 * and catch up with updaters posting new callbacks.
718 */
719 preemption_disable();
720 exec_cbs(&CPU->rcu.cur_cbs);
721 preemption_enable();
722 }
723
724 CPU->rcu.cur_cbs_cnt = 0;
725 }
726}
727
728/** Executes callbacks in the single-linked list. The list is left empty. */
729static void exec_cbs(rcu_item_t **phead)
730{
731 rcu_item_t *rcu_item = *phead;
732
733 while (rcu_item) {
734 /* func() may free rcu_item. Get a local copy. */
735 rcu_item_t *next = rcu_item->next;
736 rcu_func_t func = rcu_item->func;
737
738 func(rcu_item);
739
740 rcu_item = next;
741 }
742
743 *phead = 0;
744}
745
746static void upd_stat_cb_cnts(size_t arriving_cnt)
747{
748 CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
749 if (0 < arriving_cnt) {
750 CPU->rcu.stat_avg_cbs =
751 (99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
752 }
753}
754
755/** Prepares another batch of callbacks to dispatch at the nest grace period.
756 *
757 * @return True if the next batch of callbacks must be expedited quickly.
758 */
759static bool advance_cbs(void)
760{
761 /* Move next_cbs to cur_cbs. */
762 CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
763 CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
764 CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
765
766 /* Move arriving_cbs to next_cbs. Empties arriving_cbs. */
767 ipl_t ipl = interrupts_disable();
768
769 /*
770 * Too many callbacks queued. Better speed up the detection
771 * or risk exhausting all system memory.
772 */
773 bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.arriving_cbs_cnt)
774 || CPU->rcu.expedite_arriving;
775
776 CPU->rcu.expedite_arriving = false;
777
778 CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
779 CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
780
781 CPU->rcu.arriving_cbs = 0;
782 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
783 CPU->rcu.arriving_cbs_cnt = 0;
784
785 interrupts_restore(ipl);
786
787 /* Update statistics of arrived callbacks. */
788 upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
789
790 /*
791 * Make changes prior to queuing next_cbs visible to readers.
792 * See comment in wait_for_readers().
793 */
794 memory_barrier(); /* MB A, B */
795
796 /* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
797
798 if (!next_cbs_empty()) {
799 spinlock_lock(&rcu.gp_lock);
800
801 /* Exec next_cbs at the end of the next GP. */
802 CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
803
804 /*
805 * There are no callbacks to invoke before next_cbs. Instruct
806 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
807 * That could be sooner than next_cbs_gp (if the current GP
808 * had not yet completed), so we'll create a shorter batch
809 * of callbacks next time around.
810 */
811 if (cur_cbs_empty()) {
812 CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
813 }
814
815 spinlock_unlock(&rcu.gp_lock);
816 } else {
817 CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
818 }
819
820 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
821
822 return expedite;
823}
824
825
826#ifdef RCU_PREEMPT_A
827
828/** Waits for the grace period associated with callbacks cub_cbs to elapse.
829 *
830 * @param expedite Instructs the detector to aggressively speed up grace
831 * period detection without any delay.
832 * @param completed_gp Returns the most recent completed grace period
833 * number.
834 * @return false if the thread was interrupted and should stop.
835 */
836static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
837{
838 spinlock_lock(&rcu.gp_lock);
839
840 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
841 ASSERT(CPU->rcu.cur_cbs_gp <= _rcu_cur_gp + 1);
842
843 while (rcu.completed_gp < CPU->rcu.cur_cbs_gp) {
844 /* GP has not yet started - start a new one. */
845 if (rcu.completed_gp == _rcu_cur_gp) {
846 start_new_gp();
847 spinlock_unlock(&rcu.gp_lock);
848
849 if (!wait_for_readers(expedite))
850 return false;
851
852 spinlock_lock(&rcu.gp_lock);
853 /* Notify any reclaimers this GP had ended. */
854 rcu.completed_gp = _rcu_cur_gp;
855 condvar_broadcast(&rcu.gp_ended);
856 } else {
857 /* GP detection is in progress.*/
858
859 if (expedite)
860 condvar_signal(&rcu.expedite_now);
861
862 /* Wait for the GP to complete. */
863 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended,
864 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
865
866 if (ret == ESYNCH_INTERRUPTED) {
867 spinlock_unlock(&rcu.gp_lock);
868 return false;
869 }
870
871 upd_missed_gp_in_wait(rcu.completed_gp);
872 }
873 }
874
875 *completed_gp = rcu.completed_gp;
876 spinlock_unlock(&rcu.gp_lock);
877
878 return true;
879}
880
881static bool wait_for_readers(bool expedite)
882{
883 DEFINE_CPU_MASK(reader_cpus);
884
885 cpu_mask_active(reader_cpus);
886 rm_quiescent_cpus(reader_cpus);
887
888 while (!cpu_mask_is_none(reader_cpus)) {
889 /* Give cpus a chance to context switch (a QS) and batch callbacks. */
890 if(!gp_sleep(&expedite))
891 return false;
892
893 rm_quiescent_cpus(reader_cpus);
894 sample_cpus(reader_cpus, reader_cpus);
895 }
896
897 /* Update statistic. */
898 if (expedite) {
899 ++rcu.stat_expedited_cnt;
900 }
901
902 /*
903 * All cpus have passed through a QS and see the most recent _rcu_cur_gp.
904 * As a result newly preempted readers will associate with next_preempted
905 * and the number of old readers in cur_preempted will monotonically
906 * decrease. Wait for those old/preexisting readers.
907 */
908 return wait_for_preempt_reader();
909}
910
911static bool gp_sleep(bool *expedite)
912{
913 if (*expedite) {
914 scheduler();
915 return true;
916 } else {
917 spinlock_lock(&rcu.gp_lock);
918
919 int ret = 0;
920 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
921 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
922
923 /* rcu.expedite_now was signaled. */
924 if (ret == ESYNCH_OK_BLOCKED) {
925 *expedite = true;
926 }
927
928 spinlock_unlock(&rcu.gp_lock);
929
930 return (ret != ESYNCH_INTERRUPTED);
931 }
932}
933
934static void sample_local_cpu(void *arg)
935{
936 ASSERT(interrupts_disabled());
937 cpu_mask_t *reader_cpus = (cpu_mask_t *)arg;
938
939 bool locked = RCU_CNT_INC <= THE->rcu_nesting;
940 bool passed_qs = (CPU->rcu.last_seen_gp == _rcu_cur_gp);
941
942 if (locked && !passed_qs) {
943 /*
944 * This cpu has not yet passed a quiescent state during this grace
945 * period and it is currently in a reader section. We'll have to
946 * try to sample this cpu again later.
947 */
948 } else {
949 /* Either not in a reader section or already passed a QS. */
950 cpu_mask_reset(reader_cpus, CPU->id);
951 /* Contain new reader sections and make prior changes visible to them.*/
952 memory_barrier();
953 CPU->rcu.last_seen_gp = _rcu_cur_gp;
954 }
955}
956
957/** Called by the scheduler() when switching away from the current thread. */
958void rcu_after_thread_ran(void)
959{
960 ASSERT(interrupts_disabled());
961
962 /* Preempted a reader critical section for the first time. */
963 if (rcu_read_locked() && !(THE->rcu_nesting & RCU_WAS_PREEMPTED)) {
964 THE->rcu_nesting |= RCU_WAS_PREEMPTED;
965 note_preempted_reader();
966 }
967
968 /* Save the thread's nesting count when it is not running. */
969 THREAD->rcu.nesting_cnt = THE->rcu_nesting;
970
971 /* Clear rcu_nesting only after noting that a thread was preempted. */
972 compiler_barrier();
973 THE->rcu_nesting = 0;
974
975 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
976 /*
977 * Contain any memory accesses of old readers before announcing a QS.
978 * Also make changes from the previous GP visible to this cpu.
979 */
980 memory_barrier();
981 /*
982 * The preempted reader has been noted globally. There are therefore
983 * no readers running on this cpu so this is a quiescent state.
984 */
985 CPU->rcu.last_seen_gp = _rcu_cur_gp;
986 }
987
988 /*
989 * Forcefully associate the reclaime with the highest priority
990 * even if preempted due to its time slice running out.
991 */
992 if (THREAD == CPU->rcu.reclaimer_thr) {
993 THREAD->priority = -1;
994 }
995
996 upd_max_cbs_in_slice();
997}
998
999/** Called by the scheduler() when switching to a newly scheduled thread. */
1000void rcu_before_thread_runs(void)
1001{
1002 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1003 ASSERT(!rcu_read_locked());
1004
1005 /* Load the thread's saved nesting count from before it was preempted. */
1006 THE->rcu_nesting = THREAD->rcu.nesting_cnt;
1007}
1008
1009/** Called from scheduler() when exiting the current thread.
1010 *
1011 * Preemption or interrupts are disabled and the scheduler() already
1012 * switched away from the current thread, calling rcu_after_thread_ran().
1013 */
1014void rcu_thread_exiting(void)
1015{
1016 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1017 /*
1018 * The thread forgot to exit its reader critical section.
1019 * It is a bug, but rather than letting the entire system lock up
1020 * forcefully leave the reader section. The thread is not holding
1021 * any references anyway since it is exiting so it is safe.
1022 */
1023 if (RCU_CNT_INC <= THREAD->rcu.nesting_cnt) {
1024 /* Emulate _rcu_preempted_unlock() with the proper nesting count. */
1025 if (THREAD->rcu.nesting_cnt & RCU_WAS_PREEMPTED) {
1026 ipl_t ipl = interrupts_disable();
1027 rm_preempted_reader();
1028 interrupts_restore(ipl);
1029 }
1030 }
1031}
1032
1033/** Returns true if in an rcu reader section. */
1034bool rcu_read_locked(void)
1035{
1036 return RCU_CNT_INC <= THE->rcu_nesting;
1037}
1038
1039/** Invoked when a preempted reader finally exits its reader section. */
1040void _rcu_preempted_unlock(void)
1041{
1042 ipl_t ipl = interrupts_disable();
1043
1044 if (THE->rcu_nesting == RCU_WAS_PREEMPTED) {
1045 THE->rcu_nesting = 0;
1046 rm_preempted_reader();
1047 }
1048
1049 interrupts_restore(ipl);
1050}
1051
1052#elif defined(RCU_PREEMPT_PODZIMEK)
1053
1054/** Waits for the grace period associated with callbacks cub_cbs to elapse.
1055 *
1056 * @param expedite Instructs the detector to aggressively speed up grace
1057 * period detection without any delay.
1058 * @param completed_gp Returns the most recent completed grace period
1059 * number.
1060 * @return false if the thread was interrupted and should stop.
1061 */
1062static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
1063{
1064 /*
1065 * Use a possibly outdated version of completed_gp to bypass checking
1066 * with the lock.
1067 *
1068 * Note that loading and storing rcu.completed_gp is not atomic
1069 * (it is 64bit wide). Reading a clobbered value that is less than
1070 * rcu.completed_gp is harmless - we'll recheck with a lock. The
1071 * only way to read a clobbered value that is greater than the actual
1072 * value is if the detector increases the higher-order word first and
1073 * then decreases the lower-order word (or we see stores in that order),
1074 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value
1075 * suddenly jumps by 2^32. It would take hours for such an increase
1076 * to occur so it is safe to discard the value. We allow increases
1077 * of up to half the maximum to generously accommodate for loading an
1078 * outdated lower word.
1079 */
1080 rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
1081 if (CPU->rcu.cur_cbs_gp <= compl_gp
1082 && compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
1083 *completed_gp = compl_gp;
1084 return true;
1085 }
1086
1087 spinlock_lock(&rcu.gp_lock);
1088
1089 if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
1090 *completed_gp = rcu.completed_gp;
1091 spinlock_unlock(&rcu.gp_lock);
1092 return true;
1093 }
1094
1095 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
1096 ASSERT(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
1097
1098 /*
1099 * Notify the detector of how many GP ends we intend to wait for, so
1100 * it can avoid going to sleep unnecessarily. Optimistically assume
1101 * new callbacks will arrive while we're waiting; hence +1.
1102 */
1103 size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
1104 req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
1105
1106 /*
1107 * Ask the detector to speed up GP detection if there are too many
1108 * pending callbacks and other reclaimers have not already done so.
1109 */
1110 if (expedite) {
1111 if(0 == rcu.req_expedited_cnt)
1112 condvar_signal(&rcu.expedite_now);
1113
1114 /*
1115 * Expedite only cub_cbs. If there really is a surge of callbacks
1116 * the arriving batch will expedite the GP for the huge number
1117 * of callbacks currently in next_cbs
1118 */
1119 rcu.req_expedited_cnt = 1;
1120 }
1121
1122 /* Wait for cur_cbs_gp to end. */
1123 bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
1124
1125 *completed_gp = rcu.completed_gp;
1126 spinlock_unlock(&rcu.gp_lock);
1127
1128 upd_missed_gp_in_wait(*completed_gp);
1129
1130 return !interrupted;
1131}
1132
1133/** Waits for an announcement of the end of the grace period wait_on_gp. */
1134static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
1135{
1136 ASSERT(spinlock_locked(&rcu.gp_lock));
1137
1138 bool interrupted = false;
1139
1140 /* Wait until wait_on_gp ends. */
1141 while (rcu.completed_gp < wait_on_gp && !interrupted) {
1142 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
1143 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
1144 interrupted = (ret == ESYNCH_INTERRUPTED);
1145 }
1146
1147 ASSERT(wait_on_gp <= rcu.completed_gp);
1148
1149 return interrupted;
1150}
1151
1152/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
1153static void req_detection(size_t req_cnt)
1154{
1155 if (rcu.req_gp_end_cnt < req_cnt) {
1156 bool detector_idle = (0 == rcu.req_gp_end_cnt);
1157 rcu.req_gp_end_cnt = req_cnt;
1158
1159 if (detector_idle) {
1160 ASSERT(_rcu_cur_gp == rcu.completed_gp);
1161 condvar_signal(&rcu.req_gp_changed);
1162 }
1163 }
1164}
1165
1166
1167/** The detector thread detects and notifies reclaimers of grace period ends. */
1168static void detector(void *arg)
1169{
1170 spinlock_lock(&rcu.gp_lock);
1171
1172 while (wait_for_detect_req()) {
1173 /*
1174 * Announce new GP started. Readers start lazily acknowledging that
1175 * they passed a QS.
1176 */
1177 start_new_gp();
1178
1179 spinlock_unlock(&rcu.gp_lock);
1180
1181 if (!wait_for_readers())
1182 goto unlocked_out;
1183
1184 spinlock_lock(&rcu.gp_lock);
1185
1186 /* Notify reclaimers that they may now invoke queued callbacks. */
1187 end_cur_gp();
1188 }
1189
1190 spinlock_unlock(&rcu.gp_lock);
1191
1192unlocked_out:
1193 return;
1194}
1195
1196/** Waits for a request from a reclaimer thread to detect a grace period. */
1197static bool wait_for_detect_req(void)
1198{
1199 ASSERT(spinlock_locked(&rcu.gp_lock));
1200
1201 bool interrupted = false;
1202
1203 while (0 == rcu.req_gp_end_cnt && !interrupted) {
1204 int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed,
1205 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
1206
1207 interrupted = (ret == ESYNCH_INTERRUPTED);
1208 }
1209
1210 return !interrupted;
1211}
1212
1213
1214static void end_cur_gp(void)
1215{
1216 ASSERT(spinlock_locked(&rcu.gp_lock));
1217
1218 rcu.completed_gp = _rcu_cur_gp;
1219 --rcu.req_gp_end_cnt;
1220
1221 condvar_broadcast(&rcu.gp_ended);
1222}
1223
1224/** Waits for readers that started before the current GP started to finish. */
1225static bool wait_for_readers(void)
1226{
1227 DEFINE_CPU_MASK(reading_cpus);
1228
1229 /* All running cpus have potential readers. */
1230 cpu_mask_active(reading_cpus);
1231
1232 /*
1233 * Give readers time to pass through a QS. Also, batch arriving
1234 * callbacks in order to amortize detection overhead.
1235 */
1236 if (!gp_sleep())
1237 return false;
1238
1239 /* Non-intrusively determine which cpus have yet to pass a QS. */
1240 rm_quiescent_cpus(reading_cpus);
1241
1242 /* Actively interrupt cpus delaying the current GP and demand a QS. */
1243 interrupt_delaying_cpus(reading_cpus);
1244
1245 /* Wait for the interrupted cpus to notify us that they reached a QS. */
1246 if (!wait_for_delaying_cpus())
1247 return false;
1248 /*
1249 * All cpus recorded a QS or are still idle. Any new readers will be added
1250 * to next_preempt if preempted, ie the number of readers in cur_preempted
1251 * monotonically descreases.
1252 */
1253
1254 /* Wait for the last reader in cur_preempted to notify us it is done. */
1255 if (!wait_for_preempt_reader())
1256 return false;
1257
1258 return true;
1259}
1260
1261/** Sleeps a while if the current grace period is not to be expedited. */
1262static bool gp_sleep(void)
1263{
1264 spinlock_lock(&rcu.gp_lock);
1265
1266 int ret = 0;
1267 while (0 == rcu.req_expedited_cnt && 0 == ret) {
1268 /* minor bug: sleeps for the same duration if woken up spuriously. */
1269 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
1270 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
1271 }
1272
1273 if (0 < rcu.req_expedited_cnt) {
1274 --rcu.req_expedited_cnt;
1275 /* Update statistic. */
1276 ++rcu.stat_expedited_cnt;
1277 }
1278
1279 spinlock_unlock(&rcu.gp_lock);
1280
1281 return (ret != ESYNCH_INTERRUPTED);
1282}
1283
1284/** Actively interrupts and checks the offending cpus for quiescent states. */
1285static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
1286{
1287 atomic_set(&rcu.delaying_cpu_cnt, 0);
1288
1289 sample_cpus(cpu_mask, 0);
1290}
1291
1292/** Invoked on a cpu delaying grace period detection.
1293 *
1294 * Induces a quiescent state for the cpu or it instructs remaining
1295 * readers to notify the detector once they finish.
1296 */
1297static void sample_local_cpu(void *arg)
1298{
1299 ASSERT(interrupts_disabled());
1300 ASSERT(!CPU->rcu.is_delaying_gp);
1301
1302 /* Cpu did not pass a quiescent state yet. */
1303 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
1304 /* Interrupted a reader in a reader critical section. */
1305 if (0 < CPU->rcu.nesting_cnt) {
1306 ASSERT(!CPU->idle);
1307 /* Note to notify the detector from rcu_read_unlock(). */
1308 CPU->rcu.is_delaying_gp = true;
1309 /*
1310 * Set signal_unlock only after setting is_delaying_gp so
1311 * that NMI handlers do not accidentally clear it in unlock()
1312 * before seeing and acting upon is_delaying_gp.
1313 */
1314 compiler_barrier();
1315 CPU->rcu.signal_unlock = true;
1316
1317 atomic_inc(&rcu.delaying_cpu_cnt);
1318 } else {
1319 /*
1320 * The cpu did not enter any rcu reader sections since
1321 * the start of the current GP. Record a quiescent state.
1322 *
1323 * Or, we interrupted rcu_read_unlock_impl() right before
1324 * it recorded a QS. Record a QS for it. The memory barrier
1325 * contains the reader section's mem accesses before
1326 * updating last_seen_gp.
1327 *
1328 * Or, we interrupted rcu_read_lock() right after it recorded
1329 * a QS for the previous GP but before it got a chance to
1330 * increment its nesting count. The memory barrier again
1331 * stops the CS code from spilling out of the CS.
1332 */
1333 memory_barrier();
1334 CPU->rcu.last_seen_gp = _rcu_cur_gp;
1335 }
1336 } else {
1337 /*
1338 * This cpu already acknowledged that it had passed through
1339 * a quiescent state since the start of cur_gp.
1340 */
1341 }
1342
1343 /*
1344 * smp_call() makes sure any changes propagate back to the caller.
1345 * In particular, it makes the most current last_seen_gp visible
1346 * to the detector.
1347 */
1348}
1349
1350/** Waits for cpus delaying the current grace period if there are any. */
1351static bool wait_for_delaying_cpus(void)
1352{
1353 int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
1354
1355 for (int i = 0; i < delaying_cpu_cnt; ++i){
1356 if (!semaphore_down_interruptable(&rcu.remaining_readers))
1357 return false;
1358 }
1359
1360 /* Update statistic. */
1361 rcu.stat_delayed_cnt += delaying_cpu_cnt;
1362
1363 return true;
1364}
1365
1366/** Called by the scheduler() when switching away from the current thread. */
1367void rcu_after_thread_ran(void)
1368{
1369 ASSERT(interrupts_disabled());
1370 /* todo: make is_delaying_gp and was_preempted NMI safe via local atomics.*/
1371
1372 /*
1373 * Prevent NMI handlers from interfering. The detector will be notified
1374 * here if CPU->rcu.is_delaying_gp and the current thread is no longer
1375 * running so there is nothing to signal to the detector.
1376 */
1377 CPU->rcu.signal_unlock = false;
1378 /* Separates clearing of .signal_unlock from CPU->rcu.nesting_cnt = 0. */
1379 compiler_barrier();
1380
1381 /* Save the thread's nesting count when it is not running. */
1382 THREAD->rcu.nesting_cnt = CPU->rcu.nesting_cnt;
1383 /* Interrupt handlers might use RCU while idle in scheduler(). */
1384 CPU->rcu.nesting_cnt = 0;
1385
1386 /* Preempted a reader critical section for the first time. */
1387 if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
1388 THREAD->rcu.was_preempted = true;
1389 note_preempted_reader();
1390 }
1391
1392 /*
1393 * The preempted reader has been noted globally. There are therefore
1394 * no readers running on this cpu so this is a quiescent state.
1395 */
1396 _rcu_record_qs();
1397
1398 /*
1399 * This cpu is holding up the current GP. Let the detector know
1400 * it has just passed a quiescent state.
1401 *
1402 * The detector waits separately for preempted readers, so we have
1403 * to notify the detector even if we have just preempted a reader.
1404 */
1405 if (CPU->rcu.is_delaying_gp) {
1406 CPU->rcu.is_delaying_gp = false;
1407 semaphore_up(&rcu.remaining_readers);
1408 }
1409
1410 /*
1411 * Forcefully associate the detector with the highest priority
1412 * even if preempted due to its time slice running out.
1413 *
1414 * todo: Replace with strict scheduler priority classes.
1415 */
1416 if (THREAD == rcu.detector_thr) {
1417 THREAD->priority = -1;
1418 }
1419 else if (THREAD == CPU->rcu.reclaimer_thr) {
1420 THREAD->priority = -1;
1421 }
1422
1423 upd_max_cbs_in_slice();
1424}
1425
1426/** Called by the scheduler() when switching to a newly scheduled thread. */
1427void rcu_before_thread_runs(void)
1428{
1429 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1430 ASSERT(0 == CPU->rcu.nesting_cnt);
1431
1432 /* Load the thread's saved nesting count from before it was preempted. */
1433 CPU->rcu.nesting_cnt = THREAD->rcu.nesting_cnt;
1434 /*
1435 * In the unlikely event that a NMI occurs between the loading of the
1436 * variables and setting signal_unlock, the NMI handler may invoke
1437 * rcu_read_unlock() and clear signal_unlock. In that case we will
1438 * incorrectly overwrite signal_unlock from false to true. This event
1439 * situation benign and the next rcu_read_unlock() will at worst
1440 * needlessly invoke _rcu_signal_unlock().
1441 */
1442 CPU->rcu.signal_unlock = THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp;
1443}
1444
1445/** Called from scheduler() when exiting the current thread.
1446 *
1447 * Preemption or interrupts are disabled and the scheduler() already
1448 * switched away from the current thread, calling rcu_after_thread_ran().
1449 */
1450void rcu_thread_exiting(void)
1451{
1452 ASSERT(THREAD != 0);
1453 ASSERT(THREAD->state == Exiting);
1454 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1455 /*
1456 * The thread forgot to exit its reader critical section.
1457 * It is a bug, but rather than letting the entire system lock up
1458 * forcefully leave the reader section. The thread is not holding
1459 * any references anyway since it is exiting so it is safe.
1460 */
1461 if (0 < THREAD->rcu.nesting_cnt) {
1462 THREAD->rcu.nesting_cnt = 1;
1463 read_unlock_impl(&THREAD->rcu.nesting_cnt);
1464 }
1465}
1466
1467
1468#endif /* RCU_PREEMPT_PODZIMEK */
1469
1470/** Announces the start of a new grace period for preexisting readers to ack. */
1471static void start_new_gp(void)
1472{
1473 ASSERT(spinlock_locked(&rcu.gp_lock));
1474
1475 irq_spinlock_lock(&rcu.preempt_lock, true);
1476
1477 /* Start a new GP. Announce to readers that a quiescent state is needed. */
1478 ++_rcu_cur_gp;
1479
1480 /*
1481 * Readers preempted before the start of this GP (next_preempted)
1482 * are preexisting readers now that a GP started and will hold up
1483 * the current GP until they exit their reader sections.
1484 *
1485 * Preempted readers from the previous GP have finished so
1486 * cur_preempted is empty, but see comment in _rcu_record_qs().
1487 */
1488 list_concat(&rcu.cur_preempted, &rcu.next_preempted);
1489
1490 irq_spinlock_unlock(&rcu.preempt_lock, true);
1491}
1492
1493/** Remove those cpus from the mask that have already passed a quiescent
1494 * state since the start of the current grace period.
1495 */
1496static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
1497{
1498 /*
1499 * Ensure the announcement of the start of a new GP (ie up-to-date
1500 * cur_gp) propagates to cpus that are just coming out of idle
1501 * mode before we sample their idle state flag.
1502 *
1503 * Cpus guarantee that after they set CPU->idle = true they will not
1504 * execute any RCU reader sections without first setting idle to
1505 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
1506 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
1507 * the cpu must not have yet executed its memory barrier (otherwise
1508 * it would pair up with this mem barrier and we would see idle == false).
1509 * That memory barrier will pair up with the one below and ensure
1510 * that a reader on the now-non-idle cpu will see the most current
1511 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
1512 * pending_readers) during this GP, which allows the detector to
1513 * ignore that cpu (the detector thinks it is idle). Moreover, any
1514 * changes made by RCU updaters will have propagated to readers
1515 * on the previously idle cpu -- again thanks to issuing a memory
1516 * barrier after returning from idle mode.
1517 *
1518 * idle -> non-idle cpu | detector | reclaimer
1519 * ------------------------------------------------------
1520 * rcu reader 1 | | rcu_call()
1521 * MB X | |
1522 * idle = true | | rcu_call()
1523 * (no rcu readers allowed ) | | MB A in advance_cbs()
1524 * MB Y | (...) | (...)
1525 * (no rcu readers allowed) | | MB B in advance_cbs()
1526 * idle = false | ++cur_gp |
1527 * (no rcu readers allowed) | MB C |
1528 * MB Z | signal gp_end |
1529 * rcu reader 2 | | exec_cur_cbs()
1530 *
1531 *
1532 * MB Y orders visibility of changes to idle for detector's sake.
1533 *
1534 * MB Z pairs up with MB C. The cpu making a transition from idle
1535 * will see the most current value of cur_gp and will not attempt
1536 * to notify the detector even if preempted during this GP.
1537 *
1538 * MB Z pairs up with MB A from the previous batch. Updaters' changes
1539 * are visible to reader 2 even when the detector thinks the cpu is idle
1540 * but it is not anymore.
1541 *
1542 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
1543 * and visible before idling and before any callbacks are executed
1544 * by reclaimers.
1545 *
1546 * In summary, the detector does not know of or wait for reader 2, but
1547 * it does not have to since it is a new reader that will not access
1548 * data from previous GPs and will see any changes.
1549 */
1550 memory_barrier(); /* MB C */
1551
1552 cpu_mask_for_each(*cpu_mask, cpu_id) {
1553 /*
1554 * The cpu already checked for and passed through a quiescent
1555 * state since the beginning of this GP.
1556 *
1557 * _rcu_cur_gp is modified by local detector thread only.
1558 * Therefore, it is up-to-date even without a lock.
1559 */
1560 bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
1561
1562 /*
1563 * Either the cpu is idle or it is exiting away from idle mode
1564 * and already sees the most current _rcu_cur_gp. See comment
1565 * in wait_for_readers().
1566 */
1567 bool cpu_idle = cpus[cpu_id].idle;
1568
1569 if (cpu_acked_gp || cpu_idle) {
1570 cpu_mask_reset(cpu_mask, cpu_id);
1571 }
1572 }
1573}
1574
1575/** Invokes sample_local_cpu(arg) on each cpu of reader_cpus. */
1576static void sample_cpus(cpu_mask_t *reader_cpus, void *arg)
1577{
1578 const size_t max_conconcurrent_calls = 16;
1579 smp_call_t call[max_conconcurrent_calls];
1580 size_t outstanding_calls = 0;
1581
1582 cpu_mask_for_each(*reader_cpus, cpu_id) {
1583 smp_call_async(cpu_id, sample_local_cpu, arg, &call[outstanding_calls]);
1584 ++outstanding_calls;
1585
1586 /* Update statistic. */
1587 if (CPU->id != cpu_id)
1588 ++rcu.stat_smp_call_cnt;
1589
1590 if (outstanding_calls == max_conconcurrent_calls) {
1591 for (size_t k = 0; k < outstanding_calls; ++k) {
1592 smp_call_wait(&call[k]);
1593 }
1594
1595 outstanding_calls = 0;
1596 }
1597 }
1598
1599 for (size_t k = 0; k < outstanding_calls; ++k) {
1600 smp_call_wait(&call[k]);
1601 }
1602}
1603
1604static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
1605{
1606 ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
1607
1608 size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
1609 CPU->rcu.stat_missed_gp_in_wait += delta;
1610}
1611
1612/** Globally note that the current thread was preempted in a reader section. */
1613static void note_preempted_reader(void)
1614{
1615 irq_spinlock_lock(&rcu.preempt_lock, false);
1616
1617 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
1618 /* The reader started before the GP started - we must wait for it.*/
1619 list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
1620 } else {
1621 /*
1622 * The reader started after the GP started and this cpu
1623 * already noted a quiescent state. We might block the next GP.
1624 */
1625 list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
1626 }
1627
1628 irq_spinlock_unlock(&rcu.preempt_lock, false);
1629}
1630
1631/** Remove the current thread from the global list of preempted readers. */
1632static void rm_preempted_reader(void)
1633{
1634 irq_spinlock_lock(&rcu.preempt_lock, false);
1635
1636 ASSERT(link_used(&THREAD->rcu.preempt_link));
1637
1638 bool prev_empty = list_empty(&rcu.cur_preempted);
1639 list_remove(&THREAD->rcu.preempt_link);
1640 bool now_empty = list_empty(&rcu.cur_preempted);
1641
1642 /* This was the last reader in cur_preempted. */
1643 bool last_removed = now_empty && !prev_empty;
1644
1645 /*
1646 * Preempted readers are blocking the detector and
1647 * this was the last reader blocking the current GP.
1648 */
1649 if (last_removed && rcu.preempt_blocking_det) {
1650 rcu.preempt_blocking_det = false;
1651 semaphore_up(&rcu.remaining_readers);
1652 }
1653
1654 irq_spinlock_unlock(&rcu.preempt_lock, false);
1655}
1656
1657/** Waits for any preempted readers blocking this grace period to finish.*/
1658static bool wait_for_preempt_reader(void)
1659{
1660 irq_spinlock_lock(&rcu.preempt_lock, true);
1661
1662 bool reader_exists = !list_empty(&rcu.cur_preempted);
1663 rcu.preempt_blocking_det = reader_exists;
1664
1665 irq_spinlock_unlock(&rcu.preempt_lock, true);
1666
1667 if (reader_exists) {
1668 /* Update statistic. */
1669 ++rcu.stat_preempt_blocking_cnt;
1670
1671 return semaphore_down_interruptable(&rcu.remaining_readers);
1672 }
1673
1674 return true;
1675}
1676
1677static void upd_max_cbs_in_slice(void)
1678{
1679 rcu_cpu_data_t *cr = &CPU->rcu;
1680
1681 if (cr->arriving_cbs_cnt > cr->last_arriving_cnt) {
1682 size_t arrived_cnt = cr->arriving_cbs_cnt - cr->last_arriving_cnt;
1683 cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
1684 }
1685
1686 cr->last_arriving_cnt = cr->arriving_cbs_cnt;
1687}
1688
1689/** Prints RCU run-time statistics. */
1690void rcu_print_stat(void)
1691{
1692 /*
1693 * Don't take locks. Worst case is we get out-dated values.
1694 * CPU local values are updated without any locks, so there
1695 * are no locks to lock in order to get up-to-date values.
1696 */
1697
1698#ifdef RCU_PREEMPT_PODZIMEK
1699 const char *algo = "podzimek-preempt-rcu";
1700#elif defined(RCU_PREEMPT_A)
1701 const char *algo = "a-preempt-rcu";
1702#endif
1703
1704 printf("Config: expedite_threshold=%d, critical_threshold=%d,"
1705 " detect_sleep=%dms, %s\n",
1706 EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS, algo);
1707 printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
1708 printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
1709 printf("Delayed GPs: %zu (cpus w/ still running readers after gp sleep)\n",
1710 rcu.stat_delayed_cnt);
1711 printf("Preempt blocked GPs: %zu (waited for preempted readers; "
1712 "running or not)\n", rcu.stat_preempt_blocking_cnt);
1713 printf("Smp calls: %zu\n", rcu.stat_smp_call_cnt);
1714
1715 printf("Max arrived callbacks per GP and CPU:\n");
1716 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1717 printf(" %zu", cpus[i].rcu.stat_max_cbs);
1718 }
1719
1720 printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
1721 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1722 printf(" %zu", cpus[i].rcu.stat_avg_cbs);
1723 }
1724
1725 printf("\nMax arrived callbacks per time slice and CPU:\n");
1726 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1727 printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
1728 }
1729
1730 printf("\nMissed GP notifications per CPU:\n");
1731 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1732 printf(" %zu", cpus[i].rcu.stat_missed_gps);
1733 }
1734
1735 printf("\nMissed GP notifications per CPU while waking up:\n");
1736 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1737 printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
1738 }
1739 printf("\n");
1740}
1741
1742/** @}
1743 */
Note: See TracBrowser for help on using the repository browser.