source: mainline/kernel/generic/src/synch/rcu.c@ 0cf813d

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 0cf813d was 0cf813d, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago

rcu: Added new statistics. Changed reclaimers to run callbacks with preemption disabled if too many callbacks are pending in order to avoid exhausting system memory.

  • Property mode set to 100644
File size: 41.3 KB
Line 
1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29
30/** @addtogroup sync
31 * @{
32 */
33
34/**
35 * @file
36 * @brief Preemptible read-copy update. Usable from interrupt handlers.
37 */
38
39#include <synch/rcu.h>
40#include <synch/condvar.h>
41#include <synch/semaphore.h>
42#include <synch/spinlock.h>
43#include <proc/thread.h>
44#include <cpu/cpu_mask.h>
45#include <cpu.h>
46#include <smp/smp_call.h>
47#include <compiler/barrier.h>
48#include <atomic.h>
49#include <arch.h>
50#include <macros.h>
51
52/*
53 * Number of milliseconds to give to preexisting readers to finish
54 * when non-expedited grace period detection is in progress.
55 */
56#define DETECT_SLEEP_MS 10
57/*
58 * Max number of pending callbacks in the local cpu's queue before
59 * aggressively expediting the current grace period
60 */
61#define EXPEDITE_THRESHOLD 2000
62/*
63 * Max number of callbacks to execute in one go with preemption
64 * enabled. If there are more callbacks to be executed they will
65 * be run with preemption disabled in order to prolong reclaimer's
66 * time slice and give it a chance to catch up with callback producers.
67 */
68#define CRITICAL_THRESHOLD 30000
69/* Half the number of values a uint32 can hold. */
70#define UINT32_MAX_HALF 2147483648U
71
72
73/** Global RCU data. */
74typedef struct rcu_data {
75 /** Detector uses so signal reclaimers that a grace period ended. */
76 condvar_t gp_ended;
77 /** Reclaimers notify the detector when they request more grace periods.*/
78 condvar_t req_gp_changed;
79 /** Reclaimers use to notify the detector to accelerate GP detection. */
80 condvar_t expedite_now;
81 /**
82 * The detector waits on this semaphore for any readers delaying the GP.
83 *
84 * Each of the cpus with readers that are delaying the current GP
85 * must up() this sema once they reach a quiescent state. If there
86 * are any readers in cur_preempted (ie preempted preexisting) and
87 * they are already delaying GP detection, the last to unlock its
88 * reader section must up() this sema once.
89 */
90 semaphore_t remaining_readers;
91
92 /** Protects the 4 fields below. */
93 SPINLOCK_DECLARE(gp_lock);
94 /** Number of grace period ends the detector was requested to announce. */
95 size_t req_gp_end_cnt;
96 /** Number of consecutive grace periods to detect quickly and aggressively.*/
97 size_t req_expedited_cnt;
98 /**
99 * The current grace period number. Increases monotonically.
100 * Lock gp_lock or preempt_lock to get a current value.
101 */
102 rcu_gp_t cur_gp;
103 /**
104 * The number of the most recently completed grace period.
105 * At most one behind cur_gp. If equal to cur_gp, a grace
106 * period detection is not in progress and the detector
107 * is idle.
108 */
109 rcu_gp_t completed_gp;
110
111 /** Protects the following 3 fields. */
112 IRQ_SPINLOCK_DECLARE(preempt_lock);
113 /** Preexisting readers that have been preempted. */
114 list_t cur_preempted;
115 /** Reader that have been preempted and might delay the next grace period.*/
116 list_t next_preempted;
117 /**
118 * The detector is waiting for the last preempted reader
119 * in cur_preempted to announce that it exited its reader
120 * section by up()ing remaining_readers.
121 */
122 bool preempt_blocking_det;
123
124 /**
125 * Number of cpus with readers that are delaying the current GP.
126 * They will up() remaining_readers.
127 */
128 atomic_t delaying_cpu_cnt;
129
130 /** Interruptible attached detector thread pointer. */
131 thread_t *detector_thr;
132
133 /* Some statistics. */
134 size_t stat_expedited_cnt;
135 size_t stat_delayed_cnt;
136 size_t stat_preempt_blocking_cnt;
137 /* Does not contain self/local calls. */
138 size_t stat_smp_call_cnt;
139} rcu_data_t;
140
141
142static rcu_data_t rcu;
143
144static void start_detector(void);
145static void start_reclaimers(void);
146static void rcu_read_unlock_impl(size_t *pnesting_cnt);
147static void synch_complete(rcu_item_t *rcu_item);
148static void check_qs(void);
149static void record_qs(void);
150static void signal_read_unlock(void);
151static bool arriving_cbs_empty(void);
152static bool next_cbs_empty(void);
153static bool cur_cbs_empty(void);
154static bool all_cbs_empty(void);
155static void reclaimer(void *arg);
156static bool wait_for_pending_cbs(void);
157static bool advance_cbs(void);
158static void exec_completed_cbs(rcu_gp_t last_completed_gp);
159static void exec_cbs(rcu_item_t **phead);
160static void req_detection(size_t req_cnt);
161static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
162static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
163static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
164static void detector(void *);
165static bool wait_for_detect_req(void);
166static void start_new_gp(void);
167static void end_cur_gp(void);
168static bool wait_for_readers(void);
169static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
170static bool gp_sleep(void);
171static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
172static void sample_local_cpu(void *);
173static bool wait_for_delaying_cpus(void);
174static bool wait_for_preempt_reader(void);
175static void upd_max_cbs_in_slice(void);
176
177
178
179/** Initializes global RCU structures. */
180void rcu_init(void)
181{
182 condvar_initialize(&rcu.gp_ended);
183 condvar_initialize(&rcu.req_gp_changed);
184 condvar_initialize(&rcu.expedite_now);
185 semaphore_initialize(&rcu.remaining_readers, 0);
186
187 spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
188 rcu.req_gp_end_cnt = 0;
189 rcu.req_expedited_cnt = 0;
190 rcu.cur_gp = 0;
191 rcu.completed_gp = 0;
192
193 irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
194 list_initialize(&rcu.cur_preempted);
195 list_initialize(&rcu.next_preempted);
196 rcu.preempt_blocking_det = false;
197
198 atomic_set(&rcu.delaying_cpu_cnt, 0);
199
200 rcu.detector_thr = 0;
201
202 rcu.stat_expedited_cnt = 0;
203 rcu.stat_delayed_cnt = 0;
204 rcu.stat_preempt_blocking_cnt = 0;
205 rcu.stat_smp_call_cnt = 0;
206}
207
208/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
209void rcu_cpu_init(void)
210{
211 if (config.cpu_active == 1) {
212 rcu_init();
213 }
214
215 CPU->rcu.last_seen_gp = 0;
216
217 CPU->rcu.pnesting_cnt = &CPU->rcu.tmp_nesting_cnt;
218 CPU->rcu.tmp_nesting_cnt = 0;
219
220 CPU->rcu.cur_cbs = 0;
221 CPU->rcu.cur_cbs_cnt = 0;
222 CPU->rcu.next_cbs = 0;
223 CPU->rcu.next_cbs_cnt = 0;
224 CPU->rcu.arriving_cbs = 0;
225 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
226 CPU->rcu.arriving_cbs_cnt = 0;
227
228 CPU->rcu.cur_cbs_gp = 0;
229 CPU->rcu.next_cbs_gp = 0;
230
231 CPU->rcu.is_delaying_gp = false;
232
233 semaphore_initialize(&CPU->rcu.arrived_flag, 0);
234
235 /* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
236 if (config.cpu_active == 1)
237 CPU->rcu.reclaimer_thr = 0;
238
239 CPU->rcu.stat_max_cbs = 0;
240 CPU->rcu.stat_avg_cbs = 0;
241 CPU->rcu.stat_missed_gps = 0;
242 CPU->rcu.stat_missed_gp_in_wait = 0;
243 CPU->rcu.stat_max_slice_cbs = 0;
244 CPU->rcu.last_arriving_cnt = 0;
245}
246
247/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
248void rcu_kinit_init(void)
249{
250 start_detector();
251 start_reclaimers();
252}
253
254/** Initializes any per-thread RCU structures. */
255void rcu_thread_init(thread_t *thread)
256{
257 thread->rcu.nesting_cnt = 0;
258 thread->rcu.was_preempted = false;
259 link_initialize(&thread->rcu.preempt_link);
260}
261
262/** Called from scheduler() when exiting the current thread.
263 *
264 * Preemption or interrupts are disabled and the scheduler() already
265 * switched away from the current thread, calling rcu_after_thread_ran().
266 */
267void rcu_thread_exiting(void)
268{
269 ASSERT(THREAD != 0);
270 ASSERT(THREAD->state == Exiting);
271 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
272 /*
273 * The scheduler() must have already switched to a temporary
274 * nesting counter for interrupt handlers (we could be idle)
275 * so that interrupt handlers do not modify the exiting thread's
276 * reader section nesting count while we examine/process it.
277 */
278 ASSERT(&CPU->rcu.tmp_nesting_cnt == CPU->rcu.pnesting_cnt);
279
280 /*
281 * The thread forgot to exit its reader critical secion.
282 * It is a bug, but rather than letting the entire system lock up
283 * forcefully leave the reader section. The thread is not holding
284 * any references anyway since it is exiting so it is safe.
285 */
286 if (0 < THREAD->rcu.nesting_cnt) {
287 THREAD->rcu.nesting_cnt = 1;
288 rcu_read_unlock_impl(&THREAD->rcu.nesting_cnt);
289 }
290}
291
292/** Cleans up global RCU resources and stops dispatching callbacks.
293 *
294 * Call when shutting down the kernel. Outstanding callbacks will
295 * not be processed. Instead they will linger forever.
296 */
297void rcu_stop(void)
298{
299 /* todo: stop accepting new callbacks instead of just letting them linger?*/
300
301 /* Stop and wait for reclaimers. */
302 for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
303 ASSERT(cpus[cpu_id].rcu.reclaimer_thr != 0);
304
305 if (cpus[cpu_id].rcu.reclaimer_thr) {
306 thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
307 thread_join(cpus[cpu_id].rcu.reclaimer_thr);
308 thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
309 cpus[cpu_id].rcu.reclaimer_thr = 0;
310 }
311 }
312
313 /* Stop the detector and wait. */
314 if (rcu.detector_thr) {
315 thread_interrupt(rcu.detector_thr);
316 thread_join(rcu.detector_thr);
317 thread_detach(rcu.detector_thr);
318 rcu.detector_thr = 0;
319 }
320}
321
322/** Starts the detector thread. */
323static void start_detector(void)
324{
325 rcu.detector_thr =
326 thread_create(detector, 0, TASK, THREAD_FLAG_NONE, "rcu-det");
327
328 if (!rcu.detector_thr)
329 panic("Failed to create RCU detector thread.");
330
331 thread_ready(rcu.detector_thr);
332}
333
334/** Creates and runs cpu-bound reclaimer threads. */
335static void start_reclaimers(void)
336{
337 for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
338 char name[THREAD_NAME_BUFLEN] = {0};
339
340 snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
341
342 cpus[cpu_id].rcu.reclaimer_thr =
343 thread_create(reclaimer, 0, TASK, THREAD_FLAG_NONE, name);
344
345 if (!cpus[cpu_id].rcu.reclaimer_thr)
346 panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
347
348 thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
349 thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
350 }
351}
352
353/** Returns the number of elapsed grace periods since boot. */
354uint64_t rcu_completed_gps(void)
355{
356 spinlock_lock(&rcu.gp_lock);
357 uint64_t completed = rcu.completed_gp;
358 spinlock_unlock(&rcu.gp_lock);
359
360 return completed;
361}
362
363/** Delimits the start of an RCU reader critical section.
364 *
365 * Reader sections may be nested and are preemptable. You must not
366 * however block/sleep within reader sections.
367 */
368void rcu_read_lock(void)
369{
370 ASSERT(CPU);
371 preemption_disable();
372
373 check_qs();
374 ++(*CPU->rcu.pnesting_cnt);
375
376 preemption_enable();
377}
378
379/** Delimits the end of an RCU reader critical section. */
380void rcu_read_unlock(void)
381{
382 ASSERT(CPU);
383 preemption_disable();
384
385 rcu_read_unlock_impl(CPU->rcu.pnesting_cnt);
386
387 preemption_enable();
388}
389
390/** Unlocks the local reader section using the given nesting count.
391 *
392 * Preemption or interrupts must be disabled.
393 *
394 * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or
395 * THREAD->rcu.nesting_cnt.
396 */
397static void rcu_read_unlock_impl(size_t *pnesting_cnt)
398{
399 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
400
401 if (0 == --(*pnesting_cnt)) {
402 record_qs();
403
404 /*
405 * The thread was preempted while in a critical section or
406 * the detector is eagerly waiting for this cpu's reader
407 * to finish.
408 *
409 * Note that THREAD may be 0 in scheduler() and not just during boot.
410 */
411 if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
412 /* Rechecks with disabled interrupts. */
413 signal_read_unlock();
414 }
415 }
416}
417
418/** Records a QS if not in a reader critical section. */
419static void check_qs(void)
420{
421 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
422
423 if (0 == *CPU->rcu.pnesting_cnt)
424 record_qs();
425}
426
427/** Unconditionally records a quiescent state for the local cpu. */
428static void record_qs(void)
429{
430 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
431
432 /*
433 * A new GP was started since the last time we passed a QS.
434 * Notify the detector we have reached a new QS.
435 */
436 if (CPU->rcu.last_seen_gp != rcu.cur_gp) {
437 rcu_gp_t cur_gp = ACCESS_ONCE(rcu.cur_gp);
438 /*
439 * Contain memory accesses within a reader critical section.
440 * If we are in rcu_lock() it also makes changes prior to the
441 * start of the GP visible in the reader section.
442 */
443 memory_barrier();
444 /*
445 * Acknowledge we passed a QS since the beginning of rcu.cur_gp.
446 * Cache coherency will lazily transport the value to the
447 * detector while it sleeps in gp_sleep().
448 *
449 * Note that there is a theoretical possibility that we
450 * overwrite a more recent/greater last_seen_gp here with
451 * an older/smaller value. If this cpu is interrupted here
452 * while in rcu_lock() reader sections in the interrupt handler
453 * will update last_seen_gp to the same value as is currently
454 * in local cur_gp. However, if the cpu continues processing
455 * interrupts and the detector starts a new GP immediately,
456 * local interrupt handlers may update last_seen_gp again (ie
457 * properly ack the new GP) with a value greater than local cur_gp.
458 * Resetting last_seen_gp to a previous value here is however
459 * benign and we only have to remember that this reader may end up
460 * in cur_preempted even after the GP ends. That is why we
461 * append next_preempted to cur_preempted rather than overwriting
462 * it as if cur_preempted were empty.
463 */
464 CPU->rcu.last_seen_gp = cur_gp;
465 }
466}
467
468/** If necessary, signals the detector that we exited a reader section. */
469static void signal_read_unlock(void)
470{
471 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
472
473 /*
474 * We have to disable interrupts in order to make checking
475 * and resetting was_preempted and is_delaying_gp atomic
476 * with respect to local interrupt handlers. Otherwise
477 * an interrupt could beat us to calling semaphore_up()
478 * before we reset the appropriate flag.
479 */
480 ipl_t ipl = interrupts_disable();
481
482 /*
483 * If the detector is eagerly waiting for this cpu's reader to unlock,
484 * notify it that the reader did so.
485 */
486 if (CPU->rcu.is_delaying_gp) {
487 CPU->rcu.is_delaying_gp = false;
488 semaphore_up(&rcu.remaining_readers);
489 }
490
491 /*
492 * This reader was preempted while in a reader section.
493 * We might be holding up the current GP. Notify the
494 * detector if so.
495 */
496 if (THREAD && THREAD->rcu.was_preempted) {
497 ASSERT(link_used(&THREAD->rcu.preempt_link));
498 THREAD->rcu.was_preempted = false;
499
500 irq_spinlock_lock(&rcu.preempt_lock, false);
501
502 bool prev_empty = list_empty(&rcu.cur_preempted);
503 list_remove(&THREAD->rcu.preempt_link);
504 bool now_empty = list_empty(&rcu.cur_preempted);
505
506 /* This was the last reader in cur_preempted. */
507 bool last_removed = now_empty && !prev_empty;
508
509 /*
510 * Preempted readers are blocking the detector and
511 * this was the last reader blocking the current GP.
512 */
513 if (last_removed && rcu.preempt_blocking_det) {
514 rcu.preempt_blocking_det = false;
515 semaphore_up(&rcu.remaining_readers);
516 }
517
518 irq_spinlock_unlock(&rcu.preempt_lock, false);
519 }
520 interrupts_restore(ipl);
521}
522
523typedef struct synch_item {
524 waitq_t wq;
525 rcu_item_t rcu_item;
526} synch_item_t;
527
528/** Blocks until all preexisting readers exit their critical sections. */
529void rcu_synchronize(void)
530{
531 /* Calling from a reader section will deadlock. */
532 ASSERT(THREAD == 0 || 0 == THREAD->rcu.nesting_cnt);
533
534 synch_item_t completion;
535
536 waitq_initialize(&completion.wq);
537 rcu_call(&completion.rcu_item, synch_complete);
538 waitq_sleep(&completion.wq);
539 waitq_complete_wakeup(&completion.wq);
540}
541
542/** rcu_synchronize's callback. */
543static void synch_complete(rcu_item_t *rcu_item)
544{
545 synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
546 ASSERT(completion);
547 waitq_wakeup(&completion->wq, WAKEUP_FIRST);
548}
549
550/** Adds a callback to invoke after all preexisting readers finish.
551 *
552 * May be called from within interrupt handlers or RCU reader sections.
553 *
554 * @param rcu_item Used by RCU to track the call. Must remain
555 * until the user callback function is entered.
556 * @param func User callback function that will be invoked once a full
557 * grace period elapsed, ie at a time when all preexisting
558 * readers have finished. The callback should be short and must
559 * not block. If you must sleep, enqueue your work in the system
560 * work queue from the callback (ie workq_global_enqueue()).
561 */
562void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
563{
564 _rcu_call(false, rcu_item, func);
565}
566
567/** rcu_call() implementation. See rcu_call() for comments. */
568void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
569{
570 ASSERT(rcu_item);
571
572 rcu_item->func = func;
573 rcu_item->next = 0;
574
575 preemption_disable();
576
577 ipl_t ipl = interrupts_disable();
578
579 *CPU->rcu.parriving_cbs_tail = rcu_item;
580 CPU->rcu.parriving_cbs_tail = &rcu_item->next;
581
582 size_t cnt = ++CPU->rcu.arriving_cbs_cnt;
583 interrupts_restore(ipl);
584
585 if (expedite) {
586 CPU->rcu.expedite_arriving = true;
587 }
588
589 /* Added first callback - notify the reclaimer. */
590 if (cnt == 1 && !semaphore_count_get(&CPU->rcu.arrived_flag)) {
591 semaphore_up(&CPU->rcu.arrived_flag);
592 }
593
594 preemption_enable();
595}
596
597static bool cur_cbs_empty(void)
598{
599 ASSERT(THREAD && THREAD->wired);
600 return 0 == CPU->rcu.cur_cbs;
601}
602
603static bool next_cbs_empty(void)
604{
605 ASSERT(THREAD && THREAD->wired);
606 return 0 == CPU->rcu.next_cbs;
607}
608
609/** Disable interrupts to get an up-to-date result. */
610static bool arriving_cbs_empty(void)
611{
612 ASSERT(THREAD && THREAD->wired);
613 /*
614 * Accessing with interrupts enabled may at worst lead to
615 * a false negative if we race with a local interrupt handler.
616 */
617 return 0 == CPU->rcu.arriving_cbs;
618}
619
620static bool all_cbs_empty(void)
621{
622 return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
623}
624
625/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
626static void reclaimer(void *arg)
627{
628 ASSERT(THREAD && THREAD->wired);
629 ASSERT(THREAD == CPU->rcu.reclaimer_thr);
630
631 rcu_gp_t last_compl_gp = 0;
632 bool ok = true;
633
634 while (ok && wait_for_pending_cbs()) {
635 ASSERT(CPU->rcu.reclaimer_thr == THREAD);
636
637 exec_completed_cbs(last_compl_gp);
638
639 bool expedite = advance_cbs();
640
641 ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
642 }
643}
644
645/** Waits until there are callbacks waiting to be dispatched. */
646static bool wait_for_pending_cbs(void)
647{
648 if (!all_cbs_empty())
649 return true;
650
651 bool ok = true;
652
653 while (arriving_cbs_empty() && ok) {
654 ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
655 }
656
657 return ok;
658}
659
660static void upd_stat_missed_gp(rcu_gp_t compl)
661{
662 if (CPU->rcu.cur_cbs_gp < compl) {
663 CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
664 }
665}
666
667/** Executes all callbacks for the given completed grace period. */
668static void exec_completed_cbs(rcu_gp_t last_completed_gp)
669{
670 upd_stat_missed_gp(last_completed_gp);
671
672 /* Both next_cbs and cur_cbs GP elapsed. */
673 if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
674 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
675
676 size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
677
678 if (exec_cnt < CRITICAL_THRESHOLD) {
679 exec_cbs(&CPU->rcu.cur_cbs);
680 exec_cbs(&CPU->rcu.next_cbs);
681 } else {
682 /*
683 * Getting overwhelmed with too many callbacks to run.
684 * Disable preemption in order to prolong our time slice
685 * and catch up with updaters posting new callbacks.
686 */
687 preemption_disable();
688 exec_cbs(&CPU->rcu.cur_cbs);
689 exec_cbs(&CPU->rcu.next_cbs);
690 preemption_enable();
691 }
692
693 CPU->rcu.cur_cbs_cnt = 0;
694 CPU->rcu.next_cbs_cnt = 0;
695 } else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
696
697 if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
698 exec_cbs(&CPU->rcu.cur_cbs);
699 } else {
700 /*
701 * Getting overwhelmed with too many callbacks to run.
702 * Disable preemption in order to prolong our time slice
703 * and catch up with updaters posting new callbacks.
704 */
705 preemption_disable();
706 exec_cbs(&CPU->rcu.cur_cbs);
707 preemption_enable();
708 }
709
710 CPU->rcu.cur_cbs_cnt = 0;
711 }
712}
713
714/** Executes callbacks in the single-linked list. The list is left empty. */
715static void exec_cbs(rcu_item_t **phead)
716{
717 rcu_item_t *rcu_item = *phead;
718
719 while (rcu_item) {
720 /* func() may free rcu_item. Get a local copy. */
721 rcu_item_t *next = rcu_item->next;
722 rcu_func_t func = rcu_item->func;
723
724 func(rcu_item);
725
726 rcu_item = next;
727 }
728
729 *phead = 0;
730}
731
732static void upd_stat_cb_cnts(size_t arriving_cnt)
733{
734 CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
735 if (0 < arriving_cnt) {
736 CPU->rcu.stat_avg_cbs =
737 (99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
738 }
739}
740
741
742/** Prepares another batch of callbacks to dispatch at the nest grace period.
743 *
744 * @return True if the next batch of callbacks must be expedited quickly.
745 */
746static bool advance_cbs(void)
747{
748 /* Move next_cbs to cur_cbs. */
749 CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
750 CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
751 CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
752
753 /* Move arriving_cbs to next_cbs. Empties arriving_cbs. */
754 ipl_t ipl = interrupts_disable();
755
756 /*
757 * Too many callbacks queued. Better speed up the detection
758 * or risk exhausting all system memory.
759 */
760 bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.arriving_cbs_cnt)
761 || CPU->rcu.expedite_arriving;
762
763 CPU->rcu.expedite_arriving = false;
764
765 CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
766 CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
767
768 CPU->rcu.arriving_cbs = 0;
769 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
770 CPU->rcu.arriving_cbs_cnt = 0;
771
772 interrupts_restore(ipl);
773
774 /* Update statistics of arrived callbacks. */
775 upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
776
777 /*
778 * Make changes prior to queuing next_cbs visible to readers.
779 * See comment in wait_for_readers().
780 */
781 memory_barrier(); /* MB A, B */
782
783 /* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
784
785 if (!next_cbs_empty()) {
786 spinlock_lock(&rcu.gp_lock);
787
788 /* Exec next_cbs at the end of the next GP. */
789 CPU->rcu.next_cbs_gp = rcu.cur_gp + 1;
790
791 /*
792 * There are no callbacks to invoke before next_cbs. Instruct
793 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
794 * That could be sooner than next_cbs_gp (if the current GP
795 * had not yet completed), so we'll create a shorter batch
796 * of callbacks next time around.
797 */
798 if (cur_cbs_empty()) {
799 CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
800 }
801
802 spinlock_unlock(&rcu.gp_lock);
803 } else {
804 CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
805 }
806
807 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
808
809 return expedite;
810}
811
812/** Waits for the grace period associated with callbacks cub_cbs to elapse.
813 *
814 * @param expedite Instructs the detector to aggressively speed up grace
815 * period detection without any delay.
816 * @param completed_gp Returns the most recent completed grace period
817 * number.
818 * @return false if the thread was interrupted and should stop.
819 */
820static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
821{
822 /*
823 * Use a possibly outdated version of completed_gp to bypass checking
824 * with the lock.
825 *
826 * Note that loading and storing rcu.completed_gp is not atomic
827 * (it is 64bit wide). Reading a clobbered value that is less than
828 * rcu.completed_gp is harmless - we'll recheck with a lock. The
829 * only way to read a clobbered value that is greater than the actual
830 * value is if the detector increases the higher-order word first and
831 * then decreases the lower-order word (or we see stores in that order),
832 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value
833 * suddenly jumps by 2^32. It would take hours for such an increase
834 * to occur so it is safe to discard the value. We allow increases
835 * of up to half the maximum to generously accommodate for loading an
836 * outdated lower word.
837 */
838 rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
839 if (CPU->rcu.cur_cbs_gp <= compl_gp
840 && compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
841 *completed_gp = compl_gp;
842 return true;
843 }
844
845 spinlock_lock(&rcu.gp_lock);
846
847 if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
848 *completed_gp = rcu.completed_gp;
849 spinlock_unlock(&rcu.gp_lock);
850 return true;
851 }
852
853 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
854 ASSERT(rcu.cur_gp <= CPU->rcu.cur_cbs_gp);
855
856 /*
857 * Notify the detector of how many GP ends we intend to wait for, so
858 * it can avoid going to sleep unnecessarily. Optimistically assume
859 * new callbacks will arrive while we're waiting; hence +1.
860 */
861 size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - rcu.cur_gp);
862 req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
863
864 /*
865 * Ask the detector to speed up GP detection if there are too many
866 * pending callbacks and other reclaimers have not already done so.
867 */
868 if (expedite) {
869 if(0 == rcu.req_expedited_cnt)
870 condvar_signal(&rcu.expedite_now);
871
872 /*
873 * Expedite only cub_cbs. If there really is a surge of callbacks
874 * the arriving batch will expedite the GP for the huge number
875 * of callbacks currently in next_cbs
876 */
877 rcu.req_expedited_cnt = 1;
878 }
879
880 /* Wait for cur_cbs_gp to end. */
881 bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
882
883 *completed_gp = rcu.completed_gp;
884 spinlock_unlock(&rcu.gp_lock);
885
886 upd_missed_gp_in_wait(*completed_gp);
887
888 return !interrupted;
889}
890
891static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
892{
893 ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
894
895 size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
896 CPU->rcu.stat_missed_gp_in_wait += delta;
897}
898
899
900/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
901static void req_detection(size_t req_cnt)
902{
903 if (rcu.req_gp_end_cnt < req_cnt) {
904 bool detector_idle = (0 == rcu.req_gp_end_cnt);
905 rcu.req_gp_end_cnt = req_cnt;
906
907 if (detector_idle) {
908 ASSERT(rcu.cur_gp == rcu.completed_gp);
909 condvar_signal(&rcu.req_gp_changed);
910 }
911 }
912}
913
914/** Waits for an announcement of the end of the grace period wait_on_gp. */
915static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
916{
917 ASSERT(spinlock_locked(&rcu.gp_lock));
918
919 bool interrupted = false;
920
921 /* Wait until wait_on_gp ends. */
922 while (rcu.completed_gp < wait_on_gp && !interrupted) {
923 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
924 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
925 interrupted = (ret == ESYNCH_INTERRUPTED);
926 }
927
928 ASSERT(wait_on_gp <= rcu.completed_gp);
929
930 return interrupted;
931}
932
933/** The detector thread detects and notifies reclaimers of grace period ends. */
934static void detector(void *arg)
935{
936 spinlock_lock(&rcu.gp_lock);
937
938 while (wait_for_detect_req()) {
939 /*
940 * Announce new GP started. Readers start lazily acknowledging that
941 * they passed a QS.
942 */
943 start_new_gp();
944
945 spinlock_unlock(&rcu.gp_lock);
946
947 if (!wait_for_readers())
948 goto unlocked_out;
949
950 spinlock_lock(&rcu.gp_lock);
951
952 /* Notify reclaimers that they may now invoke queued callbacks. */
953 end_cur_gp();
954 }
955
956 spinlock_unlock(&rcu.gp_lock);
957
958unlocked_out:
959 return;
960}
961
962/** Waits for a request from a reclaimer thread to detect a grace period. */
963static bool wait_for_detect_req(void)
964{
965 ASSERT(spinlock_locked(&rcu.gp_lock));
966
967 bool interrupted = false;
968
969 while (0 == rcu.req_gp_end_cnt && !interrupted) {
970 int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed,
971 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
972
973 interrupted = (ret == ESYNCH_INTERRUPTED);
974 }
975
976 return !interrupted;
977}
978
979/** Announces the start of a new grace period for preexisting readers to ack. */
980static void start_new_gp(void)
981{
982 ASSERT(spinlock_locked(&rcu.gp_lock));
983
984 irq_spinlock_lock(&rcu.preempt_lock, true);
985
986 /* Start a new GP. Announce to readers that a quiescent state is needed. */
987 ++rcu.cur_gp;
988
989 /*
990 * Readers preempted before the start of this GP (next_preempted)
991 * are preexisting readers now that a GP started and will hold up
992 * the current GP until they exit their reader sections.
993 *
994 * Preempted readers from the previous GP have finished so
995 * cur_preempted is empty, but see comment in record_qs().
996 */
997 list_concat(&rcu.cur_preempted, &rcu.next_preempted);
998
999 irq_spinlock_unlock(&rcu.preempt_lock, true);
1000}
1001
1002static void end_cur_gp(void)
1003{
1004 ASSERT(spinlock_locked(&rcu.gp_lock));
1005
1006 rcu.completed_gp = rcu.cur_gp;
1007 --rcu.req_gp_end_cnt;
1008
1009 condvar_broadcast(&rcu.gp_ended);
1010}
1011
1012/** Waits for readers that started before the current GP started to finish. */
1013static bool wait_for_readers(void)
1014{
1015 DEFINE_CPU_MASK(reading_cpus);
1016
1017 /* All running cpus have potential readers. */
1018 cpu_mask_active(reading_cpus);
1019
1020 /*
1021 * Ensure the announcement of the start of a new GP (ie up-to-date
1022 * cur_gp) propagates to cpus that are just coming out of idle
1023 * mode before we sample their idle state flag.
1024 *
1025 * Cpus guarantee that after they set CPU->idle = true they will not
1026 * execute any RCU reader sections without first setting idle to
1027 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
1028 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
1029 * the cpu must not have yet executed its memory barrier (otherwise
1030 * it would pair up with this mem barrier and we would see idle == false).
1031 * That memory barrier will pair up with the one below and ensure
1032 * that a reader on the now-non-idle cpu will see the most current
1033 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
1034 * pending_readers) during this GP, which allows the detector to
1035 * ignore that cpu (the detector thinks it is idle). Moreover, any
1036 * changes made by RCU updaters will have propagated to readers
1037 * on the previously idle cpu -- again thanks to issuing a memory
1038 * barrier after returning from idle mode.
1039 *
1040 * idle -> non-idle cpu | detector | reclaimer
1041 * ------------------------------------------------------
1042 * rcu reader 1 | | rcu_call()
1043 * MB X | |
1044 * idle = true | | rcu_call()
1045 * (no rcu readers allowed ) | | MB A in advance_cbs()
1046 * MB Y | (...) | (...)
1047 * (no rcu readers allowed) | | MB B in advance_cbs()
1048 * idle = false | ++cur_gp |
1049 * (no rcu readers allowed) | MB C |
1050 * MB Z | signal gp_end |
1051 * rcu reader 2 | | exec_cur_cbs()
1052 *
1053 *
1054 * MB Y orders visibility of changes to idle for detector's sake.
1055 *
1056 * MB Z pairs up with MB C. The cpu making a transition from idle
1057 * will see the most current value of cur_gp and will not attempt
1058 * to notify the detector even if preempted during this GP.
1059 *
1060 * MB Z pairs up with MB A from the previous batch. Updaters' changes
1061 * are visible to reader 2 even when the detector thinks the cpu is idle
1062 * but it is not anymore.
1063 *
1064 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
1065 * and visible before idling and before any callbacks are executed
1066 * by reclaimers.
1067 *
1068 * In summary, the detector does not know of or wait for reader 2, but
1069 * it does not have to since it is a new reader that will not access
1070 * data from previous GPs and will see any changes.
1071 */
1072 memory_barrier(); /* MB C */
1073
1074 /*
1075 * Give readers time to pass through a QS. Also, batch arriving
1076 * callbacks in order to amortize detection overhead.
1077 */
1078 if (!gp_sleep())
1079 return false;
1080
1081 /* Non-intrusively determine which cpus have yet to pass a QS. */
1082 rm_quiescent_cpus(reading_cpus);
1083
1084 /* Actively interrupt cpus delaying the current GP and demand a QS. */
1085 interrupt_delaying_cpus(reading_cpus);
1086
1087 /* Wait for the interrupted cpus to notify us that they reached a QS. */
1088 if (!wait_for_delaying_cpus())
1089 return false;
1090 /*
1091 * All cpus recorded a QS or are still idle. Any new readers will be added
1092 * to next_preempt if preempted, ie the number of readers in cur_preempted
1093 * monotonically descreases.
1094 */
1095
1096 /* Wait for the last reader in cur_preempted to notify us it is done. */
1097 if (!wait_for_preempt_reader())
1098 return false;
1099
1100 return true;
1101}
1102
1103/** Remove those cpus from the mask that have already passed a quiescent
1104 * state since the start of the current grace period.
1105 */
1106static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
1107{
1108 cpu_mask_for_each(*cpu_mask, cpu_id) {
1109 /*
1110 * The cpu already checked for and passed through a quiescent
1111 * state since the beginning of this GP.
1112 *
1113 * rcu.cur_gp is modified by local detector thread only.
1114 * Therefore, it is up-to-date even without a lock.
1115 */
1116 bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == rcu.cur_gp);
1117
1118 /*
1119 * Either the cpu is idle or it is exiting away from idle mode
1120 * and already sees the most current rcu.cur_gp. See comment
1121 * in wait_for_readers().
1122 */
1123 bool cpu_idle = cpus[cpu_id].idle;
1124
1125 if (cpu_acked_gp || cpu_idle) {
1126 cpu_mask_reset(cpu_mask, cpu_id);
1127 }
1128 }
1129}
1130
1131/** Sleeps a while if the current grace period is not to be expedited. */
1132static bool gp_sleep(void)
1133{
1134 spinlock_lock(&rcu.gp_lock);
1135
1136 int ret = 0;
1137 while (0 == rcu.req_expedited_cnt && 0 == ret) {
1138 /* minor bug: sleeps for the same duration if woken up spuriously. */
1139 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
1140 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
1141 }
1142
1143 if (0 < rcu.req_expedited_cnt) {
1144 --rcu.req_expedited_cnt;
1145 /* Update statistic. */
1146 ++rcu.stat_expedited_cnt;
1147 }
1148
1149 spinlock_unlock(&rcu.gp_lock);
1150
1151 return (ret != ESYNCH_INTERRUPTED);
1152}
1153
1154/** Actively interrupts and checks the offending cpus for quiescent states. */
1155static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
1156{
1157 const size_t max_conconcurrent_calls = 16;
1158 smp_call_t call[max_conconcurrent_calls];
1159 size_t outstanding_calls = 0;
1160
1161 atomic_set(&rcu.delaying_cpu_cnt, 0);
1162
1163 cpu_mask_for_each(*cpu_mask, cpu_id) {
1164 smp_call_async(cpu_id, sample_local_cpu, 0, &call[outstanding_calls]);
1165 ++outstanding_calls;
1166
1167 /* Update statistic. */
1168 if (CPU->id != cpu_id)
1169 ++rcu.stat_smp_call_cnt;
1170
1171 if (outstanding_calls == max_conconcurrent_calls) {
1172 for (size_t k = 0; k < outstanding_calls; ++k) {
1173 smp_call_wait(&call[k]);
1174 }
1175
1176 outstanding_calls = 0;
1177 }
1178 }
1179
1180 for (size_t k = 0; k < outstanding_calls; ++k) {
1181 smp_call_wait(&call[k]);
1182 }
1183}
1184
1185/** Invoked on a cpu delaying grace period detection.
1186 *
1187 * Induces a quiescent state for the cpu or it instructs remaining
1188 * readers to notify the detector once they finish.
1189 */
1190static void sample_local_cpu(void *arg)
1191{
1192 ASSERT(interrupts_disabled());
1193 ASSERT(!CPU->rcu.is_delaying_gp);
1194
1195 /* Cpu did not pass a quiescent state yet. */
1196 if (CPU->rcu.last_seen_gp != rcu.cur_gp) {
1197 /* Interrupted a reader in a reader critical section. */
1198 if (0 < (*CPU->rcu.pnesting_cnt)) {
1199 ASSERT(!CPU->idle);
1200 /* Note to notify the detector from rcu_read_unlock(). */
1201 CPU->rcu.is_delaying_gp = true;
1202 atomic_inc(&rcu.delaying_cpu_cnt);
1203 } else {
1204 /*
1205 * The cpu did not enter any rcu reader sections since
1206 * the start of the current GP. Record a quiescent state.
1207 *
1208 * Or, we interrupted rcu_read_unlock_impl() right before
1209 * it recorded a QS. Record a QS for it. The memory barrier
1210 * contains the reader section's mem accesses before
1211 * updating last_seen_gp.
1212 *
1213 * Or, we interrupted rcu_read_lock() right after it recorded
1214 * a QS for the previous GP but before it got a chance to
1215 * increment its nesting count. The memory barrier again
1216 * stops the CS code from spilling out of the CS.
1217 */
1218 memory_barrier();
1219 CPU->rcu.last_seen_gp = rcu.cur_gp;
1220 }
1221 } else {
1222 /*
1223 * This cpu already acknowledged that it had passed through
1224 * a quiescent state since the start of cur_gp.
1225 */
1226 }
1227
1228 /*
1229 * smp_call() makes sure any changes propagate back to the caller.
1230 * In particular, it makes the most current last_seen_gp visible
1231 * to the detector.
1232 */
1233}
1234
1235/** Waits for cpus delaying the current grace period if there are any. */
1236static bool wait_for_delaying_cpus(void)
1237{
1238 int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
1239
1240 for (int i = 0; i < delaying_cpu_cnt; ++i){
1241 if (!semaphore_down_interruptable(&rcu.remaining_readers))
1242 return false;
1243 }
1244
1245 /* Update statistic. */
1246 rcu.stat_delayed_cnt += delaying_cpu_cnt;
1247
1248 return true;
1249}
1250
1251/** Waits for any preempted readers blocking this grace period to finish.*/
1252static bool wait_for_preempt_reader(void)
1253{
1254 irq_spinlock_lock(&rcu.preempt_lock, true);
1255
1256 bool reader_exists = !list_empty(&rcu.cur_preempted);
1257 rcu.preempt_blocking_det = reader_exists;
1258
1259 irq_spinlock_unlock(&rcu.preempt_lock, true);
1260
1261 if (reader_exists) {
1262 /* Update statistic. */
1263 ++rcu.stat_preempt_blocking_cnt;
1264
1265 return semaphore_down_interruptable(&rcu.remaining_readers);
1266 }
1267
1268 return true;
1269}
1270
1271/** Called by the scheduler() when switching away from the current thread. */
1272void rcu_after_thread_ran(void)
1273{
1274 ASSERT(interrupts_disabled());
1275 ASSERT(CPU->rcu.pnesting_cnt == &THREAD->rcu.nesting_cnt);
1276
1277 /* Preempted a reader critical section for the first time. */
1278 if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
1279 THREAD->rcu.was_preempted = true;
1280
1281 irq_spinlock_lock(&rcu.preempt_lock, false);
1282
1283 if (CPU->rcu.last_seen_gp != rcu.cur_gp) {
1284 /* The reader started before the GP started - we must wait for it.*/
1285 list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
1286 } else {
1287 /*
1288 * The reader started after the GP started and this cpu
1289 * already noted a quiescent state. We might block the next GP.
1290 */
1291 list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
1292 }
1293
1294 irq_spinlock_unlock(&rcu.preempt_lock, false);
1295 }
1296
1297 /*
1298 * The preempted reader has been noted globally. There are therefore
1299 * no readers running on this cpu so this is a quiescent state.
1300 */
1301 record_qs();
1302
1303 /*
1304 * This cpu is holding up the current GP. Let the detector know
1305 * it has just passed a quiescent state.
1306 *
1307 * The detector waits separately for preempted readers, so we have
1308 * to notify the detector even if we have just preempted a reader.
1309 */
1310 if (CPU->rcu.is_delaying_gp) {
1311 CPU->rcu.is_delaying_gp = false;
1312 semaphore_up(&rcu.remaining_readers);
1313 }
1314
1315 /*
1316 * After this point THREAD is 0 and stays 0 until the scheduler()
1317 * switches to a new thread. Use a temporary nesting counter for readers
1318 * in handlers of interrupts that are raised while idle in the scheduler.
1319 */
1320 CPU->rcu.pnesting_cnt = &CPU->rcu.tmp_nesting_cnt;
1321
1322 /*
1323 * Forcefully associate the detector with the highest priority
1324 * even if preempted due to its time slice running out.
1325 *
1326 * todo: Replace with strict scheduler priority classes.
1327 */
1328 if (THREAD == rcu.detector_thr) {
1329 THREAD->priority = -1;
1330 }
1331 else if (THREAD == CPU->rcu.reclaimer_thr) {
1332 THREAD->priority = -1;
1333 }
1334
1335 upd_max_cbs_in_slice();
1336}
1337
1338static void upd_max_cbs_in_slice(void)
1339{
1340 rcu_cpu_data_t *cr = &CPU->rcu;
1341
1342 if (cr->arriving_cbs_cnt > cr->last_arriving_cnt) {
1343 size_t arrived_cnt = cr->arriving_cbs_cnt - cr->last_arriving_cnt;
1344 cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
1345 }
1346
1347 cr->last_arriving_cnt = cr->arriving_cbs_cnt;
1348}
1349
1350/** Called by the scheduler() when switching to a newly scheduled thread. */
1351void rcu_before_thread_runs(void)
1352{
1353 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1354 ASSERT(&CPU->rcu.tmp_nesting_cnt == CPU->rcu.pnesting_cnt);
1355
1356 CPU->rcu.pnesting_cnt = &THREAD->rcu.nesting_cnt;
1357}
1358
1359
1360/** Prints RCU run-time statistics. */
1361void rcu_print_stat(void)
1362{
1363 /*
1364 * Don't take locks. Worst case is we get out-dated values.
1365 * CPU local values are updated without any locks, so there
1366 * are no locks to lock in order to get up-to-date values.
1367 */
1368
1369 printf("Configuration: expedite_threshold=%d, critical_threshold=%d,"
1370 " detect_sleep=%dms\n",
1371 EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS);
1372 printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
1373 printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
1374 printf("Delayed GPs: %zu (cpus w/ still running readers after gp sleep)\n",
1375 rcu.stat_delayed_cnt);
1376 printf("Preempt blocked GPs: %zu (waited for preempted readers; "
1377 "running or not)\n", rcu.stat_preempt_blocking_cnt);
1378 printf("Smp calls: %zu\n", rcu.stat_smp_call_cnt);
1379
1380 printf("Max arrived callbacks per GP and CPU:\n");
1381 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1382 printf(" %zu", cpus[i].rcu.stat_max_cbs);
1383 }
1384
1385 printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
1386 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1387 printf(" %zu", cpus[i].rcu.stat_avg_cbs);
1388 }
1389
1390 printf("\nMax arrived callbacks per time slice and CPU:\n");
1391 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1392 printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
1393 }
1394
1395 printf("\nMissed GP notifications per CPU:\n");
1396 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1397 printf(" %zu", cpus[i].rcu.stat_missed_gps);
1398 }
1399
1400 printf("\nMissed GP notifications per CPU while waking up:\n");
1401 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1402 printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
1403 }
1404 printf("\n");
1405}
1406
1407/** @}
1408 */
Note: See TracBrowser for help on using the repository browser.