source: mainline/kernel/generic/src/synch/rcu.c@ 8e3ed06

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 8e3ed06 was 8e3ed06, checked in by Adam Hraska <adam.hraska+hos@…>, 13 years ago

rcu: Allowed inlining of the RCU read side.

  • Property mode set to 100644
File size: 41.1 KB
RevLine 
[79d74fe]1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29
30/** @addtogroup sync
31 * @{
32 */
33
34/**
35 * @file
[181a746]36 * @brief Preemptible read-copy update. Usable from interrupt handlers.
[79d74fe]37 */
38
39#include <synch/rcu.h>
[181a746]40#include <synch/condvar.h>
41#include <synch/semaphore.h>
42#include <synch/spinlock.h>
[4ec9ea41]43#include <synch/mutex.h>
[181a746]44#include <proc/thread.h>
45#include <cpu/cpu_mask.h>
46#include <cpu.h>
47#include <smp/smp_call.h>
48#include <compiler/barrier.h>
49#include <atomic.h>
50#include <arch.h>
51#include <macros.h>
[79d74fe]52
[181a746]53/*
54 * Number of milliseconds to give to preexisting readers to finish
55 * when non-expedited grace period detection is in progress.
56 */
[0cf813d]57#define DETECT_SLEEP_MS 10
[181a746]58/*
59 * Max number of pending callbacks in the local cpu's queue before
60 * aggressively expediting the current grace period
61 */
[0cf813d]62#define EXPEDITE_THRESHOLD 2000
63/*
64 * Max number of callbacks to execute in one go with preemption
65 * enabled. If there are more callbacks to be executed they will
66 * be run with preemption disabled in order to prolong reclaimer's
67 * time slice and give it a chance to catch up with callback producers.
68 */
69#define CRITICAL_THRESHOLD 30000
[181a746]70/* Half the number of values a uint32 can hold. */
71#define UINT32_MAX_HALF 2147483648U
[79d74fe]72
[8e3ed06]73/**
74 * The current grace period number. Increases monotonically.
75 * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
76 */
77rcu_gp_t _rcu_cur_gp;
[181a746]78
[0cf813d]79/** Global RCU data. */
[181a746]80typedef struct rcu_data {
81 /** Detector uses so signal reclaimers that a grace period ended. */
82 condvar_t gp_ended;
83 /** Reclaimers notify the detector when they request more grace periods.*/
84 condvar_t req_gp_changed;
85 /** Reclaimers use to notify the detector to accelerate GP detection. */
86 condvar_t expedite_now;
87 /**
88 * The detector waits on this semaphore for any readers delaying the GP.
89 *
90 * Each of the cpus with readers that are delaying the current GP
91 * must up() this sema once they reach a quiescent state. If there
92 * are any readers in cur_preempted (ie preempted preexisting) and
93 * they are already delaying GP detection, the last to unlock its
94 * reader section must up() this sema once.
95 */
96 semaphore_t remaining_readers;
97
98 /** Protects the 4 fields below. */
99 SPINLOCK_DECLARE(gp_lock);
100 /** Number of grace period ends the detector was requested to announce. */
101 size_t req_gp_end_cnt;
102 /** Number of consecutive grace periods to detect quickly and aggressively.*/
103 size_t req_expedited_cnt;
104 /**
[8e3ed06]105 * The number of the most recently completed grace period. At most
106 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period
107 * detection is not in progress and the detector is idle.
[181a746]108 */
109 rcu_gp_t completed_gp;
110
[0cf813d]111 /** Protects the following 3 fields. */
[181a746]112 IRQ_SPINLOCK_DECLARE(preempt_lock);
113 /** Preexisting readers that have been preempted. */
114 list_t cur_preempted;
115 /** Reader that have been preempted and might delay the next grace period.*/
116 list_t next_preempted;
117 /**
118 * The detector is waiting for the last preempted reader
119 * in cur_preempted to announce that it exited its reader
120 * section by up()ing remaining_readers.
121 */
122 bool preempt_blocking_det;
123
124 /**
125 * Number of cpus with readers that are delaying the current GP.
126 * They will up() remaining_readers.
127 */
128 atomic_t delaying_cpu_cnt;
129
[4ec9ea41]130 /** Excludes simultaneous rcu_barrier() calls. */
131 mutex_t barrier_mtx;
132 /** Number of cpus that we are waiting for to complete rcu_barrier(). */
133 atomic_t barrier_wait_cnt;
134 /** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
135 waitq_t barrier_wq;
136
[0cf813d]137 /** Interruptible attached detector thread pointer. */
[181a746]138 thread_t *detector_thr;
139
140 /* Some statistics. */
141 size_t stat_expedited_cnt;
142 size_t stat_delayed_cnt;
143 size_t stat_preempt_blocking_cnt;
144 /* Does not contain self/local calls. */
145 size_t stat_smp_call_cnt;
146} rcu_data_t;
147
148
149static rcu_data_t rcu;
150
151static void start_detector(void);
152static void start_reclaimers(void);
[8e3ed06]153static void read_unlock_impl(size_t *pnesting_cnt);
[181a746]154static void synch_complete(rcu_item_t *rcu_item);
[4ec9ea41]155static void add_barrier_cb(void *arg);
156static void barrier_complete(rcu_item_t *barrier_item);
[181a746]157static bool arriving_cbs_empty(void);
158static bool next_cbs_empty(void);
159static bool cur_cbs_empty(void);
160static bool all_cbs_empty(void);
161static void reclaimer(void *arg);
162static bool wait_for_pending_cbs(void);
163static bool advance_cbs(void);
164static void exec_completed_cbs(rcu_gp_t last_completed_gp);
165static void exec_cbs(rcu_item_t **phead);
166static void req_detection(size_t req_cnt);
167static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
[0cf813d]168static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
[181a746]169static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
170static void detector(void *);
171static bool wait_for_detect_req(void);
172static void start_new_gp(void);
173static void end_cur_gp(void);
174static bool wait_for_readers(void);
175static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
176static bool gp_sleep(void);
177static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
178static void sample_local_cpu(void *);
179static bool wait_for_delaying_cpus(void);
180static bool wait_for_preempt_reader(void);
[0cf813d]181static void upd_max_cbs_in_slice(void);
[181a746]182
183
184
185/** Initializes global RCU structures. */
186void rcu_init(void)
187{
188 condvar_initialize(&rcu.gp_ended);
189 condvar_initialize(&rcu.req_gp_changed);
190 condvar_initialize(&rcu.expedite_now);
191 semaphore_initialize(&rcu.remaining_readers, 0);
192
193 spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
194 rcu.req_gp_end_cnt = 0;
195 rcu.req_expedited_cnt = 0;
[8e3ed06]196 _rcu_cur_gp = 0;
[181a746]197 rcu.completed_gp = 0;
198
199 irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
200 list_initialize(&rcu.cur_preempted);
201 list_initialize(&rcu.next_preempted);
202 rcu.preempt_blocking_det = false;
203
[4ec9ea41]204 mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
205 atomic_set(&rcu.barrier_wait_cnt, 0);
206 waitq_initialize(&rcu.barrier_wq);
207
[181a746]208 atomic_set(&rcu.delaying_cpu_cnt, 0);
209
210 rcu.detector_thr = 0;
211
212 rcu.stat_expedited_cnt = 0;
213 rcu.stat_delayed_cnt = 0;
214 rcu.stat_preempt_blocking_cnt = 0;
215 rcu.stat_smp_call_cnt = 0;
216}
217
218/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
219void rcu_cpu_init(void)
220{
221 if (config.cpu_active == 1) {
222 rcu_init();
223 }
224
225 CPU->rcu.last_seen_gp = 0;
226
227 CPU->rcu.pnesting_cnt = &CPU->rcu.tmp_nesting_cnt;
228 CPU->rcu.tmp_nesting_cnt = 0;
229
230 CPU->rcu.cur_cbs = 0;
[0cf813d]231 CPU->rcu.cur_cbs_cnt = 0;
[181a746]232 CPU->rcu.next_cbs = 0;
[0cf813d]233 CPU->rcu.next_cbs_cnt = 0;
[181a746]234 CPU->rcu.arriving_cbs = 0;
235 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
236 CPU->rcu.arriving_cbs_cnt = 0;
237
238 CPU->rcu.cur_cbs_gp = 0;
239 CPU->rcu.next_cbs_gp = 0;
240
241 CPU->rcu.is_delaying_gp = false;
242
243 semaphore_initialize(&CPU->rcu.arrived_flag, 0);
[0cf813d]244
245 /* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
246 if (config.cpu_active == 1)
247 CPU->rcu.reclaimer_thr = 0;
[181a746]248
249 CPU->rcu.stat_max_cbs = 0;
250 CPU->rcu.stat_avg_cbs = 0;
251 CPU->rcu.stat_missed_gps = 0;
[0cf813d]252 CPU->rcu.stat_missed_gp_in_wait = 0;
253 CPU->rcu.stat_max_slice_cbs = 0;
254 CPU->rcu.last_arriving_cnt = 0;
[181a746]255}
256
257/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
258void rcu_kinit_init(void)
259{
260 start_detector();
261 start_reclaimers();
262}
263
264/** Initializes any per-thread RCU structures. */
265void rcu_thread_init(thread_t *thread)
266{
267 thread->rcu.nesting_cnt = 0;
268 thread->rcu.was_preempted = false;
269 link_initialize(&thread->rcu.preempt_link);
270}
271
272/** Called from scheduler() when exiting the current thread.
273 *
274 * Preemption or interrupts are disabled and the scheduler() already
275 * switched away from the current thread, calling rcu_after_thread_ran().
[79d74fe]276 */
[181a746]277void rcu_thread_exiting(void)
[79d74fe]278{
[181a746]279 ASSERT(THREAD != 0);
280 ASSERT(THREAD->state == Exiting);
281 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
282 /*
283 * The scheduler() must have already switched to a temporary
284 * nesting counter for interrupt handlers (we could be idle)
285 * so that interrupt handlers do not modify the exiting thread's
286 * reader section nesting count while we examine/process it.
287 */
288 ASSERT(&CPU->rcu.tmp_nesting_cnt == CPU->rcu.pnesting_cnt);
289
290 /*
291 * The thread forgot to exit its reader critical secion.
292 * It is a bug, but rather than letting the entire system lock up
293 * forcefully leave the reader section. The thread is not holding
294 * any references anyway since it is exiting so it is safe.
295 */
296 if (0 < THREAD->rcu.nesting_cnt) {
297 THREAD->rcu.nesting_cnt = 1;
[8e3ed06]298 read_unlock_impl(&THREAD->rcu.nesting_cnt);
[181a746]299 }
[79d74fe]300}
301
[181a746]302/** Cleans up global RCU resources and stops dispatching callbacks.
303 *
304 * Call when shutting down the kernel. Outstanding callbacks will
305 * not be processed. Instead they will linger forever.
306 */
307void rcu_stop(void)
308{
309 /* Stop and wait for reclaimers. */
310 for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
311 ASSERT(cpus[cpu_id].rcu.reclaimer_thr != 0);
312
313 if (cpus[cpu_id].rcu.reclaimer_thr) {
314 thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
315 thread_join(cpus[cpu_id].rcu.reclaimer_thr);
316 thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
317 cpus[cpu_id].rcu.reclaimer_thr = 0;
318 }
319 }
320
321 /* Stop the detector and wait. */
322 if (rcu.detector_thr) {
323 thread_interrupt(rcu.detector_thr);
324 thread_join(rcu.detector_thr);
325 thread_detach(rcu.detector_thr);
326 rcu.detector_thr = 0;
327 }
328}
[79d74fe]329
[181a746]330/** Starts the detector thread. */
331static void start_detector(void)
332{
333 rcu.detector_thr =
334 thread_create(detector, 0, TASK, THREAD_FLAG_NONE, "rcu-det");
335
336 if (!rcu.detector_thr)
337 panic("Failed to create RCU detector thread.");
338
339 thread_ready(rcu.detector_thr);
340}
341
342/** Creates and runs cpu-bound reclaimer threads. */
343static void start_reclaimers(void)
344{
345 for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
346 char name[THREAD_NAME_BUFLEN] = {0};
347
348 snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
349
350 cpus[cpu_id].rcu.reclaimer_thr =
351 thread_create(reclaimer, 0, TASK, THREAD_FLAG_NONE, name);
352
353 if (!cpus[cpu_id].rcu.reclaimer_thr)
354 panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
355
356 thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
357 thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
358 }
359}
360
361/** Returns the number of elapsed grace periods since boot. */
362uint64_t rcu_completed_gps(void)
363{
364 spinlock_lock(&rcu.gp_lock);
365 uint64_t completed = rcu.completed_gp;
366 spinlock_unlock(&rcu.gp_lock);
367
368 return completed;
369}
370
[4a6da62]371/** Returns true if in an rcu reader section. */
372bool rcu_read_locked(void)
373{
374 preemption_disable();
375 bool locked = 0 < *CPU->rcu.pnesting_cnt;
376 preemption_enable();
377
378 return locked;
379}
380
[181a746]381/** Unlocks the local reader section using the given nesting count.
382 *
383 * Preemption or interrupts must be disabled.
384 *
385 * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or
386 * THREAD->rcu.nesting_cnt.
[79d74fe]387 */
[8e3ed06]388static void read_unlock_impl(size_t *pnesting_cnt)
[181a746]389{
390 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
391
392 if (0 == --(*pnesting_cnt)) {
[8e3ed06]393 _rcu_record_qs();
[181a746]394
395 /*
396 * The thread was preempted while in a critical section or
397 * the detector is eagerly waiting for this cpu's reader
398 * to finish.
399 *
400 * Note that THREAD may be 0 in scheduler() and not just during boot.
401 */
402 if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
403 /* Rechecks with disabled interrupts. */
[8e3ed06]404 _rcu_signal_read_unlock();
[181a746]405 }
406 }
407}
408
409/** If necessary, signals the detector that we exited a reader section. */
[8e3ed06]410void _rcu_signal_read_unlock(void)
[181a746]411{
412 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
413
414 /*
415 * We have to disable interrupts in order to make checking
416 * and resetting was_preempted and is_delaying_gp atomic
417 * with respect to local interrupt handlers. Otherwise
418 * an interrupt could beat us to calling semaphore_up()
419 * before we reset the appropriate flag.
420 */
421 ipl_t ipl = interrupts_disable();
422
423 /*
424 * If the detector is eagerly waiting for this cpu's reader to unlock,
425 * notify it that the reader did so.
426 */
427 if (CPU->rcu.is_delaying_gp) {
428 CPU->rcu.is_delaying_gp = false;
429 semaphore_up(&rcu.remaining_readers);
430 }
431
432 /*
433 * This reader was preempted while in a reader section.
434 * We might be holding up the current GP. Notify the
435 * detector if so.
436 */
437 if (THREAD && THREAD->rcu.was_preempted) {
438 ASSERT(link_used(&THREAD->rcu.preempt_link));
439 THREAD->rcu.was_preempted = false;
440
441 irq_spinlock_lock(&rcu.preempt_lock, false);
442
443 bool prev_empty = list_empty(&rcu.cur_preempted);
444 list_remove(&THREAD->rcu.preempt_link);
445 bool now_empty = list_empty(&rcu.cur_preempted);
446
447 /* This was the last reader in cur_preempted. */
448 bool last_removed = now_empty && !prev_empty;
449
450 /*
451 * Preempted readers are blocking the detector and
452 * this was the last reader blocking the current GP.
453 */
454 if (last_removed && rcu.preempt_blocking_det) {
455 rcu.preempt_blocking_det = false;
456 semaphore_up(&rcu.remaining_readers);
457 }
458
459 irq_spinlock_unlock(&rcu.preempt_lock, false);
460 }
461 interrupts_restore(ipl);
462}
463
464typedef struct synch_item {
465 waitq_t wq;
466 rcu_item_t rcu_item;
467} synch_item_t;
468
469/** Blocks until all preexisting readers exit their critical sections. */
[79d74fe]470void rcu_synchronize(void)
[4ec9ea41]471{
472 _rcu_synchronize(false);
473}
474
475/** Blocks until all preexisting readers exit their critical sections. */
476void rcu_synchronize_expedite(void)
477{
478 _rcu_synchronize(true);
479}
480
481/** Blocks until all preexisting readers exit their critical sections. */
482void _rcu_synchronize(bool expedite)
[79d74fe]483{
[181a746]484 /* Calling from a reader section will deadlock. */
485 ASSERT(THREAD == 0 || 0 == THREAD->rcu.nesting_cnt);
486
487 synch_item_t completion;
488
489 waitq_initialize(&completion.wq);
[4ec9ea41]490 _rcu_call(expedite, &completion.rcu_item, synch_complete);
[181a746]491 waitq_sleep(&completion.wq);
492 waitq_complete_wakeup(&completion.wq);
[79d74fe]493}
494
[181a746]495/** rcu_synchronize's callback. */
496static void synch_complete(rcu_item_t *rcu_item)
497{
498 synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
499 ASSERT(completion);
500 waitq_wakeup(&completion->wq, WAKEUP_FIRST);
501}
[79d74fe]502
[4ec9ea41]503/** Waits for all outstanding rcu calls to complete. */
504void rcu_barrier(void)
505{
506 /*
507 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
508 * currently in use by rcu_barrier().
509 */
510 mutex_lock(&rcu.barrier_mtx);
511
512 /*
513 * Ensure we queue a barrier callback on all cpus before the already
514 * enqueued barrier callbacks start signaling completion.
515 */
516 atomic_set(&rcu.barrier_wait_cnt, 1);
517
518 DEFINE_CPU_MASK(cpu_mask);
519 cpu_mask_active(cpu_mask);
520
521 cpu_mask_for_each(*cpu_mask, cpu_id) {
522 smp_call(cpu_id, add_barrier_cb, 0);
523 }
524
525 if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
526 waitq_sleep(&rcu.barrier_wq);
527 }
528
529 mutex_unlock(&rcu.barrier_mtx);
530}
531
532/** Issues a rcu_barrier() callback on the local cpu.
533 *
534 * Executed with interrupts disabled.
535 */
536static void add_barrier_cb(void *arg)
537{
538 ASSERT(interrupts_disabled() || PREEMPTION_DISABLED);
539 atomic_inc(&rcu.barrier_wait_cnt);
540 rcu_call(&CPU->rcu.barrier_item, barrier_complete);
541}
542
543/** Local cpu's rcu_barrier() completion callback. */
544static void barrier_complete(rcu_item_t *barrier_item)
545{
546 /* Is this the last barrier callback completed? */
547 if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
548 /* Notify rcu_barrier() that we're done. */
549 waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
550 }
551}
552
[181a746]553/** Adds a callback to invoke after all preexisting readers finish.
554 *
555 * May be called from within interrupt handlers or RCU reader sections.
556 *
557 * @param rcu_item Used by RCU to track the call. Must remain
558 * until the user callback function is entered.
559 * @param func User callback function that will be invoked once a full
560 * grace period elapsed, ie at a time when all preexisting
561 * readers have finished. The callback should be short and must
562 * not block. If you must sleep, enqueue your work in the system
563 * work queue from the callback (ie workq_global_enqueue()).
[79d74fe]564 */
[181a746]565void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
[79d74fe]566{
[181a746]567 _rcu_call(false, rcu_item, func);
[79d74fe]568}
569
[181a746]570/** rcu_call() implementation. See rcu_call() for comments. */
571void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
572{
573 ASSERT(rcu_item);
574
575 rcu_item->func = func;
576 rcu_item->next = 0;
577
578 preemption_disable();
579
580 ipl_t ipl = interrupts_disable();
[79d74fe]581
[181a746]582 *CPU->rcu.parriving_cbs_tail = rcu_item;
583 CPU->rcu.parriving_cbs_tail = &rcu_item->next;
584
585 size_t cnt = ++CPU->rcu.arriving_cbs_cnt;
586 interrupts_restore(ipl);
587
588 if (expedite) {
589 CPU->rcu.expedite_arriving = true;
590 }
591
592 /* Added first callback - notify the reclaimer. */
593 if (cnt == 1 && !semaphore_count_get(&CPU->rcu.arrived_flag)) {
594 semaphore_up(&CPU->rcu.arrived_flag);
595 }
596
597 preemption_enable();
598}
599
600static bool cur_cbs_empty(void)
601{
602 ASSERT(THREAD && THREAD->wired);
603 return 0 == CPU->rcu.cur_cbs;
604}
605
606static bool next_cbs_empty(void)
607{
608 ASSERT(THREAD && THREAD->wired);
609 return 0 == CPU->rcu.next_cbs;
610}
611
612/** Disable interrupts to get an up-to-date result. */
613static bool arriving_cbs_empty(void)
614{
615 ASSERT(THREAD && THREAD->wired);
616 /*
617 * Accessing with interrupts enabled may at worst lead to
618 * a false negative if we race with a local interrupt handler.
619 */
620 return 0 == CPU->rcu.arriving_cbs;
621}
622
623static bool all_cbs_empty(void)
624{
625 return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
626}
627
628/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
629static void reclaimer(void *arg)
630{
631 ASSERT(THREAD && THREAD->wired);
[0cf813d]632 ASSERT(THREAD == CPU->rcu.reclaimer_thr);
[181a746]633
634 rcu_gp_t last_compl_gp = 0;
635 bool ok = true;
636
637 while (ok && wait_for_pending_cbs()) {
[0cf813d]638 ASSERT(CPU->rcu.reclaimer_thr == THREAD);
[181a746]639
[0cf813d]640 exec_completed_cbs(last_compl_gp);
641
[181a746]642 bool expedite = advance_cbs();
643
644 ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
645 }
646}
647
648/** Waits until there are callbacks waiting to be dispatched. */
649static bool wait_for_pending_cbs(void)
650{
651 if (!all_cbs_empty())
652 return true;
653
654 bool ok = true;
655
656 while (arriving_cbs_empty() && ok) {
657 ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
658 }
659
660 return ok;
661}
662
[b68ae24]663static void upd_stat_missed_gp(rcu_gp_t compl)
[181a746]664{
[b68ae24]665 if (CPU->rcu.cur_cbs_gp < compl) {
666 CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
[181a746]667 }
668}
669
670/** Executes all callbacks for the given completed grace period. */
671static void exec_completed_cbs(rcu_gp_t last_completed_gp)
672{
[b68ae24]673 upd_stat_missed_gp(last_completed_gp);
674
[0cf813d]675 /* Both next_cbs and cur_cbs GP elapsed. */
[181a746]676 if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
[0cf813d]677 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
678
679 size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
680
681 if (exec_cnt < CRITICAL_THRESHOLD) {
682 exec_cbs(&CPU->rcu.cur_cbs);
683 exec_cbs(&CPU->rcu.next_cbs);
684 } else {
685 /*
686 * Getting overwhelmed with too many callbacks to run.
687 * Disable preemption in order to prolong our time slice
688 * and catch up with updaters posting new callbacks.
689 */
690 preemption_disable();
691 exec_cbs(&CPU->rcu.cur_cbs);
692 exec_cbs(&CPU->rcu.next_cbs);
693 preemption_enable();
694 }
695
696 CPU->rcu.cur_cbs_cnt = 0;
697 CPU->rcu.next_cbs_cnt = 0;
698 } else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
699
700 if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
701 exec_cbs(&CPU->rcu.cur_cbs);
702 } else {
703 /*
704 * Getting overwhelmed with too many callbacks to run.
705 * Disable preemption in order to prolong our time slice
706 * and catch up with updaters posting new callbacks.
707 */
708 preemption_disable();
709 exec_cbs(&CPU->rcu.cur_cbs);
710 preemption_enable();
711 }
712
713 CPU->rcu.cur_cbs_cnt = 0;
[181a746]714 }
715}
716
717/** Executes callbacks in the single-linked list. The list is left empty. */
718static void exec_cbs(rcu_item_t **phead)
719{
720 rcu_item_t *rcu_item = *phead;
721
722 while (rcu_item) {
723 /* func() may free rcu_item. Get a local copy. */
724 rcu_item_t *next = rcu_item->next;
725 rcu_func_t func = rcu_item->func;
726
727 func(rcu_item);
728
729 rcu_item = next;
730 }
731
732 *phead = 0;
733}
734
735static void upd_stat_cb_cnts(size_t arriving_cnt)
736{
737 CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
738 if (0 < arriving_cnt) {
739 CPU->rcu.stat_avg_cbs =
740 (99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
741 }
742}
743
744
745/** Prepares another batch of callbacks to dispatch at the nest grace period.
746 *
747 * @return True if the next batch of callbacks must be expedited quickly.
[79d74fe]748 */
[181a746]749static bool advance_cbs(void)
750{
751 /* Move next_cbs to cur_cbs. */
752 CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
[0cf813d]753 CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
[181a746]754 CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
755
756 /* Move arriving_cbs to next_cbs. Empties arriving_cbs. */
757 ipl_t ipl = interrupts_disable();
758
759 /*
760 * Too many callbacks queued. Better speed up the detection
761 * or risk exhausting all system memory.
762 */
763 bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.arriving_cbs_cnt)
764 || CPU->rcu.expedite_arriving;
765
766 CPU->rcu.expedite_arriving = false;
[0cf813d]767
[181a746]768 CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
[0cf813d]769 CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
770
[181a746]771 CPU->rcu.arriving_cbs = 0;
772 CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
773 CPU->rcu.arriving_cbs_cnt = 0;
774
775 interrupts_restore(ipl);
[0cf813d]776
777 /* Update statistics of arrived callbacks. */
778 upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
[181a746]779
780 /*
781 * Make changes prior to queuing next_cbs visible to readers.
782 * See comment in wait_for_readers().
783 */
784 memory_barrier(); /* MB A, B */
785
786 /* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
787
788 if (!next_cbs_empty()) {
789 spinlock_lock(&rcu.gp_lock);
790
791 /* Exec next_cbs at the end of the next GP. */
[8e3ed06]792 CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
[181a746]793
794 /*
795 * There are no callbacks to invoke before next_cbs. Instruct
796 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
797 * That could be sooner than next_cbs_gp (if the current GP
798 * had not yet completed), so we'll create a shorter batch
799 * of callbacks next time around.
800 */
801 if (cur_cbs_empty()) {
802 CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
803 }
804
805 spinlock_unlock(&rcu.gp_lock);
806 } else {
807 CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
808 }
809
810 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
811
812 return expedite;
813}
814
815/** Waits for the grace period associated with callbacks cub_cbs to elapse.
816 *
817 * @param expedite Instructs the detector to aggressively speed up grace
818 * period detection without any delay.
819 * @param completed_gp Returns the most recent completed grace period
820 * number.
821 * @return false if the thread was interrupted and should stop.
822 */
823static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
824{
825 /*
826 * Use a possibly outdated version of completed_gp to bypass checking
827 * with the lock.
828 *
829 * Note that loading and storing rcu.completed_gp is not atomic
830 * (it is 64bit wide). Reading a clobbered value that is less than
831 * rcu.completed_gp is harmless - we'll recheck with a lock. The
832 * only way to read a clobbered value that is greater than the actual
833 * value is if the detector increases the higher-order word first and
834 * then decreases the lower-order word (or we see stores in that order),
835 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value
836 * suddenly jumps by 2^32. It would take hours for such an increase
837 * to occur so it is safe to discard the value. We allow increases
838 * of up to half the maximum to generously accommodate for loading an
839 * outdated lower word.
840 */
841 rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
842 if (CPU->rcu.cur_cbs_gp <= compl_gp
843 && compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
844 *completed_gp = compl_gp;
845 return true;
846 }
847
848 spinlock_lock(&rcu.gp_lock);
849
850 if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
851 *completed_gp = rcu.completed_gp;
852 spinlock_unlock(&rcu.gp_lock);
853 return true;
854 }
855
856 ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
[8e3ed06]857 ASSERT(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
[181a746]858
859 /*
860 * Notify the detector of how many GP ends we intend to wait for, so
861 * it can avoid going to sleep unnecessarily. Optimistically assume
862 * new callbacks will arrive while we're waiting; hence +1.
863 */
[8e3ed06]864 size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
[181a746]865 req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
866
867 /*
868 * Ask the detector to speed up GP detection if there are too many
869 * pending callbacks and other reclaimers have not already done so.
870 */
871 if (expedite) {
872 if(0 == rcu.req_expedited_cnt)
873 condvar_signal(&rcu.expedite_now);
874
875 /*
876 * Expedite only cub_cbs. If there really is a surge of callbacks
877 * the arriving batch will expedite the GP for the huge number
878 * of callbacks currently in next_cbs
879 */
880 rcu.req_expedited_cnt = 1;
881 }
882
883 /* Wait for cur_cbs_gp to end. */
884 bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
[0cf813d]885
[181a746]886 *completed_gp = rcu.completed_gp;
887 spinlock_unlock(&rcu.gp_lock);
888
[0cf813d]889 upd_missed_gp_in_wait(*completed_gp);
890
[181a746]891 return !interrupted;
892}
893
[0cf813d]894static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
895{
896 ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
897
898 size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
899 CPU->rcu.stat_missed_gp_in_wait += delta;
900}
901
902
[181a746]903/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
904static void req_detection(size_t req_cnt)
905{
906 if (rcu.req_gp_end_cnt < req_cnt) {
907 bool detector_idle = (0 == rcu.req_gp_end_cnt);
908 rcu.req_gp_end_cnt = req_cnt;
909
910 if (detector_idle) {
[8e3ed06]911 ASSERT(_rcu_cur_gp == rcu.completed_gp);
[181a746]912 condvar_signal(&rcu.req_gp_changed);
913 }
914 }
915}
916
917/** Waits for an announcement of the end of the grace period wait_on_gp. */
918static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
919{
920 ASSERT(spinlock_locked(&rcu.gp_lock));
921
922 bool interrupted = false;
923
924 /* Wait until wait_on_gp ends. */
925 while (rcu.completed_gp < wait_on_gp && !interrupted) {
926 int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock,
927 SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
928 interrupted = (ret == ESYNCH_INTERRUPTED);
929 }
930
931 ASSERT(wait_on_gp <= rcu.completed_gp);
932
933 return interrupted;
934}
935
936/** The detector thread detects and notifies reclaimers of grace period ends. */
937static void detector(void *arg)
938{
939 spinlock_lock(&rcu.gp_lock);
940
941 while (wait_for_detect_req()) {
942 /*
943 * Announce new GP started. Readers start lazily acknowledging that
944 * they passed a QS.
945 */
946 start_new_gp();
947
948 spinlock_unlock(&rcu.gp_lock);
949
950 if (!wait_for_readers())
951 goto unlocked_out;
952
953 spinlock_lock(&rcu.gp_lock);
954
955 /* Notify reclaimers that they may now invoke queued callbacks. */
956 end_cur_gp();
957 }
958
959 spinlock_unlock(&rcu.gp_lock);
960
961unlocked_out:
962 return;
963}
964
965/** Waits for a request from a reclaimer thread to detect a grace period. */
966static bool wait_for_detect_req(void)
967{
968 ASSERT(spinlock_locked(&rcu.gp_lock));
969
970 bool interrupted = false;
971
972 while (0 == rcu.req_gp_end_cnt && !interrupted) {
973 int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed,
974 &rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
975
976 interrupted = (ret == ESYNCH_INTERRUPTED);
977 }
978
979 return !interrupted;
980}
981
982/** Announces the start of a new grace period for preexisting readers to ack. */
983static void start_new_gp(void)
984{
985 ASSERT(spinlock_locked(&rcu.gp_lock));
986
987 irq_spinlock_lock(&rcu.preempt_lock, true);
988
989 /* Start a new GP. Announce to readers that a quiescent state is needed. */
[8e3ed06]990 ++_rcu_cur_gp;
[181a746]991
992 /*
993 * Readers preempted before the start of this GP (next_preempted)
994 * are preexisting readers now that a GP started and will hold up
995 * the current GP until they exit their reader sections.
996 *
997 * Preempted readers from the previous GP have finished so
[8e3ed06]998 * cur_preempted is empty, but see comment in _rcu_record_qs().
[181a746]999 */
[c14762e]1000 list_concat(&rcu.cur_preempted, &rcu.next_preempted);
[181a746]1001
1002 irq_spinlock_unlock(&rcu.preempt_lock, true);
1003}
1004
1005static void end_cur_gp(void)
1006{
1007 ASSERT(spinlock_locked(&rcu.gp_lock));
1008
[8e3ed06]1009 rcu.completed_gp = _rcu_cur_gp;
[181a746]1010 --rcu.req_gp_end_cnt;
1011
1012 condvar_broadcast(&rcu.gp_ended);
1013}
1014
1015/** Waits for readers that started before the current GP started to finish. */
1016static bool wait_for_readers(void)
1017{
1018 DEFINE_CPU_MASK(reading_cpus);
1019
1020 /* All running cpus have potential readers. */
1021 cpu_mask_active(reading_cpus);
1022
1023 /*
1024 * Ensure the announcement of the start of a new GP (ie up-to-date
1025 * cur_gp) propagates to cpus that are just coming out of idle
1026 * mode before we sample their idle state flag.
1027 *
1028 * Cpus guarantee that after they set CPU->idle = true they will not
1029 * execute any RCU reader sections without first setting idle to
1030 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
1031 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
1032 * the cpu must not have yet executed its memory barrier (otherwise
1033 * it would pair up with this mem barrier and we would see idle == false).
1034 * That memory barrier will pair up with the one below and ensure
1035 * that a reader on the now-non-idle cpu will see the most current
1036 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
1037 * pending_readers) during this GP, which allows the detector to
1038 * ignore that cpu (the detector thinks it is idle). Moreover, any
1039 * changes made by RCU updaters will have propagated to readers
1040 * on the previously idle cpu -- again thanks to issuing a memory
1041 * barrier after returning from idle mode.
1042 *
1043 * idle -> non-idle cpu | detector | reclaimer
1044 * ------------------------------------------------------
1045 * rcu reader 1 | | rcu_call()
1046 * MB X | |
1047 * idle = true | | rcu_call()
1048 * (no rcu readers allowed ) | | MB A in advance_cbs()
1049 * MB Y | (...) | (...)
1050 * (no rcu readers allowed) | | MB B in advance_cbs()
1051 * idle = false | ++cur_gp |
1052 * (no rcu readers allowed) | MB C |
1053 * MB Z | signal gp_end |
1054 * rcu reader 2 | | exec_cur_cbs()
1055 *
1056 *
1057 * MB Y orders visibility of changes to idle for detector's sake.
1058 *
1059 * MB Z pairs up with MB C. The cpu making a transition from idle
1060 * will see the most current value of cur_gp and will not attempt
1061 * to notify the detector even if preempted during this GP.
1062 *
1063 * MB Z pairs up with MB A from the previous batch. Updaters' changes
1064 * are visible to reader 2 even when the detector thinks the cpu is idle
1065 * but it is not anymore.
1066 *
1067 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
1068 * and visible before idling and before any callbacks are executed
1069 * by reclaimers.
1070 *
1071 * In summary, the detector does not know of or wait for reader 2, but
1072 * it does not have to since it is a new reader that will not access
1073 * data from previous GPs and will see any changes.
1074 */
1075 memory_barrier(); /* MB C */
1076
1077 /*
1078 * Give readers time to pass through a QS. Also, batch arriving
1079 * callbacks in order to amortize detection overhead.
1080 */
1081 if (!gp_sleep())
1082 return false;
1083
1084 /* Non-intrusively determine which cpus have yet to pass a QS. */
1085 rm_quiescent_cpus(reading_cpus);
1086
1087 /* Actively interrupt cpus delaying the current GP and demand a QS. */
1088 interrupt_delaying_cpus(reading_cpus);
1089
1090 /* Wait for the interrupted cpus to notify us that they reached a QS. */
1091 if (!wait_for_delaying_cpus())
1092 return false;
1093 /*
1094 * All cpus recorded a QS or are still idle. Any new readers will be added
1095 * to next_preempt if preempted, ie the number of readers in cur_preempted
1096 * monotonically descreases.
1097 */
1098
1099 /* Wait for the last reader in cur_preempted to notify us it is done. */
1100 if (!wait_for_preempt_reader())
1101 return false;
1102
1103 return true;
1104}
1105
1106/** Remove those cpus from the mask that have already passed a quiescent
1107 * state since the start of the current grace period.
1108 */
1109static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
1110{
1111 cpu_mask_for_each(*cpu_mask, cpu_id) {
1112 /*
1113 * The cpu already checked for and passed through a quiescent
1114 * state since the beginning of this GP.
1115 *
[8e3ed06]1116 * _rcu_cur_gp is modified by local detector thread only.
[181a746]1117 * Therefore, it is up-to-date even without a lock.
1118 */
[8e3ed06]1119 bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
[181a746]1120
1121 /*
1122 * Either the cpu is idle or it is exiting away from idle mode
[8e3ed06]1123 * and already sees the most current _rcu_cur_gp. See comment
[181a746]1124 * in wait_for_readers().
1125 */
1126 bool cpu_idle = cpus[cpu_id].idle;
1127
1128 if (cpu_acked_gp || cpu_idle) {
1129 cpu_mask_reset(cpu_mask, cpu_id);
1130 }
1131 }
1132}
1133
1134/** Sleeps a while if the current grace period is not to be expedited. */
1135static bool gp_sleep(void)
1136{
1137 spinlock_lock(&rcu.gp_lock);
1138
1139 int ret = 0;
1140 while (0 == rcu.req_expedited_cnt && 0 == ret) {
1141 /* minor bug: sleeps for the same duration if woken up spuriously. */
1142 ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
1143 DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
1144 }
1145
1146 if (0 < rcu.req_expedited_cnt) {
1147 --rcu.req_expedited_cnt;
1148 /* Update statistic. */
1149 ++rcu.stat_expedited_cnt;
1150 }
1151
1152 spinlock_unlock(&rcu.gp_lock);
1153
1154 return (ret != ESYNCH_INTERRUPTED);
1155}
1156
1157/** Actively interrupts and checks the offending cpus for quiescent states. */
1158static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
1159{
1160 const size_t max_conconcurrent_calls = 16;
1161 smp_call_t call[max_conconcurrent_calls];
1162 size_t outstanding_calls = 0;
1163
1164 atomic_set(&rcu.delaying_cpu_cnt, 0);
1165
1166 cpu_mask_for_each(*cpu_mask, cpu_id) {
1167 smp_call_async(cpu_id, sample_local_cpu, 0, &call[outstanding_calls]);
1168 ++outstanding_calls;
1169
1170 /* Update statistic. */
1171 if (CPU->id != cpu_id)
1172 ++rcu.stat_smp_call_cnt;
1173
1174 if (outstanding_calls == max_conconcurrent_calls) {
1175 for (size_t k = 0; k < outstanding_calls; ++k) {
1176 smp_call_wait(&call[k]);
1177 }
1178
1179 outstanding_calls = 0;
1180 }
1181 }
1182
1183 for (size_t k = 0; k < outstanding_calls; ++k) {
1184 smp_call_wait(&call[k]);
1185 }
1186}
1187
1188/** Invoked on a cpu delaying grace period detection.
1189 *
1190 * Induces a quiescent state for the cpu or it instructs remaining
1191 * readers to notify the detector once they finish.
1192 */
1193static void sample_local_cpu(void *arg)
1194{
1195 ASSERT(interrupts_disabled());
1196 ASSERT(!CPU->rcu.is_delaying_gp);
1197
1198 /* Cpu did not pass a quiescent state yet. */
[8e3ed06]1199 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
[181a746]1200 /* Interrupted a reader in a reader critical section. */
1201 if (0 < (*CPU->rcu.pnesting_cnt)) {
1202 ASSERT(!CPU->idle);
1203 /* Note to notify the detector from rcu_read_unlock(). */
1204 CPU->rcu.is_delaying_gp = true;
1205 atomic_inc(&rcu.delaying_cpu_cnt);
1206 } else {
1207 /*
1208 * The cpu did not enter any rcu reader sections since
1209 * the start of the current GP. Record a quiescent state.
1210 *
1211 * Or, we interrupted rcu_read_unlock_impl() right before
1212 * it recorded a QS. Record a QS for it. The memory barrier
1213 * contains the reader section's mem accesses before
1214 * updating last_seen_gp.
1215 *
1216 * Or, we interrupted rcu_read_lock() right after it recorded
1217 * a QS for the previous GP but before it got a chance to
1218 * increment its nesting count. The memory barrier again
1219 * stops the CS code from spilling out of the CS.
1220 */
1221 memory_barrier();
[8e3ed06]1222 CPU->rcu.last_seen_gp = _rcu_cur_gp;
[181a746]1223 }
1224 } else {
1225 /*
1226 * This cpu already acknowledged that it had passed through
1227 * a quiescent state since the start of cur_gp.
1228 */
1229 }
1230
1231 /*
1232 * smp_call() makes sure any changes propagate back to the caller.
1233 * In particular, it makes the most current last_seen_gp visible
1234 * to the detector.
1235 */
1236}
1237
1238/** Waits for cpus delaying the current grace period if there are any. */
1239static bool wait_for_delaying_cpus(void)
1240{
1241 int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
[79d74fe]1242
[181a746]1243 for (int i = 0; i < delaying_cpu_cnt; ++i){
1244 if (!semaphore_down_interruptable(&rcu.remaining_readers))
1245 return false;
1246 }
1247
1248 /* Update statistic. */
1249 rcu.stat_delayed_cnt += delaying_cpu_cnt;
1250
1251 return true;
1252}
1253
1254/** Waits for any preempted readers blocking this grace period to finish.*/
1255static bool wait_for_preempt_reader(void)
1256{
1257 irq_spinlock_lock(&rcu.preempt_lock, true);
1258
1259 bool reader_exists = !list_empty(&rcu.cur_preempted);
1260 rcu.preempt_blocking_det = reader_exists;
1261
1262 irq_spinlock_unlock(&rcu.preempt_lock, true);
1263
1264 if (reader_exists) {
1265 /* Update statistic. */
1266 ++rcu.stat_preempt_blocking_cnt;
1267
1268 return semaphore_down_interruptable(&rcu.remaining_readers);
1269 }
1270
1271 return true;
1272}
1273
1274/** Called by the scheduler() when switching away from the current thread. */
1275void rcu_after_thread_ran(void)
1276{
1277 ASSERT(interrupts_disabled());
1278 ASSERT(CPU->rcu.pnesting_cnt == &THREAD->rcu.nesting_cnt);
1279
1280 /* Preempted a reader critical section for the first time. */
1281 if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
1282 THREAD->rcu.was_preempted = true;
1283
1284 irq_spinlock_lock(&rcu.preempt_lock, false);
1285
[8e3ed06]1286 if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
[181a746]1287 /* The reader started before the GP started - we must wait for it.*/
1288 list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
1289 } else {
1290 /*
1291 * The reader started after the GP started and this cpu
1292 * already noted a quiescent state. We might block the next GP.
1293 */
1294 list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
1295 }
1296
1297 irq_spinlock_unlock(&rcu.preempt_lock, false);
1298 }
1299
1300 /*
1301 * The preempted reader has been noted globally. There are therefore
1302 * no readers running on this cpu so this is a quiescent state.
1303 */
[8e3ed06]1304 _rcu_record_qs();
[181a746]1305
1306 /*
1307 * This cpu is holding up the current GP. Let the detector know
1308 * it has just passed a quiescent state.
1309 *
1310 * The detector waits separately for preempted readers, so we have
1311 * to notify the detector even if we have just preempted a reader.
1312 */
1313 if (CPU->rcu.is_delaying_gp) {
1314 CPU->rcu.is_delaying_gp = false;
1315 semaphore_up(&rcu.remaining_readers);
1316 }
1317
1318 /*
1319 * After this point THREAD is 0 and stays 0 until the scheduler()
1320 * switches to a new thread. Use a temporary nesting counter for readers
1321 * in handlers of interrupts that are raised while idle in the scheduler.
1322 */
1323 CPU->rcu.pnesting_cnt = &CPU->rcu.tmp_nesting_cnt;
1324
1325 /*
1326 * Forcefully associate the detector with the highest priority
1327 * even if preempted due to its time slice running out.
1328 *
1329 * todo: Replace with strict scheduler priority classes.
1330 */
1331 if (THREAD == rcu.detector_thr) {
1332 THREAD->priority = -1;
1333 }
[0cf813d]1334 else if (THREAD == CPU->rcu.reclaimer_thr) {
1335 THREAD->priority = -1;
1336 }
1337
1338 upd_max_cbs_in_slice();
1339}
1340
1341static void upd_max_cbs_in_slice(void)
1342{
1343 rcu_cpu_data_t *cr = &CPU->rcu;
1344
1345 if (cr->arriving_cbs_cnt > cr->last_arriving_cnt) {
1346 size_t arrived_cnt = cr->arriving_cbs_cnt - cr->last_arriving_cnt;
1347 cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
1348 }
1349
1350 cr->last_arriving_cnt = cr->arriving_cbs_cnt;
[181a746]1351}
1352
1353/** Called by the scheduler() when switching to a newly scheduled thread. */
1354void rcu_before_thread_runs(void)
1355{
1356 ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
1357 ASSERT(&CPU->rcu.tmp_nesting_cnt == CPU->rcu.pnesting_cnt);
1358
1359 CPU->rcu.pnesting_cnt = &THREAD->rcu.nesting_cnt;
1360}
1361
1362
1363/** Prints RCU run-time statistics. */
1364void rcu_print_stat(void)
1365{
[0cf813d]1366 /*
1367 * Don't take locks. Worst case is we get out-dated values.
1368 * CPU local values are updated without any locks, so there
1369 * are no locks to lock in order to get up-to-date values.
1370 */
1371
1372 printf("Configuration: expedite_threshold=%d, critical_threshold=%d,"
1373 " detect_sleep=%dms\n",
1374 EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS);
[181a746]1375 printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
1376 printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
1377 printf("Delayed GPs: %zu (cpus w/ still running readers after gp sleep)\n",
1378 rcu.stat_delayed_cnt);
1379 printf("Preempt blocked GPs: %zu (waited for preempted readers; "
[b68ae24]1380 "running or not)\n", rcu.stat_preempt_blocking_cnt);
1381 printf("Smp calls: %zu\n", rcu.stat_smp_call_cnt);
[181a746]1382
[0cf813d]1383 printf("Max arrived callbacks per GP and CPU:\n");
1384 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1385 printf(" %zu", cpus[i].rcu.stat_max_cbs);
1386 }
1387
[0cf813d]1388 printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
1389 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1390 printf(" %zu", cpus[i].rcu.stat_avg_cbs);
1391 }
1392
[0cf813d]1393 printf("\nMax arrived callbacks per time slice and CPU:\n");
1394 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1395 printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
1396 }
1397
1398 printf("\nMissed GP notifications per CPU:\n");
1399 for (unsigned int i = 0; i < config.cpu_count; ++i) {
[181a746]1400 printf(" %zu", cpus[i].rcu.stat_missed_gps);
1401 }
[0cf813d]1402
1403 printf("\nMissed GP notifications per CPU while waking up:\n");
1404 for (unsigned int i = 0; i < config.cpu_count; ++i) {
1405 printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
1406 }
[181a746]1407 printf("\n");
1408}
1409
1410/** @}
1411 */
Note: See TracBrowser for help on using the repository browser.