1 | /*
|
---|
2 | * Copyright (c) 2012 Adam Hraska
|
---|
3 | * All rights reserved.
|
---|
4 | *
|
---|
5 | * Redistribution and use in source and binary forms, with or without
|
---|
6 | * modification, are permitted provided that the following conditions
|
---|
7 | * are met:
|
---|
8 | *
|
---|
9 | * - Redistributions of source code must retain the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer.
|
---|
11 | * - Redistributions in binary form must reproduce the above copyright
|
---|
12 | * notice, this list of conditions and the following disclaimer in the
|
---|
13 | * documentation and/or other materials provided with the distribution.
|
---|
14 | * - The name of the author may not be used to endorse or promote products
|
---|
15 | * derived from this software without specific prior written permission.
|
---|
16 | *
|
---|
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
---|
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
---|
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
---|
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
---|
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
---|
22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
---|
23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
---|
24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
---|
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
27 | */
|
---|
28 |
|
---|
29 | /** @addtogroup liburcu
|
---|
30 | * @{
|
---|
31 | */
|
---|
32 | /**
|
---|
33 | * @file
|
---|
34 | *
|
---|
35 | * User space RCU is based on URCU utilizing signals [1]. This
|
---|
36 | * implementation does not however signal each thread of the process
|
---|
37 | * to issue a memory barrier. Instead, we introduced a syscall that
|
---|
38 | * issues memory barriers (via IPIs) on cpus that are running threads
|
---|
39 | * of the current process. First, it does not require us to schedule
|
---|
40 | * and run every thread of the process. Second, IPIs are less intrusive
|
---|
41 | * than switching contexts and entering user space.
|
---|
42 | *
|
---|
43 | * This algorithm is further modified to require a single instead of
|
---|
44 | * two reader group changes per grace period. Signal-URCU flips
|
---|
45 | * the reader group and waits for readers of the previous group
|
---|
46 | * twice in succession in order to wait for new readers that were
|
---|
47 | * delayed and mistakenly associated with the previous reader group.
|
---|
48 | * The modified algorithm ensures that the new reader group is
|
---|
49 | * always empty (by explicitly waiting for it to become empty).
|
---|
50 | * Only then does it flip the reader group and wait for preexisting
|
---|
51 | * readers of the old reader group (invariant of SRCU [2, 3]).
|
---|
52 | *
|
---|
53 | *
|
---|
54 | * [1] User-level implementations of read-copy update,
|
---|
55 | * 2012, appendix
|
---|
56 | * http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
|
---|
57 | *
|
---|
58 | * [2] linux/kernel/srcu.c in Linux 3.5-rc2,
|
---|
59 | * 2012
|
---|
60 | * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/kernel/srcu.c?v=linux-3.5-rc2-ccs-1.8.3
|
---|
61 | *
|
---|
62 | * [3] [RFC PATCH 5/5 single-thread-version] implement
|
---|
63 | * per-domain single-thread state machine,
|
---|
64 | * 2012, Lai
|
---|
65 | * https://lkml.org/lkml/2012/3/6/586
|
---|
66 | */
|
---|
67 |
|
---|
68 | #include "rcu.h"
|
---|
69 | #include <fibril_synch.h>
|
---|
70 | #include <fibril.h>
|
---|
71 | #include <stdio.h>
|
---|
72 | #include <stddef.h>
|
---|
73 | #include <compiler/barrier.h>
|
---|
74 | #include <libarch/barrier.h>
|
---|
75 | #include <futex.h>
|
---|
76 | #include <macros.h>
|
---|
77 | #include <async.h>
|
---|
78 | #include <adt/list.h>
|
---|
79 | #include <smp_memory_barrier.h>
|
---|
80 | #include <assert.h>
|
---|
81 | #include <time.h>
|
---|
82 | #include <thread.h>
|
---|
83 |
|
---|
84 | #include "private/fibril.h"
|
---|
85 |
|
---|
86 |
|
---|
87 | /** RCU sleeps for RCU_SLEEP_MS before polling an active RCU reader again. */
|
---|
88 | #define RCU_SLEEP_MS 10
|
---|
89 |
|
---|
90 | #define RCU_NESTING_SHIFT 1
|
---|
91 | #define RCU_NESTING_INC (1 << RCU_NESTING_SHIFT)
|
---|
92 | #define RCU_GROUP_BIT_MASK (size_t)(RCU_NESTING_INC - 1)
|
---|
93 | #define RCU_GROUP_A (size_t)(0 | RCU_NESTING_INC)
|
---|
94 | #define RCU_GROUP_B (size_t)(1 | RCU_NESTING_INC)
|
---|
95 |
|
---|
96 |
|
---|
97 | /** Fibril local RCU data. */
|
---|
98 | typedef struct fibril_rcu_data {
|
---|
99 | size_t nesting_cnt;
|
---|
100 | link_t link;
|
---|
101 | bool registered;
|
---|
102 | } fibril_rcu_data_t;
|
---|
103 |
|
---|
104 | /** Process global RCU data. */
|
---|
105 | typedef struct rcu_data {
|
---|
106 | size_t cur_gp;
|
---|
107 | size_t reader_group;
|
---|
108 | futex_t list_futex;
|
---|
109 | list_t fibrils_list;
|
---|
110 | struct {
|
---|
111 | futex_t futex;
|
---|
112 | bool locked;
|
---|
113 | list_t blocked_fibrils;
|
---|
114 | size_t blocked_thread_cnt;
|
---|
115 | futex_t futex_blocking_threads;
|
---|
116 | } sync_lock;
|
---|
117 | } rcu_data_t;
|
---|
118 |
|
---|
119 | typedef struct blocked_fibril {
|
---|
120 | fid_t id;
|
---|
121 | link_t link;
|
---|
122 | bool is_ready;
|
---|
123 | } blocked_fibril_t;
|
---|
124 |
|
---|
125 |
|
---|
126 | /** Fibril local RCU data. */
|
---|
127 | static fibril_local fibril_rcu_data_t fibril_rcu = {
|
---|
128 | .nesting_cnt = 0,
|
---|
129 | .link = {
|
---|
130 | .next = NULL,
|
---|
131 | .prev = NULL
|
---|
132 | },
|
---|
133 | .registered = false
|
---|
134 | };
|
---|
135 |
|
---|
136 | /** Process global RCU data. */
|
---|
137 | static rcu_data_t rcu = {
|
---|
138 | .cur_gp = 0,
|
---|
139 | .reader_group = RCU_GROUP_A,
|
---|
140 | .list_futex = FUTEX_INITIALIZER,
|
---|
141 | .fibrils_list = LIST_INITIALIZER(rcu.fibrils_list),
|
---|
142 | .sync_lock = {
|
---|
143 | .futex = FUTEX_INITIALIZER,
|
---|
144 | .locked = false,
|
---|
145 | .blocked_fibrils = LIST_INITIALIZER(rcu.sync_lock.blocked_fibrils),
|
---|
146 | .blocked_thread_cnt = 0,
|
---|
147 | .futex_blocking_threads = FUTEX_INITIALIZE(0),
|
---|
148 | },
|
---|
149 | };
|
---|
150 |
|
---|
151 |
|
---|
152 | static void wait_for_readers(size_t reader_group, blocking_mode_t blocking_mode);
|
---|
153 | static void force_mb_in_all_threads(void);
|
---|
154 | static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group);
|
---|
155 |
|
---|
156 | static void lock_sync(blocking_mode_t blocking_mode);
|
---|
157 | static void unlock_sync(void);
|
---|
158 | static void sync_sleep(blocking_mode_t blocking_mode);
|
---|
159 |
|
---|
160 | static bool is_in_group(size_t nesting_cnt, size_t group);
|
---|
161 | static bool is_in_reader_section(size_t nesting_cnt);
|
---|
162 | static size_t get_other_group(size_t group);
|
---|
163 |
|
---|
164 |
|
---|
165 | /** Registers a fibril so it may start using RCU read sections.
|
---|
166 | *
|
---|
167 | * A fibril must be registered with rcu before it can enter RCU critical
|
---|
168 | * sections delineated by rcu_read_lock() and rcu_read_unlock().
|
---|
169 | */
|
---|
170 | void rcu_register_fibril(void)
|
---|
171 | {
|
---|
172 | assert(!fibril_rcu.registered);
|
---|
173 |
|
---|
174 | futex_down(&rcu.list_futex);
|
---|
175 | list_append(&fibril_rcu.link, &rcu.fibrils_list);
|
---|
176 | futex_up(&rcu.list_futex);
|
---|
177 |
|
---|
178 | fibril_rcu.registered = true;
|
---|
179 | }
|
---|
180 |
|
---|
181 | /** Deregisters a fibril that had been using RCU read sections.
|
---|
182 | *
|
---|
183 | * A fibril must be deregistered before it exits if it had
|
---|
184 | * been registered with rcu via rcu_register_fibril().
|
---|
185 | */
|
---|
186 | void rcu_deregister_fibril(void)
|
---|
187 | {
|
---|
188 | assert(fibril_rcu.registered);
|
---|
189 |
|
---|
190 | /*
|
---|
191 | * Forcefully unlock any reader sections. The fibril is exiting
|
---|
192 | * so it is not holding any references to data protected by the
|
---|
193 | * rcu section. Therefore, it is safe to unlock. Otherwise,
|
---|
194 | * rcu_synchronize() would wait indefinitely.
|
---|
195 | */
|
---|
196 | memory_barrier();
|
---|
197 | fibril_rcu.nesting_cnt = 0;
|
---|
198 |
|
---|
199 | futex_down(&rcu.list_futex);
|
---|
200 | list_remove(&fibril_rcu.link);
|
---|
201 | futex_up(&rcu.list_futex);
|
---|
202 |
|
---|
203 | fibril_rcu.registered = false;
|
---|
204 | }
|
---|
205 |
|
---|
206 | /** Delimits the start of an RCU reader critical section.
|
---|
207 | *
|
---|
208 | * RCU reader sections may be nested.
|
---|
209 | */
|
---|
210 | void rcu_read_lock(void)
|
---|
211 | {
|
---|
212 | assert(fibril_rcu.registered);
|
---|
213 |
|
---|
214 | size_t nesting_cnt = ACCESS_ONCE(fibril_rcu.nesting_cnt);
|
---|
215 |
|
---|
216 | if (0 == (nesting_cnt >> RCU_NESTING_SHIFT)) {
|
---|
217 | ACCESS_ONCE(fibril_rcu.nesting_cnt) = ACCESS_ONCE(rcu.reader_group);
|
---|
218 | /* Required by MB_FORCE_L */
|
---|
219 | compiler_barrier(); /* CC_BAR_L */
|
---|
220 | } else {
|
---|
221 | ACCESS_ONCE(fibril_rcu.nesting_cnt) = nesting_cnt + RCU_NESTING_INC;
|
---|
222 | }
|
---|
223 | }
|
---|
224 |
|
---|
225 | /** Delimits the start of an RCU reader critical section. */
|
---|
226 | void rcu_read_unlock(void)
|
---|
227 | {
|
---|
228 | assert(fibril_rcu.registered);
|
---|
229 | assert(rcu_read_locked());
|
---|
230 |
|
---|
231 | /* Required by MB_FORCE_U */
|
---|
232 | compiler_barrier(); /* CC_BAR_U */
|
---|
233 | /* todo: ACCESS_ONCE(nesting_cnt) ? */
|
---|
234 | fibril_rcu.nesting_cnt -= RCU_NESTING_INC;
|
---|
235 | }
|
---|
236 |
|
---|
237 | /** Returns true if the current fibril is in an RCU reader section. */
|
---|
238 | bool rcu_read_locked(void)
|
---|
239 | {
|
---|
240 | return 0 != (fibril_rcu.nesting_cnt >> RCU_NESTING_SHIFT);
|
---|
241 | }
|
---|
242 |
|
---|
243 | /** Blocks until all preexisting readers exit their critical sections. */
|
---|
244 | void _rcu_synchronize(blocking_mode_t blocking_mode)
|
---|
245 | {
|
---|
246 | assert(!rcu_read_locked());
|
---|
247 |
|
---|
248 | /* Contain load of rcu.cur_gp. */
|
---|
249 | memory_barrier();
|
---|
250 |
|
---|
251 | /* Approximately the number of the GP in progress. */
|
---|
252 | size_t gp_in_progress = ACCESS_ONCE(rcu.cur_gp);
|
---|
253 |
|
---|
254 | lock_sync(blocking_mode);
|
---|
255 |
|
---|
256 | /*
|
---|
257 | * Exit early if we were stuck waiting for the mutex for a full grace
|
---|
258 | * period. Started waiting during gp_in_progress (or gp_in_progress + 1
|
---|
259 | * if the value propagated to this cpu too late) so wait for the next
|
---|
260 | * full GP, gp_in_progress + 1, to finish. Ie don't wait if the GP
|
---|
261 | * after that, gp_in_progress + 2, already started.
|
---|
262 | */
|
---|
263 | /* rcu.cur_gp >= gp_in_progress + 2, but tolerates overflows. */
|
---|
264 | if (rcu.cur_gp != gp_in_progress && rcu.cur_gp + 1 != gp_in_progress) {
|
---|
265 | unlock_sync();
|
---|
266 | return;
|
---|
267 | }
|
---|
268 |
|
---|
269 | ++ACCESS_ONCE(rcu.cur_gp);
|
---|
270 |
|
---|
271 | /*
|
---|
272 | * Pairs up with MB_FORCE_L (ie CC_BAR_L). Makes changes prior
|
---|
273 | * to rcu_synchronize() visible to new readers.
|
---|
274 | */
|
---|
275 | memory_barrier(); /* MB_A */
|
---|
276 |
|
---|
277 | /*
|
---|
278 | * Pairs up with MB_A.
|
---|
279 | *
|
---|
280 | * If the memory barrier is issued before CC_BAR_L in the target
|
---|
281 | * thread, it pairs up with MB_A and the thread sees all changes
|
---|
282 | * prior to rcu_synchronize(). Ie any reader sections are new
|
---|
283 | * rcu readers.
|
---|
284 | *
|
---|
285 | * If the memory barrier is issued after CC_BAR_L, it pairs up
|
---|
286 | * with MB_B and it will make the most recent nesting_cnt visible
|
---|
287 | * in this thread. Since the reader may have already accessed
|
---|
288 | * memory protected by RCU (it ran instructions passed CC_BAR_L),
|
---|
289 | * it is a preexisting reader. Seeing the most recent nesting_cnt
|
---|
290 | * ensures the thread will be identified as a preexisting reader
|
---|
291 | * and we will wait for it in wait_for_readers(old_reader_group).
|
---|
292 | */
|
---|
293 | force_mb_in_all_threads(); /* MB_FORCE_L */
|
---|
294 |
|
---|
295 | /*
|
---|
296 | * Pairs with MB_FORCE_L (ie CC_BAR_L, CC_BAR_U) and makes the most
|
---|
297 | * current fibril.nesting_cnt visible to this cpu.
|
---|
298 | */
|
---|
299 | read_barrier(); /* MB_B */
|
---|
300 |
|
---|
301 | size_t new_reader_group = get_other_group(rcu.reader_group);
|
---|
302 | wait_for_readers(new_reader_group, blocking_mode);
|
---|
303 |
|
---|
304 | /* Separates waiting for readers in new_reader_group from group flip. */
|
---|
305 | memory_barrier();
|
---|
306 |
|
---|
307 | /* Flip the group new readers should associate with. */
|
---|
308 | size_t old_reader_group = rcu.reader_group;
|
---|
309 | rcu.reader_group = new_reader_group;
|
---|
310 |
|
---|
311 | /* Flip the group before waiting for preexisting readers in the old group.*/
|
---|
312 | memory_barrier();
|
---|
313 |
|
---|
314 | wait_for_readers(old_reader_group, blocking_mode);
|
---|
315 |
|
---|
316 | /* MB_FORCE_U */
|
---|
317 | force_mb_in_all_threads(); /* MB_FORCE_U */
|
---|
318 |
|
---|
319 | unlock_sync();
|
---|
320 | }
|
---|
321 |
|
---|
322 | /** Issues a memory barrier in each thread of this process. */
|
---|
323 | static void force_mb_in_all_threads(void)
|
---|
324 | {
|
---|
325 | /*
|
---|
326 | * Only issue barriers in running threads. The scheduler will
|
---|
327 | * execute additional memory barriers when switching to threads
|
---|
328 | * of the process that are currently not running.
|
---|
329 | */
|
---|
330 | smp_memory_barrier();
|
---|
331 | }
|
---|
332 |
|
---|
333 | /** Waits for readers of reader_group to exit their readers sections. */
|
---|
334 | static void wait_for_readers(size_t reader_group, blocking_mode_t blocking_mode)
|
---|
335 | {
|
---|
336 | futex_down(&rcu.list_futex);
|
---|
337 |
|
---|
338 | list_t quiescent_fibrils;
|
---|
339 | list_initialize(&quiescent_fibrils);
|
---|
340 |
|
---|
341 | while (!list_empty(&rcu.fibrils_list)) {
|
---|
342 | list_foreach_safe(rcu.fibrils_list, fibril_it, next_fibril) {
|
---|
343 | fibril_rcu_data_t *fib = member_to_inst(fibril_it,
|
---|
344 | fibril_rcu_data_t, link);
|
---|
345 |
|
---|
346 | if (is_preexisting_reader(fib, reader_group)) {
|
---|
347 | futex_up(&rcu.list_futex);
|
---|
348 | sync_sleep(blocking_mode);
|
---|
349 | futex_down(&rcu.list_futex);
|
---|
350 | /* Break to while loop. */
|
---|
351 | break;
|
---|
352 | } else {
|
---|
353 | list_remove(fibril_it);
|
---|
354 | list_append(fibril_it, &quiescent_fibrils);
|
---|
355 | }
|
---|
356 | }
|
---|
357 | }
|
---|
358 |
|
---|
359 | list_concat(&rcu.fibrils_list, &quiescent_fibrils);
|
---|
360 | futex_up(&rcu.list_futex);
|
---|
361 | }
|
---|
362 |
|
---|
363 | static void lock_sync(blocking_mode_t blocking_mode)
|
---|
364 | {
|
---|
365 | futex_down(&rcu.sync_lock.futex);
|
---|
366 | if (rcu.sync_lock.locked) {
|
---|
367 | if (blocking_mode == BM_BLOCK_FIBRIL) {
|
---|
368 | blocked_fibril_t blocked_fib;
|
---|
369 | blocked_fib.id = fibril_get_id();
|
---|
370 |
|
---|
371 | list_append(&blocked_fib.link, &rcu.sync_lock.blocked_fibrils);
|
---|
372 |
|
---|
373 | do {
|
---|
374 | blocked_fib.is_ready = false;
|
---|
375 | futex_up(&rcu.sync_lock.futex);
|
---|
376 | fibril_switch(FIBRIL_TO_MANAGER);
|
---|
377 | futex_down(&rcu.sync_lock.futex);
|
---|
378 | } while (rcu.sync_lock.locked);
|
---|
379 |
|
---|
380 | list_remove(&blocked_fib.link);
|
---|
381 | rcu.sync_lock.locked = true;
|
---|
382 | } else {
|
---|
383 | assert(blocking_mode == BM_BLOCK_THREAD);
|
---|
384 | rcu.sync_lock.blocked_thread_cnt++;
|
---|
385 | futex_up(&rcu.sync_lock.futex);
|
---|
386 | futex_down(&rcu.sync_lock.futex_blocking_threads);
|
---|
387 | }
|
---|
388 | } else {
|
---|
389 | rcu.sync_lock.locked = true;
|
---|
390 | }
|
---|
391 | }
|
---|
392 |
|
---|
393 | static void unlock_sync(void)
|
---|
394 | {
|
---|
395 | assert(rcu.sync_lock.locked);
|
---|
396 |
|
---|
397 | /*
|
---|
398 | * Blocked threads have a priority over fibrils when accessing sync().
|
---|
399 | * Pass the lock onto a waiting thread.
|
---|
400 | */
|
---|
401 | if (0 < rcu.sync_lock.blocked_thread_cnt) {
|
---|
402 | --rcu.sync_lock.blocked_thread_cnt;
|
---|
403 | futex_up(&rcu.sync_lock.futex_blocking_threads);
|
---|
404 | } else {
|
---|
405 | /* Unlock but wake up any fibrils waiting for the lock. */
|
---|
406 |
|
---|
407 | if (!list_empty(&rcu.sync_lock.blocked_fibrils)) {
|
---|
408 | blocked_fibril_t *blocked_fib = member_to_inst(
|
---|
409 | list_first(&rcu.sync_lock.blocked_fibrils), blocked_fibril_t, link);
|
---|
410 |
|
---|
411 | if (!blocked_fib->is_ready) {
|
---|
412 | blocked_fib->is_ready = true;
|
---|
413 | fibril_add_ready(blocked_fib->id);
|
---|
414 | }
|
---|
415 | }
|
---|
416 |
|
---|
417 | rcu.sync_lock.locked = false;
|
---|
418 | futex_up(&rcu.sync_lock.futex);
|
---|
419 | }
|
---|
420 | }
|
---|
421 |
|
---|
422 | static void sync_sleep(blocking_mode_t blocking_mode)
|
---|
423 | {
|
---|
424 | assert(rcu.sync_lock.locked);
|
---|
425 | /*
|
---|
426 | * Release the futex to avoid deadlocks in singlethreaded apps
|
---|
427 | * but keep sync locked.
|
---|
428 | */
|
---|
429 | futex_up(&rcu.sync_lock.futex);
|
---|
430 |
|
---|
431 | if (blocking_mode == BM_BLOCK_FIBRIL) {
|
---|
432 | async_usleep(RCU_SLEEP_MS * 1000);
|
---|
433 | } else {
|
---|
434 | thread_usleep(RCU_SLEEP_MS * 1000);
|
---|
435 | }
|
---|
436 |
|
---|
437 | futex_down(&rcu.sync_lock.futex);
|
---|
438 | }
|
---|
439 |
|
---|
440 |
|
---|
441 | static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group)
|
---|
442 | {
|
---|
443 | size_t nesting_cnt = ACCESS_ONCE(fib->nesting_cnt);
|
---|
444 |
|
---|
445 | return is_in_group(nesting_cnt, group) && is_in_reader_section(nesting_cnt);
|
---|
446 | }
|
---|
447 |
|
---|
448 | static size_t get_other_group(size_t group)
|
---|
449 | {
|
---|
450 | if (group == RCU_GROUP_A)
|
---|
451 | return RCU_GROUP_B;
|
---|
452 | else
|
---|
453 | return RCU_GROUP_A;
|
---|
454 | }
|
---|
455 |
|
---|
456 | static bool is_in_reader_section(size_t nesting_cnt)
|
---|
457 | {
|
---|
458 | return RCU_NESTING_INC <= nesting_cnt;
|
---|
459 | }
|
---|
460 |
|
---|
461 | static bool is_in_group(size_t nesting_cnt, size_t group)
|
---|
462 | {
|
---|
463 | return (nesting_cnt & RCU_GROUP_BIT_MASK) == (group & RCU_GROUP_BIT_MASK);
|
---|
464 | }
|
---|
465 |
|
---|
466 |
|
---|
467 |
|
---|
468 | /** @}
|
---|
469 | */
|
---|