1 | /*
|
---|
2 | * Copyright (c) 2012 Adam Hraska
|
---|
3 | * All rights reserved.
|
---|
4 | *
|
---|
5 | * Redistribution and use in source and binary forms, with or without
|
---|
6 | * modification, are permitted provided that the following conditions
|
---|
7 | * are met:
|
---|
8 | *
|
---|
9 | * - Redistributions of source code must retain the above copyright
|
---|
10 | * notice, this list of conditions and the following disclaimer.
|
---|
11 | * - Redistributions in binary form must reproduce the above copyright
|
---|
12 | * notice, this list of conditions and the following disclaimer in the
|
---|
13 | * documentation and/or other materials provided with the distribution.
|
---|
14 | * - The name of the author may not be used to endorse or promote products
|
---|
15 | * derived from this software without specific prior written permission.
|
---|
16 | *
|
---|
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
---|
18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
---|
19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
---|
20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
---|
21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
---|
22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
---|
23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
---|
24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
---|
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
---|
26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
---|
27 | */
|
---|
28 |
|
---|
29 | /** @addtogroup liburcu
|
---|
30 | * @{
|
---|
31 | */
|
---|
32 | /**
|
---|
33 | * @file
|
---|
34 | *
|
---|
35 | * User space RCU is based on URCU utilizing signals [1]. This
|
---|
36 | * implementation does not however signal each thread of the process
|
---|
37 | * to issue a memory barrier. Instead, we introduced a syscall that
|
---|
38 | * issues memory barriers (via IPIs) on cpus that are running threads
|
---|
39 | * of the current process. First, it does not require us to schedule
|
---|
40 | * and run every thread of the process. Second, IPIs are less intrusive
|
---|
41 | * than switching contexts and entering user space.
|
---|
42 | *
|
---|
43 | * This algorithm is further modified to require a single instead of
|
---|
44 | * two reader group changes per grace period. Signal-URCU flips
|
---|
45 | * the reader group and waits for readers of the previous group
|
---|
46 | * twice in succession in order to wait for new readers that were
|
---|
47 | * delayed and mistakenly associated with the previous reader group.
|
---|
48 | * The modified algorithm ensures that the new reader group is
|
---|
49 | * always empty (by explicitly waiting for it to become empty).
|
---|
50 | * Only then does it flip the reader group and wait for preexisting
|
---|
51 | * readers of the old reader group (invariant of SRCU [2, 3]).
|
---|
52 | *
|
---|
53 | *
|
---|
54 | * [1] User-level implementations of read-copy update,
|
---|
55 | * 2012, appendix
|
---|
56 | * http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
|
---|
57 | *
|
---|
58 | * [2] linux/kernel/srcu.c in Linux 3.5-rc2,
|
---|
59 | * 2012
|
---|
60 | * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/kernel/srcu.c?v=linux-3.5-rc2-ccs-1.8.3
|
---|
61 | *
|
---|
62 | * [3] [RFC PATCH 5/5 single-thread-version] implement
|
---|
63 | * per-domain single-thread state machine,
|
---|
64 | * 2012, Lai
|
---|
65 | * https://lkml.org/lkml/2012/3/6/586
|
---|
66 | */
|
---|
67 |
|
---|
68 | #include "rcu.h"
|
---|
69 | #include <fibril_synch.h>
|
---|
70 | #include <fibril.h>
|
---|
71 | #include <stdio.h>
|
---|
72 | #include <stddef.h>
|
---|
73 | #include <barrier.h>
|
---|
74 | #include <macros.h>
|
---|
75 | #include <async.h>
|
---|
76 | #include <adt/list.h>
|
---|
77 | #include <barrier.h>
|
---|
78 | #include <assert.h>
|
---|
79 | #include <time.h>
|
---|
80 |
|
---|
81 | #include "../private/fibril.h"
|
---|
82 |
|
---|
83 |
|
---|
84 | /** RCU sleeps for RCU_SLEEP_MS before polling an active RCU reader again. */
|
---|
85 | #define RCU_SLEEP_MS 10
|
---|
86 |
|
---|
87 | #define RCU_NESTING_SHIFT 1
|
---|
88 | #define RCU_NESTING_INC (1 << RCU_NESTING_SHIFT)
|
---|
89 | #define RCU_GROUP_BIT_MASK (size_t)(RCU_NESTING_INC - 1)
|
---|
90 | #define RCU_GROUP_A (size_t)(0 | RCU_NESTING_INC)
|
---|
91 | #define RCU_GROUP_B (size_t)(1 | RCU_NESTING_INC)
|
---|
92 |
|
---|
93 |
|
---|
94 | /** Fibril local RCU data. */
|
---|
95 | typedef struct fibril_rcu_data {
|
---|
96 | size_t nesting_cnt;
|
---|
97 | link_t link;
|
---|
98 | bool registered;
|
---|
99 | } fibril_rcu_data_t;
|
---|
100 |
|
---|
101 | /** Process global RCU data. */
|
---|
102 | typedef struct rcu_data {
|
---|
103 | size_t cur_gp;
|
---|
104 | size_t reader_group;
|
---|
105 | fibril_rmutex_t list_mutex;
|
---|
106 | list_t fibrils_list;
|
---|
107 | struct {
|
---|
108 | fibril_rmutex_t mutex;
|
---|
109 | bool locked;
|
---|
110 | list_t blocked_fibrils;
|
---|
111 | } sync_lock;
|
---|
112 | } rcu_data_t;
|
---|
113 |
|
---|
114 | typedef struct blocked_fibril {
|
---|
115 | fibril_event_t unblock;
|
---|
116 | link_t link;
|
---|
117 | bool is_ready;
|
---|
118 | } blocked_fibril_t;
|
---|
119 |
|
---|
120 |
|
---|
121 | /** Fibril local RCU data. */
|
---|
122 | static fibril_local fibril_rcu_data_t fibril_rcu = {
|
---|
123 | .nesting_cnt = 0,
|
---|
124 | .link = {
|
---|
125 | .next = NULL,
|
---|
126 | .prev = NULL
|
---|
127 | },
|
---|
128 | .registered = false
|
---|
129 | };
|
---|
130 |
|
---|
131 | /** Process global RCU data. */
|
---|
132 | static rcu_data_t rcu = {
|
---|
133 | .cur_gp = 0,
|
---|
134 | .reader_group = RCU_GROUP_A,
|
---|
135 | .list_mutex = FIBRIL_RMUTEX_INITIALIZER(rcu.list_mutex),
|
---|
136 | .fibrils_list = LIST_INITIALIZER(rcu.fibrils_list),
|
---|
137 | .sync_lock = {
|
---|
138 | .mutex = FIBRIL_RMUTEX_INITIALIZER(rcu.sync_lock.mutex),
|
---|
139 | .locked = false,
|
---|
140 | .blocked_fibrils = LIST_INITIALIZER(rcu.sync_lock.blocked_fibrils),
|
---|
141 | },
|
---|
142 | };
|
---|
143 |
|
---|
144 |
|
---|
145 | static void wait_for_readers(size_t reader_group);
|
---|
146 | static void force_mb_in_all_threads(void);
|
---|
147 | static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group);
|
---|
148 |
|
---|
149 | static void lock_sync(void);
|
---|
150 | static void unlock_sync(void);
|
---|
151 | static void sync_sleep(void);
|
---|
152 |
|
---|
153 | static bool is_in_group(size_t nesting_cnt, size_t group);
|
---|
154 | static bool is_in_reader_section(size_t nesting_cnt);
|
---|
155 | static size_t get_other_group(size_t group);
|
---|
156 |
|
---|
157 |
|
---|
158 | /** Registers a fibril so it may start using RCU read sections.
|
---|
159 | *
|
---|
160 | * A fibril must be registered with rcu before it can enter RCU critical
|
---|
161 | * sections delineated by rcu_read_lock() and rcu_read_unlock().
|
---|
162 | */
|
---|
163 | void rcu_register_fibril(void)
|
---|
164 | {
|
---|
165 | assert(!fibril_rcu.registered);
|
---|
166 |
|
---|
167 | fibril_rmutex_lock(&rcu.list_mutex);
|
---|
168 | list_append(&fibril_rcu.link, &rcu.fibrils_list);
|
---|
169 | fibril_rmutex_unlock(&rcu.list_mutex);
|
---|
170 |
|
---|
171 | fibril_rcu.registered = true;
|
---|
172 | }
|
---|
173 |
|
---|
174 | /** Deregisters a fibril that had been using RCU read sections.
|
---|
175 | *
|
---|
176 | * A fibril must be deregistered before it exits if it had
|
---|
177 | * been registered with rcu via rcu_register_fibril().
|
---|
178 | */
|
---|
179 | void rcu_deregister_fibril(void)
|
---|
180 | {
|
---|
181 | assert(fibril_rcu.registered);
|
---|
182 |
|
---|
183 | /*
|
---|
184 | * Forcefully unlock any reader sections. The fibril is exiting
|
---|
185 | * so it is not holding any references to data protected by the
|
---|
186 | * rcu section. Therefore, it is safe to unlock. Otherwise,
|
---|
187 | * rcu_synchronize() would wait indefinitely.
|
---|
188 | */
|
---|
189 | memory_barrier();
|
---|
190 | fibril_rcu.nesting_cnt = 0;
|
---|
191 |
|
---|
192 | fibril_rmutex_lock(&rcu.list_mutex);
|
---|
193 | list_remove(&fibril_rcu.link);
|
---|
194 | fibril_rmutex_unlock(&rcu.list_mutex);
|
---|
195 |
|
---|
196 | fibril_rcu.registered = false;
|
---|
197 | }
|
---|
198 |
|
---|
199 | /** Delimits the start of an RCU reader critical section.
|
---|
200 | *
|
---|
201 | * RCU reader sections may be nested.
|
---|
202 | */
|
---|
203 | void rcu_read_lock(void)
|
---|
204 | {
|
---|
205 | assert(fibril_rcu.registered);
|
---|
206 |
|
---|
207 | size_t nesting_cnt = ACCESS_ONCE(fibril_rcu.nesting_cnt);
|
---|
208 |
|
---|
209 | if (0 == (nesting_cnt >> RCU_NESTING_SHIFT)) {
|
---|
210 | ACCESS_ONCE(fibril_rcu.nesting_cnt) = ACCESS_ONCE(rcu.reader_group);
|
---|
211 | /* Required by MB_FORCE_L */
|
---|
212 | compiler_barrier(); /* CC_BAR_L */
|
---|
213 | } else {
|
---|
214 | ACCESS_ONCE(fibril_rcu.nesting_cnt) = nesting_cnt + RCU_NESTING_INC;
|
---|
215 | }
|
---|
216 | }
|
---|
217 |
|
---|
218 | /** Delimits the end of an RCU reader critical section. */
|
---|
219 | void rcu_read_unlock(void)
|
---|
220 | {
|
---|
221 | assert(fibril_rcu.registered);
|
---|
222 | assert(rcu_read_locked());
|
---|
223 |
|
---|
224 | /* Required by MB_FORCE_U */
|
---|
225 | compiler_barrier(); /* CC_BAR_U */
|
---|
226 | /* todo: ACCESS_ONCE(nesting_cnt) ? */
|
---|
227 | fibril_rcu.nesting_cnt -= RCU_NESTING_INC;
|
---|
228 | }
|
---|
229 |
|
---|
230 | /** Returns true if the current fibril is in an RCU reader section. */
|
---|
231 | bool rcu_read_locked(void)
|
---|
232 | {
|
---|
233 | return 0 != (fibril_rcu.nesting_cnt >> RCU_NESTING_SHIFT);
|
---|
234 | }
|
---|
235 |
|
---|
236 | /** Blocks until all preexisting readers exit their critical sections. */
|
---|
237 | void rcu_synchronize(void)
|
---|
238 | {
|
---|
239 | assert(!rcu_read_locked());
|
---|
240 |
|
---|
241 | /* Contain load of rcu.cur_gp. */
|
---|
242 | memory_barrier();
|
---|
243 |
|
---|
244 | /* Approximately the number of the GP in progress. */
|
---|
245 | size_t gp_in_progress = ACCESS_ONCE(rcu.cur_gp);
|
---|
246 |
|
---|
247 | lock_sync();
|
---|
248 |
|
---|
249 | /*
|
---|
250 | * Exit early if we were stuck waiting for the mutex for a full grace
|
---|
251 | * period. Started waiting during gp_in_progress (or gp_in_progress + 1
|
---|
252 | * if the value propagated to this cpu too late) so wait for the next
|
---|
253 | * full GP, gp_in_progress + 1, to finish. Ie don't wait if the GP
|
---|
254 | * after that, gp_in_progress + 2, already started.
|
---|
255 | */
|
---|
256 | /* rcu.cur_gp >= gp_in_progress + 2, but tolerates overflows. */
|
---|
257 | if (rcu.cur_gp != gp_in_progress && rcu.cur_gp + 1 != gp_in_progress) {
|
---|
258 | unlock_sync();
|
---|
259 | return;
|
---|
260 | }
|
---|
261 |
|
---|
262 | ++ACCESS_ONCE(rcu.cur_gp);
|
---|
263 |
|
---|
264 | /*
|
---|
265 | * Pairs up with MB_FORCE_L (ie CC_BAR_L). Makes changes prior
|
---|
266 | * to rcu_synchronize() visible to new readers.
|
---|
267 | */
|
---|
268 | memory_barrier(); /* MB_A */
|
---|
269 |
|
---|
270 | /*
|
---|
271 | * Pairs up with MB_A.
|
---|
272 | *
|
---|
273 | * If the memory barrier is issued before CC_BAR_L in the target
|
---|
274 | * thread, it pairs up with MB_A and the thread sees all changes
|
---|
275 | * prior to rcu_synchronize(). Ie any reader sections are new
|
---|
276 | * rcu readers.
|
---|
277 | *
|
---|
278 | * If the memory barrier is issued after CC_BAR_L, it pairs up
|
---|
279 | * with MB_B and it will make the most recent nesting_cnt visible
|
---|
280 | * in this thread. Since the reader may have already accessed
|
---|
281 | * memory protected by RCU (it ran instructions passed CC_BAR_L),
|
---|
282 | * it is a preexisting reader. Seeing the most recent nesting_cnt
|
---|
283 | * ensures the thread will be identified as a preexisting reader
|
---|
284 | * and we will wait for it in wait_for_readers(old_reader_group).
|
---|
285 | */
|
---|
286 | force_mb_in_all_threads(); /* MB_FORCE_L */
|
---|
287 |
|
---|
288 | /*
|
---|
289 | * Pairs with MB_FORCE_L (ie CC_BAR_L, CC_BAR_U) and makes the most
|
---|
290 | * current fibril.nesting_cnt visible to this cpu.
|
---|
291 | */
|
---|
292 | read_barrier(); /* MB_B */
|
---|
293 |
|
---|
294 | size_t new_reader_group = get_other_group(rcu.reader_group);
|
---|
295 | wait_for_readers(new_reader_group);
|
---|
296 |
|
---|
297 | /* Separates waiting for readers in new_reader_group from group flip. */
|
---|
298 | memory_barrier();
|
---|
299 |
|
---|
300 | /* Flip the group new readers should associate with. */
|
---|
301 | size_t old_reader_group = rcu.reader_group;
|
---|
302 | rcu.reader_group = new_reader_group;
|
---|
303 |
|
---|
304 | /* Flip the group before waiting for preexisting readers in the old group.*/
|
---|
305 | memory_barrier();
|
---|
306 |
|
---|
307 | wait_for_readers(old_reader_group);
|
---|
308 |
|
---|
309 | /* MB_FORCE_U */
|
---|
310 | force_mb_in_all_threads(); /* MB_FORCE_U */
|
---|
311 |
|
---|
312 | unlock_sync();
|
---|
313 | }
|
---|
314 |
|
---|
315 | /** Issues a memory barrier in each thread of this process. */
|
---|
316 | static void force_mb_in_all_threads(void)
|
---|
317 | {
|
---|
318 | /*
|
---|
319 | * Only issue barriers in running threads. The scheduler will
|
---|
320 | * execute additional memory barriers when switching to threads
|
---|
321 | * of the process that are currently not running.
|
---|
322 | */
|
---|
323 | smp_memory_barrier();
|
---|
324 | }
|
---|
325 |
|
---|
326 | /** Waits for readers of reader_group to exit their readers sections. */
|
---|
327 | static void wait_for_readers(size_t reader_group)
|
---|
328 | {
|
---|
329 | fibril_rmutex_lock(&rcu.list_mutex);
|
---|
330 |
|
---|
331 | list_t quiescent_fibrils;
|
---|
332 | list_initialize(&quiescent_fibrils);
|
---|
333 |
|
---|
334 | while (!list_empty(&rcu.fibrils_list)) {
|
---|
335 | list_foreach_safe(rcu.fibrils_list, fibril_it, next_fibril) {
|
---|
336 | fibril_rcu_data_t *fib = member_to_inst(fibril_it,
|
---|
337 | fibril_rcu_data_t, link);
|
---|
338 |
|
---|
339 | if (is_preexisting_reader(fib, reader_group)) {
|
---|
340 | fibril_rmutex_unlock(&rcu.list_mutex);
|
---|
341 | sync_sleep();
|
---|
342 | fibril_rmutex_lock(&rcu.list_mutex);
|
---|
343 | /* Break to while loop. */
|
---|
344 | break;
|
---|
345 | } else {
|
---|
346 | list_remove(fibril_it);
|
---|
347 | list_append(fibril_it, &quiescent_fibrils);
|
---|
348 | }
|
---|
349 | }
|
---|
350 | }
|
---|
351 |
|
---|
352 | list_concat(&rcu.fibrils_list, &quiescent_fibrils);
|
---|
353 | fibril_rmutex_unlock(&rcu.list_mutex);
|
---|
354 | }
|
---|
355 |
|
---|
356 | static void lock_sync(void)
|
---|
357 | {
|
---|
358 | fibril_rmutex_lock(&rcu.sync_lock.mutex);
|
---|
359 | if (rcu.sync_lock.locked) {
|
---|
360 | blocked_fibril_t blocked_fib;
|
---|
361 | blocked_fib.unblock = FIBRIL_EVENT_INIT;
|
---|
362 |
|
---|
363 | list_append(&blocked_fib.link, &rcu.sync_lock.blocked_fibrils);
|
---|
364 |
|
---|
365 | do {
|
---|
366 | blocked_fib.is_ready = false;
|
---|
367 | fibril_rmutex_unlock(&rcu.sync_lock.mutex);
|
---|
368 | fibril_wait_for(&blocked_fib.unblock);
|
---|
369 | fibril_rmutex_lock(&rcu.sync_lock.mutex);
|
---|
370 | } while (rcu.sync_lock.locked);
|
---|
371 |
|
---|
372 | list_remove(&blocked_fib.link);
|
---|
373 | rcu.sync_lock.locked = true;
|
---|
374 | } else {
|
---|
375 | rcu.sync_lock.locked = true;
|
---|
376 | }
|
---|
377 | }
|
---|
378 |
|
---|
379 | static void unlock_sync(void)
|
---|
380 | {
|
---|
381 | assert(rcu.sync_lock.locked);
|
---|
382 |
|
---|
383 | /* Unlock but wake up any fibrils waiting for the lock. */
|
---|
384 |
|
---|
385 | if (!list_empty(&rcu.sync_lock.blocked_fibrils)) {
|
---|
386 | blocked_fibril_t *blocked_fib = member_to_inst(
|
---|
387 | list_first(&rcu.sync_lock.blocked_fibrils), blocked_fibril_t, link);
|
---|
388 |
|
---|
389 | if (!blocked_fib->is_ready) {
|
---|
390 | blocked_fib->is_ready = true;
|
---|
391 | fibril_notify(&blocked_fib->unblock);
|
---|
392 | }
|
---|
393 | }
|
---|
394 |
|
---|
395 | rcu.sync_lock.locked = false;
|
---|
396 | fibril_rmutex_unlock(&rcu.sync_lock.mutex);
|
---|
397 | }
|
---|
398 |
|
---|
399 | static void sync_sleep(void)
|
---|
400 | {
|
---|
401 | assert(rcu.sync_lock.locked);
|
---|
402 | /*
|
---|
403 | * Release the futex to avoid deadlocks in singlethreaded apps
|
---|
404 | * but keep sync locked.
|
---|
405 | */
|
---|
406 | fibril_rmutex_unlock(&rcu.sync_lock.mutex);
|
---|
407 | fibril_usleep(RCU_SLEEP_MS * 1000);
|
---|
408 | fibril_rmutex_lock(&rcu.sync_lock.mutex);
|
---|
409 | }
|
---|
410 |
|
---|
411 |
|
---|
412 | static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group)
|
---|
413 | {
|
---|
414 | size_t nesting_cnt = ACCESS_ONCE(fib->nesting_cnt);
|
---|
415 |
|
---|
416 | return is_in_group(nesting_cnt, group) && is_in_reader_section(nesting_cnt);
|
---|
417 | }
|
---|
418 |
|
---|
419 | static size_t get_other_group(size_t group)
|
---|
420 | {
|
---|
421 | if (group == RCU_GROUP_A)
|
---|
422 | return RCU_GROUP_B;
|
---|
423 | else
|
---|
424 | return RCU_GROUP_A;
|
---|
425 | }
|
---|
426 |
|
---|
427 | static bool is_in_reader_section(size_t nesting_cnt)
|
---|
428 | {
|
---|
429 | return RCU_NESTING_INC <= nesting_cnt;
|
---|
430 | }
|
---|
431 |
|
---|
432 | static bool is_in_group(size_t nesting_cnt, size_t group)
|
---|
433 | {
|
---|
434 | return (nesting_cnt & RCU_GROUP_BIT_MASK) == (group & RCU_GROUP_BIT_MASK);
|
---|
435 | }
|
---|
436 |
|
---|
437 |
|
---|
438 |
|
---|
439 | /** @}
|
---|
440 | */
|
---|