source: mainline/uspace/lib/c/generic/rcu.c@ 514d561

lfn serial ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 514d561 was 514d561, checked in by Jiří Zárevúcky <jiri.zarevucky@…>, 7 years ago

Fibril/async implementation overhaul.

This commit marks the move towards treating the fibril library as a mere
implementation of a generic threading interface. Understood as a layer that
wraps the kernel threads, we not only have to wrap threading itself, but also
every syscall that blocks the kernel thread (by blocking, we mean thread not
doing useful work until an external event happens — e.g. locking a kernel
mutex or thread sleep is understood as blocking, but an as_area_create() is not,
despite potentially taking a long time to complete).

Consequently, we implement fibril_ipc_wait() as a fibril-native wrapper for
kernel's ipc_wait(), and also implement timer functionality like timeouts
as part of the fibril library. This removes the interdependency between fibril
implementation and the async framework — in theory, the fibril API could be
reimplemented as a simple 1:1 shim, and the async framework would continue
working normally (note that the current implementation of loader complicates
this).

To better isolate the fibril internals from the implementation of high-level
synchronization, a fibril_event_t is added. This object conceptually acts
like a single slot wait queue. All other synchronization is implemented in
terms of this primitive.

  • Property mode set to 100644
File size: 12.6 KB
Line 
1/*
2 * Copyright (c) 2012 Adam Hraska
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup liburcu
30 * @{
31 */
32/**
33 * @file
34 *
35 * User space RCU is based on URCU utilizing signals [1]. This
36 * implementation does not however signal each thread of the process
37 * to issue a memory barrier. Instead, we introduced a syscall that
38 * issues memory barriers (via IPIs) on cpus that are running threads
39 * of the current process. First, it does not require us to schedule
40 * and run every thread of the process. Second, IPIs are less intrusive
41 * than switching contexts and entering user space.
42 *
43 * This algorithm is further modified to require a single instead of
44 * two reader group changes per grace period. Signal-URCU flips
45 * the reader group and waits for readers of the previous group
46 * twice in succession in order to wait for new readers that were
47 * delayed and mistakenly associated with the previous reader group.
48 * The modified algorithm ensures that the new reader group is
49 * always empty (by explicitly waiting for it to become empty).
50 * Only then does it flip the reader group and wait for preexisting
51 * readers of the old reader group (invariant of SRCU [2, 3]).
52 *
53 *
54 * [1] User-level implementations of read-copy update,
55 * 2012, appendix
56 * http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
57 *
58 * [2] linux/kernel/srcu.c in Linux 3.5-rc2,
59 * 2012
60 * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/kernel/srcu.c?v=linux-3.5-rc2-ccs-1.8.3
61 *
62 * [3] [RFC PATCH 5/5 single-thread-version] implement
63 * per-domain single-thread state machine,
64 * 2012, Lai
65 * https://lkml.org/lkml/2012/3/6/586
66 */
67
68#include "rcu.h"
69#include <fibril_synch.h>
70#include <fibril.h>
71#include <stdio.h>
72#include <stddef.h>
73#include <compiler/barrier.h>
74#include <libarch/barrier.h>
75#include <futex.h>
76#include <macros.h>
77#include <async.h>
78#include <adt/list.h>
79#include <smp_memory_barrier.h>
80#include <assert.h>
81#include <time.h>
82
83#include "private/fibril.h"
84
85
86/** RCU sleeps for RCU_SLEEP_MS before polling an active RCU reader again. */
87#define RCU_SLEEP_MS 10
88
89#define RCU_NESTING_SHIFT 1
90#define RCU_NESTING_INC (1 << RCU_NESTING_SHIFT)
91#define RCU_GROUP_BIT_MASK (size_t)(RCU_NESTING_INC - 1)
92#define RCU_GROUP_A (size_t)(0 | RCU_NESTING_INC)
93#define RCU_GROUP_B (size_t)(1 | RCU_NESTING_INC)
94
95
96/** Fibril local RCU data. */
97typedef struct fibril_rcu_data {
98 size_t nesting_cnt;
99 link_t link;
100 bool registered;
101} fibril_rcu_data_t;
102
103/** Process global RCU data. */
104typedef struct rcu_data {
105 size_t cur_gp;
106 size_t reader_group;
107 fibril_rmutex_t list_mutex;
108 list_t fibrils_list;
109 struct {
110 fibril_rmutex_t mutex;
111 bool locked;
112 list_t blocked_fibrils;
113 } sync_lock;
114} rcu_data_t;
115
116typedef struct blocked_fibril {
117 fibril_event_t unblock;
118 link_t link;
119 bool is_ready;
120} blocked_fibril_t;
121
122
123/** Fibril local RCU data. */
124static fibril_local fibril_rcu_data_t fibril_rcu = {
125 .nesting_cnt = 0,
126 .link = {
127 .next = NULL,
128 .prev = NULL
129 },
130 .registered = false
131};
132
133/** Process global RCU data. */
134static rcu_data_t rcu = {
135 .cur_gp = 0,
136 .reader_group = RCU_GROUP_A,
137 .list_mutex = FIBRIL_RMUTEX_INITIALIZER(rcu.list_mutex),
138 .fibrils_list = LIST_INITIALIZER(rcu.fibrils_list),
139 .sync_lock = {
140 .mutex = FIBRIL_RMUTEX_INITIALIZER(rcu.sync_lock.mutex),
141 .locked = false,
142 .blocked_fibrils = LIST_INITIALIZER(rcu.sync_lock.blocked_fibrils),
143 },
144};
145
146
147static void wait_for_readers(size_t reader_group);
148static void force_mb_in_all_threads(void);
149static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group);
150
151static void lock_sync(void);
152static void unlock_sync(void);
153static void sync_sleep(void);
154
155static bool is_in_group(size_t nesting_cnt, size_t group);
156static bool is_in_reader_section(size_t nesting_cnt);
157static size_t get_other_group(size_t group);
158
159
160/** Registers a fibril so it may start using RCU read sections.
161 *
162 * A fibril must be registered with rcu before it can enter RCU critical
163 * sections delineated by rcu_read_lock() and rcu_read_unlock().
164 */
165void rcu_register_fibril(void)
166{
167 assert(!fibril_rcu.registered);
168
169 fibril_rmutex_lock(&rcu.list_mutex);
170 list_append(&fibril_rcu.link, &rcu.fibrils_list);
171 fibril_rmutex_unlock(&rcu.list_mutex);
172
173 fibril_rcu.registered = true;
174}
175
176/** Deregisters a fibril that had been using RCU read sections.
177 *
178 * A fibril must be deregistered before it exits if it had
179 * been registered with rcu via rcu_register_fibril().
180 */
181void rcu_deregister_fibril(void)
182{
183 assert(fibril_rcu.registered);
184
185 /*
186 * Forcefully unlock any reader sections. The fibril is exiting
187 * so it is not holding any references to data protected by the
188 * rcu section. Therefore, it is safe to unlock. Otherwise,
189 * rcu_synchronize() would wait indefinitely.
190 */
191 memory_barrier();
192 fibril_rcu.nesting_cnt = 0;
193
194 fibril_rmutex_lock(&rcu.list_mutex);
195 list_remove(&fibril_rcu.link);
196 fibril_rmutex_unlock(&rcu.list_mutex);
197
198 fibril_rcu.registered = false;
199}
200
201/** Delimits the start of an RCU reader critical section.
202 *
203 * RCU reader sections may be nested.
204 */
205void rcu_read_lock(void)
206{
207 assert(fibril_rcu.registered);
208
209 size_t nesting_cnt = ACCESS_ONCE(fibril_rcu.nesting_cnt);
210
211 if (0 == (nesting_cnt >> RCU_NESTING_SHIFT)) {
212 ACCESS_ONCE(fibril_rcu.nesting_cnt) = ACCESS_ONCE(rcu.reader_group);
213 /* Required by MB_FORCE_L */
214 compiler_barrier(); /* CC_BAR_L */
215 } else {
216 ACCESS_ONCE(fibril_rcu.nesting_cnt) = nesting_cnt + RCU_NESTING_INC;
217 }
218}
219
220/** Delimits the end of an RCU reader critical section. */
221void rcu_read_unlock(void)
222{
223 assert(fibril_rcu.registered);
224 assert(rcu_read_locked());
225
226 /* Required by MB_FORCE_U */
227 compiler_barrier(); /* CC_BAR_U */
228 /* todo: ACCESS_ONCE(nesting_cnt) ? */
229 fibril_rcu.nesting_cnt -= RCU_NESTING_INC;
230}
231
232/** Returns true if the current fibril is in an RCU reader section. */
233bool rcu_read_locked(void)
234{
235 return 0 != (fibril_rcu.nesting_cnt >> RCU_NESTING_SHIFT);
236}
237
238/** Blocks until all preexisting readers exit their critical sections. */
239void rcu_synchronize(void)
240{
241 assert(!rcu_read_locked());
242
243 /* Contain load of rcu.cur_gp. */
244 memory_barrier();
245
246 /* Approximately the number of the GP in progress. */
247 size_t gp_in_progress = ACCESS_ONCE(rcu.cur_gp);
248
249 lock_sync();
250
251 /*
252 * Exit early if we were stuck waiting for the mutex for a full grace
253 * period. Started waiting during gp_in_progress (or gp_in_progress + 1
254 * if the value propagated to this cpu too late) so wait for the next
255 * full GP, gp_in_progress + 1, to finish. Ie don't wait if the GP
256 * after that, gp_in_progress + 2, already started.
257 */
258 /* rcu.cur_gp >= gp_in_progress + 2, but tolerates overflows. */
259 if (rcu.cur_gp != gp_in_progress && rcu.cur_gp + 1 != gp_in_progress) {
260 unlock_sync();
261 return;
262 }
263
264 ++ACCESS_ONCE(rcu.cur_gp);
265
266 /*
267 * Pairs up with MB_FORCE_L (ie CC_BAR_L). Makes changes prior
268 * to rcu_synchronize() visible to new readers.
269 */
270 memory_barrier(); /* MB_A */
271
272 /*
273 * Pairs up with MB_A.
274 *
275 * If the memory barrier is issued before CC_BAR_L in the target
276 * thread, it pairs up with MB_A and the thread sees all changes
277 * prior to rcu_synchronize(). Ie any reader sections are new
278 * rcu readers.
279 *
280 * If the memory barrier is issued after CC_BAR_L, it pairs up
281 * with MB_B and it will make the most recent nesting_cnt visible
282 * in this thread. Since the reader may have already accessed
283 * memory protected by RCU (it ran instructions passed CC_BAR_L),
284 * it is a preexisting reader. Seeing the most recent nesting_cnt
285 * ensures the thread will be identified as a preexisting reader
286 * and we will wait for it in wait_for_readers(old_reader_group).
287 */
288 force_mb_in_all_threads(); /* MB_FORCE_L */
289
290 /*
291 * Pairs with MB_FORCE_L (ie CC_BAR_L, CC_BAR_U) and makes the most
292 * current fibril.nesting_cnt visible to this cpu.
293 */
294 read_barrier(); /* MB_B */
295
296 size_t new_reader_group = get_other_group(rcu.reader_group);
297 wait_for_readers(new_reader_group);
298
299 /* Separates waiting for readers in new_reader_group from group flip. */
300 memory_barrier();
301
302 /* Flip the group new readers should associate with. */
303 size_t old_reader_group = rcu.reader_group;
304 rcu.reader_group = new_reader_group;
305
306 /* Flip the group before waiting for preexisting readers in the old group.*/
307 memory_barrier();
308
309 wait_for_readers(old_reader_group);
310
311 /* MB_FORCE_U */
312 force_mb_in_all_threads(); /* MB_FORCE_U */
313
314 unlock_sync();
315}
316
317/** Issues a memory barrier in each thread of this process. */
318static void force_mb_in_all_threads(void)
319{
320 /*
321 * Only issue barriers in running threads. The scheduler will
322 * execute additional memory barriers when switching to threads
323 * of the process that are currently not running.
324 */
325 smp_memory_barrier();
326}
327
328/** Waits for readers of reader_group to exit their readers sections. */
329static void wait_for_readers(size_t reader_group)
330{
331 fibril_rmutex_lock(&rcu.list_mutex);
332
333 list_t quiescent_fibrils;
334 list_initialize(&quiescent_fibrils);
335
336 while (!list_empty(&rcu.fibrils_list)) {
337 list_foreach_safe(rcu.fibrils_list, fibril_it, next_fibril) {
338 fibril_rcu_data_t *fib = member_to_inst(fibril_it,
339 fibril_rcu_data_t, link);
340
341 if (is_preexisting_reader(fib, reader_group)) {
342 fibril_rmutex_unlock(&rcu.list_mutex);
343 sync_sleep();
344 fibril_rmutex_lock(&rcu.list_mutex);
345 /* Break to while loop. */
346 break;
347 } else {
348 list_remove(fibril_it);
349 list_append(fibril_it, &quiescent_fibrils);
350 }
351 }
352 }
353
354 list_concat(&rcu.fibrils_list, &quiescent_fibrils);
355 fibril_rmutex_unlock(&rcu.list_mutex);
356}
357
358static void lock_sync(void)
359{
360 fibril_rmutex_lock(&rcu.sync_lock.mutex);
361 if (rcu.sync_lock.locked) {
362 blocked_fibril_t blocked_fib;
363 blocked_fib.unblock = FIBRIL_EVENT_INIT;
364
365 list_append(&blocked_fib.link, &rcu.sync_lock.blocked_fibrils);
366
367 do {
368 blocked_fib.is_ready = false;
369 fibril_rmutex_unlock(&rcu.sync_lock.mutex);
370 fibril_wait_for(&blocked_fib.unblock);
371 fibril_rmutex_lock(&rcu.sync_lock.mutex);
372 } while (rcu.sync_lock.locked);
373
374 list_remove(&blocked_fib.link);
375 rcu.sync_lock.locked = true;
376 } else {
377 rcu.sync_lock.locked = true;
378 }
379}
380
381static void unlock_sync(void)
382{
383 assert(rcu.sync_lock.locked);
384
385 /* Unlock but wake up any fibrils waiting for the lock. */
386
387 if (!list_empty(&rcu.sync_lock.blocked_fibrils)) {
388 blocked_fibril_t *blocked_fib = member_to_inst(
389 list_first(&rcu.sync_lock.blocked_fibrils), blocked_fibril_t, link);
390
391 if (!blocked_fib->is_ready) {
392 blocked_fib->is_ready = true;
393 fibril_notify(&blocked_fib->unblock);
394 }
395 }
396
397 rcu.sync_lock.locked = false;
398 fibril_rmutex_unlock(&rcu.sync_lock.mutex);
399}
400
401static void sync_sleep(void)
402{
403 assert(rcu.sync_lock.locked);
404 /*
405 * Release the futex to avoid deadlocks in singlethreaded apps
406 * but keep sync locked.
407 */
408 fibril_rmutex_unlock(&rcu.sync_lock.mutex);
409 fibril_usleep(RCU_SLEEP_MS * 1000);
410 fibril_rmutex_lock(&rcu.sync_lock.mutex);
411}
412
413
414static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group)
415{
416 size_t nesting_cnt = ACCESS_ONCE(fib->nesting_cnt);
417
418 return is_in_group(nesting_cnt, group) && is_in_reader_section(nesting_cnt);
419}
420
421static size_t get_other_group(size_t group)
422{
423 if (group == RCU_GROUP_A)
424 return RCU_GROUP_B;
425 else
426 return RCU_GROUP_A;
427}
428
429static bool is_in_reader_section(size_t nesting_cnt)
430{
431 return RCU_NESTING_INC <= nesting_cnt;
432}
433
434static bool is_in_group(size_t nesting_cnt, size_t group)
435{
436 return (nesting_cnt & RCU_GROUP_BIT_MASK) == (group & RCU_GROUP_BIT_MASK);
437}
438
439
440
441/** @}
442 */
Note: See TracBrowser for help on using the repository browser.