| 1 | /*
|
|---|
| 2 | * Copyright (c) 2012 Adam Hraska
|
|---|
| 3 | * All rights reserved.
|
|---|
| 4 | *
|
|---|
| 5 | * Redistribution and use in source and binary forms, with or without
|
|---|
| 6 | * modification, are permitted provided that the following conditions
|
|---|
| 7 | * are met:
|
|---|
| 8 | *
|
|---|
| 9 | * - Redistributions of source code must retain the above copyright
|
|---|
| 10 | * notice, this list of conditions and the following disclaimer.
|
|---|
| 11 | * - Redistributions in binary form must reproduce the above copyright
|
|---|
| 12 | * notice, this list of conditions and the following disclaimer in the
|
|---|
| 13 | * documentation and/or other materials provided with the distribution.
|
|---|
| 14 | * - The name of the author may not be used to endorse or promote products
|
|---|
| 15 | * derived from this software without specific prior written permission.
|
|---|
| 16 | *
|
|---|
| 17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|---|
| 18 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|---|
| 19 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|---|
| 20 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|---|
| 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|---|
| 22 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|---|
| 23 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|---|
| 24 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|---|
| 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|---|
| 26 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|---|
| 27 | */
|
|---|
| 28 |
|
|---|
| 29 | /** @addtogroup liburcu
|
|---|
| 30 | * @{
|
|---|
| 31 | */
|
|---|
| 32 | /**
|
|---|
| 33 | * @file
|
|---|
| 34 | *
|
|---|
| 35 | * User space RCU is based on URCU utilizing signals [1]. This
|
|---|
| 36 | * implementation does not however signal each thread of the process
|
|---|
| 37 | * to issue a memory barrier. Instead, we introduced a syscall that
|
|---|
| 38 | * issues memory barriers (via IPIs) on cpus that are running threads
|
|---|
| 39 | * of the current process. First, it does not require us to schedule
|
|---|
| 40 | * and run every thread of the process. Second, IPIs are less intrusive
|
|---|
| 41 | * than switching contexts and entering user space.
|
|---|
| 42 | *
|
|---|
| 43 | * This algorithm is further modified to require a single instead of
|
|---|
| 44 | * two reader group changes per grace period. Signal-URCU flips
|
|---|
| 45 | * the reader group and waits for readers of the previous group
|
|---|
| 46 | * twice in succession in order to wait for new readers that were
|
|---|
| 47 | * delayed and mistakenly associated with the previous reader group.
|
|---|
| 48 | * The modified algorithm ensures that the new reader group is
|
|---|
| 49 | * always empty (by explicitly waiting for it to become empty).
|
|---|
| 50 | * Only then does it flip the reader group and wait for preexisting
|
|---|
| 51 | * readers of the old reader group (invariant of SRCU [2, 3]).
|
|---|
| 52 | *
|
|---|
| 53 | *
|
|---|
| 54 | * [1] User-level implementations of read-copy update,
|
|---|
| 55 | * 2012, appendix
|
|---|
| 56 | * http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
|
|---|
| 57 | *
|
|---|
| 58 | * [2] linux/kernel/srcu.c in Linux 3.5-rc2,
|
|---|
| 59 | * 2012
|
|---|
| 60 | * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/kernel/srcu.c?v=linux-3.5-rc2-ccs-1.8.3
|
|---|
| 61 | *
|
|---|
| 62 | * [3] [RFC PATCH 5/5 single-thread-version] implement
|
|---|
| 63 | * per-domain single-thread state machine,
|
|---|
| 64 | * 2012, Lai
|
|---|
| 65 | * https://lkml.org/lkml/2012/3/6/586
|
|---|
| 66 | */
|
|---|
| 67 |
|
|---|
| 68 | #include "rcu.h"
|
|---|
| 69 | #include <fibril_synch.h>
|
|---|
| 70 | #include <fibril.h>
|
|---|
| 71 | #include <stdio.h>
|
|---|
| 72 | #include <compiler/barrier.h>
|
|---|
| 73 | #include <libarch/barrier.h>
|
|---|
| 74 | #include <adt/list.h>
|
|---|
| 75 | #include <futex.h>
|
|---|
| 76 | #include <macros.h>
|
|---|
| 77 | #include <async.h>
|
|---|
| 78 | #include <smp_memory_barrier.h>
|
|---|
| 79 |
|
|---|
| 80 | #define RCU_SLEEP_MS 10
|
|---|
| 81 |
|
|---|
| 82 | #define RCU_NESTING_SHIFT 1
|
|---|
| 83 | #define RCU_NESTING_INC (1 << RCU_NESTING_SHIFT)
|
|---|
| 84 | #define RCU_GROUP_BIT_MASK (size_t)(RCU_NESTING_INC - 1)
|
|---|
| 85 | #define RCU_GROUP_A (size_t)(0 | RCU_NESTING_INC)
|
|---|
| 86 | #define RCU_GROUP_B (size_t)(1 | RCU_NESTING_INC)
|
|---|
| 87 |
|
|---|
| 88 |
|
|---|
| 89 | typedef struct rcu_fibril_data {
|
|---|
| 90 | size_t nesting_cnt;
|
|---|
| 91 | link_t link;
|
|---|
| 92 | } rcu_fibril_data_t;
|
|---|
| 93 |
|
|---|
| 94 | typedef struct rcu_data {
|
|---|
| 95 | fibril_mutex_t mtx;
|
|---|
| 96 | size_t cur_gp;
|
|---|
| 97 | size_t reader_group;
|
|---|
| 98 | futex_t list_futex;
|
|---|
| 99 | list_t fibrils_list;
|
|---|
| 100 | } rcu_data_t;
|
|---|
| 101 |
|
|---|
| 102 |
|
|---|
| 103 | static fibril_local rcu_fibril_data_t rcu_fibril = {
|
|---|
| 104 | .nesting_cnt = 0
|
|---|
| 105 | };
|
|---|
| 106 |
|
|---|
| 107 | static rcu_data_t rcu = {
|
|---|
| 108 | .mtx = FIBRIL_MUTEX_INITIALIZER(rcu.mtx),
|
|---|
| 109 | .cur_gp = 0,
|
|---|
| 110 | .reader_group = RCU_GROUP_A,
|
|---|
| 111 | .list_futex = FUTEX_INITIALIZER,
|
|---|
| 112 | .fibrils_list = LIST_INITIALIZER(rcu.fibrils_list),
|
|---|
| 113 | };
|
|---|
| 114 |
|
|---|
| 115 |
|
|---|
| 116 | static void wait_for_readers(size_t reader_group);
|
|---|
| 117 | static void force_mb_in_all_threads(void);
|
|---|
| 118 | static bool is_preexisting_reader(const rcu_fibril_data_t *fib, size_t group);
|
|---|
| 119 |
|
|---|
| 120 | static bool is_in_group(size_t nesting_cnt, size_t group);
|
|---|
| 121 | static bool is_in_reader_section(size_t nesting_cnt);
|
|---|
| 122 | static size_t get_other_group(size_t group);
|
|---|
| 123 |
|
|---|
| 124 |
|
|---|
| 125 | /** Registers a fibril so it may start using RCU read sections.
|
|---|
| 126 | *
|
|---|
| 127 | * A fibril must be registered with rcu before it can enter RCU critical
|
|---|
| 128 | * sections delineated by rcu_read_lock() and rcu_read_unlock().
|
|---|
| 129 | */
|
|---|
| 130 | void rcu_register_fibril(void)
|
|---|
| 131 | {
|
|---|
| 132 | futex_down(&rcu.list_futex);
|
|---|
| 133 | list_append(&rcu_fibril.link, &rcu.fibrils_list);
|
|---|
| 134 | futex_up(&rcu.list_futex);
|
|---|
| 135 | }
|
|---|
| 136 |
|
|---|
| 137 | /** Deregisters a fibril that had been using RCU read sections.
|
|---|
| 138 | *
|
|---|
| 139 | * A fibril must be deregistered before it exits if it had
|
|---|
| 140 | * been registered with rcu via rcu_register_fibril().
|
|---|
| 141 | */
|
|---|
| 142 | void rcu_deregister_fibril(void)
|
|---|
| 143 | {
|
|---|
| 144 | /*
|
|---|
| 145 | * Forcefully unlock any reader sections. The fibril is exiting
|
|---|
| 146 | * so it is not holding any references to data protected by the
|
|---|
| 147 | * rcu section. Therefore, it is safe to unlock. Otherwise,
|
|---|
| 148 | * rcu_synchronize() would wait indefinitely.
|
|---|
| 149 | */
|
|---|
| 150 | memory_barrier();
|
|---|
| 151 | rcu_fibril.nesting_cnt = 0;
|
|---|
| 152 |
|
|---|
| 153 | futex_down(&rcu.list_futex);
|
|---|
| 154 | list_remove(&rcu_fibril.link);
|
|---|
| 155 | futex_up(&rcu.list_futex);
|
|---|
| 156 | }
|
|---|
| 157 |
|
|---|
| 158 | /** Delimits the start of an RCU reader critical section.
|
|---|
| 159 | *
|
|---|
| 160 | * RCU reader sections may be nested.
|
|---|
| 161 | */
|
|---|
| 162 | void rcu_read_lock(void)
|
|---|
| 163 | {
|
|---|
| 164 | size_t nesting_cnt = ACCESS_ONCE(rcu_fibril.nesting_cnt);
|
|---|
| 165 |
|
|---|
| 166 | if (0 == (nesting_cnt >> RCU_NESTING_SHIFT)) {
|
|---|
| 167 | ACCESS_ONCE(rcu_fibril.nesting_cnt) = ACCESS_ONCE(rcu.reader_group);
|
|---|
| 168 | /* Required by MB_FORCE_L */
|
|---|
| 169 | compiler_barrier(); /* CC_BAR_L */
|
|---|
| 170 | } else {
|
|---|
| 171 | ACCESS_ONCE(rcu_fibril.nesting_cnt) = nesting_cnt + RCU_NESTING_INC;
|
|---|
| 172 | }
|
|---|
| 173 | }
|
|---|
| 174 |
|
|---|
| 175 | /** Delimits the start of an RCU reader critical section. */
|
|---|
| 176 | void rcu_read_unlock(void)
|
|---|
| 177 | {
|
|---|
| 178 | /* Required by MB_FORCE_U */
|
|---|
| 179 | compiler_barrier(); /* CC_BAR_U */
|
|---|
| 180 | /* todo: ACCESS_ONCE(nesting_cnt) ? */
|
|---|
| 181 | rcu_fibril.nesting_cnt -= RCU_NESTING_INC;
|
|---|
| 182 | }
|
|---|
| 183 |
|
|---|
| 184 | /** Blocks until all preexisting readers exit their critical sections. */
|
|---|
| 185 | void rcu_synchronize(void)
|
|---|
| 186 | {
|
|---|
| 187 | /* Contain load of rcu.cur_gp. */
|
|---|
| 188 | memory_barrier();
|
|---|
| 189 |
|
|---|
| 190 | /* Approximately the number of the GP in progress. */
|
|---|
| 191 | size_t gp_in_progress = ACCESS_ONCE(rcu.cur_gp);
|
|---|
| 192 |
|
|---|
| 193 | /* todo: early exit for batched sync()s */
|
|---|
| 194 | fibril_mutex_lock(&rcu.mtx);
|
|---|
| 195 |
|
|---|
| 196 | /*
|
|---|
| 197 | * Exit early if we were stuck waiting for the mutex for a full grace
|
|---|
| 198 | * period. Started waiting during gp_in_progress (or gp_in_progress + 1
|
|---|
| 199 | * if the value propagated to this cpu too late) so wait for the next
|
|---|
| 200 | * full GP, gp_in_progress + 1, to finish. Ie don't wait if the GP
|
|---|
| 201 | * after that, gp_in_progress + 2, already started.
|
|---|
| 202 | */
|
|---|
| 203 | if (rcu.cur_gp + 2 >= gp_in_progress) {
|
|---|
| 204 | fibril_mutex_unlock(&rcu.mtx);
|
|---|
| 205 | return;
|
|---|
| 206 | }
|
|---|
| 207 |
|
|---|
| 208 | ++ACCESS_ONCE(rcu.cur_gp);
|
|---|
| 209 |
|
|---|
| 210 | /*
|
|---|
| 211 | * Pairs up with MB_FORCE_L (ie CC_BAR_L). Makes changes prior
|
|---|
| 212 | * to rcu_synchronize() visible to new readers.
|
|---|
| 213 | */
|
|---|
| 214 | memory_barrier(); /* MB_A */
|
|---|
| 215 |
|
|---|
| 216 | /*
|
|---|
| 217 | * Pairs up with MB_A.
|
|---|
| 218 | *
|
|---|
| 219 | * If the memory barrier is issued before CC_BAR_L in the target
|
|---|
| 220 | * thread, it pairs up with MB_A and the thread sees all changes
|
|---|
| 221 | * prior to rcu_synchronize(). Ie any reader sections are new
|
|---|
| 222 | * rcu readers.
|
|---|
| 223 | *
|
|---|
| 224 | * If the memory barrier is issued after CC_BAR_L, it pairs up
|
|---|
| 225 | * with MB_B and it will make the most recent nesting_cnt visible
|
|---|
| 226 | * in this thread. Since the reader may have already accessed
|
|---|
| 227 | * memory protected by RCU (it ran instructions passed CC_BAR_L),
|
|---|
| 228 | * it is a preexisting reader. Seeing the most recent nesting_cnt
|
|---|
| 229 | * ensures the thread will be identified as a preexisting reader
|
|---|
| 230 | * and we will wait for it in wait_for_readers(old_reader_group).
|
|---|
| 231 | */
|
|---|
| 232 | force_mb_in_all_threads(); /* MB_FORCE_L */
|
|---|
| 233 |
|
|---|
| 234 | /*
|
|---|
| 235 | * Pairs with MB_FORCE_L (ie CC_BAR_L, CC_BAR_U) and makes the most
|
|---|
| 236 | * current fibril.nesting_cnt visible to this cpu.
|
|---|
| 237 | */
|
|---|
| 238 | read_barrier(); /* MB_B */
|
|---|
| 239 |
|
|---|
| 240 | size_t new_reader_group = get_other_group(rcu.reader_group);
|
|---|
| 241 | wait_for_readers(new_reader_group);
|
|---|
| 242 |
|
|---|
| 243 | /* Separates waiting for readers in new_reader_group from group flip. */
|
|---|
| 244 | memory_barrier();
|
|---|
| 245 |
|
|---|
| 246 | /* Flip the group new readers should associate with. */
|
|---|
| 247 | size_t old_reader_group = rcu.reader_group;
|
|---|
| 248 | rcu.reader_group = new_reader_group;
|
|---|
| 249 |
|
|---|
| 250 | /* Flip the group before waiting for preexisting readers in the old group.*/
|
|---|
| 251 | memory_barrier();
|
|---|
| 252 |
|
|---|
| 253 | wait_for_readers(old_reader_group);
|
|---|
| 254 |
|
|---|
| 255 | /* MB_FORCE_U */
|
|---|
| 256 | force_mb_in_all_threads(); /* MB_FORCE_U */
|
|---|
| 257 |
|
|---|
| 258 | fibril_mutex_unlock(&rcu.mtx);
|
|---|
| 259 | }
|
|---|
| 260 |
|
|---|
| 261 | /** Issues a memory barrier in each thread of this process. */
|
|---|
| 262 | static void force_mb_in_all_threads(void)
|
|---|
| 263 | {
|
|---|
| 264 | /*
|
|---|
| 265 | * Only issue barriers in running threads. The scheduler will
|
|---|
| 266 | * execute additional memory barriers when switching to threads
|
|---|
| 267 | * of the process that are currently not running.
|
|---|
| 268 | */
|
|---|
| 269 | smp_memory_barrier();
|
|---|
| 270 | }
|
|---|
| 271 |
|
|---|
| 272 | /** Waits for readers of reader_group to exit their readers sections. */
|
|---|
| 273 | static void wait_for_readers(size_t reader_group)
|
|---|
| 274 | {
|
|---|
| 275 | futex_down(&rcu.list_futex);
|
|---|
| 276 |
|
|---|
| 277 | list_t quiescent_fibrils;
|
|---|
| 278 | list_initialize(&quiescent_fibrils);
|
|---|
| 279 |
|
|---|
| 280 | while (!list_empty(&rcu.fibrils_list)) {
|
|---|
| 281 | list_foreach_safe(rcu.fibrils_list, fibril_it, next_fibril) {
|
|---|
| 282 | rcu_fibril_data_t *fib = member_to_inst(fibril_it,
|
|---|
| 283 | rcu_fibril_data_t, link);
|
|---|
| 284 |
|
|---|
| 285 | if (is_preexisting_reader(fib, reader_group)) {
|
|---|
| 286 | futex_up(&rcu.list_futex);
|
|---|
| 287 | async_usleep(RCU_SLEEP_MS * 1000);
|
|---|
| 288 | futex_down(&rcu.list_futex);
|
|---|
| 289 | break;
|
|---|
| 290 | } else {
|
|---|
| 291 | list_remove(fibril_it);
|
|---|
| 292 | list_append(fibril_it, &quiescent_fibrils);
|
|---|
| 293 | }
|
|---|
| 294 | }
|
|---|
| 295 | }
|
|---|
| 296 |
|
|---|
| 297 | list_concat(&rcu.fibrils_list, &quiescent_fibrils);
|
|---|
| 298 | futex_up(&rcu.list_futex);
|
|---|
| 299 | }
|
|---|
| 300 |
|
|---|
| 301 | static bool is_preexisting_reader(const rcu_fibril_data_t *fib, size_t group)
|
|---|
| 302 | {
|
|---|
| 303 | size_t nesting_cnt = ACCESS_ONCE(fib->nesting_cnt);
|
|---|
| 304 |
|
|---|
| 305 | return is_in_group(nesting_cnt, group) && is_in_reader_section(nesting_cnt);
|
|---|
| 306 | }
|
|---|
| 307 |
|
|---|
| 308 | static size_t get_other_group(size_t group)
|
|---|
| 309 | {
|
|---|
| 310 | if (group == RCU_GROUP_A)
|
|---|
| 311 | return RCU_GROUP_B;
|
|---|
| 312 | else
|
|---|
| 313 | return RCU_GROUP_A;
|
|---|
| 314 | }
|
|---|
| 315 |
|
|---|
| 316 | static bool is_in_reader_section(size_t nesting_cnt)
|
|---|
| 317 | {
|
|---|
| 318 | return RCU_NESTING_INC <= nesting_cnt;
|
|---|
| 319 | }
|
|---|
| 320 |
|
|---|
| 321 | static bool is_in_group(size_t nesting_cnt, size_t group)
|
|---|
| 322 | {
|
|---|
| 323 | return (nesting_cnt & RCU_GROUP_BIT_MASK) == (group & RCU_GROUP_BIT_MASK);
|
|---|
| 324 | }
|
|---|
| 325 |
|
|---|
| 326 |
|
|---|
| 327 |
|
|---|
| 328 | /** @}
|
|---|
| 329 | */
|
|---|