Index: HelenOS.config
===================================================================
--- HelenOS.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ HelenOS.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -399,4 +399,9 @@
 ! [COMPILER=gcc_cross|COMPILER=gcc_native] CONFIG_LTO (n/y)
 
+% Kernel RCU algorithm
+@ "PREEMPT_PODZIMEK" Preemptible Podzimek-RCU
+@ "PREEMPT_A" Preemptible A-RCU
+! RCU (choice)
+
 
 ## Hardware support
Index: abi/include/syscall.h
===================================================================
--- abi/include/syscall.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ abi/include/syscall.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -55,4 +55,5 @@
 	SYS_FUTEX_WAKEUP,
 	SYS_SMC_COHERENCE,
+	SYS_SMP_MEMORY_BARRIER,
 	
 	SYS_AS_AREA_CREATE,
Index: boot/Makefile.common
===================================================================
--- boot/Makefile.common	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ boot/Makefile.common	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -174,4 +174,5 @@
 	$(USPACE_PATH)/app/mkexfat/mkexfat \
 	$(USPACE_PATH)/app/mkmfs/mkmfs \
+	$(USPACE_PATH)/app/rcutest/rcutest \
 	$(USPACE_PATH)/app/sbi/sbi \
 	$(USPACE_PATH)/app/sportdmp/sportdmp \
Index: defaults/amd64/Makefile.config
===================================================================
--- defaults/amd64/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/amd64/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -38,4 +38,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/arm32/Makefile.config
===================================================================
--- defaults/arm32/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/arm32/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -29,4 +29,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # What is your input device?
 CONFIG_HID_IN = generic
Index: defaults/ia32/Makefile.config
===================================================================
--- defaults/ia32/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/ia32/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -44,4 +44,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/ia64/Makefile.config
===================================================================
--- defaults/ia64/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/ia64/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/mips32/Makefile.config
===================================================================
--- defaults/mips32/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/mips32/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -35,4 +35,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/mips64/Makefile.config
===================================================================
--- defaults/mips64/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/mips64/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -35,4 +35,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/ppc32/Makefile.config
===================================================================
--- defaults/ppc32/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/ppc32/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -29,4 +29,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/sparc64/Makefile.config
===================================================================
--- defaults/sparc64/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/sparc64/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -44,4 +44,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Input device class
 CONFIG_HID_IN = generic
Index: defaults/special/Makefile.config
===================================================================
--- defaults/special/Makefile.config	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ defaults/special/Makefile.config	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -29,4 +29,7 @@
 CONFIG_TEST = y
 
+# Kernel RCU implementation
+RCU = PREEMPT_A
+
 # Load disk drivers on startup
 CONFIG_START_BD = n
Index: kernel/Makefile
===================================================================
--- kernel/Makefile	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -192,4 +192,5 @@
 	generic/src/adt/bitmap.c \
 	generic/src/adt/btree.c \
+	generic/src/adt/cht.c \
 	generic/src/adt/hash_table.c \
 	generic/src/adt/list.c \
@@ -198,4 +199,5 @@
 	generic/src/console/prompt.c \
 	generic/src/cpu/cpu.c \
+	generic/src/cpu/cpu_mask.c \
 	generic/src/ddi/ddi.c \
 	generic/src/ddi/irq.c \
@@ -251,8 +253,12 @@
 	generic/src/synch/semaphore.c \
 	generic/src/synch/smc.c \
+	generic/src/synch/smp_memory_barrier.c \
 	generic/src/synch/waitq.c \
 	generic/src/synch/futex.c \
+	generic/src/synch/workqueue.c \
+	generic/src/synch/rcu.c \
 	generic/src/smp/ipi.c \
 	generic/src/smp/smp.c \
+	generic/src/smp/smp_call.c \
 	generic/src/ipc/ipc.c \
 	generic/src/ipc/sysipc.c \
@@ -304,4 +310,5 @@
 		test/atomic/atomic1.c \
 		test/btree/btree1.c \
+		test/cht/cht1.c \
 		test/avltree/avltree1.c \
 		test/fault/fault1.c \
@@ -313,4 +320,7 @@
 		test/synch/semaphore1.c \
 		test/synch/semaphore2.c \
+		test/synch/workqueue2.c \
+		test/synch/workqueue3.c \
+		test/synch/rcu1.c \
 		test/print/print1.c \
 		test/print/print2.c \
@@ -318,5 +328,6 @@
 		test/print/print4.c \
 		test/print/print5.c \
-		test/thread/thread1.c
+		test/thread/thread1.c \
+		test/smpcall/smpcall1.c
 	
 	ifeq ($(KARCH),mips32)
Index: kernel/arch/amd64/Makefile.inc
===================================================================
--- kernel/arch/amd64/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -85,4 +85,5 @@
 		arch/$(KARCH)/src/smp/ipi.c \
 		arch/$(KARCH)/src/smp/mps.c \
+		arch/$(KARCH)/src/smp/smp_call.c \
 		arch/$(KARCH)/src/smp/smp.c
 endif
Index: kernel/arch/amd64/include/atomic.h
===================================================================
--- kernel/arch/amd64/include/atomic.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/include/atomic.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2001-2004 Jakub Jermar
+ * Copyright (c) 2012      Adam Hraska
  * All rights reserved.
  *
@@ -140,6 +141,124 @@
 }
 
+
+#define _atomic_cas_impl(pptr, exp_val, new_val, old_val, prefix) \
+({ \
+	switch (sizeof(typeof(*(pptr)))) { \
+	case 1: \
+		asm volatile ( \
+			prefix " cmpxchgb %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 2: \
+		asm volatile ( \
+			prefix " cmpxchgw %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 4: \
+		asm volatile ( \
+			prefix " cmpxchgl %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 8: \
+		asm volatile ( \
+			prefix " cmpxchgq %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	} \
+})
+
+
+#ifndef local_atomic_cas
+
+#define local_atomic_cas(pptr, exp_val, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _old_val_cas; \
+	typeof(*(pptr)) _exp_val_cas = exp_val; \
+	typeof(*(pptr)) _new_val_cas = new_val; \
+	_atomic_cas_impl(pptr, _exp_val_cas, _new_val_cas, _old_val_cas, ""); \
+	\
+	_old_val_cas; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
 #endif
 
+
+#ifndef local_atomic_exchange
+/* 
+ * Issuing a xchg instruction always implies lock prefix semantics.
+ * Therefore, it is cheaper to use a cmpxchg without a lock prefix 
+ * in a loop.
+ */
+#define local_atomic_exchange(pptr, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _exp_val_x; \
+	typeof(*(pptr)) _old_val_x; \
+	typeof(*(pptr)) _new_val_x = new_val; \
+	\
+	do { \
+		_exp_val_x = *pptr; \
+		_old_val_x = local_atomic_cas(pptr, _exp_val_x, _new_val_x); \
+	} while (_old_val_x != _exp_val_x); \
+	\
+	_old_val_x; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
+#endif
+
 /** @}
  */
Index: kernel/arch/amd64/include/cpu.h
===================================================================
--- kernel/arch/amd64/include/cpu.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/include/cpu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -73,4 +73,6 @@
 	tss_t *tss;
 	
+	unsigned int id; /** CPU's local, ie physical, APIC ID. */
+	
 	size_t iomapver_copy;  /** Copy of TASK's I/O Permission bitmap generation count. */
 } cpu_arch_t;
Index: kernel/arch/amd64/include/interrupt.h
===================================================================
--- kernel/arch/amd64/include/interrupt.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/include/interrupt.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -69,4 +69,5 @@
 #define VECTOR_TLB_SHOOTDOWN_IPI  (IVT_FREEBASE + 1)
 #define VECTOR_DEBUG_IPI          (IVT_FREEBASE + 2)
+#define VECTOR_SMP_CALL_IPI       (IVT_FREEBASE + 3)
 
 extern void (* disable_irqs_function)(uint16_t);
Index: kernel/arch/amd64/src/amd64.c
===================================================================
--- kernel/arch/amd64/src/amd64.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/src/amd64.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -171,5 +171,5 @@
 }
 
-void arch_post_cpu_init()
+void arch_post_cpu_init(void)
 {
 #ifdef CONFIG_SMP
Index: kernel/arch/amd64/src/cpu/cpu.c
===================================================================
--- kernel/arch/amd64/src/cpu/cpu.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/src/cpu/cpu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -158,7 +158,7 @@
 void cpu_print_report(cpu_t* m)
 {
-	printf("cpu%d: (%s family=%d model=%d stepping=%d) %dMHz\n",
+	printf("cpu%d: (%s family=%d model=%d stepping=%d apicid=%u) %dMHz\n",
 	    m->id, vendor_str[m->arch.vendor], m->arch.family, m->arch.model,
-	    m->arch.stepping, m->frequency_mhz);
+	    m->arch.stepping, m->arch.id, m->frequency_mhz);
 }
 
Index: kernel/arch/amd64/src/interrupt.c
===================================================================
--- kernel/arch/amd64/src/interrupt.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/amd64/src/interrupt.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -54,4 +54,5 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <smp/smp_call.h>
 
 /*
@@ -161,4 +162,10 @@
 	tlb_shootdown_ipi_recv();
 }
+
+static void arch_smp_call_ipi_recv(unsigned int n, istate_t *istate)
+{
+	trap_virtual_eoi();
+	smp_call_ipi_recv();
+}
 #endif
 
@@ -222,4 +229,6 @@
 	exc_register(VECTOR_TLB_SHOOTDOWN_IPI, "tlb_shootdown", true,
 	    (iroutine_t) tlb_shootdown_ipi);
+	exc_register(VECTOR_SMP_CALL_IPI, "smp_call", true, 
+		(iroutine_t) arch_smp_call_ipi_recv);
 #endif
 }
Index: kernel/arch/amd64/src/smp/smp_call.c
===================================================================
--- kernel/arch/amd64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/amd64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,1 @@
+../../../ia32/src/smp/smp_call.c
Index: kernel/arch/arm32/Makefile.inc
===================================================================
--- kernel/arch/arm32/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/arm32/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -60,4 +60,5 @@
 	arch/$(KARCH)/src/mm/tlb.c \
 	arch/$(KARCH)/src/mm/page_fault.c \
+	arch/$(KARCH)/src/atomic.c \
 	arch/$(KARCH)/src/ras.c
 
Index: kernel/arch/arm32/src/atomic.c
===================================================================
--- kernel/arch/arm32/src/atomic.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/arm32/src/atomic.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup arm32
+ * @{
+ */
+/** @file
+ *  @brief Atomic operations emulation.
+ */
+
+#include <synch/spinlock.h>
+
+
+IRQ_SPINLOCK_STATIC_INITIALIZE_NAME(cas_lock, "arm-cas-lock");
+
+/** Implements GCC's missing compare-and-swap intrinsic for ARM.
+ *
+ * Sets \a *ptr to \a new_val if it is equal to \a expected. In any case,
+ * returns the previous value of \a *ptr.
+ */
+void * __sync_val_compare_and_swap_4(void **ptr, void *expected, void *new_val)
+{
+	/* 
+	 * Using an interrupt disabling spinlock might still lead to deadlock
+	 * if CAS() is used in an exception handler. Eg. if a CAS() results
+	 * in a page fault exception and the exception handler again tries
+	 * to invoke CAS() (even for a different memory location), the spinlock
+	 * would deadlock.
+	 */
+	irq_spinlock_lock(&cas_lock, true);
+	
+	void * cur_val = *ptr;
+	
+	if (cur_val == expected) {
+		*ptr = new_val;
+	}
+	
+	irq_spinlock_unlock(&cas_lock, true);
+	
+	return cur_val;
+}
+
+
+/** @}
+ */
Index: kernel/arch/ia32/Makefile.inc
===================================================================
--- kernel/arch/ia32/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -86,4 +86,5 @@
 	arch/$(KARCH)/src/smp/mps.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/atomic.S \
 	arch/$(KARCH)/src/smp/ipi.c \
Index: kernel/arch/ia32/include/atomic.h
===================================================================
--- kernel/arch/ia32/include/atomic.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/include/atomic.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2001-2004 Jakub Jermar
+ * Copyright (c) 2012      Adam Hraska
  * All rights reserved.
  *
@@ -113,4 +114,5 @@
 }
 
+
 /** ia32 specific fast spinlock */
 NO_TRACE static inline void atomic_lock_arch(atomic_t *val)
@@ -142,4 +144,106 @@
 }
 
+
+#define _atomic_cas_impl(pptr, exp_val, new_val, old_val, prefix) \
+({ \
+	switch (sizeof(typeof(*(pptr)))) { \
+	case 1: \
+		asm volatile ( \
+			prefix " cmpxchgb %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 2: \
+		asm volatile ( \
+			prefix " cmpxchgw %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 4: \
+		asm volatile ( \
+			prefix " cmpxchgl %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	} \
+})
+
+
+#ifndef local_atomic_cas
+
+#define local_atomic_cas(pptr, exp_val, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _old_val_cas; \
+	typeof(*(pptr)) _exp_val_cas = exp_val; \
+	typeof(*(pptr)) _new_val_cas = new_val; \
+	_atomic_cas_impl(pptr, _exp_val_cas, _new_val_cas, _old_val_cas, ""); \
+	\
+	_old_val_cas; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
+#ifndef local_atomic_exchange
+/* 
+ * Issuing a xchg instruction always implies lock prefix semantics.
+ * Therefore, it is cheaper to use a cmpxchg without a lock prefix 
+ * in a loop.
+ */
+#define local_atomic_exchange(pptr, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _exp_val_x; \
+	typeof(*(pptr)) _old_val_x; \
+	typeof(*(pptr)) _new_val_x = new_val; \
+	\
+	do { \
+		_exp_val_x = *pptr; \
+		_old_val_x = local_atomic_cas(pptr, _exp_val_x, _new_val_x); \
+	} while (_old_val_x != _exp_val_x); \
+	\
+	_old_val_x; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
 #endif
 
Index: kernel/arch/ia32/include/cpu.h
===================================================================
--- kernel/arch/ia32/include/cpu.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/include/cpu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -60,4 +60,6 @@
 	unsigned int stepping;
 	cpuid_feature_info fi;
+	
+	unsigned int id; /** CPU's local, ie physical, APIC ID. */
 
 	tss_t *tss;
Index: kernel/arch/ia32/include/interrupt.h
===================================================================
--- kernel/arch/ia32/include/interrupt.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/include/interrupt.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -69,4 +69,5 @@
 #define VECTOR_TLB_SHOOTDOWN_IPI  (IVT_FREEBASE + 1)
 #define VECTOR_DEBUG_IPI          (IVT_FREEBASE + 2)
+#define VECTOR_SMP_CALL_IPI       (IVT_FREEBASE + 3)
 
 extern void (* disable_irqs_function)(uint16_t);
Index: kernel/arch/ia32/include/smp/apic.h
===================================================================
--- kernel/arch/ia32/include/smp/apic.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/include/smp/apic.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -353,4 +353,5 @@
 extern void l_apic_init(void);
 extern void l_apic_eoi(void);
+extern int l_apic_send_custom_ipi(uint8_t, uint8_t);
 extern int l_apic_broadcast_custom_ipi(uint8_t);
 extern int l_apic_send_init_ipi(uint8_t);
Index: kernel/arch/ia32/src/cpu/cpu.c
===================================================================
--- kernel/arch/ia32/src/cpu/cpu.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/src/cpu/cpu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -160,7 +160,7 @@
 void cpu_print_report(cpu_t* cpu)
 {
-	printf("cpu%u: (%s family=%u model=%u stepping=%u) %" PRIu16 " MHz\n",
-		cpu->id, vendor_str[cpu->arch.vendor], cpu->arch.family,
-		cpu->arch.model, cpu->arch.stepping, cpu->frequency_mhz);
+	printf("cpu%u: (%s family=%u model=%u stepping=%u apicid=%u) %" PRIu16 
+		" MHz\n", cpu->id, vendor_str[cpu->arch.vendor], cpu->arch.family,
+		cpu->arch.model, cpu->arch.stepping, cpu->arch.id, cpu->frequency_mhz);
 }
 
Index: kernel/arch/ia32/src/ia32.c
===================================================================
--- kernel/arch/ia32/src/ia32.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/src/ia32.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -125,5 +125,5 @@
 }
 
-void arch_post_cpu_init()
+void arch_post_cpu_init(void)
 {
 #ifdef CONFIG_SMP
Index: kernel/arch/ia32/src/interrupt.c
===================================================================
--- kernel/arch/ia32/src/interrupt.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/src/interrupt.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -54,4 +54,6 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <smp/smp_call.h>
+#include <proc/task.h>
 
 /*
@@ -170,4 +172,10 @@
 	tlb_shootdown_ipi_recv();
 }
+
+static void arch_smp_call_ipi_recv(unsigned int n, istate_t *istate)
+{
+	trap_virtual_eoi();
+	smp_call_ipi_recv();
+}
 #endif
 
@@ -230,4 +238,6 @@
 	exc_register(VECTOR_TLB_SHOOTDOWN_IPI, "tlb_shootdown", true,
 	    (iroutine_t) tlb_shootdown_ipi);
+	exc_register(VECTOR_SMP_CALL_IPI, "smp_call", true,
+	    (iroutine_t) arch_smp_call_ipi_recv);
 #endif
 }
Index: kernel/arch/ia32/src/smp/apic.c
===================================================================
--- kernel/arch/ia32/src/smp/apic.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/src/smp/apic.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -259,22 +259,44 @@
 }
 
-#define DELIVS_PENDING_SILENT_RETRIES	4	
-
+/* Waits for the destination cpu to accept the previous ipi. */
 static void l_apic_wait_for_delivery(void)
 {
 	icr_t icr;
-	unsigned retries = 0;
-
+	
 	do {
-		if (retries++ > DELIVS_PENDING_SILENT_RETRIES) {
-			retries = 0;
-#ifdef CONFIG_DEBUG
-			printf("IPI is pending.\n");
-#endif
-			delay(20);
-		}
 		icr.lo = l_apic[ICRlo];
-	} while (icr.delivs == DELIVS_PENDING);
-	
+	} while (icr.delivs != DELIVS_IDLE);
+}
+
+/** Send one CPU an IPI vector.
+ *
+ * @param apicid Physical APIC ID of the destination CPU.
+ * @param vector Interrupt vector to be sent.
+ *
+ * @return 0 on failure, 1 on success.
+ */
+int l_apic_send_custom_ipi(uint8_t apicid, uint8_t vector)
+{
+	icr_t icr;
+
+	/* Wait for a destination cpu to accept our previous ipi. */
+	l_apic_wait_for_delivery();
+	
+	icr.lo = l_apic[ICRlo];
+	icr.hi = l_apic[ICRhi];
+	
+	icr.delmod = DELMOD_FIXED;
+	icr.destmod = DESTMOD_PHYS;
+	icr.level = LEVEL_ASSERT;
+	icr.shorthand = SHORTHAND_NONE;
+	icr.trigger_mode = TRIGMOD_LEVEL;
+	icr.vector = vector;
+	icr.dest = apicid;
+
+	/* Send the IPI by writing to l_apic[ICRlo]. */
+	l_apic[ICRhi] = icr.hi;
+	l_apic[ICRlo] = icr.lo;
+	
+	return apic_poll_errors();
 }
 
@@ -289,4 +311,7 @@
 {
 	icr_t icr;
+
+	/* Wait for a destination cpu to accept our previous ipi. */
+	l_apic_wait_for_delivery();
 	
 	icr.lo = l_apic[ICRlo];
@@ -299,6 +324,4 @@
 	
 	l_apic[ICRlo] = icr.lo;
-
-	l_apic_wait_for_delivery();
 	
 	return apic_poll_errors();
Index: kernel/arch/ia32/src/smp/smp.c
===================================================================
--- kernel/arch/ia32/src/smp/smp.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia32/src/smp/smp.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -55,4 +55,5 @@
 #include <memstr.h>
 #include <arch/drivers/i8259.h>
+#include <cpu.h>
 
 #ifdef CONFIG_SMP
@@ -77,4 +78,14 @@
 		io_apic = (uint32_t *) km_map((uintptr_t) io_apic, PAGE_SIZE,
 		    PAGE_WRITE | PAGE_NOT_CACHEABLE);
+	}
+}
+
+static void cpu_arch_id_init(void)
+{
+	ASSERT(ops != NULL);
+	ASSERT(cpus != NULL);
+	
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		cpus[i].arch.id = ops->cpu_apic_id(i);
 	}
 }
@@ -92,4 +103,10 @@
 	
 	ASSERT(ops != NULL);
+
+	/*
+	 * SMP initialized, cpus array allocated. Assign each CPU its 
+	 * physical APIC ID.
+	 */
+	cpu_arch_id_init();
 	
 	/*
Index: kernel/arch/ia32/src/smp/smp_call.c
===================================================================
--- kernel/arch/ia32/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/ia32/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,14 @@
+#include <smp/smp_call.h>
+#include <arch/smp/apic.h>
+#include <arch/interrupt.h>
+#include <cpu.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	(void) l_apic_send_custom_ipi(cpus[cpu_id].arch.id, VECTOR_SMP_CALL_IPI);
+}
+
+#endif /* CONFIG_SMP */
+
Index: kernel/arch/ia64/Makefile.inc
===================================================================
--- kernel/arch/ia64/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/ia64/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -71,4 +71,5 @@
 	arch/$(KARCH)/src/ddi/ddi.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/drivers/it.c
 
Index: kernel/arch/ia64/src/smp/smp_call.c
===================================================================
--- kernel/arch/ia64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/ia64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup ia64
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/mips32/Makefile.inc
===================================================================
--- kernel/arch/mips32/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/mips32/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -70,3 +70,4 @@
 	arch/$(KARCH)/src/ddi/ddi.c \
 	arch/$(KARCH)/src/smp/dorder.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/smp/smp.c
Index: kernel/arch/mips32/src/smp/smp_call.c
===================================================================
--- kernel/arch/mips32/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/mips32/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup mips32
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/mips64/Makefile.inc
===================================================================
--- kernel/arch/mips64/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/mips64/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -62,3 +62,4 @@
 	arch/$(KARCH)/src/ddi/ddi.c \
 	arch/$(KARCH)/src/smp/dorder.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/smp/smp.c
Index: kernel/arch/mips64/src/smp/smp_call.c
===================================================================
--- kernel/arch/mips64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/mips64/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup mips64
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/sparc64/Makefile.inc
===================================================================
--- kernel/arch/sparc64/Makefile.inc	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/sparc64/Makefile.inc	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -101,4 +101,5 @@
 	ARCH_SOURCES += \
 		arch/$(KARCH)/src/smp/$(USARCH)/smp.c \
+		arch/$(KARCH)/src/smp/$(USARCH)/smp_call.c \
 		arch/$(KARCH)/src/smp/$(USARCH)/ipi.c
 endif
Index: kernel/arch/sparc64/include/barrier.h
===================================================================
--- kernel/arch/sparc64/include/barrier.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/sparc64/include/barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -37,4 +37,10 @@
 
 #include <trace.h>
+
+#ifdef KERNEL
+#include <arch/common.h>
+#else
+#include <libarch/common.h>
+#endif
 
 /*
Index: kernel/arch/sparc64/include/interrupt.h
===================================================================
--- kernel/arch/sparc64/include/interrupt.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/sparc64/include/interrupt.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -47,5 +47,6 @@
 
 enum {
-	IPI_TLB_SHOOTDOWN = VECTOR_TLB_SHOOTDOWN_IPI
+	IPI_TLB_SHOOTDOWN = VECTOR_TLB_SHOOTDOWN_IPI,
+	IPI_SMP_CALL
 };
 
Index: kernel/arch/sparc64/include/smp/sun4u/ipi.h
===================================================================
--- kernel/arch/sparc64/include/smp/sun4u/ipi.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/sparc64/include/smp/sun4u/ipi.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sparc64	
+ * @{
+ */
+/**
+ * @file
+ * @brief	IPI functions specific to Sun4U.
+ */
+
+#ifndef KERN_sparc64_sun4u_IPI_H_
+#define KERN_sparc64_sun4u_IPI_H_
+
+extern void ipi_unicast_arch(unsigned int, int);
+
+#endif
+
+/** @}
+ */
Index: kernel/arch/sparc64/src/debug/stacktrace.c
===================================================================
--- kernel/arch/sparc64/src/debug/stacktrace.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/sparc64/src/debug/stacktrace.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -36,4 +36,5 @@
 #include <syscall/copy.h>
 #include <typedefs.h>
+#include <proc/thread.h>
 
 #include <arch.h>
Index: kernel/arch/sparc64/src/smp/sun4u/ipi.c
===================================================================
--- kernel/arch/sparc64/src/smp/sun4u/ipi.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/arch/sparc64/src/smp/sun4u/ipi.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -34,4 +34,5 @@
 
 #include <smp/ipi.h>
+#include <arch/smp/sun4u/ipi.h>
 #include <cpu.h>
 #include <arch.h>
@@ -40,4 +41,5 @@
 #include <config.h>
 #include <mm/tlb.h>
+#include <smp/smp_call.h>
 #include <arch/interrupt.h>
 #include <arch/trap/interrupt.h>
@@ -171,4 +173,26 @@
 }
 
+
+/*
+ * Deliver an IPI to the specified processors (except the current one).
+ *
+ * Interrupts must be disabled.
+ *
+ * @param cpu_id Destination cpu id (index into cpus array). Must not 
+ *               be the current cpu.
+ * @param ipi    IPI number.
+ */
+void ipi_unicast_arch(unsigned int cpu_id, int ipi)
+{
+	ASSERT(&cpus[cpu_id] != CPU);
+	
+	if (ipi == IPI_SMP_CALL) {
+		cross_call(cpus[cpu_id].arch.mid, smp_call_ipi_recv);
+	} else {
+		panic("Unknown IPI (%d).\n", ipi);
+		return;
+	}
+}
+
 /** @}
  */
Index: kernel/arch/sparc64/src/smp/sun4u/smp_call.c
===================================================================
--- kernel/arch/sparc64/src/smp/sun4u/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/arch/sparc64/src/smp/sun4u/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sparc64
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Sun4u specific smp call support.
+ */
+
+#include <smp/smp_call.h>
+#include <arch/smp/sun4u/ipi.h>
+#include <arch/interrupt.h>
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	/* 
+	 * Required by ipi_unicast_arch(). That functions resolves a potential
+	 * deadlock should both the destination and source cpus be sending
+	 * unicast ipis to each other with interrupts disabled.
+	 */
+	ipl_t ipl = interrupts_disable();
+	ipi_unicast_arch(cpu_id, IPI_SMP_CALL);
+	interrupts_restore(ipl);
+}
+
+/** @}
+ */
Index: kernel/generic/include/adt/cht.h
===================================================================
--- kernel/generic/include/adt/cht.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/adt/cht.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup genericadt
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_CONC_HASH_TABLE_H_
+#define KERN_CONC_HASH_TABLE_H_
+
+#include <stdint.h>
+#include <adt/list.h>
+#include <synch/rcu_types.h>
+#include <macros.h>
+#include <synch/workqueue.h>
+
+typedef uintptr_t cht_ptr_t;
+
+/** Concurrent hash table node link. */
+typedef struct cht_link {
+	/* Must be placed first. 
+	 * 
+	 * The function pointer (rcu_link.func) is used to store the item's 
+	 * mixed memoized hash. If in use by RCU (ie waiting for deferred 
+	 * destruction) the hash will contain the value of 
+	 * cht_t.op->remove_callback.
+	 */
+	union {
+		rcu_item_t rcu_link;
+		size_t hash;
+	};
+	/** Link to the next item in the bucket including any marks. */
+	cht_ptr_t link;
+} cht_link_t;
+
+/** Set of operations for a concurrent hash table. */
+typedef struct cht_ops {
+	/** Returns the hash of the item.
+	 * 
+	 * Applicable also to items that were logically deleted from the table
+	 * but have yet to be physically removed by means of remove_callback().
+	 */
+	size_t (*hash)(const cht_link_t *item);
+	/** Returns the hash value of the key used to search for entries. */
+	size_t (*key_hash)(void *key);
+	/** Returns true if the two items store equal search keys. */
+	bool (*equal)(const cht_link_t *item1, const cht_link_t *item2);
+	/** Returns true if the item contains an equal search key. */
+	bool (*key_equal)(void *key, const cht_link_t *item);
+	/** Invoked to free a removed item once all references to it are dropped. */
+	void (*remove_callback)(cht_link_t *item);
+} cht_ops_t;
+
+/** Groups hash table buckets with their count.
+ * 
+ * It allows both the number of buckets as well as the bucket array
+ * to be swapped atomically when resing the table.
+ */
+typedef struct cht_buckets {
+	/** The number of buckets is 2^order. */
+	size_t order;
+	/** Array of single linked list bucket heads along with any marks. */
+	cht_ptr_t head[1];
+} cht_buckets_t;
+
+/** Concurrent hash table structure. */
+typedef struct {
+	/** Item specific operations. */
+	cht_ops_t *op;
+	
+	/** Buckets currently in use. */
+	cht_buckets_t *b;
+	/** Resized table buckets that will replace b once resize is complete. */
+	cht_buckets_t *new_b;
+	/** Invalid memoized hash value. 
+	 * 
+	 * If cht_link.hash contains this value the item had been logically
+	 * removed and is waiting to be freed. Such hashes (and the associated
+	 * items) are disregarded and skipped or the actual hash must be 
+	 * determined via op->hash().
+	 */
+	size_t invalid_hash;
+
+	/** Minimum number of buckets is 2^min_order. */
+	size_t min_order;
+	/** Maximum number of items per bucket before the table grows. */
+	size_t max_load;
+	/** Table is resized in the background in a work queue. */
+	work_t resize_work;
+	/** If positive the table should grow or shrink.
+	 * 
+	 * If not 0 resize work had already been posted to the system work queue.
+	 */
+	atomic_t resize_reqs;
+	
+	/** Number of items in the table that have not been logically deleted. */
+	atomic_t item_cnt;
+} cht_t;
+
+#define cht_get_inst(item, type, member) \
+	member_to_inst((item), type, member)
+
+
+#define cht_read_lock()     rcu_read_lock()
+#define cht_read_unlock()   rcu_read_unlock()
+
+extern bool cht_create_simple(cht_t *h, cht_ops_t *op);
+extern bool cht_create(cht_t *h, size_t init_size, size_t min_size, 
+	size_t max_load, bool can_block, cht_ops_t *op);
+extern void cht_destroy(cht_t *h);
+extern void cht_destroy_unsafe(cht_t *h);
+
+extern cht_link_t *cht_find(cht_t *h, void *key);
+extern cht_link_t *cht_find_lazy(cht_t *h, void *key);
+extern cht_link_t *cht_find_next(cht_t *h, const cht_link_t *item);
+extern cht_link_t *cht_find_next_lazy(cht_t *h, const cht_link_t *item);
+
+extern void cht_insert(cht_t *h, cht_link_t *item);
+extern bool cht_insert_unique(cht_t *h, cht_link_t *item, cht_link_t **dup_item);
+extern size_t cht_remove_key(cht_t *h, void *key);
+extern bool cht_remove_item(cht_t *h, cht_link_t *item);
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/adt/hash.h
===================================================================
--- kernel/generic/include/adt/hash.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/adt/hash.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup genericadt
+ * @{
+ */
+/** @file
+ */
+#ifndef KERN_HASH_H_
+#define KERN_HASH_H_
+
+#include <stdint.h>
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline uint32_t hash_mix32(uint32_t hash)
+{
+	/*
+	 * Thomas Wang's modification of Bob Jenkin's hash mixing function:
+	 * http://www.concentric.net/~Ttwang/tech/inthash.htm
+	 * Public domain.
+	 */
+	hash = ~hash + (hash << 15); 
+	hash = hash ^ (hash >> 12);
+	hash = hash + (hash << 2);
+	hash = hash ^ (hash >> 4);
+	hash = hash * 2057; 
+	hash = hash ^ (hash >> 16);
+	return hash;	
+}
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline uint64_t hash_mix64(uint64_t hash)
+{
+	/*
+	 * Thomas Wang's public domain 64-bit hash mixing function:
+	 * http://www.concentric.net/~Ttwang/tech/inthash.htm
+	 */
+	hash = (hash ^ 61) ^ (hash >> 16);
+	hash = hash + (hash << 3);
+	hash = hash ^ (hash >> 4);
+	hash = hash * 0x27d4eb2d;
+	hash = hash ^ (hash >> 15);	
+	/* 
+	 * Lower order bits are mixed more thoroughly. Swap them with
+	 * the higher order bits and make the resulting higher order bits
+	 * more usable.
+	 */
+	return (hash << 32) | (hash >> 32);
+}
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline size_t hash_mix(size_t hash) 
+{
+#ifdef __32_BITS__
+	return hash_mix32(hash);
+#elif defined(__64_BITS__)
+	return hash_mix64(hash);
+#else
+#error Unknown size_t size - cannot select proper hash mix function.
+#endif
+}
+
+/** Use to create a hash from multiple values.
+ * 
+ * Typical usage:
+ * @code
+ * int car_id;
+ * bool car_convertible;
+ * // ..
+ * size_t hash = 0;
+ * hash = hash_combine(hash, car_id);
+ * hash = hash_combine(hash, car_convertible);
+ * // Now use hash as a hash of both car_id and car_convertible.
+ * @endcode
+ */
+static inline size_t hash_combine(size_t seed, size_t hash)
+{
+	/* 
+	 * todo: use Bob Jenkin's proper mixing hash pass:
+	 * http://burtleburtle.net/bob/c/lookup3.c
+	 */
+	seed ^= hash + 0x9e3779b9 
+		+ ((seed << 5) | (seed >> (sizeof(size_t) * 8 - 5)));
+	return seed;	
+}
+
+#endif
Index: kernel/generic/include/adt/list.h
===================================================================
--- kernel/generic/include/adt/list.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/adt/list.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -51,4 +51,10 @@
 } list_t;
 
+
+extern int list_member(const link_t *, const list_t *);
+extern void list_splice(list_t *, link_t *);
+extern unsigned int list_count(const list_t *);
+
+
 /** Declare and initialize statically allocated list.
  *
@@ -71,4 +77,36 @@
 	    iterator != &(list).head; iterator = iterator->next)
 
+/** Unlike list_foreach(), allows removing items while traversing a list.
+ * 
+ * @code
+ * list_t mylist;
+ * typedef struct item {
+ *     int value;
+ *     link_t item_link;
+ * } item_t;
+ * 
+ * //..
+ * 
+ * // Print each list element's value and remove the element from the list.
+ * list_foreach_safe(mylist, cur_link, next_link) {
+ *     item_t *cur_item = list_get_instance(cur_link, item_t, item_link);
+ *     printf("%d\n", cur_item->value);
+ *     list_remove(cur_link);
+ * }
+ * @endcode
+ * 
+ * @param list List to traverse.
+ * @param iterator Iterator to the current element of the list.
+ *             The item this iterator points may be safely removed
+ *             from the list.
+ * @param next_iter Iterator to the next element of the list.
+ */
+#define list_foreach_safe(list, iterator, next_iter) \
+	for (link_t *iterator = (list).head.next, \
+		*next_iter = iterator->next; \
+		iterator != &(list).head; \
+		iterator = next_iter, next_iter = iterator->next)
+
+	
 #define assert_link_not_used(link) \
 	ASSERT(((link)->prev == NULL) && ((link)->next == NULL))
@@ -85,4 +123,15 @@
 	link->prev = NULL;
 	link->next = NULL;
+}
+
+/** Returns true if the initialized link is already in use by any list.
+ * 
+ * @param link Link to examine whether if belongs to a list or not.
+ * @return 1 if the link is part of a list. 
+ * @return 0 otherwise.
+ */
+NO_TRACE static inline int link_used(const link_t *link)
+{
+	return link->prev != NULL || link->next != NULL;
 }
 
@@ -256,4 +305,19 @@
 {
 	headless_list_split_or_concat(part1, part2);
+}
+
+/** Concatenate two lists
+ *
+ * Concatenate lists @a list1 and @a list2, producing a single
+ * list @a list1 containing items from both (in @a list1, @a list2
+ * order) and empty list @a list2.
+ *
+ * @param list1		First list and concatenated output
+ * @param list2 	Second list and empty output.
+ *
+ */
+NO_TRACE static inline void list_concat(list_t *list1, list_t *list2)
+{
+	list_splice(list2, list1->head.prev);
 }
 
@@ -281,8 +345,4 @@
 }
 
-extern int list_member(const link_t *, const list_t *);
-extern void list_concat(list_t *, list_t *);
-extern unsigned int list_count(const list_t *);
-
 #endif
 
Index: kernel/generic/include/arch.h
===================================================================
--- kernel/generic/include/arch.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/arch.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -36,8 +36,8 @@
 #define KERN_ARCH_H_
 
-#include <arch/arch.h>
-#include <proc/thread.h>
-#include <proc/task.h>
-#include <mm/as.h>
+#include <arch/arch.h>  /* arch_pre_main() */
+#include <arch/asm.h>   /* get_stack_base() */
+#include <config.h>
+
 
 /*
@@ -49,9 +49,4 @@
 #define THE  ((the_t * )(get_stack_base()))
 
-#define CPU                  THE->cpu
-#define THREAD               THE->thread
-#define TASK                 THE->task
-#define AS                   THE->as
-#define PREEMPTION_DISABLED  THE->preemption_disabled
 #define MAGIC                UINT32_C(0xfacefeed)
 
@@ -62,4 +57,10 @@
 	((THE->task) ? (THE->task->container) : (DEFAULT_CONTAINER))
 
+/* Fwd decl. to avoid include hell. */
+struct thread;
+struct task;
+struct cpu;
+struct as;
+
 /**
  * For each possible kernel stack, structure
@@ -68,10 +69,13 @@
  */
 typedef struct {
-	size_t preemption_disabled;  /**< Preemption disabled counter. */
-	thread_t *thread;            /**< Current thread. */
-	task_t *task;                /**< Current task. */
-	cpu_t *cpu;                  /**< Executing cpu. */
-	as_t *as;                    /**< Current address space. */
-	uint32_t magic;              /**< Magic value */
+	size_t preemption;     /**< Preemption disabled counter and flag. */
+#ifdef RCU_PREEMPT_A
+	size_t rcu_nesting;    /**< RCU nesting count and flag. */
+#endif 
+	struct thread *thread; /**< Current thread. */
+	struct task *task;     /**< Current task. */
+	struct cpu *cpu;       /**< Executing cpu. */
+	struct as *as;         /**< Current address space. */
+	uint32_t magic;        /**< Magic value */
 } the_t;
 
@@ -91,4 +95,5 @@
 extern void *arch_construct_function(fncptr_t *, void *, void *);
 
+
 #endif
 
Index: kernel/generic/include/atomic.h
===================================================================
--- kernel/generic/include/atomic.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/atomic.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -53,4 +53,18 @@
 }
 
+
+/*
+ * If the architecture does not provide operations that are atomic
+ * only with respect to the local cpu (eg exception handlers) and
+ * not other cpus, implement these cpu local atomic operations with
+ * full blown smp-safe atomics.
+ */
+#ifndef local_atomic_exchange
+#define local_atomic_exchange(var_addr, new_val) \
+	__atomic_exchange_n((var_addr), (new_val), __ATOMIC_RELAXED)
+#endif
+
+
+
 #endif
 
Index: kernel/generic/include/compiler/barrier.h
===================================================================
--- kernel/generic/include/compiler/barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/compiler/barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef KERN_COMPILER_BARRIER_H_
+#define KERN_COMPILER_BARRIER_H_
+
+#define compiler_barrier() asm volatile ("" ::: "memory")
+
+/** Forces the compiler to access (ie load/store) the variable only once. */
+#define ACCESS_ONCE(var) (*((volatile typeof(var)*)&(var)))
+
+#endif /* KERN_COMPILER_BARRIER_H_ */
Index: kernel/generic/include/cpu.h
===================================================================
--- kernel/generic/include/cpu.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/cpu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -38,7 +38,13 @@
 #include <mm/tlb.h>
 #include <synch/spinlock.h>
+#include <synch/rcu_types.h>
 #include <proc/scheduler.h>
 #include <arch/cpu.h>
 #include <arch/context.h>
+#include <adt/list.h>
+#include <arch.h>
+
+#define CPU                  THE->cpu
+
 
 /** CPU structure.
@@ -94,4 +100,13 @@
 	
 	/**
+	 * SMP calls to invoke on this CPU.
+	 */
+	SPINLOCK_DECLARE(smp_calls_lock);
+	list_t smp_pending_calls;
+	
+	/** RCU per-cpu data. Uses own locking. */
+	rcu_cpu_data_t rcu;
+	
+	/**
 	 * Stack used by scheduler when there is no running thread.
 	 */
Index: kernel/generic/include/cpu/cpu_mask.h
===================================================================
--- kernel/generic/include/cpu/cpu_mask.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/cpu/cpu_mask.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+#ifndef KERN_CPU_CPU_MASK_H_
+#define KERN_CPU_CPU_MASK_H_
+
+#include <cpu.h>
+#include <config.h>
+#include <lib/memfnc.h>
+
+/** Iterates over all cpu id's whose bit is included in the cpu mask. 
+ * 
+ * Example usage:
+ * @code
+ * DEFINE_CPU_MASK(cpu_mask);
+ * cpu_mask_active(&cpu_mask);
+ * 
+ * cpu_mask_for_each(cpu_mask, cpu_id) {
+ *     printf("Cpu with logical id %u is active.\n", cpu_id);
+ * }
+ * @endcode
+ */
+#define cpu_mask_for_each(mask, cpu_id) \
+	for (unsigned int (cpu_id) = 0; (cpu_id) < config.cpu_count; ++(cpu_id)) \
+		if (cpu_mask_is_set(&(mask), (cpu_id))) 
+
+/** Allocates a cpu_mask_t on stack. */
+#define DEFINE_CPU_MASK(cpu_mask) \
+	cpu_mask_t *(cpu_mask) = (cpu_mask_t*) alloca(cpu_mask_size())
+
+/** If used with DEFINE_CPU_MASK, the mask is large enough for all detected cpus.*/
+typedef struct cpu_mask {
+	unsigned int mask[1];
+} cpu_mask_t;
+
+
+extern size_t cpu_mask_size(void);
+extern void cpu_mask_active(cpu_mask_t *);
+extern void cpu_mask_all(cpu_mask_t *);
+extern void cpu_mask_none(cpu_mask_t *);
+extern void cpu_mask_set(cpu_mask_t *, unsigned int);
+extern void cpu_mask_reset(cpu_mask_t *, unsigned int);
+extern bool cpu_mask_is_set(cpu_mask_t *, unsigned int);
+extern bool cpu_mask_is_none(cpu_mask_t *);
+
+#endif /* KERN_CPU_CPU_MASK_H_ */ 
+
+/** @}
+ */
Index: kernel/generic/include/lib/memfnc.h
===================================================================
--- kernel/generic/include/lib/memfnc.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/lib/memfnc.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,6 @@
 extern void *memcpy(void *, const void *, size_t);
 
+#define alloca(size) __builtin_alloca((size))
+
 #endif
 
Index: kernel/generic/include/macros.h
===================================================================
--- kernel/generic/include/macros.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/macros.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -150,4 +150,11 @@
 	})
 
+
+#ifndef member_to_inst
+#define member_to_inst(ptr_member, type, member_identif) \
+	((type*) (((void*)(ptr_member)) - ((void*)&(((type*)0)->member_identif))))
+#endif
+
+
 #endif
 
Index: kernel/generic/include/memstr.h
===================================================================
--- kernel/generic/include/memstr.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/memstr.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -40,4 +40,5 @@
 #define memset(dst, val, cnt)  __builtin_memset((dst), (val), (cnt))
 #define memcpy(dst, src, cnt)  __builtin_memcpy((dst), (src), (cnt))
+#define bzero(dst, cnt)        memset((dst), 0, (cnt))
 
 extern void memsetb(void *, size_t, uint8_t);
Index: kernel/generic/include/mm/as.h
===================================================================
--- kernel/generic/include/mm/as.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/mm/as.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -48,4 +48,8 @@
 #include <adt/btree.h>
 #include <lib/elf.h>
+#include <arch.h>
+
+#define AS                   THE->as
+
 
 /**
Index: kernel/generic/include/preemption.h
===================================================================
--- kernel/generic/include/preemption.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/preemption.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -36,6 +36,27 @@
 #define KERN_PREEMPTION_H_
 
-extern void preemption_disable(void);
-extern void preemption_enable(void);
+#include <arch.h>
+#include <compiler/barrier.h>
+#include <debug.h>
+
+#define PREEMPTION_INC         (1 << 0)
+#define PREEMPTION_DISABLED    (PREEMPTION_INC <= THE->preemption)
+#define PREEMPTION_ENABLED     (!PREEMPTION_DISABLED)
+
+/** Increment preemption disabled counter. */
+#define preemption_disable() \
+	do { \
+		THE->preemption += PREEMPTION_INC; \
+		compiler_barrier(); \
+	} while (0)
+
+/** Restores preemption but never reschedules. */
+#define preemption_enable() \
+	do { \
+		ASSERT(PREEMPTION_DISABLED); \
+		compiler_barrier(); \
+		THE->preemption -= PREEMPTION_INC; \
+	} while (0)
+
 
 #endif
Index: kernel/generic/include/proc/task.h
===================================================================
--- kernel/generic/include/proc/task.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/proc/task.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -43,6 +43,8 @@
 #include <synch/mutex.h>
 #include <synch/futex.h>
+#include <synch/workqueue.h>
 #include <adt/avl.h>
 #include <adt/btree.h>
+#include <adt/cht.h>
 #include <adt/list.h>
 #include <security/cap.h>
@@ -57,4 +59,8 @@
 #include <mm/as.h>
 #include <abi/sysinfo.h>
+#include <arch.h>
+
+#define TASK                 THE->task
+
 
 struct thread;
@@ -123,11 +129,13 @@
 	task_arch_t arch;
 	
-	/**
-	 * Serializes access to the B+tree of task's futexes. This mutex is
-	 * independent on the task spinlock.
-	 */
-	mutex_t futexes_lock;
-	/** B+tree of futexes referenced by this task. */
-	btree_t futexes;
+	struct futex_cache {
+		/** CHT mapping virtual addresses of futex variables to futex objects.*/
+		cht_t ht;
+		/** Serializes access to futex_list.*/
+		mutex_t list_lock;
+		/** List of all futexes accesses by this task. */
+		list_t list;
+		work_t destroy_work;
+	} *futexes;
 	
 	/** Accumulated accounting. */
Index: kernel/generic/include/proc/thread.h
===================================================================
--- kernel/generic/include/proc/thread.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/proc/thread.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,5 @@
 #include <cpu.h>
 #include <synch/spinlock.h>
+#include <synch/rcu_types.h>
 #include <adt/avl.h>
 #include <mm/slab.h>
@@ -48,4 +49,8 @@
 #include <udebug/udebug.h>
 #include <abi/sysinfo.h>
+#include <arch.h>
+
+
+#define THREAD              THE->thread
 
 #define THREAD_NAME_BUFLEN  20
@@ -180,4 +185,16 @@
 	/** Thread ID. */
 	thread_id_t tid;
+
+	/** Work queue this thread belongs to or NULL. Immutable. */
+	struct work_queue *workq;
+	/** Links work queue threads. Protected by workq->lock. */
+	link_t workq_link; 
+	/** True if the worker was blocked and is not running. Use thread->lock. */
+	bool workq_blocked;
+	/** True if the worker will block in order to become idle. Use workq->lock. */
+	bool workq_idling;
+	
+	/** RCU thread related data. Protected by its own locks. */
+	rcu_thread_data_t rcu;
 	
 	/** Architecture-specific data. */
@@ -217,4 +234,6 @@
 extern void thread_ready(thread_t *);
 extern void thread_exit(void) __attribute__((noreturn));
+extern void thread_interrupt(thread_t *);
+extern bool thread_interrupted(thread_t *);
 
 #ifndef thread_create_arch
Index: kernel/generic/include/smp/smp_call.h
===================================================================
--- kernel/generic/include/smp/smp_call.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/smp/smp_call.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_SMP_CALL_H_
+#define	KERN_SMP_CALL_H_
+
+#include <adt/list.h>
+#include <synch/spinlock.h>
+#include <atomic.h>
+
+typedef void (*smp_call_func_t)(void *);
+
+typedef struct smp_call {
+	smp_call_func_t func;
+	void *arg;
+	link_t calls_link;
+	atomic_t pending;
+} smp_call_t;
+
+
+
+extern void smp_call(unsigned int, smp_call_func_t, void *);
+extern void smp_call_async(unsigned int, smp_call_func_t, void *, smp_call_t *);
+extern void smp_call_wait(smp_call_t *);
+
+extern void smp_call_init(void);
+
+#ifdef CONFIG_SMP
+extern void smp_call_ipi_recv(void);
+extern void arch_smp_call_ipi(unsigned int);
+#endif
+
+
+
+
+#endif	/* KERN_SMP_CALL_H_ */
+
+/** @}
+ */
+
Index: kernel/generic/include/synch/condvar.h
===================================================================
--- kernel/generic/include/synch/condvar.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/synch/condvar.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -39,4 +39,5 @@
 #include <synch/waitq.h>
 #include <synch/mutex.h>
+#include <synch/spinlock.h>
 #include <abi/synch.h>
 
@@ -50,4 +51,12 @@
 	_condvar_wait_timeout((cv), (mtx), (usec), SYNCH_FLAGS_NONE)
 
+#ifdef CONFIG_SMP
+#define _condvar_wait_timeout_spinlock(cv, lock, usec, flags) \
+	_condvar_wait_timeout_spinlock_impl((cv), (lock), (usec), (flags))
+#else
+#define _condvar_wait_timeout_spinlock(cv, lock, usec, flags) \
+	_condvar_wait_timeout_spinlock_impl((cv), NULL, (usec), (flags))
+#endif
+
 extern void condvar_initialize(condvar_t *cv);
 extern void condvar_signal(condvar_t *cv);
@@ -55,4 +64,9 @@
 extern int _condvar_wait_timeout(condvar_t *cv, mutex_t *mtx, uint32_t usec,
     int flags);
+extern int _condvar_wait_timeout_spinlock_impl(condvar_t *cv, spinlock_t *lock, 
+	uint32_t usec, int flags);
+extern int _condvar_wait_timeout_irq_spinlock(condvar_t *cv, 
+	irq_spinlock_t *irq_lock, uint32_t usec, int flags);
+
 
 #endif
Index: kernel/generic/include/synch/futex.h
===================================================================
--- kernel/generic/include/synch/futex.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/synch/futex.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -55,5 +55,7 @@
 extern sysarg_t sys_futex_wakeup(uintptr_t);
 
-extern void futex_cleanup(void);
+extern void futex_task_cleanup(void);
+extern void futex_task_init(struct task *);
+extern void futex_task_deinit(struct task *);
 
 #endif
Index: kernel/generic/include/synch/rcu.h
===================================================================
--- kernel/generic/include/synch/rcu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/synch/rcu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_RCU_H_
+#define KERN_RCU_H_
+
+#include <synch/rcu_types.h>
+#include <compiler/barrier.h>
+
+
+/** Use to assign a pointer to newly initialized data to a rcu reader 
+ * accessible pointer.
+ * 
+ * Example:
+ * @code
+ * typedef struct exam {
+ *     struct exam *next;
+ *     int grade;
+ * } exam_t;
+ * 
+ * exam_t *exam_list;
+ * // ..
+ * 
+ * // Insert at the beginning of the list.
+ * exam_t *my_exam = malloc(sizeof(exam_t), 0);
+ * my_exam->grade = 5;
+ * my_exam->next = exam_list;
+ * rcu_assign(exam_list, my_exam);
+ * 
+ * // Changes properly propagate. Every reader either sees
+ * // the old version of exam_list or the new version with
+ * // the fully initialized my_exam.
+ * rcu_synchronize();
+ * // Now we can be sure every reader sees my_exam.
+ * 
+ * @endcode
+ */
+#define rcu_assign(ptr, value) \
+	do { \
+		memory_barrier(); \
+		(ptr) = (value); \
+	} while (0)
+
+/** Use to access RCU protected data in a reader section.
+ * 
+ * Example:
+ * @code
+ * exam_t *exam_list;
+ * // ...
+ * 
+ * rcu_read_lock();
+ * exam_t *first_exam = rcu_access(exam_list);
+ * // We can now safely use first_exam, it won't change 
+ * // under us while we're using it.
+ *
+ * // ..
+ * rcu_read_unlock();
+ * @endcode
+ */
+#define rcu_access(ptr) ACCESS_ONCE(ptr)
+
+
+
+
+#include <debug.h>
+#include <preemption.h>
+#include <cpu.h>
+#include <proc/thread.h>
+
+
+extern bool rcu_read_locked(void);
+extern void rcu_synchronize(void);
+extern void rcu_synchronize_expedite(void);
+extern void rcu_call(rcu_item_t *rcu_item, rcu_func_t func);
+extern void rcu_barrier(void);
+
+extern void rcu_print_stat(void);
+
+extern void rcu_init(void);
+extern void rcu_stop(void);
+extern void rcu_cpu_init(void);
+extern void rcu_kinit_init(void);
+extern void rcu_thread_init(struct thread*);
+extern void rcu_thread_exiting(void);
+extern void rcu_after_thread_ran(void);
+extern void rcu_before_thread_runs(void);
+
+extern uint64_t rcu_completed_gps(void);
+extern void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func);
+extern void _rcu_synchronize(bool expedite);
+
+
+#ifdef RCU_PREEMPT_A
+
+#define RCU_CNT_INC       (1 << 1)
+#define RCU_WAS_PREEMPTED (1 << 0)
+
+/* Fwd. decl. because of inlining. */
+void _rcu_preempted_unlock(void);
+
+/** Delimits the start of an RCU reader critical section. 
+ * 
+ * Reader sections may be nested and are preemptible. You must not
+ * however block/sleep within reader sections.
+ */
+static inline void rcu_read_lock(void)
+{
+	THE->rcu_nesting += RCU_CNT_INC;
+	compiler_barrier();
+}
+
+/** Delimits the end of an RCU reader critical section. */
+static inline void rcu_read_unlock(void)
+{
+	compiler_barrier();
+	THE->rcu_nesting -= RCU_CNT_INC;
+	
+	if (RCU_WAS_PREEMPTED == THE->rcu_nesting) {
+		_rcu_preempted_unlock();
+	}
+}
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+
+/* Fwd decl. required by the inlined implementation. Not part of public API. */
+extern rcu_gp_t _rcu_cur_gp;
+extern void _rcu_signal_read_unlock(void);
+
+
+/** Unconditionally records a quiescent state for the local cpu. */
+static inline void _rcu_record_qs(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/* 
+	 * A new GP was started since the last time we passed a QS. 
+	 * Notify the detector we have reached a new QS.
+	 */
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		rcu_gp_t cur_gp = ACCESS_ONCE(_rcu_cur_gp);
+		/* 
+		 * Contain memory accesses within a reader critical section. 
+		 * If we are in rcu_lock() it also makes changes prior to the
+		 * start of the GP visible in the reader section.
+		 */
+		memory_barrier();
+		/*
+		 * Acknowledge we passed a QS since the beginning of rcu.cur_gp.
+		 * Cache coherency will lazily transport the value to the
+		 * detector while it sleeps in gp_sleep(). 
+		 * 
+		 * Note that there is a theoretical possibility that we
+		 * overwrite a more recent/greater last_seen_gp here with 
+		 * an older/smaller value. If this cpu is interrupted here
+		 * while in rcu_lock() reader sections in the interrupt handler 
+		 * will update last_seen_gp to the same value as is currently 
+		 * in local cur_gp. However, if the cpu continues processing 
+		 * interrupts and the detector starts a new GP immediately, 
+		 * local interrupt handlers may update last_seen_gp again (ie 
+		 * properly ack the new GP) with a value greater than local cur_gp. 
+		 * Resetting last_seen_gp to a previous value here is however 
+		 * benign and we only have to remember that this reader may end up 
+		 * in cur_preempted even after the GP ends. That is why we
+		 * append next_preempted to cur_preempted rather than overwriting 
+		 * it as if cur_preempted were empty.
+		 */
+		CPU->rcu.last_seen_gp = cur_gp;
+	}
+}
+
+/** Delimits the start of an RCU reader critical section. 
+ * 
+ * Reader sections may be nested and are preemptable. You must not
+ * however block/sleep within reader sections.
+ */
+static inline void rcu_read_lock(void)
+{
+	ASSERT(CPU);
+	preemption_disable();
+
+	/* Record a QS if not in a reader critical section. */
+	if (0 == CPU->rcu.nesting_cnt)
+		_rcu_record_qs();
+
+	++CPU->rcu.nesting_cnt;
+
+	preemption_enable();
+}
+
+/** Delimits the end of an RCU reader critical section. */
+static inline void rcu_read_unlock(void)
+{
+	ASSERT(CPU);
+	preemption_disable();
+	
+	if (0 == --CPU->rcu.nesting_cnt) {
+		_rcu_record_qs();
+		
+		/* 
+		 * The thread was preempted while in a critical section or 
+		 * the detector is eagerly waiting for this cpu's reader to finish. 
+		 */
+		if (CPU->rcu.signal_unlock) {
+			/* Rechecks with disabled interrupts. */
+			_rcu_signal_read_unlock();
+		}
+	}
+	
+	preemption_enable();
+}
+#endif
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/rcu_types.h
===================================================================
--- kernel/generic/include/synch/rcu_types.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/synch/rcu_types.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_RCU_TYPES_H_
+#define KERN_RCU_TYPES_H_
+
+#include <adt/list.h>
+#include <synch/semaphore.h>
+
+#if !defined(RCU_PREEMPT_PODZIMEK) && !defined(RCU_PREEMPT_A)
+#error You must select an RCU algorithm.
+#endif
+
+
+/* Fwd decl. */
+struct thread;
+struct rcu_item;
+
+/** Grace period number typedef. */
+typedef uint64_t rcu_gp_t;
+
+/** RCU callback type. The passed rcu_item_t maybe freed. */
+typedef void (*rcu_func_t)(struct rcu_item *rcu_item);
+
+typedef struct rcu_item {
+	rcu_func_t func;
+	struct rcu_item *next;
+} rcu_item_t;
+
+
+/** RCU related per-cpu data. */
+typedef struct rcu_cpu_data {
+	/** The cpu recorded a quiescent state last time during this grace period.*/
+	rcu_gp_t last_seen_gp;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	/** This cpu has not yet passed a quiescent state and it is delaying the
+	 * detector. Once it reaches a QS it must sema_up(rcu.remaining_readers).
+	 */
+	bool is_delaying_gp;
+	
+	/** True if we should signal the detector that we exited a reader section.
+	 * 
+	 * Equal to (THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp).
+	 */
+	bool signal_unlock;
+
+	/** The number of times an RCU reader section is nested on this cpu. 
+	 * 
+	 * If positive, it is definitely executing reader code. If zero, 
+	 * the thread might already be executing reader code thanks to
+	 * cpu instruction reordering.
+	 */
+	size_t nesting_cnt;
+#endif
+	
+	/** Callbacks to invoke once the current grace period ends, ie cur_cbs_gp.
+	 * Accessed by the local reclaimer only.
+	 */
+	rcu_item_t *cur_cbs;
+	/** Number of callbacks in cur_cbs. */
+	size_t cur_cbs_cnt;
+	/** Callbacks to invoke once the next grace period ends, ie next_cbs_gp. 
+	 * Accessed by the local reclaimer only.
+	 */
+	rcu_item_t *next_cbs;
+	/** Number of callbacks in next_cbs. */
+	size_t next_cbs_cnt;
+	/** New callbacks are place at the end of this list. */
+	rcu_item_t *arriving_cbs;
+	/** Tail of arriving_cbs list. Disable interrupts to access. */
+	rcu_item_t **parriving_cbs_tail;
+	/** Number of callbacks currently in arriving_cbs. 
+	 * Disable interrupts to access.
+	 */
+	size_t arriving_cbs_cnt;
+
+	/** At the end of this grace period callbacks in cur_cbs will be invoked.*/
+	rcu_gp_t cur_cbs_gp;
+	/** At the end of this grace period callbacks in next_cbs will be invoked.
+	 * 
+	 * Should be the next grace period but it allows the reclaimer to 
+	 * notice if it missed a grace period end announcement. In that
+	 * case it can execute next_cbs without waiting for another GP.
+	 * 
+	 * Invariant: next_cbs_gp >= cur_cbs_gp
+	 */
+	rcu_gp_t next_cbs_gp;
+	
+	/** Positive if there are callbacks pending in arriving_cbs. */
+	semaphore_t arrived_flag;
+	
+	/** The reclaimer should expedite GPs for cbs in arriving_cbs. */
+	bool expedite_arriving;
+	
+	/** Protected by global rcu.barrier_mtx. */
+	rcu_item_t barrier_item;
+	
+	/** Interruptable attached reclaimer thread. */
+	struct thread *reclaimer_thr;
+	
+	/* Some statistics. */
+	size_t stat_max_cbs;
+	size_t stat_avg_cbs;
+	size_t stat_missed_gps;
+	size_t stat_missed_gp_in_wait;
+	size_t stat_max_slice_cbs;
+	size_t last_arriving_cnt;
+} rcu_cpu_data_t;
+
+
+/** RCU related per-thread data. */
+typedef struct rcu_thread_data {
+	/** 
+	 * Nesting count of the thread's RCU read sections when the thread 
+	 * is not running.
+	 */
+	size_t nesting_cnt;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	
+	/** True if the thread was preempted in a reader section. 
+	 *
+	 * The thread is placed into rcu.cur_preempted or rcu.next_preempted
+	 * and must remove itself in rcu_read_unlock(). 
+	 * 
+	 * Access with interrupts disabled.
+	 */
+	bool was_preempted;
+#endif
+	
+	/** Preempted threads link. Access with rcu.prempt_lock.*/
+	link_t preempt_link;
+} rcu_thread_data_t;
+
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/semaphore.h
===================================================================
--- kernel/generic/include/synch/semaphore.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/synch/semaphore.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -53,4 +53,8 @@
 	_semaphore_down_timeout((s), (usec), SYNCH_FLAGS_NONE)
 
+#define semaphore_down_interruptable(s) \
+	(ESYNCH_INTERRUPTED != _semaphore_down_timeout((s), SYNCH_NO_TIMEOUT, \
+		SYNCH_FLAGS_INTERRUPTIBLE))
+
 extern void semaphore_initialize(semaphore_t *, int);
 extern int _semaphore_down_timeout(semaphore_t *, uint32_t, unsigned int);
Index: kernel/generic/include/synch/smp_memory_barrier.h
===================================================================
--- kernel/generic/include/synch/smp_memory_barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/synch/smp_memory_barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_SMP_MEM_BAR_H_
+#define KERN_SMP_MEM_BAR_H_
+
+#include <typedefs.h>
+
+extern sysarg_t sys_smp_memory_barrier(void);
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/spinlock.h
===================================================================
--- kernel/generic/include/synch/spinlock.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/include/synch/spinlock.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -45,5 +45,5 @@
 #ifdef CONFIG_SMP
 
-typedef struct {
+typedef struct spinlock {
 	atomic_t val;
 	
@@ -162,4 +162,7 @@
 /* On UP systems, spinlocks are effectively left out. */
 
+/* Allow the use of spinlock_t as an incomplete type. */
+typedef struct spinlock spinlock_t;
+
 #define SPINLOCK_DECLARE(name)
 #define SPINLOCK_EXTERN(name)
@@ -176,5 +179,5 @@
 
 #define spinlock_lock(lock)     preemption_disable()
-#define spinlock_trylock(lock)  (preemption_disable(), 1)
+#define spinlock_trylock(lock)  ({ preemption_disable(); 1; })
 #define spinlock_unlock(lock)   preemption_enable()
 #define spinlock_locked(lock)	1
Index: kernel/generic/include/synch/workqueue.h
===================================================================
--- kernel/generic/include/synch/workqueue.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/include/synch/workqueue.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_WORKQUEUE_H_
+#define KERN_WORKQUEUE_H_
+
+#include <adt/list.h>
+
+/* Fwd decl. */
+struct thread;
+struct work_item;
+struct work_queue;
+typedef struct work_queue work_queue_t;
+
+typedef void (*work_func_t)(struct work_item *);
+
+typedef struct work_item {
+	link_t queue_link;
+	work_func_t func;
+	
+#ifdef CONFIG_DEBUG
+	/* Magic number for integrity checks. */
+	uint32_t cookie;
+#endif 
+} work_t;
+
+
+
+extern void workq_global_init(void);
+extern void workq_global_worker_init(void);
+extern void workq_global_stop(void);
+extern int workq_global_enqueue_noblock(work_t *, work_func_t);
+extern int workq_global_enqueue(work_t *, work_func_t);
+
+extern struct work_queue * workq_create(const char *);
+extern void workq_destroy(struct work_queue *);
+extern int workq_init(struct work_queue *, const char *);
+extern void workq_stop(struct work_queue *);
+extern int workq_enqueue_noblock(struct work_queue *, work_t *, work_func_t);
+extern int workq_enqueue(struct work_queue *, work_t *, work_func_t);
+
+extern void workq_print_info(struct work_queue *);
+extern void workq_global_print_info(void);
+
+
+extern void workq_after_thread_ran(void);
+extern void workq_before_thread_is_ready(struct thread *);
+
+#endif /* KERN_WORKQUEUE_H_ */
+
+/** @}
+ */
Index: kernel/generic/src/adt/cht.c
===================================================================
--- kernel/generic/src/adt/cht.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/adt/cht.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,2704 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/** @addtogroup genericadt
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Scalable resizable concurrent lock-free hash table.
+ * 
+ * CHT is a concurrent hash table that is scalable resizable and lock-free.
+ * resizable = the number of buckets of the table increases or decreases
+ *     depending on the average number of elements per bucket (ie load)
+ * scalable = accessing the table from more cpus increases performance
+ *     almost linearly
+ * lock-free = common operations never block; even if any of the operations
+ *     is preempted or interrupted at any time, other operations will still
+ *     make forward progress
+ *
+ * CHT is designed for read mostly scenarios. Performance degrades as the
+ * fraction of updates (insert/remove) increases. Other data structures
+ * significantly outperform CHT if the fraction of updates exceeds ~40%.
+ * 
+ * CHT tolerates hardware exceptions and may be accessed from exception
+ * handlers as long as the underlying RCU implementation is exception safe.
+ * 
+ * @par Caveats
+ * 
+ * 0) Never assume an item is still in the table.
+ * The table may be accessed concurrently; therefore, other threads may
+ * insert or remove an item at any time. Do not assume an item is still
+ * in the table if cht_find() just returned it to you. Similarly, an
+ * item may have already been inserted by the time cht_find() returns NULL.
+ * 
+ * 1) Always use RCU read locks when searching the table.
+ * Holding an RCU lock guarantees that an item found in the table remains
+ * valid (eg is not freed) even if the item was removed from the table
+ * in the meantime by another thread.
+ * 
+ * 2) Never update values in place.
+ * Do not update items in the table in place, ie directly. The changes
+ * will not propagate to other readers (on other cpus) immediately or even
+ * correctly. Some readers may then encounter items that have only some
+ * of their fields changed or are completely inconsistent. 
+ * 
+ * Instead consider inserting an updated/changed copy of the item and 
+ * removing the original item. Or contact the maintainer to provide
+ * you with a function that atomically replaces an item with a copy.
+ * 
+ * 3) Use cht_insert_unique() instead of checking for duplicates with cht_find()
+ * The following code is prone to race conditions:
+ * @code
+ * if (NULL == cht_find(&h, key)) {
+ *     // If another thread inserts and item here, we'll insert a duplicate.
+ *     cht_insert(&h, item);
+ * }
+ * @endcode
+ * See cht_insert_unique() on how to correctly fix this.
+ * 
+ *
+ * @par Semantics
+ * 
+ * Lazy readers = cht_find_lazy(), cht_find_next_lazy()
+ * Readers = lazy readers, cht_find(), cht_find_next()
+ * Updates = cht_insert(), cht_insert_unique(), cht_remove_key(), 
+ *     cht_remove_item()
+ * 
+ * Readers (but not lazy readers) are guaranteed to see the effects 
+ * of @e completed updates. In other words, if cht_find() is invoked 
+ * after a cht_insert() @e returned eg on another cpu, cht_find() is 
+ * guaranteed to see the inserted item. 
+ * 
+ * Similarly, updates see the effects of @e completed updates. For example,
+ * issuing cht_remove() after a cht_insert() for that key returned (even 
+ * on another cpu) is guaranteed to remove the inserted item.
+ * 
+ * Reading or updating the table concurrently with other updates
+ * always returns consistent data and never corrupts the table.
+ * However the effects of concurrent updates may or may not be
+ * visible to all other concurrent readers or updaters. Eg, not
+ * all readers may see that an item has already been inserted 
+ * if cht_insert() has not yet returned. 
+ * 
+ * Lazy readers are guaranteed to eventually see updates but it
+ * may take some time (possibly milliseconds) after the update
+ * completes for the change to propagate to lazy readers on all
+ * cpus.
+ * 
+ * @par Implementation
+ * 
+ * Collisions in CHT are resolved with chaining. The number of buckets
+ * is always a power of 2. Each bucket is represented with a single linked 
+ * lock-free list [1]. Items in buckets are sorted by their mixed hashes 
+ * in ascending order. All buckets are terminated with a single global 
+ * sentinel node whose mixed hash value is the greatest possible. 
+ *
+ * CHT with 2^k buckets uses the k most significant bits of a hash value
+ * to determine the bucket number where an item is to be stored. To
+ * avoid storing all items in a single bucket if the user supplied
+ * hash function does not produce uniform hashes, hash values are
+ * mixed first so that the top bits of a mixed hash change even if hash
+ * values differ only in the least significant bits. The mixed hash 
+ * values are cached in cht_link.hash (which is overwritten once the 
+ * item is scheduled for removal via rcu_call).
+ * 
+ * A new item is inserted before all other existing items in the bucket
+ * with the same hash value as the newly inserted item (a la the original
+ * lock-free list [2]). Placing new items at the start of a same-hash 
+ * sequence of items (eg duplicates) allows us to easily check for duplicates 
+ * in cht_insert_unique(). The function can first check that there are 
+ * no duplicates of the newly inserted item amongst the items with the 
+ * same hash as the new item. If there were no duplicates the new item 
+ * is linked before the same-hash items. Inserting a duplicate while 
+ * the function is checking for duplicates is detected as a change of 
+ * the link to the first checked same-hash item (and the search for 
+ * duplicates can be restarted).
+ * 
+ * @par Table resize algorithm
+ * 
+ * Table resize is based on [3] and [5]. First, a new bucket head array
+ * is allocated and initialized. Second, old bucket heads are moved
+ * to the new bucket head array with the protocol mentioned in [5]. 
+ * At this point updaters start using the new bucket heads. Third,
+ * buckets are split (or joined) so that the table can make use of
+ * the extra bucket head slots in the new array (or stop wasting space
+ * with the unnecessary extra slots in the old array). Splitting
+ * or joining buckets employs a custom protocol. Last, the new array 
+ * replaces the original bucket array.
+ * 
+ * A single background work item (of the system work queue) guides
+ * resizing of the table. If an updater detects that the bucket it
+ * is about to access is undergoing a resize (ie its head is moving
+ * or it needs to be split/joined), it helps out and completes the
+ * head move or the bucket split/join.
+ * 
+ * The table always grows or shrinks by a factor of 2. Because items 
+ * are assigned a bucket based on the top k bits of their mixed hash 
+ * values, when growing the table each bucket is split into two buckets 
+ * and all items of the two new buckets come from the single bucket in the 
+ * original table. Ie items from separate buckets in the original table
+ * never intermix in the new buckets. Moreover 
+ * since the buckets are sorted by their mixed hash values the items 
+ * at the beginning of the old bucket will end up in the first new 
+ * bucket while all the remaining items of the old bucket will end up
+ * in the second new bucket. Therefore, there is a single point where 
+ * to split the linked list of the old bucket into two correctly sorted 
+ * linked lists of the new buckets:
+ *                            .- bucket split
+ *                            | 
+ *             <-- first -->  v  <-- second --> 
+ *   [old] --> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *              ^                 ^    
+ *   [new0] -- -+                 |  
+ *   [new1] -- -- -- -- -- -- -- -+
+ * 
+ * Resize in greater detail:
+ * 
+ * a) First, a resizer (a single background system work queue item 
+ * in charge of resizing the table) allocates and initializes a new 
+ * bucket head array. New bucket heads are pointed to the sentinel 
+ * and marked Invalid (in the lower order bits of the pointer to the 
+ * next item, ie the sentinel in this case):
+ * 
+ *   [old, N] --> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *                                                    ^ ^
+ *   [new0, Inv] -------------------------------------+ |
+ *   [new1, Inv] ---------------------------------------+
+ * 
+ * 
+ * b) Second, the resizer starts moving old bucket heads with the following 
+ * lock-free protocol (from [5]) where cas(variable, expected_val, new_val) 
+ * is short for compare-and-swap:
+ * 
+ *   old head     new0 head      transition to next state
+ *   --------     ---------      ------------------------
+ *   addr, N      sentinel, Inv  cas(old, (addr, N), (addr, Const))
+ *                               .. mark the old head as immutable, so that 
+ *                                  updaters do not relink it to other nodes 
+ *                                  until the head move is done.
+ *   addr, Const  sentinel, Inv  cas(new0, (sentinel, Inv), (addr, N))
+ *                               .. move the address to the new head and mark 
+ *                                  the new head normal so updaters can start
+ *                                  using it.
+ *   addr, Const  addr, N        cas(old, (addr, Const), (addr, Inv))
+ *                               .. mark the old head Invalid to signify
+ *                                  the head move is done.
+ *   addr, Inv    addr, N
+ * 
+ * Notice that concurrent updaters may step in at any point and correctly
+ * complete the head move without disrupting the resizer. At worst, the
+ * resizer or other concurrent updaters will attempt a number of CAS() that 
+ * will correctly fail.
+ * 
+ *   [old, Inv] -> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *                 ^                                   ^
+ *   [new0, N] ----+                                   |
+ *   [new1, Inv] --------------------------------------+
+ * 
+ *  
+ * c) Third, buckets are split if the table is growing; or joined if 
+ * shrinking (by the resizer or updaters depending on whoever accesses 
+ * the bucket first). See split_bucket() and join_buckets() for details.
+ * 
+ *  1) Mark the last item of new0 with JOIN_FOLLOWS:
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b] -> [11b] -> sentinel
+ *                 ^                                       ^
+ *   [new0, N] ----+                                       |
+ *   [new1, Inv] ------------------------------------------+
+ * 
+ *  2) Mark the first item of new1 with JOIN_NODE:
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b, JN] -> [11b] -> sentinel
+ *                 ^                                           ^
+ *   [new0, N] ----+                                           |
+ *   [new1, Inv] ----------------------------------------------+
+ * 
+ *  3) Point new1 to the join-node and mark new1 NORMAL.
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b, JN] -> [11b] -> sentinel
+ *                 ^                     ^
+ *   [new0, N] ----+                     |
+ *   [new1, N] --------------------------+
+ * 
+ * 
+ * d) Fourth, the resizer cleans up extra marks added during bucket 
+ * splits/joins but only when it is sure all updaters are accessing
+ * the table via the new bucket heads only (ie it is certain there
+ * are no delayed updaters unaware of the resize and accessing the 
+ * table via the old bucket head).
+ * 
+ *   [old, Inv] ---+
+ *                 v
+ *   [new0, N] --> [00b] -> [01b, N] ---+
+ *                                      v
+ *   [new1, N] --> [10b, N] -> [11b] -> sentinel
+ * 
+ * 
+ * e) Last, the resizer publishes the new bucket head array for everyone
+ * to see and use. This signals the end of the resize and the old bucket
+ * array is freed. 
+ * 
+ * 
+ * To understand details of how the table is resized, read [1, 3, 5]
+ * and comments in join_buckets(), split_bucket().
+ *  
+ * 
+ * [1] High performance dynamic lock-free hash tables and list-based sets, 
+ *     Michael, 2002
+ *     http://www.research.ibm.com/people/m/michael/spaa-2002.pdf
+ * [2] Lock-free linked lists using compare-and-swap,
+ *     Valois, 1995
+ *     http://people.csail.mit.edu/bushl2/rpi/portfolio/lockfree-grape/documents/lock-free-linked-lists.pdf
+ * [3] Resizable, scalable, concurrent hash tables via relativistic programming,
+ *     Triplett, 2011
+ *     http://www.usenix.org/event/atc11/tech/final_files/Triplett.pdf
+ * [4] Split-ordered Lists: Lock-free Extensible Hash Tables,
+ *     Shavit, 2006
+ *     http://www.cs.ucf.edu/~dcm/Teaching/COT4810-Spring2011/Literature/SplitOrderedLists.pdf
+ * [5] Towards a Scalable Non-blocking Coding Style,
+ *     Click, 2008
+ *     http://www.azulsystems.com/events/javaone_2008/2008_CodingNonBlock.pdf
+ */
+
+
+#include <adt/cht.h>
+#include <adt/hash.h>
+#include <debug.h>
+#include <memstr.h>
+#include <mm/slab.h>
+#include <arch/barrier.h>
+#include <compiler/barrier.h>
+#include <atomic.h>
+#include <synch/rcu.h>
+
+
+/* Logarithm of the min bucket count. Must be at least 3. 2^6 == 64 buckets. */
+#define CHT_MIN_ORDER 6
+/* Logarithm of the max bucket count. */
+#define CHT_MAX_ORDER (8 * sizeof(size_t))
+/* Minimum number of hash table buckets. */
+#define CHT_MIN_BUCKET_CNT (1 << CHT_MIN_ORDER)
+/* Does not have to be a power of 2. */
+#define CHT_MAX_LOAD 2 
+
+typedef cht_ptr_t marked_ptr_t;
+typedef bool (*equal_pred_t)(void *arg, const cht_link_t *item);
+
+/** The following mark items and bucket heads. 
+ * 
+ * They are stored in the two low order bits of the next item pointers.
+ * Some marks may be combined. Some marks share the same binary value and
+ * are distinguished only by context (eg bucket head vs an ordinary item),
+ * in particular by walk_mode_t.
+ */
+typedef enum mark {
+	/** Normal non-deleted item or a valid bucket head. */
+	N_NORMAL = 0,
+	/** Logically deleted item that might have already been unlinked.
+	 * 
+	 * May be combined with N_JOIN and N_JOIN_FOLLOWS. Applicable only 
+	 * to items; never to bucket heads. 
+	 * 
+	 * Once marked deleted an item remains marked deleted.	 
+	 */
+	N_DELETED = 1,
+	/** Immutable bucket head. 
+	 * 
+	 * The bucket is being moved or joined with another and its (old) head 
+	 * must not be modified.
+	 * 
+	 * May be combined with N_INVALID. Applicable only to old bucket heads,
+	 * ie cht_t.b and not cht_t.new_b.
+	 */
+	N_CONST = 1,
+	/** Invalid bucket head. The bucket head must not be modified. 
+	 * 
+	 * Old bucket heads (ie cht_t.b) are marked invalid if they have
+	 * already been moved to cht_t.new_b or if the bucket had already
+	 * been merged with another when shrinking the table. New bucket
+	 * heads (ie cht_t.new_b) are marked invalid if the old bucket had
+	 * not yet been moved or if an old bucket had not yet been split
+	 * when growing the table.
+	 */
+	N_INVALID = 3,
+	/** The item is a join node, ie joining two buckets
+	 * 
+	 * A join node is either the first node of the second part of
+	 * a bucket to be split; or it is the first node of the bucket
+	 * to be merged into/appended to/joined with another bucket.
+	 * 
+	 * May be combined with N_DELETED. Applicable only to items, never 
+	 * to bucket heads.
+	 * 
+	 * Join nodes are referred to from two different buckets and may,
+	 * therefore, not be safely/atomically unlinked from both buckets.
+	 * As a result join nodes are not unlinked but rather just marked
+	 * deleted. Once resize completes join nodes marked deleted are
+	 * garbage collected.
+	 */
+	N_JOIN = 2,
+	/** The next node is a join node and will soon be marked so. 
+	 * 
+	 * A join-follows node is the last node of the first part of bucket
+	 * that is to be split, ie it is the last node that will remain
+	 * in the same bucket after splitting it.
+	 * 
+	 * May be combined with N_DELETED. Applicable to items as well
+	 * as to bucket heads of the bucket to be split (but only in cht_t.new_b). 
+	 */
+	N_JOIN_FOLLOWS = 2,
+	/** Bit mask to filter out the address to the next item from the next ptr. */
+	N_MARK_MASK = 3
+} mark_t;
+
+/** Determines */
+typedef enum walk_mode {
+	/** The table is not resizing. */
+	WM_NORMAL = 4,
+	/** The table is undergoing a resize. Join nodes may be encountered. */
+	WM_LEAVE_JOIN,
+	/** The table is growing. A join-follows node may be encountered. */
+	WM_MOVE_JOIN_FOLLOWS
+} walk_mode_t;
+
+/** Bucket position window. */
+typedef struct wnd {
+	/** Pointer to cur's predecessor. */
+	marked_ptr_t *ppred;
+	/** Current item. */
+	cht_link_t *cur;
+	/** Last encountered item. Deleted or not. */
+	cht_link_t *last;
+} wnd_t;
+
+
+/* Sentinel node used by all buckets. Stores the greatest possible hash value.*/
+static const cht_link_t sentinel = {
+	/* NULL and N_NORMAL */
+	.link = 0 | N_NORMAL,
+	.hash = -1
+};
+
+
+static size_t size_to_order(size_t bucket_cnt, size_t min_order);
+static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid, 
+	bool can_block);
+static inline cht_link_t *find_lazy(cht_t *h, void *key);
+static cht_link_t *search_bucket(cht_t *h, marked_ptr_t head, void *key, 
+	size_t search_hash);
+static cht_link_t *find_resizing(cht_t *h, void *key, size_t hash, 
+	marked_ptr_t old_head, size_t old_idx);
+static bool insert_impl(cht_t *h, cht_link_t *item, cht_link_t **dup_item);
+static bool insert_at(cht_link_t *item, const wnd_t *wnd, walk_mode_t walk_mode,
+	bool *resizing);
+static bool has_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *cur, cht_link_t **dup_item);
+static cht_link_t *find_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *start);
+static bool remove_pred(cht_t *h, size_t hash, equal_pred_t pred, void *pred_arg);
+static bool delete_at(cht_t *h, wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *deleted_but_gc, bool *resizing);
+static bool mark_deleted(cht_link_t *cur, walk_mode_t walk_mode, bool *resizing);
+static bool unlink_from_pred(wnd_t *wnd, walk_mode_t walk_mode, bool *resizing);
+static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	equal_pred_t pred, void *pred_arg, wnd_t *wnd, bool *resizing);
+static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	wnd_t *wnd, bool *resizing);
+static bool gc_deleted_node(cht_t *h, walk_mode_t walk_mode, wnd_t *wnd,
+	bool *resizing);
+static bool join_completed(cht_t *h, const wnd_t *wnd);
+static void upd_resizing_head(cht_t *h, size_t hash, marked_ptr_t **phead, 
+	bool *join_finishing,  walk_mode_t *walk_mode);
+static void item_removed(cht_t *h);
+static void item_inserted(cht_t *h);
+static void free_later(cht_t *h, cht_link_t *item);
+static void help_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head);
+static void start_head_move(marked_ptr_t *psrc_head);
+static void mark_const(marked_ptr_t *psrc_head);
+static void complete_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head);
+static void split_bucket(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash);
+static void mark_join_follows(cht_t *h, marked_ptr_t *psrc_head, 
+	size_t split_hash, wnd_t *wnd);
+static void mark_join_node(cht_link_t *join_node);
+static void join_buckets(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash);
+static void link_to_join_node(cht_t *h, marked_ptr_t *pdest_head, 
+	cht_link_t *join_node, size_t split_hash);
+static void resize_table(work_t *arg);
+static void grow_table(cht_t *h);
+static void shrink_table(cht_t *h);
+static void cleanup_join_node(cht_t *h, marked_ptr_t *new_head);
+static void clear_join_and_gc(cht_t *h, cht_link_t *join_node, 
+	marked_ptr_t *new_head);
+static void cleanup_join_follows(cht_t *h, marked_ptr_t *new_head);
+static marked_ptr_t make_link(const cht_link_t *next, mark_t mark);
+static cht_link_t * get_next(marked_ptr_t link);
+static mark_t get_mark(marked_ptr_t link);
+static void next_wnd(wnd_t *wnd);
+static bool same_node_pred(void *node, const cht_link_t *item2);
+static size_t calc_key_hash(cht_t *h, void *key);
+static size_t node_hash(cht_t *h, const cht_link_t *item);
+static size_t calc_node_hash(cht_t *h, const cht_link_t *item);
+static void memoize_node_hash(cht_t *h, cht_link_t *item);
+static size_t calc_split_hash(size_t split_idx, size_t order);
+static size_t calc_bucket_idx(size_t hash, size_t order);
+static size_t grow_to_split_idx(size_t old_idx);
+static size_t grow_idx(size_t idx);
+static size_t shrink_idx(size_t idx);
+static marked_ptr_t cas_link(marked_ptr_t *link, const cht_link_t *cur_next, 
+	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark);
+static marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur, 
+	marked_ptr_t new);
+static void cas_order_barrier(void);
+
+static void dummy_remove_callback(cht_link_t *item)
+{
+	/* empty */
+}
+
+/** Creates a concurrent hash table.
+ * 
+ * @param h         Valid pointer to a cht_t instance.
+ * @param op        Item specific operations. All operations are compulsory.
+ * @return True if successfully created the table. False otherwise.
+ */
+bool cht_create_simple(cht_t *h, cht_ops_t *op)
+{
+	return cht_create(h, 0, 0, 0, false, op); 
+}
+
+/** Creates a concurrent hash table.
+ * 
+ * @param h         Valid pointer to a cht_t instance.
+ * @param init_size The initial number of buckets the table should contain.
+ *                  The table may be shrunk below this value if deemed necessary.
+ *                  Uses the default value if 0.
+ * @param min_size  Minimum number of buckets that the table should contain.
+ *                  The number of buckets never drops below this value,
+ *                  although it may be rounded up internally as appropriate.
+ *                  Uses the default value if 0.
+ * @param max_load  Maximum average number of items per bucket that allowed
+ *                  before the table grows.
+ * @param can_block If true creating the table blocks until enough memory
+ *                  is available (possibly indefinitely). Otherwise, 
+ *                  table creation does not block and returns immediately
+ *                  even if not enough memory is available. 
+ * @param op        Item specific operations. All operations are compulsory.
+ * @return True if successfully created the table. False otherwise.
+ */
+bool cht_create(cht_t *h, size_t init_size, size_t min_size, size_t max_load, 
+	bool can_block, cht_ops_t *op)
+{
+	ASSERT(h);
+	ASSERT(op && op->hash && op->key_hash && op->equal && op->key_equal);
+	/* Memoized hashes are stored in the rcu_link.func function pointer. */
+	ASSERT(sizeof(size_t) == sizeof(rcu_func_t));
+	ASSERT(sentinel.hash == (uintptr_t)sentinel.rcu_link.func);
+
+	/* All operations are compulsory. */
+	if (!op || !op->hash || !op->key_hash || !op->equal || !op->key_equal)
+		return false;
+	
+	size_t min_order = size_to_order(min_size, CHT_MIN_ORDER);
+	size_t order = size_to_order(init_size, min_order);
+	
+	h->b = alloc_buckets(order, false, can_block);
+	
+	if (!h->b)
+		return false;
+	
+	h->max_load = (max_load == 0) ? CHT_MAX_LOAD : max_load;
+	h->min_order = min_order;
+	h->new_b = NULL;
+	h->op = op;
+	atomic_set(&h->item_cnt, 0);
+	atomic_set(&h->resize_reqs, 0);
+	
+	if (NULL == op->remove_callback) {
+		h->op->remove_callback = dummy_remove_callback;
+	}
+	
+	/* 
+	 * Cached item hashes are stored in item->rcu_link.func. Once the item
+	 * is deleted rcu_link.func will contain the value of invalid_hash.
+	 */
+	h->invalid_hash = (uintptr_t)h->op->remove_callback;
+	
+	/* Ensure the initialization takes place before we start using the table. */
+	write_barrier();
+	
+	return true;
+}
+
+/** Allocates and initializes 2^order buckets.
+ * 
+ * All bucket heads are initialized to point to the sentinel node.
+ * 
+ * @param order       The number of buckets to allocate is 2^order.
+ * @param set_invalid Bucket heads are marked invalid if true; otherwise
+ *                    they are marked N_NORMAL.
+ * @param can_block   If true memory allocation blocks until enough memory
+ *                    is available (possibly indefinitely). Otherwise, 
+ *                    memory allocation does not block. 
+ * @return Newly allocated and initialized buckets or NULL if not enough memory.
+ */
+static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid, bool can_block)
+{
+	size_t bucket_cnt = (1 << order);
+	size_t bytes = 
+		sizeof(cht_buckets_t) + (bucket_cnt - 1) * sizeof(marked_ptr_t);
+	cht_buckets_t *b = malloc(bytes, can_block ? 0 : FRAME_ATOMIC);
+	
+	if (!b)
+		return NULL;
+	
+	b->order = order;
+	
+	marked_ptr_t head_link = set_invalid 
+		? make_link(&sentinel, N_INVALID) 
+		: make_link(&sentinel, N_NORMAL);
+	
+	for (size_t i = 0; i < bucket_cnt; ++i) {
+		b->head[i] = head_link;
+	}
+	
+	return b;
+}
+
+/** Returns the smallest k such that bucket_cnt <= 2^k and min_order <= k.*/
+static size_t size_to_order(size_t bucket_cnt, size_t min_order)
+{
+	size_t order = min_order;
+
+	/* Find a power of two such that bucket_cnt <= 2^order */
+	do {
+		if (bucket_cnt <= ((size_t)1 << order))
+			return order;
+		
+		++order;
+	} while (order < CHT_MAX_ORDER);
+	
+	return order;
+}
+
+/** Destroys a CHT successfully created via cht_create().
+ * 
+ * Waits for all outstanding concurrent operations to complete and
+ * frees internal allocated resources. The table is however not cleared
+ * and items already present in the table (if any) are leaked.
+ */
+void cht_destroy(cht_t *h)
+{
+	cht_destroy_unsafe(h);
+	
+	/* You must clear the table of items. Otherwise cht_destroy will leak. */
+	ASSERT(atomic_get(&h->item_cnt) == 0);
+}
+
+/** Destroys a successfully created CHT but does no error checking. */
+void cht_destroy_unsafe(cht_t *h)
+{
+	/* Wait for resize to complete. */
+	while (0 < atomic_get(&h->resize_reqs)) {
+		rcu_barrier();
+	}
+	
+	/* Wait for all remove_callback()s to complete. */
+	rcu_barrier();
+	
+	free(h->b);
+	h->b = NULL;
+}
+
+/** Returns the first item equal to the search key or NULL if not found.
+ * 
+ * The call must be enclosed in a rcu_read_lock() unlock() pair. The 
+ * returned item is guaranteed to be allocated until rcu_read_unlock()
+ * although the item may be concurrently removed from the table by another
+ * cpu.
+ * 
+ * Further items matching the key may be retrieved via cht_find_next().
+ * 
+ * cht_find() sees the effects of any completed cht_remove(), cht_insert().
+ * If a concurrent remove or insert had not yet completed cht_find() may
+ * or may not see the effects of it (eg it may find an item being removed).
+ * 
+ * @param h   CHT to operate on.
+ * @param key Search key as defined by cht_ops_t.key_equal() and .key_hash().
+ * @return First item equal to the key or NULL if such an item does not exist.
+ */
+cht_link_t *cht_find(cht_t *h, void *key)
+{
+	/* Make the most recent changes to the table visible. */
+	read_barrier();
+	return cht_find_lazy(h, key);
+}
+
+/** Returns the first item equal to the search key or NULL if not found.
+ * 
+ * Unlike cht_find(), cht_find_lazy() may not see the effects of 
+ * cht_remove() or cht_insert() even though they have already completed.
+ * It may take a couple of milliseconds for those changes to propagate
+ * and become visible to cht_find_lazy(). On the other hand, cht_find_lazy() 
+ * operates a bit faster than cht_find().
+ * 
+ * See cht_find() for more details.
+ */
+cht_link_t *cht_find_lazy(cht_t *h, void *key)
+{
+	return find_lazy(h, key);
+}
+
+/** Finds the first item equal to the search key. */
+static inline cht_link_t *find_lazy(cht_t *h, void *key)
+{
+	ASSERT(h);
+	/* See docs to cht_find() and cht_find_lazy(). */
+	ASSERT(rcu_read_locked());
+	
+	size_t hash = calc_key_hash(h, key);
+	
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	/* 
+	 * No need for access_once. b->head[idx] will point to an allocated node 
+	 * even if marked invalid until we exit rcu read section.
+	 */
+	marked_ptr_t head = b->head[idx];
+	
+	/* Undergoing a resize - take the slow path. */
+	if (N_INVALID == get_mark(head))
+		return find_resizing(h, key, hash, head, idx);
+	
+	return search_bucket(h, head, key, hash);
+}
+
+/** Returns the next item matching \a item. 
+ * 
+ * Must be enclosed in a rcu_read_lock()/unlock() pair. Effects of 
+ * completed cht_remove(), cht_insert() are guaranteed to be visible
+ * to cht_find_next().
+ * 
+ * See cht_find() for more details.  
+ */
+cht_link_t *cht_find_next(cht_t *h, const cht_link_t *item)
+{
+	/* Make the most recent changes to the table visible. */
+	read_barrier();
+	return cht_find_next_lazy(h, item);
+}
+
+/** Returns the next item matching \a item. 
+ * 
+ * Must be enclosed in a rcu_read_lock()/unlock() pair. Effects of 
+ * completed cht_remove(), cht_insert() may or may not be visible
+ * to cht_find_next_lazy().
+ * 
+ * See cht_find_lazy() for more details.  
+ */
+cht_link_t *cht_find_next_lazy(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(h);
+	ASSERT(rcu_read_locked());
+	ASSERT(item);
+	
+	return find_duplicate(h, item, calc_node_hash(h, item), get_next(item->link));
+}
+
+/** Searches the bucket at head for key using search_hash. */
+static inline cht_link_t *search_bucket(cht_t *h, marked_ptr_t head, void *key, 
+	size_t search_hash)
+{
+	/* 
+	 * It is safe to access nodes even outside of this bucket (eg when
+	 * splitting the bucket). The resizer makes sure that any node we 
+	 * may find by following the next pointers is allocated.
+	 */
+
+	cht_link_t *cur = NULL;
+	marked_ptr_t prev = head;
+
+try_again:
+	/* Filter out items with different hashes. */
+	do {
+		cur = get_next(prev);
+		ASSERT(cur);
+		prev = cur->link;
+	} while (node_hash(h, cur) < search_hash);
+	
+	/* 
+	 * Only search for an item with an equal key if cur is not the sentinel
+	 * node or a node with a different hash. 
+	 */
+	while (node_hash(h, cur) == search_hash) {
+		if (h->op->key_equal(key, cur)) {
+			if (!(N_DELETED & get_mark(cur->link)))
+				return cur;
+		}
+		
+		cur = get_next(cur->link);
+		ASSERT(cur);
+	} 
+	
+	/* 
+	 * In the unlikely case that we have encountered a node whose cached
+	 * hash has been overwritten due to a pending rcu_call for it, skip
+	 * the node and try again.
+	 */
+	if (node_hash(h, cur) == h->invalid_hash) {
+		prev = cur->link;
+		goto try_again;
+	}
+	
+	return NULL;
+}
+
+/** Searches for the key while the table is undergoing a resize. */
+static cht_link_t *find_resizing(cht_t *h, void *key, size_t hash, 
+	marked_ptr_t old_head, size_t old_idx)
+{
+	ASSERT(N_INVALID == get_mark(old_head)); 
+	ASSERT(h->new_b);
+	
+	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
+	marked_ptr_t new_head = h->new_b->head[new_idx];
+	marked_ptr_t search_head = new_head;
+	
+	/* Growing. */
+	if (h->b->order < h->new_b->order) {
+		/* 
+		 * Old bucket head is invalid, so it must have been already
+		 * moved. Make the new head visible if still not visible, ie
+		 * invalid.
+		 */
+		if (N_INVALID == get_mark(new_head)) {
+			/* 
+			 * We should be searching a newly added bucket but the old
+			 * moved bucket has not yet been split (its marked invalid) 
+			 * or we have not yet seen the split. 
+			 */
+			if (grow_idx(old_idx) != new_idx) {
+				/* 
+				 * Search the moved bucket. It is guaranteed to contain
+				 * items of the newly added bucket that were present
+				 * before the moved bucket was split.
+				 */
+				new_head = h->new_b->head[grow_idx(old_idx)];
+			}
+			
+			/* new_head is now the moved bucket, either valid or invalid. */
+			
+			/* 
+			 * The old bucket was definitely moved to new_head but the
+			 * change of new_head had not yet propagated to this cpu.
+			 */
+			if (N_INVALID == get_mark(new_head)) {
+				/*
+				 * We could issue a read_barrier() and make the now valid
+				 * moved bucket head new_head visible, but instead fall back
+				 * on using the old bucket. Although the old bucket head is 
+				 * invalid, it points to a node that is allocated and in the 
+				 * right bucket. Before the node can be freed, it must be
+				 * unlinked from the head (or another item after that item
+				 * modified the new_head) and a grace period must elapse. 
+				 * As a result had the node been already freed the grace
+				 * period preceeding the free() would make the unlink and
+				 * any changes to new_head visible. Therefore, it is safe
+				 * to use the node pointed to from the old bucket head.
+				 */
+
+				search_head = old_head;
+			} else {
+				search_head = new_head;
+			}
+		}
+		
+		return search_bucket(h, search_head, key, hash);
+	} else if (h->b->order > h->new_b->order) {
+		/* Shrinking. */
+		
+		/* Index of the bucket in the old table that was moved. */
+		size_t move_src_idx = grow_idx(new_idx);
+		marked_ptr_t moved_old_head = h->b->head[move_src_idx];
+		
+		/*
+		 * h->b->head[move_src_idx] had already been moved to new_head 
+		 * but the change to new_head had not yet propagated to us.
+		 */
+		if (N_INVALID == get_mark(new_head)) {
+			/*
+			 * new_head is definitely valid and we could make it visible 
+			 * to this cpu with a read_barrier(). Instead, use the bucket 
+			 * in the old table that was moved even though it is now marked 
+			 * as invalid. The node it points to must be allocated because
+			 * a grace period would have to elapse before it could be freed;
+			 * and the grace period would make the now valid new_head 
+			 * visible to all cpus. 
+			 * 
+			 * Note that move_src_idx may not be the same as old_idx.
+			 * If move_src_idx != old_idx then old_idx is the bucket
+			 * in the old table that is not moved but instead it is
+			 * appended to the moved bucket, ie it is added at the tail
+			 * of new_head. In that case an invalid old_head notes that
+			 * it had already been merged into (the moved) new_head. 
+			 * We will try to search that bucket first because it
+			 * may contain some newly added nodes after the bucket 
+			 * join. Moreover, the bucket joining link may already be 
+			 * visible even if new_head is not. Therefore, if we're
+			 * lucky we'll find the item via moved_old_head. In any
+			 * case, we'll retry in proper old_head if not found.
+			 */
+			search_head = moved_old_head;
+		}
+		
+		cht_link_t *ret = search_bucket(h, search_head, key, hash);
+		
+		if (ret)
+			return ret;
+		/*
+		 * Bucket old_head was already joined with moved_old_head
+		 * in the new table but we have not yet seen change of the
+		 * joining link (or the item is not in the table).
+		 */
+		if (move_src_idx != old_idx && get_next(old_head) != &sentinel) {
+			/*
+			 * Note that old_head (the bucket to be merged into new_head) 
+			 * points to an allocated join node (if non-null) even if marked 
+			 * invalid. Before the resizer lets join nodes to be unlinked
+			 * (and freed) it sets old_head to NULL and waits for a grace period.
+			 * So either the invalid old_head points to join node; or old_head
+			 * is null and we would have seen a completed bucket join while
+			 * traversing search_head.
+			 */
+			ASSERT(N_JOIN & get_mark(get_next(old_head)->link));
+			return search_bucket(h, old_head, key, hash);
+		}
+		
+		return NULL;
+	} else {
+		/* 
+		 * Resize is almost done. The resizer is waiting to make
+		 * sure all cpus see that the new table replaced the old one.
+		 */
+		ASSERT(h->b->order == h->new_b->order);
+		/* 
+		 * The resizer must ensure all new bucket heads are visible before
+		 * replacing the old table.
+		 */
+		ASSERT(N_NORMAL == get_mark(new_head));
+		return search_bucket(h, new_head, key, hash);
+	}
+}
+
+/** Inserts an item. Succeeds even if an equal item is already present. */
+void cht_insert(cht_t *h, cht_link_t *item)
+{
+	insert_impl(h, item, NULL);
+}
+
+/** Inserts a unique item. Returns false if an equal item was already present. 
+ * 
+ * Use this function to atomically check if an equal/duplicate item had
+ * not yet been inserted into the table and to insert this item into the 
+ * table.
+ * 
+ * The following is @e NOT thread-safe, so do not use:
+ * @code
+ * if (!cht_find(h, key)) {
+ *     // A concurrent insert here may go unnoticed by cht_find() above.
+ *     item = malloc(..);
+ *     cht_insert(h, item);
+ *     // Now we may have two items with equal search keys.
+ * }
+ * @endcode
+ * 
+ * Replace such code with:
+ * @code
+ * item = malloc(..);
+ * if (!cht_insert_unique(h, item, &dup_item)) {
+ *     // Whoops, someone beat us to it - an equal item 'dup_item'
+ *     // had already been inserted.
+ *     free(item); 
+ * } else {
+ *     // Successfully inserted the item and we are guaranteed that
+ *     // there are no other equal items.
+ * }
+ * @endcode
+ * 
+ */
+bool cht_insert_unique(cht_t *h, cht_link_t *item, cht_link_t **dup_item)
+{
+	ASSERT(rcu_read_locked());
+	ASSERT(dup_item);
+	return insert_impl(h, item, dup_item);
+}
+
+/** Inserts the item into the table and checks for duplicates if dup_item. */
+static bool insert_impl(cht_t *h, cht_link_t *item, cht_link_t **dup_item)
+{
+	rcu_read_lock();
+
+	cht_buckets_t *b = rcu_access(h->b);
+	memoize_node_hash(h, item);
+	size_t hash = node_hash(h, item);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	marked_ptr_t *phead = &b->head[idx];
+
+	bool resizing = false;
+	bool inserted = false;
+	
+	do {
+		walk_mode_t walk_mode = WM_NORMAL;
+		bool join_finishing;
+		
+		resizing = resizing || (N_NORMAL != get_mark(*phead));
+		
+		/* The table is resizing. Get the correct bucket head. */
+		if (resizing) {
+			upd_resizing_head(h, hash, &phead, &join_finishing, &walk_mode);
+		}
+		
+		wnd_t wnd = {
+			.ppred = phead,
+			.cur = get_next(*phead),
+			.last = NULL
+		};
+		
+		if (!find_wnd_and_gc(h, hash, walk_mode, &wnd, &resizing)) {
+			/* Could not GC a node; or detected an unexpected resize. */
+			continue;
+		}
+		
+		if (dup_item && has_duplicate(h, item, hash, wnd.cur, dup_item)) {
+			rcu_read_unlock();
+			return false;
+		}
+		
+		inserted = insert_at(item, &wnd, walk_mode, &resizing);		
+	} while (!inserted);
+	
+	rcu_read_unlock();
+
+	item_inserted(h);
+	return true;
+}
+
+/** Inserts item between wnd.ppred and wnd.cur. 
+ * 
+ * @param item      Item to link to wnd.ppred and wnd.cur.
+ * @param wnd       The item will be inserted before wnd.cur. Wnd.ppred
+ *                  must be N_NORMAL.
+ * @param walk_mode 
+ * @param resizing  Set to true only if the table is undergoing resize 
+ *         and it was not expected (ie walk_mode == WM_NORMAL).
+ * @return True if the item was successfully linked to wnd.ppred. False
+ *         if whole insert operation must be retried because the predecessor
+ *         of wnd.cur has changed.
+ */
+inline static bool insert_at(cht_link_t *item, const wnd_t *wnd, 
+	walk_mode_t walk_mode, bool *resizing)
+{
+	marked_ptr_t ret;
+	
+	if (walk_mode == WM_NORMAL) {
+		item->link = make_link(wnd->cur, N_NORMAL);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		/* Link a clean/normal predecessor to the item. */
+		ret = cas_link(wnd->ppred, wnd->cur, N_NORMAL, item, N_NORMAL);
+		
+		if (ret == make_link(wnd->cur, N_NORMAL)) {
+			return true;
+		} else {
+			/* This includes an invalid head but not a const head. */
+			*resizing = ((N_JOIN_FOLLOWS | N_JOIN) & get_mark(ret));
+			return false;
+		}
+	} else if (walk_mode == WM_MOVE_JOIN_FOLLOWS) {
+		/* Move JOIN_FOLLOWS mark but filter out the DELETED mark. */
+		mark_t jf_mark = get_mark(*wnd->ppred) & N_JOIN_FOLLOWS;
+		item->link = make_link(wnd->cur, jf_mark);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		/* Link the not-deleted predecessor to the item. Move its JF mark. */
+		ret = cas_link(wnd->ppred, wnd->cur, jf_mark, item, N_NORMAL);
+		
+		return ret == make_link(wnd->cur, jf_mark);
+	} else {
+		ASSERT(walk_mode == WM_LEAVE_JOIN);
+
+		item->link = make_link(wnd->cur, N_NORMAL);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		mark_t pred_mark = get_mark(*wnd->ppred);
+		/* If the predecessor is a join node it may be marked deleted.*/
+		mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
+
+		ret = cas_link(wnd->ppred, wnd->cur, exp_pred_mark, item, exp_pred_mark);
+		return ret == make_link(wnd->cur, exp_pred_mark);
+	}
+}
+
+/** Returns true if the chain starting at cur has an item equal to \a item.
+ * 
+ * @param h    CHT to operate on.
+ * @param item Item whose duplicates the function looks for.
+ * @param hash Hash of \a item.
+ * @param[in] cur  The first node with a hash greater to or equal to item's hash.
+ * @param[out] dup_item The first duplicate item encountered.
+ * @return True if a non-deleted item equal to \a item exists in the table.
+ */
+static inline bool has_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *cur, cht_link_t **dup_item)
+{
+	ASSERT(cur);
+	ASSERT(cur == &sentinel || hash <= node_hash(h, cur)
+		|| node_hash(h, cur) == h->invalid_hash);
+	
+	/* hash < node_hash(h, cur) */
+	if (hash != node_hash(h, cur) && h->invalid_hash != node_hash(h, cur))
+		return false;
+
+	/* 
+	 * Load the most recent node marks. Otherwise we might pronounce a 
+	 * logically deleted node for a duplicate of the item just because 
+	 * the deleted node's DEL mark had not yet propagated to this cpu.
+	 */
+	read_barrier();
+	
+	*dup_item = find_duplicate(h, item, hash, cur);
+	return NULL != *dup_item;
+}
+
+/** Returns an item that is equal to \a item starting in a chain at \a start. */
+static cht_link_t *find_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *start)
+{
+	ASSERT(hash <= node_hash(h, start) || h->invalid_hash == node_hash(h, start));
+
+	cht_link_t *cur = start;
+	
+try_again:	
+	ASSERT(cur);
+
+	while (node_hash(h, cur) == hash) {
+		ASSERT(cur != &sentinel);
+		
+		bool deleted = (N_DELETED & get_mark(cur->link));
+		
+		/* Skip logically deleted nodes. */
+		if (!deleted && h->op->equal(item, cur))
+			return cur;
+		
+		cur = get_next(cur->link);
+		ASSERT(cur);
+	} 
+
+	/* Skip logically deleted nodes with rcu_call() in progress. */
+	if (h->invalid_hash == node_hash(h, cur)) {
+		cur = get_next(cur->link);
+		goto try_again;
+	}
+	
+	return NULL;
+}
+
+/** Removes all items matching the search key. Returns the number of items removed.*/
+size_t cht_remove_key(cht_t *h, void *key)
+{
+	ASSERT(h);
+	
+	size_t hash = calc_key_hash(h, key);
+	size_t removed = 0;
+	
+	while (remove_pred(h, hash, h->op->key_equal, key)) 
+		++removed;
+	
+	return removed;
+}
+
+/** Removes a specific item from the table. 
+ * 
+ * The called must hold rcu read lock. 
+ * 
+ * @param item Item presumably present in the table and to be removed.
+ * @return True if the item was removed successfully; or false if it had
+ *     already been deleted. 
+ */
+bool cht_remove_item(cht_t *h, cht_link_t *item)
+{
+	ASSERT(h);
+	ASSERT(item);
+	/* Otherwise a concurrent cht_remove_key might free the item. */
+	ASSERT(rcu_read_locked());
+
+	/* 
+	 * Even though we know the node we want to delete we must unlink it
+	 * from the correct bucket and from a clean/normal predecessor. Therefore, 
+	 * we search for it again from the beginning of the correct bucket.
+	 */
+	size_t hash = calc_node_hash(h, item);
+	return remove_pred(h, hash, same_node_pred, item);
+}
+
+/** Removes an item equal to pred_arg according to the predicate pred. */
+static bool remove_pred(cht_t *h, size_t hash, equal_pred_t pred, void *pred_arg)
+{
+	rcu_read_lock();
+	
+	bool resizing = false;
+	bool deleted = false;
+	bool deleted_but_gc = false;
+	
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	marked_ptr_t *phead = &b->head[idx];
+	
+	do {
+		walk_mode_t walk_mode = WM_NORMAL;
+		bool join_finishing = false;
+		
+		resizing = resizing || (N_NORMAL != get_mark(*phead));
+		
+		/* The table is resizing. Get the correct bucket head. */
+		if (resizing) {
+			upd_resizing_head(h, hash, &phead, &join_finishing, &walk_mode);
+		}
+		
+		wnd_t wnd = {
+			.ppred = phead,
+			.cur = get_next(*phead),
+			.last = NULL
+		};
+		
+		if (!find_wnd_and_gc_pred(
+			h, hash, walk_mode, pred, pred_arg, &wnd, &resizing)) {
+			/* Could not GC a node; or detected an unexpected resize. */
+			continue;
+		}
+		
+		/* 
+		 * The item lookup is affected by a bucket join but effects of
+		 * the bucket join have not been seen while searching for the item.
+		 */
+		if (join_finishing && !join_completed(h, &wnd)) {
+			/* 
+			 * Bucket was appended at the end of another but the next 
+			 * ptr linking them together was not visible on this cpu. 
+			 * join_completed() makes this appended bucket visible.
+			 */
+			continue;
+		}
+		
+		/* Already deleted, but delete_at() requested one GC pass. */
+		if (deleted_but_gc)
+			break;
+		
+		bool found = (wnd.cur != &sentinel && pred(pred_arg, wnd.cur));
+		
+		if (!found) {
+			rcu_read_unlock();
+			return false;
+		}
+		
+		deleted = delete_at(h, &wnd, walk_mode, &deleted_but_gc, &resizing);		
+	} while (!deleted || deleted_but_gc);
+	
+	rcu_read_unlock();
+	return true;
+}
+
+/** Unlinks wnd.cur from wnd.ppred and schedules a deferred free for the item.
+ * 
+ * Ignores nodes marked N_JOIN if walk mode is WM_LEAVE_JOIN.
+ * 
+ * @param h   CHT to operate on.
+ * @param wnd Points to the item to delete and its N_NORMAL predecessor.
+ * @param walk_mode Bucket chaing walk mode.
+ * @param deleted_but_gc Set to true if the item had been logically deleted, 
+ *         but a garbage collecting walk of the bucket is in order for
+ *         it to be fully unlinked.         
+ * @param resizing Set to true if the table is undergoing an unexpected
+ *         resize (ie walk_mode == WM_NORMAL).
+ * @return False if the wnd.ppred changed in the meantime and the whole
+ *         delete operation must be retried.
+ */
+static inline bool delete_at(cht_t *h, wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *deleted_but_gc, bool *resizing)
+{
+	ASSERT(wnd->cur && wnd->cur != &sentinel);
+	
+	*deleted_but_gc = false;
+	
+	if (!mark_deleted(wnd->cur, walk_mode, resizing)) {
+		/* Already deleted, or unexpectedly marked as JOIN/JOIN_FOLLOWS. */
+		return false;
+	}
+	
+	/* Marked deleted. Unlink from the bucket. */
+	
+	/* Never unlink join nodes. */
+	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link)))
+		return true;
+	
+	cas_order_barrier();
+	
+	if (unlink_from_pred(wnd, walk_mode, resizing)) {
+		free_later(h, wnd->cur);
+	} else {
+		*deleted_but_gc = true;
+	}
+	
+	return true;
+}
+
+/** Marks cur logically deleted. Returns false to request a retry. */
+static inline bool mark_deleted(cht_link_t *cur, walk_mode_t walk_mode, 
+	bool *resizing)
+{
+	ASSERT(cur && cur != &sentinel);
+	
+	/* 
+	 * Btw, we could loop here if the cas fails but let's not complicate
+	 * things and let's retry from the head of the bucket. 
+	 */
+	
+	cht_link_t *next = get_next(cur->link);
+	
+	if (walk_mode == WM_NORMAL) {
+		/* Only mark clean/normal nodes - JF/JN is used only during resize. */
+		marked_ptr_t ret = cas_link(&cur->link, next, N_NORMAL, next, N_DELETED);
+		
+		if (ret != make_link(next, N_NORMAL)) {
+			*resizing = (N_JOIN | N_JOIN_FOLLOWS) & get_mark(ret);
+			return false;
+		}
+	} else {
+		ASSERT(N_JOIN == N_JOIN_FOLLOWS);
+		
+		/* Keep the N_JOIN/N_JOIN_FOLLOWS mark but strip N_DELETED. */
+		mark_t cur_mark = get_mark(cur->link) & N_JOIN_FOLLOWS;
+		
+		marked_ptr_t ret = 
+			cas_link(&cur->link, next, cur_mark, next, cur_mark | N_DELETED);
+		
+		if (ret != make_link(next, cur_mark))
+			return false;
+	} 
+	
+	return true;
+}
+
+/** Unlinks wnd.cur from wnd.ppred. Returns false if it should be retried. */
+static inline bool unlink_from_pred(wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *resizing)
+{
+	ASSERT(wnd->cur != &sentinel);
+	ASSERT(wnd->cur && (N_DELETED & get_mark(wnd->cur->link)));
+	
+	cht_link_t *next = get_next(wnd->cur->link);
+		
+	if (walk_mode == WM_LEAVE_JOIN) {
+		/* Never try to unlink join nodes. */
+		ASSERT(!(N_JOIN & get_mark(wnd->cur->link)));
+
+		mark_t pred_mark = get_mark(*wnd->ppred);
+		/* Succeed only if the predecessor is clean/normal or a join node. */
+		mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
+		
+		marked_ptr_t pred_link = make_link(wnd->cur, exp_pred_mark);
+		marked_ptr_t next_link = make_link(next, exp_pred_mark);
+		
+		if (pred_link != _cas_link(wnd->ppred, pred_link, next_link))
+			return false;
+	} else {
+		ASSERT(walk_mode == WM_MOVE_JOIN_FOLLOWS || walk_mode == WM_NORMAL);
+		/* Move the JF mark if set. Clear DEL mark. */
+		mark_t cur_mark = N_JOIN_FOLLOWS & get_mark(wnd->cur->link);
+		
+		/* The predecessor must be clean/normal. */
+		marked_ptr_t pred_link = make_link(wnd->cur, N_NORMAL);
+		/* Link to cur's successor keeping/copying cur's JF mark. */
+		marked_ptr_t next_link = make_link(next, cur_mark);		
+		
+		marked_ptr_t ret = _cas_link(wnd->ppred, pred_link, next_link);
+		
+		if (pred_link != ret) {
+			/* If we're not resizing the table there are no JF/JN nodes. */
+			*resizing = (walk_mode == WM_NORMAL) 
+				&& (N_JOIN_FOLLOWS & get_mark(ret));
+			return false;
+		}
+	}
+	
+	return true;
+}
+
+/** Finds the first non-deleted item equal to \a pred_arg according to \a pred.
+ * 
+ * The function returns the candidate item in \a wnd. Logically deleted
+ * nodes are garbage collected so the predecessor will most likely not
+ * be marked as deleted. 
+ * 
+ * Unlike find_wnd_and_gc(), this function never returns a node that
+ * is known to have already been marked N_DELETED.
+ *
+ * Any logically deleted nodes (ie those marked N_DELETED) are garbage
+ * collected, ie free in the background via rcu_call (except for join-nodes
+ * if walk_mode == WM_LEAVE_JOIN).
+ * 
+ * @param h         CHT to operate on.
+ * @param hash      Hash the search for.
+ * @param walk_mode Bucket chain walk mode.
+ * @param pred      Predicate used to find an item equal to pred_arg.
+ * @param pred_arg  Argument to pass to the equality predicate \a pred.
+ * @param[in,out] wnd The search starts with wnd.cur. If the desired
+ *                  item is found wnd.cur will point to it.
+ * @param resizing  Set to true if the table is resizing but it was not
+ *                  expected (ie walk_mode == WM_NORMAL).
+ * @return False if the operation has to be retried. True otherwise 
+ *        (even if an equal item had not been found).
+ */
+static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	equal_pred_t pred, void *pred_arg, wnd_t *wnd, bool *resizing)
+{
+	ASSERT(wnd->cur);
+	
+	if (wnd->cur == &sentinel)
+		return true;
+	
+	/* 
+	 * A read barrier is not needed here to bring up the most recent 
+	 * node marks (esp the N_DELETED). At worst we'll try to delete
+	 * an already deleted node; fail in delete_at(); and retry.
+	 */
+	
+	size_t cur_hash;
+
+try_again:	
+	cur_hash = node_hash(h, wnd->cur);
+		
+	while (cur_hash <= hash) {
+		ASSERT(wnd->cur && wnd->cur != &sentinel);
+		
+		/* GC any deleted nodes on the way. */
+		if (N_DELETED & get_mark(wnd->cur->link)) {
+			if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
+				/* Retry from the head of a bucket. */
+				return false;
+			}
+		} else {
+			/* Is this the node we were looking for? */
+			if (cur_hash == hash && pred(pred_arg, wnd->cur))
+				return true;
+			
+			next_wnd(wnd);
+		}
+		
+		cur_hash = node_hash(h, wnd->cur);
+	}
+	
+	if (cur_hash == h->invalid_hash) {
+		next_wnd(wnd);
+		ASSERT(wnd->cur);
+		goto try_again;
+	}
+	
+	/* The searched for node is not in the current bucket. */
+	return true;
+}
+
+/** Find the first item (deleted or not) with a hash greater or equal to \a hash.
+ * 
+ * The function returns the first item with a hash that is greater or 
+ * equal to \a hash in \a wnd. Moreover it garbage collects logically
+ * deleted node that have not yet been unlinked and freed. Therefore,
+ * the returned node's predecessor will most likely be N_NORMAL.
+ * 
+ * Unlike find_wnd_and_gc_pred(), this function may return a node
+ * that is known to had been marked N_DELETED.
+ *  
+ * @param h         CHT to operate on.
+ * @param hash      Hash of the item to find.
+ * @param walk_mode Bucket chain walk mode.
+ * @param[in,out] wnd wnd.cur denotes the first node of the chain. If the 
+ *                  the operation is successful, \a wnd points to the desired 
+ *                  item.
+ * @param resizing  Set to true if a table resize was detected but walk_mode
+ *                  suggested the table was not undergoing a resize.
+ * @return False indicates the operation must be retried. True otherwise 
+ *       (even if an item with exactly the same has was not found).
+ */
+static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	wnd_t *wnd, bool *resizing)
+{
+try_again:
+	ASSERT(wnd->cur);
+
+	while (node_hash(h, wnd->cur) < hash) {
+		/* GC any deleted nodes along the way to our desired node. */
+		if (N_DELETED & get_mark(wnd->cur->link)) {
+			if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
+				/* Failed to remove the garbage node. Retry. */
+				return false;
+			}
+		} else {
+			next_wnd(wnd);
+		}
+		
+		ASSERT(wnd->cur);
+	}
+	
+	if (node_hash(h, wnd->cur) == h->invalid_hash) {
+		next_wnd(wnd);
+		goto try_again;
+	}
+
+	/* wnd->cur may be NULL or even marked N_DELETED. */
+	return true;
+}
+
+/** Garbage collects the N_DELETED node at \a wnd skipping join nodes. */
+static bool gc_deleted_node(cht_t *h, walk_mode_t walk_mode, wnd_t *wnd,
+	bool *resizing)
+{
+	ASSERT(N_DELETED & get_mark(wnd->cur->link));
+
+	/* Skip deleted JOIN nodes. */
+	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link))) {
+		next_wnd(wnd);
+	} else {
+		/* Ordinary deleted node or a deleted JOIN_FOLLOWS. */
+		ASSERT(walk_mode != WM_LEAVE_JOIN 
+			|| !((N_JOIN | N_JOIN_FOLLOWS) & get_mark(wnd->cur->link)));
+
+		/* Unlink an ordinary deleted node, move JOIN_FOLLOWS mark. */
+		if (!unlink_from_pred(wnd, walk_mode, resizing)) {
+			/* Retry. The predecessor was deleted, invalid, const, join_follows. */
+			return false;
+		}
+
+		free_later(h, wnd->cur);
+
+		/* Leave ppred as is. */
+		wnd->last = wnd->cur;
+		wnd->cur = get_next(wnd->cur->link);
+	}
+	
+	return true;
+}
+
+/** Returns true if a bucket join had already completed.
+ * 
+ * May only be called if upd_resizing_head() indicates a bucket join 
+ * may be in progress.
+ * 
+ * If it returns false, the search must be retried in order to guarantee
+ * all item that should have been encountered have been seen.
+ */
+static bool join_completed(cht_t *h, const wnd_t *wnd)
+{
+	/* 
+	 * The table is shrinking and the searched for item is in a bucket 
+	 * appended to another. Check that the link joining these two buckets 
+	 * is visible and if not, make it visible to this cpu.
+	 */
+	
+	/* 
+	 * Resizer ensures h->b->order stays the same for the duration of this 
+	 * func. We got here because there was an alternative head to search.
+	 * The resizer waits for all preexisting readers to finish after
+	 * it 
+	 */
+	ASSERT(h->b->order > h->new_b->order);
+	ASSERT(wnd->cur);
+	
+	/* Either we did not need the joining link or we have already followed it.*/
+	if (wnd->cur != &sentinel)
+		return true;
+	
+	/* We have reached the end of a bucket. */
+	
+	if (wnd->last != &sentinel) {
+		size_t last_seen_hash = node_hash(h, wnd->last);
+		
+		if (last_seen_hash == h->invalid_hash) {
+			last_seen_hash = calc_node_hash(h, wnd->last);
+		}
+		
+		size_t last_old_idx = calc_bucket_idx(last_seen_hash, h->b->order);
+		size_t move_src_idx = grow_idx(shrink_idx(last_old_idx));
+		
+		/* 
+		 * Last node seen was in the joining bucket - if the searched 
+		 * for node is there we will find it. 
+		 */
+		if (move_src_idx != last_old_idx) 
+			return true;
+	}
+	
+	/* 
+	 * Reached the end of the bucket but no nodes from the joining bucket
+	 * were seen. There should have at least been a JOIN node so we have
+	 * definitely not seen (and followed) the joining link. Make the link
+	 * visible and retry.
+	 */
+	read_barrier();
+	return false;
+}
+
+/** When resizing returns the bucket head to start the search with in \a phead.
+ * 
+ * If a resize had been detected (eg cht_t.b.head[idx] is marked immutable).
+ * upd_resizing_head() moves the bucket for \a hash from the old head
+ * to the new head. Moreover, it splits or joins buckets as necessary.
+ * 
+ * @param h     CHT to operate on.
+ * @param hash  Hash of an item whose chain we would like to traverse.
+ * @param[out] phead Head of the bucket to search for \a hash.
+ * @param[out] join_finishing Set to true if a bucket join might be
+ *              in progress and the bucket may have to traversed again
+ *              as indicated by join_completed().
+ * @param[out] walk_mode Specifies how to interpret node marks.  
+ */
+static void upd_resizing_head(cht_t *h, size_t hash, marked_ptr_t **phead, 
+	bool *join_finishing,  walk_mode_t *walk_mode)
+{
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t old_idx = calc_bucket_idx(hash, b->order);
+	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
+	
+	marked_ptr_t *pold_head = &b->head[old_idx];
+	marked_ptr_t *pnew_head = &h->new_b->head[new_idx];
+	
+	/* In any case, use the bucket in the new table. */
+	*phead = pnew_head;
+
+	/* Growing the table. */
+	if (b->order < h->new_b->order) {
+		size_t move_dest_idx = grow_idx(old_idx);
+		marked_ptr_t *pmoved_head = &h->new_b->head[move_dest_idx];
+		
+		/* Complete moving the bucket from the old to the new table. */
+		help_head_move(pold_head, pmoved_head);
+		
+		/* The hash belongs to the moved bucket. */
+		if (move_dest_idx == new_idx) {
+			ASSERT(pmoved_head == pnew_head);
+			/* 
+			 * move_head() makes the new head of the moved bucket visible. 
+			 * The new head may be marked with a JOIN_FOLLOWS
+			 */
+			ASSERT(!(N_CONST & get_mark(*pmoved_head)));
+			*walk_mode = WM_MOVE_JOIN_FOLLOWS;
+		} else {
+			ASSERT(pmoved_head != pnew_head);
+			/* 
+			 * The hash belongs to the bucket that is the result of splitting 
+			 * the old/moved bucket, ie the bucket that contains the second
+			 * half of the split/old/moved bucket.
+			 */
+			
+			/* The moved bucket has not yet been split. */
+			if (N_NORMAL != get_mark(*pnew_head)) {
+				size_t split_hash = calc_split_hash(new_idx, h->new_b->order);
+				split_bucket(h, pmoved_head, pnew_head, split_hash);
+				/* 
+				 * split_bucket() makes the new head visible. No 
+				 * JOIN_FOLLOWS in this part of split bucket.
+				 */
+				ASSERT(N_NORMAL == get_mark(*pnew_head));
+			}
+			
+			*walk_mode = WM_LEAVE_JOIN;
+		}
+	} else if (h->new_b->order < b->order ) {
+		/* Shrinking the table. */
+		
+		size_t move_src_idx = grow_idx(new_idx);
+		
+		/* 
+		 * Complete moving the bucket from the old to the new table. 
+		 * Makes a valid pnew_head visible if already moved.
+		 */
+		help_head_move(&b->head[move_src_idx], pnew_head);
+		
+		/* Hash belongs to the bucket to be joined with the moved bucket. */
+		if (move_src_idx != old_idx) {
+			/* Bucket join not yet completed. */
+			if (N_INVALID != get_mark(*pold_head)) {
+				size_t split_hash = calc_split_hash(old_idx, b->order);
+				join_buckets(h, pold_head, pnew_head, split_hash);
+			}
+			
+			/* 
+			 * The resizer sets pold_head to &sentinel when all cpus are
+			 * guaranteed to see the bucket join.
+			 */
+			*join_finishing = (&sentinel != get_next(*pold_head));
+		}
+		
+		/* move_head() or join_buckets() makes it so or makes the mark visible.*/
+		ASSERT(N_INVALID == get_mark(*pold_head));
+		/* move_head() makes it visible. No JOIN_FOLLOWS used when shrinking. */
+		ASSERT(N_NORMAL == get_mark(*pnew_head));
+
+		*walk_mode = WM_LEAVE_JOIN;
+	} else {
+		/* 
+		 * Final stage of resize. The resizer is waiting for all 
+		 * readers to notice that the old table had been replaced.
+		 */
+		ASSERT(b == h->new_b);
+		*walk_mode = WM_NORMAL;
+	}
+}
+
+
+#if 0
+static void move_head(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head)
+{
+	start_head_move(psrc_head);
+	cas_order_barrier();
+	complete_head_move(psrc_head, pdest_head);
+}
+#endif
+
+/** Moves an immutable head \a psrc_head of cht_t.b to \a pdest_head of cht_t.new_b. 
+ * 
+ * The function guarantees the move will be visible on this cpu once
+ * it completes. In particular, *pdest_head will not be N_INVALID.
+ * 
+ * Unlike complete_head_move(), help_head_move() checks if the head had already
+ * been moved and tries to avoid moving the bucket heads if possible.
+ */
+static inline void help_head_move(marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head)
+{
+	/* Head move has to in progress already when calling this func. */
+	ASSERT(N_CONST & get_mark(*psrc_head));
+	
+	/* Head already moved. */
+	if (N_INVALID == get_mark(*psrc_head)) {
+		/* Effects of the head move have not yet propagated to this cpu. */
+		if (N_INVALID == get_mark(*pdest_head)) {
+			/* Make the move visible on this cpu. */
+			read_barrier();
+		}
+	} else {
+		complete_head_move(psrc_head, pdest_head);
+	}
+	
+	ASSERT(!(N_CONST & get_mark(*pdest_head)));
+}
+
+/** Initiates the move of the old head \a psrc_head.
+ * 
+ * The move may be completed with help_head_move(). 
+ */
+static void start_head_move(marked_ptr_t *psrc_head)
+{
+	/* Mark src head immutable. */
+	mark_const(psrc_head);
+}
+
+/** Marks the head immutable. */
+static void mark_const(marked_ptr_t *psrc_head)
+{
+	marked_ptr_t ret, src_link;
+	
+	/* Mark src head immutable. */
+	do {
+		cht_link_t *next = get_next(*psrc_head);
+		src_link = make_link(next, N_NORMAL);
+		
+		/* Mark the normal/clean src link immutable/const. */
+		ret = cas_link(psrc_head, next, N_NORMAL, next, N_CONST);
+	} while(ret != src_link && !(N_CONST & get_mark(ret)));
+}
+
+/** Completes moving head psrc_head to pdest_head (started by start_head_move()).*/
+static void complete_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head)
+{
+	ASSERT(N_JOIN_FOLLOWS != get_mark(*psrc_head));
+	ASSERT(N_CONST & get_mark(*psrc_head));
+	
+	cht_link_t *next = get_next(*psrc_head);
+	marked_ptr_t ret;
+	
+	ret = cas_link(pdest_head, &sentinel, N_INVALID, next, N_NORMAL);
+	ASSERT(ret == make_link(&sentinel, N_INVALID) || (N_NORMAL == get_mark(ret)));
+	cas_order_barrier();
+	
+	ret = cas_link(psrc_head, next, N_CONST, next, N_INVALID);	
+	ASSERT(ret == make_link(next, N_CONST) || (N_INVALID == get_mark(ret)));
+	cas_order_barrier();
+}
+
+/** Splits the bucket at psrc_head and links to the remainder from pdest_head.
+ * 
+ * Items with hashes greater or equal to \a split_hash are moved to bucket
+ * with head at \a pdest_head. 
+ * 
+ * @param h           CHT to operate on.
+ * @param psrc_head   Head of the bucket to split (in cht_t.new_b).
+ * @param pdest_head  Head of the bucket that points to the second part
+ *                    of the split bucket in psrc_head. (in cht_t.new_b)
+ * @param split_hash  Hash of the first possible item in the remainder of 
+ *                    psrc_head, ie the smallest hash pdest_head is allowed
+ *                    to point to..
+ */
+static void split_bucket(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash)
+{
+	/* Already split. */
+	if (N_NORMAL == get_mark(*pdest_head))
+		return;
+	
+	/*
+	 * L == Last node of the first part of the split bucket. That part
+	 *      remains in the original/src bucket. 
+	 * F == First node of the second part of the split bucket. That part
+	 *      will be referenced from the dest bucket head.
+	 *
+	 * We want to first mark a clean L as JF so that updaters unaware of 
+	 * the split (or table resize):
+	 * - do not insert a new node between L and F
+	 * - do not unlink L (that is why it has to be clean/normal)
+	 * - do not unlink F
+	 *
+	 * Then we can safely mark F as JN even if it has been marked deleted. 
+	 * Once F is marked as JN updaters aware of table resize will not 
+	 * attempt to unlink it (JN will have two predecessors - we cannot
+	 * safely unlink from both at the same time). Updaters unaware of 
+	 * ongoing resize can reach F only via L and that node is already 
+	 * marked JF, so they won't unlink F.
+	 * 
+	 * Last, link the new/dest head to F.
+	 * 
+	 * 
+	 * 0)                           ,-- split_hash, first hash of the dest bucket 
+	 *                              v  
+	 *  [src_head | N] -> .. -> [L] -> [F]
+	 *  [dest_head | Inv]
+	 * 
+	 * 1)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [F]
+	 *  [dest_head | Inv]
+	 * 
+	 * 2)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [JN]
+	 *  [dest_head | Inv]
+	 * 
+	 * 3)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [JN]
+	 *                                   ^
+	 *  [dest_head | N] -----------------'
+	 */
+	wnd_t wnd;
+	
+	rcu_read_lock();
+	
+	/* Mark the last node of the first part of the split bucket as JF. */
+	mark_join_follows(h, psrc_head, split_hash, &wnd);
+	cas_order_barrier();
+	
+	/* There are nodes in the dest bucket, ie the second part of the split. */
+	if (wnd.cur != &sentinel) {
+		/* 
+		 * Mark the first node of the dest bucket as a join node so 
+		 * updaters do not attempt to unlink it if it is deleted. 
+		 */
+		mark_join_node(wnd.cur);
+		cas_order_barrier();
+	} else {
+		/* 
+		 * Second part of the split bucket is empty. There are no nodes
+		 * to mark as JOIN nodes and there never will be.
+		 */
+	}
+	
+	/* Link the dest head to the second part of the split. */
+	marked_ptr_t ret = 
+		cas_link(pdest_head, &sentinel, N_INVALID, wnd.cur, N_NORMAL);
+	ASSERT(ret == make_link(&sentinel, N_INVALID) || (N_NORMAL == get_mark(ret)));
+	cas_order_barrier();
+	
+	rcu_read_unlock();
+}
+
+/** Finds and marks the last node of psrc_head w/ hash less than split_hash.
+ * 
+ * Finds a node in psrc_head with the greatest hash that is strictly less 
+ * than split_hash and marks it with N_JOIN_FOLLOWS. 
+ * 
+ * Returns a window pointing to that node. 
+ * 
+ * Any logically deleted nodes along the way are 
+ * garbage collected; therefore, the predecessor node (if any) will most 
+ * likely not be marked N_DELETED.
+ * 
+ * @param h          CHT to operate on.
+ * @param psrc_head  Bucket head.
+ * @param split_hash The smallest hash a join node (ie the node following
+ *                   the desired join-follows node) may have.
+ * @param[out] wnd   Points to the node marked with N_JOIN_FOLLOWS.
+ */
+static void mark_join_follows(cht_t *h, marked_ptr_t *psrc_head, 
+	size_t split_hash, wnd_t *wnd)
+{
+	/* See comment in split_bucket(). */
+	
+	bool done;
+	do {
+		bool resizing = false;
+		wnd->ppred = psrc_head;
+		wnd->cur = get_next(*psrc_head);
+		
+		/* 
+		 * Find the split window, ie the last node of the first part of
+		 * the split bucket and the its successor - the first node of
+		 * the second part of the split bucket. Retry if GC failed. 
+		 */
+		if (!find_wnd_and_gc(h, split_hash, WM_MOVE_JOIN_FOLLOWS, wnd, &resizing))
+			continue;
+		
+		/* Must not report that the table is resizing if WM_MOVE_JOIN_FOLLOWS.*/
+		ASSERT(!resizing);
+		/* 
+		 * Mark the last node of the first half of the split bucket 
+		 * that a join node follows. It must be clean/normal.
+		 */
+		marked_ptr_t ret
+			= cas_link(wnd->ppred, wnd->cur, N_NORMAL, wnd->cur, N_JOIN_FOLLOWS);
+
+		/* 
+		 * Successfully marked as a JF node or already marked that way (even 
+		 * if also marked deleted - unlinking the node will move the JF mark). 
+		 */
+		done = (ret == make_link(wnd->cur, N_NORMAL))
+			|| (N_JOIN_FOLLOWS & get_mark(ret));
+	} while (!done);
+}
+
+/** Marks join_node with N_JOIN. */
+static void mark_join_node(cht_link_t *join_node)
+{
+	/* See comment in split_bucket(). */
+	
+	bool done;
+	do {
+		cht_link_t *next = get_next(join_node->link);
+		mark_t mark = get_mark(join_node->link);
+		
+		/* 
+		 * May already be marked as deleted, but it won't be unlinked 
+		 * because its predecessor is marked with JOIN_FOLLOWS or CONST.
+		 */
+		marked_ptr_t ret 
+			= cas_link(&join_node->link, next, mark, next, mark | N_JOIN);
+		
+		/* Successfully marked or already marked as a join node. */
+		done = (ret == make_link(next, mark))
+			|| (N_JOIN & get_mark(ret));
+	} while(!done);
+}
+
+/** Appends the bucket at psrc_head to the bucket at pdest_head.
+ * 
+ * @param h          CHT to operate on.
+ * @param psrc_head  Bucket to merge with pdest_head.
+ * @param pdest_head Bucket to be joined by psrc_head.
+ * @param split_hash The smallest hash psrc_head may contain.
+ */
+static void join_buckets(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash)
+{
+	/* Buckets already joined. */
+	if (N_INVALID == get_mark(*psrc_head))
+		return;
+	/*
+	 * F == First node of psrc_head, ie the bucket we want to append 
+	 *      to (ie join with) the bucket starting at pdest_head.
+	 * L == Last node of pdest_head, ie the bucket that psrc_head will
+	 *      be appended to. 
+	 *
+	 * (1) We first mark psrc_head immutable to signal that a join is 
+	 * in progress and so that updaters unaware of the join (or table 
+	 * resize):
+	 * - do not insert new nodes between the head psrc_head and F
+	 * - do not unlink F (it may already be marked deleted)
+	 * 
+	 * (2) Next, F is marked as a join node. Updaters aware of table resize
+	 * will not attempt to unlink it. We cannot safely/atomically unlink 
+	 * the join node because it will be pointed to from two different 
+	 * buckets. Updaters unaware of resize will fail to unlink the join
+	 * node due to the head being marked immutable.
+	 *
+	 * (3) Then the tail of the bucket at pdest_head is linked to the join
+	 * node. From now on, nodes in both buckets can be found via pdest_head.
+	 * 
+	 * (4) Last, mark immutable psrc_head as invalid. It signals updaters
+	 * that the join is complete and they can insert new nodes (originally
+	 * destined for psrc_head) into pdest_head. 
+	 * 
+	 * Note that pdest_head keeps pointing at the join node. This allows
+	 * lookups and updaters to determine if they should see a link between
+	 * the tail L and F when searching for nodes originally in psrc_head
+	 * via pdest_head. If they reach the tail of pdest_head without 
+	 * encountering any nodes of psrc_head, either there were no nodes
+	 * in psrc_head to begin with or the link between L and F did not
+	 * yet propagate to their cpus. If psrc_head was empty, it remains
+	 * NULL. Otherwise psrc_head points to a join node (it will not be 
+	 * unlinked until table resize completes) and updaters/lookups
+	 * should issue a read_barrier() to make the link [L]->[JN] visible.
+	 * 
+	 * 0)                           ,-- split_hash, first hash of the src bucket 
+	 *                              v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | N]--> [F] -> .. 
+	 *  ^
+	 *  ` split_hash, first hash of the src bucket
+	 * 
+	 * 1)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | C]--> [F] -> .. 
+	 * 
+	 * 2)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | C]--> [JN] -> .. 
+	 * 
+	 * 3)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L] --+
+	 *                                v
+	 *  [src_head | C]-------------> [JN] -> .. 
+	 * 
+	 * 4)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L] --+
+	 *                                v
+	 *  [src_head | Inv]-----------> [JN] -> .. 
+	 */
+	
+	rcu_read_lock();
+	
+	/* Mark src_head immutable - signals updaters that bucket join started. */
+	mark_const(psrc_head);
+	cas_order_barrier();
+	
+	cht_link_t *join_node = get_next(*psrc_head);
+
+	if (join_node != &sentinel) {
+		mark_join_node(join_node);
+		cas_order_barrier();
+		
+		link_to_join_node(h, pdest_head, join_node, split_hash);
+		cas_order_barrier();
+	} 
+	
+	marked_ptr_t ret = 
+		cas_link(psrc_head, join_node, N_CONST, join_node, N_INVALID);
+	ASSERT(ret == make_link(join_node, N_CONST) || (N_INVALID == get_mark(ret)));
+	cas_order_barrier();
+	
+	rcu_read_unlock();
+}
+
+/** Links the tail of pdest_head to join_node.
+ * 
+ * @param h          CHT to operate on.
+ * @param pdest_head Head of the bucket whose tail is to be linked to join_node.
+ * @param join_node  A node marked N_JOIN with a hash greater or equal to
+ *                   split_hash.
+ * @param split_hash The least hash that is greater than the hash of any items
+ *                   (originally) in pdest_head.
+ */
+static void link_to_join_node(cht_t *h, marked_ptr_t *pdest_head, 
+	cht_link_t *join_node, size_t split_hash)
+{
+	bool done;
+	do {
+		wnd_t wnd = {
+			.ppred = pdest_head,
+			.cur = get_next(*pdest_head)
+		};
+		
+		bool resizing = false;
+		
+		if (!find_wnd_and_gc(h, split_hash, WM_LEAVE_JOIN, &wnd, &resizing))
+			continue;
+
+		ASSERT(!resizing);
+		
+		if (wnd.cur != &sentinel) {
+			/* Must be from the new appended bucket. */
+			ASSERT(split_hash <= node_hash(h, wnd.cur) 
+				|| h->invalid_hash == node_hash(h, wnd.cur));
+			return;
+		}
+		
+		/* Reached the tail of pdest_head - link it to the join node. */
+		marked_ptr_t ret = 
+			cas_link(wnd.ppred, &sentinel, N_NORMAL, join_node, N_NORMAL);
+		
+		done = (ret == make_link(&sentinel, N_NORMAL));
+	} while (!done);
+}
+
+/** Instructs RCU to free the item once all preexisting references are dropped. 
+ * 
+ * The item is freed via op->remove_callback().
+ */
+static void free_later(cht_t *h, cht_link_t *item)
+{
+	ASSERT(item != &sentinel);
+	
+	/* 
+	 * remove_callback only works as rcu_func_t because rcu_link is the first
+	 * field in cht_link_t.
+	 */
+	rcu_call(&item->rcu_link, (rcu_func_t)h->op->remove_callback);
+	
+	item_removed(h);
+}
+
+/** Notes that an item had been unlinked from the table and shrinks it if needed.
+ * 
+ * If the number of items in the table drops below 1/4 of the maximum 
+ * allowed load the table is shrunk in the background.
+ */
+static inline void item_removed(cht_t *h)
+{
+	size_t items = (size_t) atomic_predec(&h->item_cnt);
+	size_t bucket_cnt = (1 << h->b->order);
+	
+	bool need_shrink = (items == h->max_load * bucket_cnt / 4);
+	bool missed_shrink = (items == h->max_load * bucket_cnt / 8);
+	
+	if ((need_shrink || missed_shrink) && h->b->order > h->min_order) {
+		atomic_count_t resize_reqs = atomic_preinc(&h->resize_reqs);
+		/* The first resize request. Start the resizer. */
+		if (1 == resize_reqs) {
+			workq_global_enqueue_noblock(&h->resize_work, resize_table);
+		}
+	}
+}
+
+/** Notes an item had been inserted and grows the table if needed. 
+ * 
+ * The table is resized in the background.
+ */
+static inline void item_inserted(cht_t *h)
+{
+	size_t items = (size_t) atomic_preinc(&h->item_cnt);
+	size_t bucket_cnt = (1 << h->b->order);
+	
+	bool need_grow = (items == h->max_load * bucket_cnt);
+	bool missed_grow = (items == 2 * h->max_load * bucket_cnt);
+	
+	if ((need_grow || missed_grow) && h->b->order < CHT_MAX_ORDER) {
+		atomic_count_t resize_reqs = atomic_preinc(&h->resize_reqs);
+		/* The first resize request. Start the resizer. */
+		if (1 == resize_reqs) {
+			workq_global_enqueue_noblock(&h->resize_work, resize_table);
+		}
+	}
+}
+
+/** Resize request handler. Invoked on the system work queue. */
+static void resize_table(work_t *arg)
+{
+	cht_t *h = member_to_inst(arg, cht_t, resize_work);
+	
+#ifdef CONFIG_DEBUG
+	ASSERT(h->b);
+	/* Make resize_reqs visible. */
+	read_barrier();
+	ASSERT(0 < atomic_get(&h->resize_reqs));
+#endif
+
+	bool done;
+	do {
+		/* Load the most recent  h->item_cnt. */
+		read_barrier();
+		size_t cur_items = (size_t) atomic_get(&h->item_cnt);
+		size_t bucket_cnt = (1 << h->b->order);
+		size_t max_items = h->max_load * bucket_cnt;
+
+		if (cur_items >= max_items && h->b->order < CHT_MAX_ORDER) {
+			grow_table(h);
+		} else if (cur_items <= max_items / 4 && h->b->order > h->min_order) {
+			shrink_table(h);
+		} else {
+			/* Table is just the right size. */
+			atomic_count_t reqs = atomic_predec(&h->resize_reqs);
+			done = (reqs == 0);
+		}
+	} while (!done);
+}
+
+/** Increases the number of buckets two-fold. Blocks until done. */
+static void grow_table(cht_t *h)
+{
+	if (h->b->order >= CHT_MAX_ORDER)
+		return;
+	
+	h->new_b = alloc_buckets(h->b->order + 1, true, false);
+
+	/* Failed to alloc a new table - try next time the resizer is run. */
+	if (!h->new_b) 
+		return;
+
+	/* Wait for all readers and updaters to see the initialized new table. */
+	rcu_synchronize();
+	size_t old_bucket_cnt = (1 << h->b->order);
+	
+	/* 
+	 * Give updaters a chance to help out with the resize. Do the minimum 
+	 * work needed to announce a resize is in progress, ie start moving heads.
+	 */
+	for (size_t idx = 0; idx < old_bucket_cnt; ++idx) {
+		start_head_move(&h->b->head[idx]);
+	}
+	
+	/* Order start_head_move() wrt complete_head_move(). */
+	cas_order_barrier();
+	
+	/* Complete moving heads and split any buckets not yet split by updaters. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		marked_ptr_t *move_dest_head = &h->new_b->head[grow_idx(old_idx)];
+		marked_ptr_t *move_src_head = &h->b->head[old_idx];
+
+		/* Head move not yet completed. */
+		if (N_INVALID != get_mark(*move_src_head)) {
+			complete_head_move(move_src_head, move_dest_head);
+		}
+
+		size_t split_idx = grow_to_split_idx(old_idx);
+		size_t split_hash = calc_split_hash(split_idx, h->new_b->order);
+		marked_ptr_t *split_dest_head = &h->new_b->head[split_idx];
+
+		split_bucket(h, move_dest_head, split_dest_head, split_hash);
+	}
+	
+	/* 
+	 * Wait for all updaters to notice the new heads. Once everyone sees
+	 * the invalid old bucket heads they will know a resize is in progress
+	 * and updaters will modify the correct new buckets. 
+	 */
+	rcu_synchronize();
+	
+	/* Clear the JOIN_FOLLOWS mark and remove the link between the split buckets.*/
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = grow_idx(old_idx);
+		
+		cleanup_join_follows(h, &h->new_b->head[new_idx]);
+	}
+
+	/* 
+	 * Wait for everyone to notice that buckets were split, ie link connecting
+	 * the join follows and join node has been cut. 
+	 */
+	rcu_synchronize();
+	
+	/* Clear the JOIN mark and GC any deleted join nodes. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = grow_to_split_idx(old_idx);
+		
+		cleanup_join_node(h, &h->new_b->head[new_idx]);
+	}
+
+	/* Wait for everyone to see that the table is clear of any resize marks. */
+	rcu_synchronize();
+	
+	cht_buckets_t *old_b = h->b;
+	rcu_assign(h->b, h->new_b);
+
+	/* Wait for everyone to start using the new table. */
+	rcu_synchronize();
+	
+	free(old_b);
+	
+	/* Not needed; just for increased readability. */
+	h->new_b = NULL;
+}
+
+/** Halfs the number of buckets. Blocks until done. */
+static void shrink_table(cht_t *h)
+{
+	if (h->b->order <= h->min_order)
+		return;
+	
+	h->new_b = alloc_buckets(h->b->order - 1, true, false);
+
+	/* Failed to alloc a new table - try next time the resizer is run. */
+	if (!h->new_b) 
+		return;
+
+	/* Wait for all readers and updaters to see the initialized new table. */
+	rcu_synchronize();
+	
+	size_t old_bucket_cnt = (1 << h->b->order);
+	
+	/* 
+	 * Give updaters a chance to help out with the resize. Do the minimum 
+	 * work needed to announce a resize is in progress, ie start moving heads.
+	 */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = shrink_idx(old_idx);
+		
+		/* This bucket should be moved. */
+		if (grow_idx(new_idx) == old_idx) {
+			start_head_move(&h->b->head[old_idx]);
+		} else {
+			/* This bucket should join the moved bucket once the move is done.*/
+		}
+	}
+	
+	/* Order start_head_move() wrt to complete_head_move(). */
+	cas_order_barrier();
+	
+	/* Complete moving heads and join buckets with the moved buckets. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = shrink_idx(old_idx);
+		size_t move_src_idx = grow_idx(new_idx);
+		
+		/* This bucket should be moved. */
+		if (move_src_idx == old_idx) {
+			/* Head move not yet completed. */
+			if (N_INVALID != get_mark(h->b->head[old_idx])) {
+				complete_head_move(&h->b->head[old_idx], &h->new_b->head[new_idx]);
+			}
+		} else {
+			/* This bucket should join the moved bucket. */
+			size_t split_hash = calc_split_hash(old_idx, h->b->order);
+			join_buckets(h, &h->b->head[old_idx], &h->new_b->head[new_idx], 
+				split_hash);
+		}
+	}
+	
+	/* 
+	 * Wait for all updaters to notice the new heads. Once everyone sees
+	 * the invalid old bucket heads they will know a resize is in progress
+	 * and updaters will modify the correct new buckets. 
+	 */
+	rcu_synchronize();
+	
+	/* Let everyone know joins are complete and fully visible. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t move_src_idx = grow_idx(shrink_idx(old_idx));
+	
+		/* Set the invalid joinee head to NULL. */
+		if (old_idx != move_src_idx) {
+			ASSERT(N_INVALID == get_mark(h->b->head[old_idx]));
+			
+			if (&sentinel != get_next(h->b->head[old_idx]))
+				h->b->head[old_idx] = make_link(&sentinel, N_INVALID);
+		}
+	}
+	
+	/* todo comment join node vs reset joinee head*/
+	rcu_synchronize();
+
+	size_t new_bucket_cnt = (1 << h->new_b->order);
+		
+	/* Clear the JOIN mark and GC any deleted join nodes. */
+	for (size_t new_idx = 0; new_idx < new_bucket_cnt; ++new_idx) {
+		cleanup_join_node(h, &h->new_b->head[new_idx]);
+	}
+
+	/* Wait for everyone to see that the table is clear of any resize marks. */
+	rcu_synchronize();
+	
+	cht_buckets_t *old_b = h->b;
+	rcu_assign(h->b, h->new_b);
+	
+	/* Wait for everyone to start using the new table. */
+	rcu_synchronize();
+	
+	free(old_b);
+	
+	/* Not needed; just for increased readability. */
+	h->new_b = NULL;
+}
+
+/** Finds and clears the N_JOIN mark from a node in new_head (if present). */
+static void cleanup_join_node(cht_t *h, marked_ptr_t *new_head)
+{
+	rcu_read_lock();
+
+	cht_link_t *cur = get_next(*new_head);
+		
+	while (cur != &sentinel) {
+		/* Clear the join node's JN mark - even if it is marked as deleted. */
+		if (N_JOIN & get_mark(cur->link)) {
+			clear_join_and_gc(h, cur, new_head);
+			break;
+		}
+		
+		cur = get_next(cur->link);
+	}
+	
+	rcu_read_unlock();
+}
+
+/** Clears the join_node's N_JOIN mark frees it if marked N_DELETED as well. */
+static void clear_join_and_gc(cht_t *h, cht_link_t *join_node, 
+	marked_ptr_t *new_head)
+{
+	ASSERT(join_node != &sentinel);
+	ASSERT(join_node && (N_JOIN & get_mark(join_node->link)));
+	
+	bool done;
+	
+	/* Clear the JN mark. */
+	do {
+		marked_ptr_t jn_link = join_node->link;
+		cht_link_t *next = get_next(jn_link);
+		/* Clear the JOIN mark but keep the DEL mark if present. */
+		mark_t cleared_mark = get_mark(jn_link) & N_DELETED;
+
+		marked_ptr_t ret = 
+			_cas_link(&join_node->link, jn_link, make_link(next, cleared_mark));
+
+		/* Done if the mark was cleared. Retry if a new node was inserted. */
+		done = (ret == jn_link);
+		ASSERT(ret == jn_link || (get_mark(ret) & N_JOIN));
+	} while (!done);
+	
+	if (!(N_DELETED & get_mark(join_node->link)))
+		return;
+
+	/* The join node had been marked as deleted - GC it. */
+
+	/* Clear the JOIN mark before trying to unlink the deleted join node.*/
+	cas_order_barrier();
+	
+	size_t jn_hash = node_hash(h, join_node);
+	do {
+		bool resizing = false;
+		
+		wnd_t wnd = {
+			.ppred = new_head,
+			.cur = get_next(*new_head)
+		};
+		
+		done = find_wnd_and_gc_pred(h, jn_hash, WM_NORMAL, same_node_pred, 
+			join_node, &wnd, &resizing);
+		
+		ASSERT(!resizing);
+	} while (!done);
+}
+
+/** Finds a non-deleted node with N_JOIN_FOLLOWS and clears the mark. */
+static void cleanup_join_follows(cht_t *h, marked_ptr_t *new_head)
+{
+	ASSERT(new_head);
+	
+	rcu_read_lock();
+
+	wnd_t wnd = {
+		.ppred = NULL,
+		.cur = NULL
+	};
+	marked_ptr_t *cur_link = new_head;
+		
+	/*
+	 * Find the non-deleted node with a JF mark and clear the JF mark.
+	 * The JF node may be deleted and/or the mark moved to its neighbors
+	 * at any time. Therefore, we GC deleted nodes until we find the JF 
+	 * node in order to remove stale/deleted JF nodes left behind eg by 
+	 * delayed threads that did not yet get a chance to unlink the deleted 
+	 * JF node and move its mark. 
+	 * 
+	 * Note that the head may be marked JF (but never DELETED).
+	 */
+	while (true) {
+		bool is_jf_node = N_JOIN_FOLLOWS & get_mark(*cur_link);
+		
+		/* GC any deleted nodes on the way - even deleted JOIN_FOLLOWS. */
+		if (N_DELETED & get_mark(*cur_link)) {
+			ASSERT(cur_link != new_head);
+			ASSERT(wnd.ppred && wnd.cur && wnd.cur != &sentinel);
+			ASSERT(cur_link == &wnd.cur->link);
+
+			bool dummy;
+			bool deleted = gc_deleted_node(h, WM_MOVE_JOIN_FOLLOWS, &wnd, &dummy);
+
+			/* Failed to GC or collected a deleted JOIN_FOLLOWS. */
+			if (!deleted || is_jf_node) {
+				/* Retry from the head of the bucket. */
+				cur_link = new_head;
+				continue;
+			}
+		} else {
+			/* Found a non-deleted JF. Clear its JF mark. */
+			if (is_jf_node) {
+				cht_link_t *next = get_next(*cur_link);
+				marked_ptr_t ret = 
+					cas_link(cur_link, next, N_JOIN_FOLLOWS, &sentinel, N_NORMAL);
+				
+				ASSERT(next == &sentinel 
+					|| ((N_JOIN | N_JOIN_FOLLOWS) & get_mark(ret)));
+
+				/* Successfully cleared the JF mark of a non-deleted node. */
+				if (ret == make_link(next, N_JOIN_FOLLOWS)) {
+					break;
+				} else {
+					/* 
+					 * The JF node had been deleted or a new node inserted 
+					 * right after it. Retry from the head.
+					 */
+					cur_link = new_head;
+					continue;
+				}
+			} else {
+				wnd.ppred = cur_link;
+				wnd.cur = get_next(*cur_link);				
+			}
+		}
+
+		/* We must encounter a JF node before we reach the end of the bucket. */
+		ASSERT(wnd.cur && wnd.cur != &sentinel);
+		cur_link = &wnd.cur->link;
+	}
+	
+	rcu_read_unlock();
+}
+
+/** Returns the first possible hash following a bucket split point. 
+ * 
+ * In other words the returned hash is the smallest possible hash
+ * the remainder of the split bucket may contain.
+ */
+static inline size_t calc_split_hash(size_t split_idx, size_t order)
+{
+	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
+	return split_idx << (8 * sizeof(size_t) - order);
+}
+
+/** Returns the bucket head index given the table size order and item hash. */
+static inline size_t calc_bucket_idx(size_t hash, size_t order)
+{
+	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
+	return hash >> (8 * sizeof(size_t) - order);
+}
+
+/** Returns the bucket index of destination*/
+static inline size_t grow_to_split_idx(size_t old_idx)
+{
+	return grow_idx(old_idx) | 1;
+}
+
+/** Returns the destination index of a bucket head when the table is growing. */
+static inline size_t grow_idx(size_t idx)
+{
+	return idx << 1;
+}
+
+/** Returns the destination index of a bucket head when the table is shrinking.*/
+static inline size_t shrink_idx(size_t idx)
+{
+	return idx >> 1;
+}
+
+/** Returns a mixed hash of the search key.*/
+static inline size_t calc_key_hash(cht_t *h, void *key)
+{
+	/* Mimic calc_node_hash. */
+	return hash_mix(h->op->key_hash(key)) & ~(size_t)1;
+}
+
+/** Returns a memoized mixed hash of the item. */
+static inline size_t node_hash(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(item->hash == h->invalid_hash 
+		|| item->hash == sentinel.hash
+		|| item->hash == calc_node_hash(h, item));
+	
+	return item->hash;
+}
+
+/** Calculates and mixed the hash of the item. */
+static inline size_t calc_node_hash(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(item != &sentinel);
+	/* 
+	 * Clear the lowest order bit in order for sentinel's node hash
+	 * to be the greatest possible.
+	 */
+	return hash_mix(h->op->hash(item)) & ~(size_t)1;
+}
+
+/** Computes and memoizes the hash of the item. */
+static inline void memoize_node_hash(cht_t *h, cht_link_t *item)
+{
+	item->hash = calc_node_hash(h, item);
+}
+
+/** Packs the next pointer address and the mark into a single pointer. */
+static inline marked_ptr_t make_link(const cht_link_t *next, mark_t mark)
+{
+	marked_ptr_t ptr = (marked_ptr_t) next;
+	
+	ASSERT(!(ptr & N_MARK_MASK));
+	ASSERT(!((unsigned)mark & ~N_MARK_MASK));
+	
+	return ptr | mark;
+}
+
+/** Strips any marks from the next item link and returns the next item's address.*/
+static inline cht_link_t * get_next(marked_ptr_t link)
+{
+	return (cht_link_t*)(link & ~N_MARK_MASK);
+}
+
+/** Returns the current node's mark stored in the next item link. */
+static inline mark_t get_mark(marked_ptr_t link)
+{
+	return (mark_t)(link & N_MARK_MASK);
+}
+
+/** Moves the window by one item so that is points to the next item. */
+static inline void next_wnd(wnd_t *wnd)
+{
+	ASSERT(wnd);
+	ASSERT(wnd->cur);
+
+	wnd->last = wnd->cur;
+	wnd->ppred = &wnd->cur->link;
+	wnd->cur = get_next(wnd->cur->link);
+}
+
+/** Predicate that matches only exactly the same node. */
+static bool same_node_pred(void *node, const cht_link_t *item2)
+{
+	const cht_link_t *item1 = (const cht_link_t*) node;
+	return item1 == item2;
+}
+
+/** Compare-and-swaps a next item link. */
+static inline marked_ptr_t cas_link(marked_ptr_t *link, const cht_link_t *cur_next, 
+	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark)
+{
+	return _cas_link(link, make_link(cur_next, cur_mark), 
+		make_link(new_next, new_mark));
+}
+
+/** Compare-and-swaps a next item link. */
+static inline marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur, 
+	marked_ptr_t new)
+{
+	ASSERT(link != &sentinel.link);
+	/*
+	 * cas(x) on the same location x on one cpu must be ordered, but do not
+	 * have to be ordered wrt to other cas(y) to a different location y
+	 * on the same cpu.
+	 * 
+	 * cas(x) must act as a write barrier on x, ie if cas(x) succeeds 
+	 * and is observed by another cpu, then all cpus must be able to 
+	 * make the effects of cas(x) visible just by issuing a load barrier.
+	 * For example:
+	 * cpu1         cpu2            cpu3
+	 *                              cas(x, 0 -> 1), succeeds 
+	 *              cas(x, 0 -> 1), fails
+	 *              MB, to order load of x in cas and store to y
+	 *              y = 7
+	 * sees y == 7
+	 * loadMB must be enough to make cas(x) on cpu3 visible to cpu1, ie x == 1.
+	 * 
+	 * If cas() did not work this way:
+	 * a) our head move protocol would not be correct.
+	 * b) freeing an item linked to a moved head after another item was
+	 *   inserted in front of it, would require more than one grace period.
+	 * 
+	 * Ad (a): In the following example, cpu1 starts moving old_head
+	 * to new_head, cpu2 completes the move and cpu3 notices cpu2
+	 * completed the move before cpu1 gets a chance to notice cpu2
+	 * had already completed the move. Our requirements for cas() 
+	 * assume cpu3 will see a valid and mutable value in new_head 
+	 * after issuing a load memory barrier once it has determined 
+	 * the old_head's value had been successfully moved to new_head 
+	 * (because it sees old_head marked invalid).
+	 * 
+	 *  cpu1             cpu2             cpu3
+	 *   cas(old_head, <addr, N>, <addr, Const>), succeeds
+	 *   cas-order-barrier
+	 *   // Move from old_head to new_head started, now the interesting stuff:
+	 *   cas(new_head, <0, Inv>, <addr, N>), succeeds
+	 * 
+	 *                    cas(new_head, <0, Inv>, <addr, N>), but fails
+	 *                    cas-order-barrier
+	 *                    cas(old_head, <addr, Const>, <addr, Inv>), succeeds
+	 *                                     
+	 *                                     Sees old_head marked Inv (by cpu2)
+	 *                                     load-MB
+	 *                                     assert(new_head == <addr, N>)
+	 *   
+	 *   cas-order-barrier
+	 *  
+	 * Even though cpu1 did not yet issue a cas-order-barrier, cpu1's store
+	 * to new_head (successful cas()) must be made visible to cpu3 with
+	 * a load memory barrier if cpu1's store to new_head is visible
+	 * on another cpu (cpu2) and that cpu's (cpu2's) store to old_head
+	 * is already visible to cpu3.	 * 
+	 */
+	void *expected = (void*)cur;
+	
+	/* 
+	 * Use the acquire-release model, although we could probably
+	 * get away even with the relaxed memory model due to our use
+	 * of explicit memory barriers.
+	 */
+	__atomic_compare_exchange_n((void**)link, &expected, (void *)new, false,
+		__ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+	
+	return (marked_ptr_t) expected;
+}
+
+/** Orders compare-and-swaps to different memory locations. */
+static inline void cas_order_barrier(void)
+{
+	/* Make sure CAS to different memory locations are ordered. */
+	write_barrier();
+}
+
+
+/** @}
+ */
Index: kernel/generic/src/adt/list.c
===================================================================
--- kernel/generic/src/adt/list.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/adt/list.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -68,24 +68,26 @@
 }
 
-/** Concatenate two lists
- *
- * Concatenate lists @a list1 and @a list2, producing a single
- * list @a list1 containing items from both (in @a list1, @a list2
- * order) and empty list @a list2.
- *
- * @param list1		First list and concatenated output
- * @param list2 	Second list and empty output.
- *
+/** Moves items of one list into another after the specified item.
+ * 
+ * Inserts all items of @a list after item at @a pos in another list. 
+ * Both lists may be empty. 
+ * 
+ * @param list Source list to move after pos. Empty afterwards.
+ * @param pos Source items will be placed after this item.
  */
-void list_concat(list_t *list1, list_t *list2)
+void list_splice(list_t *list, link_t *pos)
 {
-	if (list_empty(list2))
+	if (list_empty(list)) 
 		return;
-
-	list2->head.next->prev = list1->head.prev;
-	list2->head.prev->next = &list1->head;
-	list1->head.prev->next = list2->head.next;
-	list1->head.prev = list2->head.prev;
-	list_initialize(list2);
+	
+	/* Attach list to destination. */
+	list->head.next->prev = pos;
+	list->head.prev->next = pos->next;
+	
+	/* Link destination list to the added list. */
+	pos->next->prev = list->head.prev;
+	pos->next = list->head.next;
+	
+	list_initialize(list);
 }
 
Index: kernel/generic/src/console/chardev.c
===================================================================
--- kernel/generic/src/console/chardev.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/console/chardev.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -39,5 +39,5 @@
 #include <print.h>
 #include <func.h>
-#include <arch.h>
+#include <cpu.h>
 
 /** Initialize input character device.
Index: kernel/generic/src/console/cmd.c
===================================================================
--- kernel/generic/src/console/cmd.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/console/cmd.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -69,4 +69,6 @@
 #include <sysinfo/sysinfo.h>
 #include <symtab.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <errno.h>
 
@@ -525,4 +527,22 @@
 };
 
+/* Data and methods for the 'workq' command */
+static int cmd_workq(cmd_arg_t *argv);
+static cmd_info_t workq_info = {
+	.name = "workq",
+	.description = "Show global workq information.",
+	.func = cmd_workq,
+	.argc = 0
+};
+
+/* Data and methods for the 'workq' command */
+static int cmd_rcu(cmd_arg_t *argv);
+static cmd_info_t rcu_info = {
+	.name = "rcu",
+	.description = "Show RCU run-time statistics.",
+	.func = cmd_rcu,
+	.argc = 0
+};
+
 /* Data and methods for 'ipc' command */
 static int cmd_ipc(cmd_arg_t *argv);
@@ -588,4 +608,5 @@
 	&physmem_info,
 	&reboot_info,
+	&rcu_info,
 	&sched_info,
 	&set4_info,
@@ -598,4 +619,5 @@
 	&uptime_info,
 	&version_info,
+	&workq_info,
 	&zones_info,
 	&zone_info,
@@ -1280,4 +1302,28 @@
 }
 
+/** Prints information about the global work queue.
+ *
+ * @param argv Ignores
+ *
+ * @return Always 1
+ */
+int cmd_workq(cmd_arg_t *argv)
+{
+	workq_global_print_info();
+	return 1;
+}
+
+/** Prints RCU statistics.
+ *
+ * @param argv Ignores
+ *
+ * @return Always 1
+ */
+int cmd_rcu(cmd_arg_t *argv)
+{
+	rcu_print_stat();
+	return 1;
+}
+
 /** Command for listing memory zones
  *
Index: kernel/generic/src/console/console.c
===================================================================
--- kernel/generic/src/console/console.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/console/console.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -52,4 +52,6 @@
 #include <errno.h>
 #include <str.h>
+#include <mm/frame.h> /* SIZE2FRAMES */
+#include <mm/slab.h>  /* malloc */
 
 #define KLOG_PAGES    8
Index: kernel/generic/src/console/kconsole.c
===================================================================
--- kernel/generic/src/console/kconsole.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/console/kconsole.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -60,4 +60,5 @@
 #include <putchar.h>
 #include <str.h>
+#include <mm/slab.h>
 
 /** Simple kernel console.
Index: kernel/generic/src/cpu/cpu.c
===================================================================
--- kernel/generic/src/cpu/cpu.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/cpu/cpu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -50,4 +50,5 @@
 #include <sysinfo/sysinfo.h>
 #include <arch/cycle.h>
+#include <synch/rcu.h>
 
 cpu_t *cpus;
@@ -102,4 +103,5 @@
 	cpu_identify();
 	cpu_arch_init();
+	rcu_cpu_init();
 }
 
Index: kernel/generic/src/cpu/cpu_mask.c
===================================================================
--- kernel/generic/src/cpu/cpu_mask.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/cpu/cpu_mask.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief CPU mask manipulation functions.
+ */
+#include <cpu/cpu_mask.h>
+#include <cpu.h>
+#include <config.h>
+
+static const size_t word_size = sizeof(unsigned int);
+static const size_t word_bit_cnt = 8 * sizeof(unsigned int);
+
+/** Returns the size of cpu_mask_t for the detected number of cpus in bytes. */
+size_t cpu_mask_size(void)
+{
+	size_t word_cnt = (config.cpu_count + word_bit_cnt - 1) / word_bit_cnt;
+	return word_cnt * word_size;
+}
+
+/** Add first cpu_cnt cpus to the mask, ie sets the first cpu_cnt bits. */
+static void cpu_mask_count(cpu_mask_t *cpus, size_t cpu_cnt)
+{
+	ASSERT(NULL != cpus);
+	ASSERT(cpu_cnt <= config.cpu_count);
+	
+	for (size_t active_word = 0; 
+		(active_word + 1) * word_bit_cnt <= cpu_cnt;
+		++active_word) {
+		/* Set all bits in the cell/word. */
+		cpus->mask[active_word] = -1;
+	}
+	
+	size_t remaining_bits = (cpu_cnt % word_bit_cnt);
+	if (0 < remaining_bits) {
+		/* Set lower remaining_bits of the last word. */
+		cpus->mask[cpu_cnt / word_bit_cnt] = (1 << remaining_bits) - 1;
+	}
+}
+
+/** Sets bits corresponding to the active cpus, ie the first 
+ * config.cpu_active cpus. 
+ */
+void cpu_mask_active(cpu_mask_t *cpus)
+{
+	cpu_mask_none(cpus);
+	cpu_mask_count(cpus, config.cpu_active);
+}
+
+/** Sets bits for all cpus of the mask. */
+void cpu_mask_all(cpu_mask_t *cpus)
+{
+	cpu_mask_count(cpus, config.cpu_count);
+}
+
+/** Resets/removes all bits. */
+void cpu_mask_none(cpu_mask_t *cpus)
+{
+	ASSERT(cpus);
+	
+	size_t word_cnt = cpu_mask_size() / word_size;
+		
+	for (size_t word = 0; word < word_cnt; ++word) {
+		cpus->mask[word] = 0;
+	}
+}
+
+/** Sets the bit corresponding to cpu_id to true. */
+void cpu_mask_set(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	cpus->mask[word] |= (1U << word_pos);
+}
+
+/** Resets the bit corresponding to cpu_id to false. */
+void cpu_mask_reset(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	cpus->mask[word] &= ~(1U << word_pos);
+}
+
+/** Returns true if the bit corresponding to cpu_id is set. */
+bool cpu_mask_is_set(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	return 0 != (cpus->mask[word] & (1U << word_pos));
+}
+
+/** Returns true if no bits are set. */
+bool cpu_mask_is_none(cpu_mask_t *cpus)
+{
+	size_t word_cnt = cpu_mask_size() / word_size;
+
+	for (size_t word = 0; word < word_cnt; ++word) {
+		if (cpus->mask[word])
+			return false;
+	}
+	
+	return true;
+}
+
+/** @}
+ */
Index: kernel/generic/src/debug/panic.c
===================================================================
--- kernel/generic/src/debug/panic.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/debug/panic.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -97,5 +97,5 @@
 	if (THE != NULL) {
 		printf("pe=%" PRIun " thr=%p task=%p cpu=%p as=%p"
-		    " magic=%#" PRIx32 "\n", THE->preemption_disabled,
+		    " magic=%#" PRIx32 "\n", THE->preemption,
 		    THE->thread, THE->task, THE->cpu, THE->as, THE->magic);
 	} else
Index: kernel/generic/src/interrupt/interrupt.c
===================================================================
--- kernel/generic/src/interrupt/interrupt.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/interrupt/interrupt.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -111,13 +111,11 @@
 	}
 	
-	/* Account CPU usage if it has waked up from sleep */
-	if (CPU) {
+	/* Account CPU usage if it woke up from sleep */
+	if (CPU && CPU->idle) {
 		irq_spinlock_lock(&CPU->lock, false);
-		if (CPU->idle) {
-			uint64_t now = get_cycle();
-			CPU->idle_cycles += now - CPU->last_cycle;
-			CPU->last_cycle = now;
-			CPU->idle = false;
-		}
+		uint64_t now = get_cycle();
+		CPU->idle_cycles += now - CPU->last_cycle;
+		CPU->last_cycle = now;
+		CPU->idle = false;
 		irq_spinlock_unlock(&CPU->lock, false);
 	}
Index: kernel/generic/src/ipc/kbox.c
===================================================================
--- kernel/generic/src/ipc/kbox.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/ipc/kbox.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -44,4 +44,5 @@
 #include <ipc/kbox.h>
 #include <print.h>
+#include <proc/thread.h>
 
 void ipc_kbox_cleanup(void)
Index: kernel/generic/src/lib/str.c
===================================================================
--- kernel/generic/src/lib/str.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/lib/str.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -111,4 +111,5 @@
 #include <debug.h>
 #include <macros.h>
+#include <mm/slab.h>
 
 /** Check the condition if wchar_t is signed */
@@ -567,4 +568,5 @@
 	/* There must be space for a null terminator in the buffer. */
 	ASSERT(size > 0);
+	ASSERT(src != NULL);
 	
 	size_t src_off = 0;
Index: kernel/generic/src/main/kinit.c
===================================================================
--- kernel/generic/src/main/kinit.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/main/kinit.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -78,4 +78,6 @@
 #include <synch/waitq.h>
 #include <synch/spinlock.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 
 #define ALIVE_CHARS  4
@@ -104,6 +106,14 @@
 	 */
 	thread_detach(THREAD);
-	
+
 	interrupts_disable();
+	
+	/* Start processing RCU callbacks. RCU is fully functional afterwards. */
+	rcu_kinit_init();
+	
+	/*
+	 * Start processing work queue items. Some may have been queued during boot.
+	 */
+	workq_global_worker_init();
 	
 #ifdef CONFIG_SMP
Index: kernel/generic/src/main/main.c
===================================================================
--- kernel/generic/src/main/main.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/main/main.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -75,4 +75,6 @@
 #include <synch/waitq.h>
 #include <synch/futex.h>
+#include <synch/workqueue.h>
+#include <smp/smp_call.h>
 #include <arch/arch.h>
 #include <arch.h>
@@ -244,6 +246,9 @@
 	
 	cpu_init();
-	
 	calibrate_delay_loop();
+	arch_post_cpu_init();
+
+	smp_call_init();
+	workq_global_init();
 	clock_counter_init();
 	timeout_init();
@@ -347,4 +352,6 @@
 void main_ap_separated_stack(void)
 {
+	smp_call_init();
+	
 	/*
 	 * Configure timeouts for this cpu.
Index: kernel/generic/src/main/shutdown.c
===================================================================
--- kernel/generic/src/main/shutdown.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/main/shutdown.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -37,4 +37,5 @@
 
 #include <arch.h>
+#include <proc/task.h>
 #include <func.h>
 #include <print.h>
Index: kernel/generic/src/mm/frame.c
===================================================================
--- kernel/generic/src/mm/frame.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/mm/frame.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -61,4 +61,5 @@
 #include <config.h>
 #include <str.h>
+#include <proc/thread.h> /* THREAD */
 
 zones_t zones;
Index: kernel/generic/src/mm/km.c
===================================================================
--- kernel/generic/src/mm/km.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/mm/km.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -49,4 +49,5 @@
 #include <macros.h>
 #include <bitops.h>
+#include <proc/thread.h>
 
 static ra_arena_t *km_ni_arena;
Index: kernel/generic/src/mm/slab.c
===================================================================
--- kernel/generic/src/mm/slab.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/mm/slab.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -114,4 +114,5 @@
 #include <bitops.h>
 #include <macros.h>
+#include <cpu.h>
 
 IRQ_SPINLOCK_STATIC_INITIALIZE(slab_cache_lock);
Index: kernel/generic/src/preempt/preemption.c
===================================================================
--- kernel/generic/src/preempt/preemption.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/preempt/preemption.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -37,23 +37,5 @@
 
 #include <preemption.h>
-#include <arch.h>
-#include <arch/asm.h>
-#include <arch/barrier.h>
-#include <debug.h>
 
-/** Increment preemption disabled counter. */
-void preemption_disable(void)
-{
-	THE->preemption_disabled++;
-	memory_barrier();
-}
-
-/** Decrement preemption disabled counter. */
-void preemption_enable(void)
-{
-	ASSERT(PREEMPTION_DISABLED);
-	memory_barrier();
-	THE->preemption_disabled--;
-}
 
 /** @}
Index: kernel/generic/src/proc/scheduler.c
===================================================================
--- kernel/generic/src/proc/scheduler.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/proc/scheduler.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -52,4 +52,6 @@
 #include <atomic.h>
 #include <synch/spinlock.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <config.h>
 #include <context.h>
@@ -63,4 +65,5 @@
 #include <debug.h>
 #include <stacktrace.h>
+#include <cpu.h>
 
 static void scheduler_separated_stack(void);
@@ -86,4 +89,5 @@
 {
 	before_thread_runs_arch();
+	rcu_before_thread_runs();
 	
 #ifdef CONFIG_FPU_LAZY
@@ -126,4 +130,6 @@
 static void after_thread_ran(void)
 {
+	workq_after_thread_ran();
+	rcu_after_thread_ran();
 	after_thread_ran_arch();
 }
@@ -218,4 +224,6 @@
 		goto loop;
 	}
+
+	ASSERT(!CPU->idle);
 	
 	unsigned int i;
@@ -397,4 +405,5 @@
 	ASSERT((!THREAD) || (irq_spinlock_locked(&THREAD->lock)));
 	ASSERT(CPU != NULL);
+	ASSERT(interrupts_disabled());
 	
 	/*
@@ -420,4 +429,5 @@
 		
 		case Exiting:
+			rcu_thread_exiting();
 repeat:
 			if (THREAD->detached) {
Index: kernel/generic/src/proc/task.c
===================================================================
--- kernel/generic/src/proc/task.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/proc/task.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,5 @@
 #include <mm/slab.h>
 #include <atomic.h>
+#include <synch/futex.h>
 #include <synch/spinlock.h>
 #include <synch/waitq.h>
@@ -153,5 +154,4 @@
 	
 	irq_spinlock_initialize(&task->lock, "task_t_lock");
-	mutex_initialize(&task->futexes_lock, MUTEX_PASSIVE);
 	
 	list_initialize(&task->threads);
@@ -165,5 +165,5 @@
 	spinlock_initialize(&task->active_calls_lock, "active_calls_lock");
 	list_initialize(&task->active_calls);
-	
+		
 #ifdef CONFIG_UDEBUG
 	/* Init kbox stuff */
@@ -221,5 +221,5 @@
 		(void) ipc_phone_connect(&task->phones[0], ipc_phone_0);
 	
-	btree_create(&task->futexes);
+	futex_task_init(task);
 	
 	/*
@@ -262,5 +262,5 @@
 	 * Free up dynamically allocated state.
 	 */
-	btree_destroy(&task->futexes);
+	futex_task_deinit(task);
 	
 	/*
Index: kernel/generic/src/proc/the.c
===================================================================
--- kernel/generic/src/proc/the.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/proc/the.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -43,4 +43,5 @@
 
 #include <arch.h>
+#include <debug.h>
 
 /** Initialize THE structure
@@ -53,5 +54,5 @@
 void the_initialize(the_t *the)
 {
-	the->preemption_disabled = 0;
+	the->preemption = 0;
 	the->cpu = NULL;
 	the->thread = NULL;
@@ -59,4 +60,7 @@
 	the->as = NULL;
 	the->magic = MAGIC;
+#ifdef RCU_PREEMPT_A	
+	the->rcu_nesting = 0;
+#endif
 }
 
Index: kernel/generic/src/proc/thread.c
===================================================================
--- kernel/generic/src/proc/thread.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/proc/thread.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -46,4 +46,6 @@
 #include <synch/spinlock.h>
 #include <synch/waitq.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <cpu.h>
 #include <str.h>
@@ -260,4 +262,11 @@
 }
 
+/** Invoked right before thread_ready() readies the thread. thread is locked. */
+static void before_thread_is_ready(thread_t *thread)
+{
+	ASSERT(irq_spinlock_locked(&thread->lock));
+	workq_before_thread_is_ready(thread);
+}
+
 /** Make thread ready
  *
@@ -272,13 +281,20 @@
 	
 	ASSERT(thread->state != Ready);
+
+	before_thread_is_ready(thread);
 	
 	int i = (thread->priority < RQ_COUNT - 1) ?
 	    ++thread->priority : thread->priority;
-	
-	cpu_t *cpu;
-	if (thread->wired || thread->nomigrate || thread->fpu_context_engaged) {
-		ASSERT(thread->cpu != NULL);
-		cpu = thread->cpu;
-	} else
+
+	/* Check that thread->cpu is set whenever it needs to be. */
+	ASSERT(thread->cpu != NULL || 
+		(!thread->wired && !thread->nomigrate && !thread->fpu_context_engaged));
+
+	/* 
+	 * Prefer to run on the same cpu as the last time. Used by wired 
+	 * threads as well as threads with disabled migration.
+	 */
+	cpu_t *cpu = thread->cpu;
+	if (cpu == NULL) 
 		cpu = CPU;
 	
@@ -374,4 +390,6 @@
 	thread->task = task;
 	
+	thread->workq = NULL;
+	
 	thread->fpu_context_exists = false;
 	thread->fpu_context_engaged = false;
@@ -388,4 +406,6 @@
 	/* Might depend on previous initialization */
 	thread_create_arch(thread);
+	
+	rcu_thread_init(thread);
 	
 	if ((flags & THREAD_FLAG_NOATTACH) != THREAD_FLAG_NOATTACH)
@@ -498,5 +518,5 @@
 			 */
 			ipc_cleanup();
-			futex_cleanup();
+			futex_task_cleanup();
 			LOG("Cleanup of task %" PRIu64" completed.", TASK->taskid);
 		}
@@ -518,4 +538,52 @@
 	/* Not reached */
 	while (true);
+}
+
+/** Interrupts an existing thread so that it may exit as soon as possible.
+ * 
+ * Threads that are blocked waiting for a synchronization primitive 
+ * are woken up with a return code of ESYNCH_INTERRUPTED if the
+ * blocking call was interruptable. See waitq_sleep_timeout().
+ * 
+ * The caller must guarantee the thread object is valid during the entire
+ * function, eg by holding the threads_lock lock.
+ * 
+ * Interrupted threads automatically exit when returning back to user space.
+ * 
+ * @param thread A valid thread object. The caller must guarantee it
+ *               will remain valid until thread_interrupt() exits.
+ */
+void thread_interrupt(thread_t *thread)
+{
+	ASSERT(thread != NULL);
+	
+	irq_spinlock_lock(&thread->lock, true);
+	
+	thread->interrupted = true;
+	bool sleeping = (thread->state == Sleeping);
+	
+	irq_spinlock_unlock(&thread->lock, true);
+	
+	if (sleeping)
+		waitq_interrupt_sleep(thread);
+}
+
+/** Returns true if the thread was interrupted.
+ * 
+ * @param thread A valid thread object. User must guarantee it will
+ *               be alive during the entire call.
+ * @return true if the thread was already interrupted via thread_interrupt().
+ */
+bool thread_interrupted(thread_t *thread)
+{
+	ASSERT(thread != NULL);
+	
+	bool interrupted;
+	
+	irq_spinlock_lock(&thread->lock, true);
+	interrupted = thread->interrupted;
+	irq_spinlock_unlock(&thread->lock, true);
+	
+	return interrupted;
 }
 
Index: kernel/generic/src/smp/smp_call.c
===================================================================
--- kernel/generic/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/smp/smp_call.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Facility to invoke functions on other cpus via IPIs.
+ */
+
+#include <smp/smp_call.h>
+#include <arch/barrier.h>
+#include <arch/asm.h>  /* interrupt_disable */
+#include <arch.h>
+#include <config.h>
+#include <preemption.h>
+#include <debug.h>
+#include <cpu.h>
+
+static void call_start(smp_call_t *call_info, smp_call_func_t func, void *arg);
+static void call_done(smp_call_t *call_info);
+static void call_wait(smp_call_t *call_info);
+
+
+/** Init smp_call() on the local cpu. */
+void smp_call_init(void)
+{
+	ASSERT(CPU);
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	spinlock_initialize(&CPU->smp_calls_lock, "cpu[].smp_calls_lock");
+	list_initialize(&CPU->smp_pending_calls);
+}
+
+/** Invokes a function on a specific cpu and waits for it to complete.
+ * 
+ * Calls @a func on the CPU denoted by its logical id @cpu_id . 
+ * The function will execute with interrupts disabled. It should 
+ * be a quick and simple function and must never block. 
+ * 
+ * If @a cpu_id is the local CPU, the function will be invoked
+ * directly.
+ * 
+ * All memory accesses of prior to smp_call() will be visible
+ * to @a func on cpu @a cpu_id. Similarly, any changes @a func
+ * makes on cpu @a cpu_id will be visible on this cpu once
+ * smp_call() returns.
+ * 
+ * Invoking @a func on the destination cpu acts as a memory barrier
+ * on that cpu.
+ * 
+ * @param cpu_id Destination CPU's logical id (eg CPU->id)
+ * @param func Function to call.
+ * @param arg Argument to pass to the user supplied function @a func.
+ */
+void smp_call(unsigned int cpu_id, smp_call_func_t func, void *arg)
+{
+	smp_call_t call_info;
+	smp_call_async(cpu_id, func, arg, &call_info);
+	smp_call_wait(&call_info);
+}
+
+/** Invokes a function on a specific cpu asynchronously.
+ * 
+ * Calls @a func on the CPU denoted by its logical id @cpu_id . 
+ * The function will execute with interrupts disabled. It should 
+ * be a quick and simple function and must never block. 
+ * 
+ * Pass @a call_info to smp_call_wait() in order to wait for 
+ * @a func to complete.
+ * 
+ * @a call_info must be valid until/after @a func returns. Use
+ * smp_call_wait() to wait until it is safe to free @a call_info.
+ * 
+ * If @a cpu_id is the local CPU, the function will be invoked
+ * directly. If the destination cpu id @a cpu_id is invalid
+ * or denotes an inactive cpu, the call is discarded immediately.
+ * 
+ * All memory accesses of the caller prior to smp_call_async()
+ * will be made visible to @a func on the other cpu. Similarly, 
+ * any changes @a func makes on cpu @a cpu_id will be visible
+ * to this cpu when smp_call_wait() returns.
+ * 
+ * Invoking @a func on the destination cpu acts as a memory barrier
+ * on that cpu.
+ * 
+ * Interrupts must be enabled. Otherwise you run the risk
+ * of a deadlock.
+ * 
+ * @param cpu_id Destination CPU's logical id (eg CPU->id).
+ * @param func Function to call.
+ * @param arg Argument to pass to the user supplied function @a func.
+ * @param call_info Use it to wait for the function to complete. Must
+ *          be valid until the function completes.
+ */
+void smp_call_async(unsigned int cpu_id, smp_call_func_t func, void *arg, 
+	smp_call_t *call_info)
+{
+	/* 
+	 * Interrupts must not be disabled or you run the risk of a deadlock 
+	 * if both the destination and source cpus try to send an IPI to each
+	 * other with interrupts disabled. Because the interrupts are disabled 
+	 * the IPIs cannot be delivered and both cpus will forever busy wait 
+	 * for an acknowledgment of the IPI from the other cpu.
+	 */
+	ASSERT(!interrupts_disabled());
+	ASSERT(call_info != NULL);
+	
+	/* Discard invalid calls. */
+	if (config.cpu_count <= cpu_id || !cpus[cpu_id].active) {
+		call_start(call_info, func, arg);
+		call_done(call_info);
+		return;
+	}
+	
+	/* Protect cpu->id against migration. */
+	preemption_disable();
+
+	call_start(call_info, func, arg);
+	
+	if (cpu_id != CPU->id) {
+#ifdef CONFIG_SMP
+		spinlock_lock(&cpus[cpu_id].smp_calls_lock);
+		list_append(&call_info->calls_link, &cpus[cpu_id].smp_pending_calls);
+		spinlock_unlock(&cpus[cpu_id].smp_calls_lock);
+
+		/*
+		 * If a platform supports SMP it must implement arch_smp_call_ipi().
+		 * It should issue an IPI on cpu_id and invoke smp_call_ipi_recv()
+		 * on cpu_id in turn. 
+		 * 
+		 * Do not implement as just an empty dummy function. Instead
+		 * consider providing a full implementation or at least a version 
+		 * that panics if invoked. Note that smp_call_async() never
+		 * calls arch_smp_call_ipi() on uniprocessors even if CONFIG_SMP.
+		 */
+		arch_smp_call_ipi(cpu_id);
+#endif
+	} else {
+		/* Invoke local smp calls in place. */
+		ipl_t ipl = interrupts_disable();
+		func(arg);
+		interrupts_restore(ipl);
+		
+		call_done(call_info);
+	}
+	
+	preemption_enable();
+}
+
+/** Waits for a function invoked on another CPU asynchronously to complete.
+ * 
+ * Does not sleep but rather spins.
+ * 
+ * Example usage:
+ * @code
+ * void hello(void *p) {
+ *     puts((char*)p);
+ * }
+ * 
+ * smp_call_t call_info;
+ * smp_call_async(cpus[2].id, hello, "hi!\n", &call_info);
+ * // Do some work. In the meantime, hello() is executed on cpu2.
+ * smp_call_wait(&call_info);
+ * @endcode
+ * 
+ * @param call_info Initialized by smp_call_async().
+ */
+void smp_call_wait(smp_call_t *call_info)
+{
+	call_wait(call_info);
+}
+
+#ifdef CONFIG_SMP
+
+/** Architecture independent smp call IPI handler.
+ * 
+ * Interrupts must be disabled. Tolerates spurious calls.
+ */
+void smp_call_ipi_recv(void)
+{
+	ASSERT(interrupts_disabled());
+	ASSERT(CPU);
+	
+	list_t calls_list;
+	list_initialize(&calls_list);
+	
+	/* 
+	 * Acts as a load memory barrier. Any changes made by the cpu that
+	 * added the smp_call to calls_list will be made visible to this cpu.
+	 */
+	spinlock_lock(&CPU->smp_calls_lock);
+	list_concat(&calls_list, &CPU->smp_pending_calls);
+	spinlock_unlock(&CPU->smp_calls_lock);
+
+	/* Walk the list manually, so that we can safely remove list items. */
+	for (link_t *cur = calls_list.head.next, *next = cur->next; 
+		!list_empty(&calls_list); cur = next, next = cur->next) {
+		
+		smp_call_t *call_info = list_get_instance(cur, smp_call_t, calls_link);
+		list_remove(cur);
+		
+		call_info->func(call_info->arg);
+		call_done(call_info);
+	}
+}
+
+#endif /* CONFIG_SMP */
+
+static void call_start(smp_call_t *call_info, smp_call_func_t func, void *arg)
+{
+	link_initialize(&call_info->calls_link);
+	call_info->func = func;
+	call_info->arg = arg;
+	
+	/*
+	 * We can't use standard spinlocks here because we want to lock
+	 * the structure on one cpu and unlock it on another (without
+	 * messing up the preemption count).
+	 */
+	atomic_set(&call_info->pending, 1);
+	
+	/* Let initialization complete before continuing. */
+	memory_barrier();
+}
+
+static void call_done(smp_call_t *call_info)
+{
+	/* 
+	 * Separate memory accesses of the called function from the 
+	 * announcement of its completion.
+	 */
+	memory_barrier();
+	atomic_set(&call_info->pending, 0);
+}
+
+static void call_wait(smp_call_t *call_info)
+{
+	do {
+		/* 
+		 * Ensure memory accesses following call_wait() are ordered
+		 * after completion of the called function on another cpu. 
+		 * Also, speed up loading of call_info->pending.
+		 */
+		memory_barrier();
+	} while (atomic_get(&call_info->pending));
+}
+
+
+/** @}
+ */
Index: kernel/generic/src/synch/condvar.c
===================================================================
--- kernel/generic/src/synch/condvar.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/condvar.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -38,4 +38,5 @@
 #include <synch/condvar.h>
 #include <synch/mutex.h>
+#include <synch/spinlock.h>
 #include <synch/waitq.h>
 #include <arch.h>
@@ -90,4 +91,5 @@
 
 	ipl = waitq_sleep_prepare(&cv->wq);
+	/* Unlock only after the waitq is locked so we don't miss a wakeup. */
 	mutex_unlock(mtx);
 
@@ -95,10 +97,93 @@
 	rc = waitq_sleep_timeout_unsafe(&cv->wq, usec, flags);
 
+	waitq_sleep_finish(&cv->wq, rc, ipl);
+	/* Lock only after releasing the waitq to avoid a possible deadlock. */
 	mutex_lock(mtx);
-	waitq_sleep_finish(&cv->wq, rc, ipl);
 
 	return rc;
 }
 
+/** Wait for the condition to become true with a locked spinlock.
+ * 
+ * The function is not aware of irq_spinlock. Therefore do not even
+ * try passing irq_spinlock_t to it. Use _condvar_wait_timeout_irq_spinlock()
+ * instead.
+ *
+ * @param cv		Condition variable.
+ * @param lock		Locked spinlock.
+ * @param usec		Timeout value in microseconds.
+ * @param flags		Select mode of operation.
+ *
+ * For exact description of meaning of possible combinations of usec and flags,
+ * see comment for waitq_sleep_timeout().  Note that when
+ * SYNCH_FLAGS_NON_BLOCKING is specified here, ESYNCH_WOULD_BLOCK is always
+ * returned.
+ *
+ * @return See comment for waitq_sleep_timeout().
+ */
+int _condvar_wait_timeout_spinlock_impl(condvar_t *cv, spinlock_t *lock, 
+	uint32_t usec, int flags)
+{
+	int rc;
+	ipl_t ipl;
+	
+	ipl = waitq_sleep_prepare(&cv->wq);
+
+	/* Unlock only after the waitq is locked so we don't miss a wakeup. */
+	spinlock_unlock(lock);
+
+	cv->wq.missed_wakeups = 0;	/* Enforce blocking. */
+	rc = waitq_sleep_timeout_unsafe(&cv->wq, usec, flags);
+
+	waitq_sleep_finish(&cv->wq, rc, ipl);
+	/* Lock only after releasing the waitq to avoid a possible deadlock. */
+	spinlock_lock(lock);
+	
+	return rc;
+}
+
+/** Wait for the condition to become true with a locked irq spinlock.
+ * 
+ * @param cv		Condition variable.
+ * @param lock		Locked irq spinlock.
+ * @param usec		Timeout value in microseconds.
+ * @param flags		Select mode of operation.
+ *
+ * For exact description of meaning of possible combinations of usec and flags,
+ * see comment for waitq_sleep_timeout().  Note that when
+ * SYNCH_FLAGS_NON_BLOCKING is specified here, ESYNCH_WOULD_BLOCK is always
+ * returned.
+ *
+ * @return See comment for waitq_sleep_timeout().
+ */
+int _condvar_wait_timeout_irq_spinlock(condvar_t *cv, irq_spinlock_t *irq_lock, 
+	uint32_t usec, int flags)
+{
+	int rc;
+	/* Save spinlock's state so we can restore it correctly later on. */
+	ipl_t ipl = irq_lock->ipl;
+	bool guard = irq_lock->guard;
+	
+	irq_lock->guard = false;
+	
+	/* 
+	 * waitq_prepare() restores interrupts to the current state, 
+	 * ie disabled. Therefore, interrupts will remain disabled while 
+	 * it spins waiting for a pending timeout handler to complete. 
+	 * Although it spins with interrupts disabled there can only
+	 * be a pending timeout if we failed to cancel an imminent
+	 * timeout (on another cpu) during a wakeup. As a result the 
+	 * timeout handler is guaranteed to run (it is most likely already 
+	 * running) and there is no danger of a deadlock.
+	 */
+	rc = _condvar_wait_timeout_spinlock(cv, &irq_lock->lock, usec, flags);
+	
+	irq_lock->guard = guard;
+	irq_lock->ipl = ipl;
+	
+	return rc;
+}
+
+
 /** @}
  */
Index: kernel/generic/src/synch/futex.c
===================================================================
--- kernel/generic/src/synch/futex.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/futex.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2006 Jakub Jermar
+ * Copyright (c) 2012 Adam Hraska
  * All rights reserved.
  *
@@ -34,4 +35,28 @@
  * @file
  * @brief	Kernel backend for futexes.
+ * 
+ * Kernel futex objects are stored in a global hash table futex_ht 
+ * where the physical address of the futex variable (futex_t.paddr)
+ * is used as the lookup key. As a result multiple address spaces 
+ * may share the same futex variable. 
+ * 
+ * A kernel futex object is created the first time a task accesses
+ * the futex (having a futex variable at a physical address not 
+ * encountered before). Futex object's lifetime is governed by
+ * a reference count that represents the number of all the different
+ * user space virtual addresses from all tasks that map to the
+ * physical address of the futex variable. A futex object is freed
+ * when the last task having accessed the futex exits.
+ * 
+ * Each task keeps track of the futex objects it accessed in a list
+ * of pointers (futex_ptr_t, task->futex_list) to the different futex 
+ * objects.
+ * 
+ * To speed up translation of futex variables' virtual addresses
+ * to their physical addresses, futex pointers accessed by the
+ * task are furthermore stored in a concurrent hash table (CHT,
+ * task->futexes->ht). A single lookup without locks or accesses
+ * to the page table translates a futex variable's virtual address 
+ * into its futex kernel object. 
  */
 
@@ -39,4 +64,5 @@
 #include <synch/mutex.h>
 #include <synch/spinlock.h>
+#include <synch/rcu.h>
 #include <mm/frame.h>
 #include <mm/page.h>
@@ -46,4 +72,5 @@
 #include <genarch/mm/page_pt.h>
 #include <genarch/mm/page_ht.h>
+#include <adt/cht.h>
 #include <adt/hash_table.h>
 #include <adt/list.h>
@@ -52,26 +79,55 @@
 #include <panic.h>
 #include <errno.h>
-#include <print.h>
 
 #define FUTEX_HT_SIZE	1024	/* keep it a power of 2 */
 
-static void futex_initialize(futex_t *futex);
-
-static futex_t *futex_find(uintptr_t paddr);
+/** Task specific pointer to a global kernel futex object. */
+typedef struct futex_ptr {
+	/** CHT link. */
+	cht_link_t cht_link;
+	/** List of all futex pointers used by the task. */
+	link_t all_link;
+	/** Kernel futex object. */
+	futex_t *futex;
+	/** User space virtual address of the futex variable in the task. */
+	uintptr_t uaddr;
+} futex_ptr_t;
+
+
+static void destroy_task_cache(work_t *work);
+
+static void futex_initialize(futex_t *futex, uintptr_t paddr);
+static void futex_add_ref(futex_t *futex);
+static void futex_release_ref(futex_t *futex);
+static void futex_release_ref_locked(futex_t *futex);
+
+static futex_t *get_futex(uintptr_t uaddr);
+static futex_t *find_cached_futex(uintptr_t uaddr);
+static futex_t *get_and_cache_futex(uintptr_t phys_addr, uintptr_t uaddr);
+static bool find_futex_paddr(uintptr_t uaddr, uintptr_t *phys_addr);
+
 static size_t futex_ht_hash(sysarg_t *key);
 static bool futex_ht_compare(sysarg_t *key, size_t keys, link_t *item);
 static void futex_ht_remove_callback(link_t *item);
 
-/**
- * Mutex protecting global futex hash table.
- * It is also used to serialize access to all futex_t structures.
- * Must be acquired before the task futex B+tree lock.
+static size_t task_fut_ht_hash(const cht_link_t *link);
+static size_t task_fut_ht_key_hash(void *key);
+static bool task_fut_ht_equal(const cht_link_t *item1, const cht_link_t *item2);
+static bool task_fut_ht_key_equal(void *key, const cht_link_t *item);
+
+
+/** Mutex protecting the global futex hash table.
+ * 
+ * Acquire task specific TASK->futex_list_lock before this mutex.
  */
 static mutex_t futex_ht_lock;
 
-/** Futex hash table. */
+/** Global kernel futex hash table. Lock futex_ht_lock before accessing.
+ * 
+ * Physical address of the futex variable is the lookup key.
+ */
 static hash_table_t futex_ht;
 
-/** Futex hash table operations. */
+/** Global kernel futex hash table operations. */
 static hash_table_operations_t futex_ht_ops = {
 	.hash = futex_ht_hash,
@@ -80,4 +136,13 @@
 };
 
+/** Task futex cache CHT operations. */
+static cht_ops_t task_futex_ht_ops = {
+	.hash = task_fut_ht_hash,
+	.key_hash = task_fut_ht_key_hash,
+	.equal = task_fut_ht_equal,
+	.key_equal = task_fut_ht_key_equal,
+	.remove_callback = NULL
+};
+
 /** Initialize futex subsystem. */
 void futex_init(void)
@@ -87,14 +152,233 @@
 }
 
-/** Initialize kernel futex structure.
- *
- * @param futex		Kernel futex structure.
- */
-void futex_initialize(futex_t *futex)
+/** Initializes the futex structures for the new task. */
+void futex_task_init(struct task *task)
+{
+	task->futexes = malloc(sizeof(struct futex_cache), 0);
+	
+	cht_create(&task->futexes->ht, 0, 0, 0, true, &task_futex_ht_ops);
+	
+	list_initialize(&task->futexes->list);
+	mutex_initialize(&task->futexes->list_lock, MUTEX_PASSIVE);
+}
+
+/** Destroys the futex structures for the dying task. */
+void futex_task_deinit(task_t *task)
+{
+	/* Interrupts are disabled so we must not block (cannot run cht_destroy). */
+	if (interrupts_disabled()) {
+		/* Invoke the blocking cht_destroy in the background. */
+		workq_global_enqueue_noblock(&task->futexes->destroy_work, 
+			destroy_task_cache);
+	} else {
+		/* We can block. Invoke cht_destroy in this thread. */
+		destroy_task_cache(&task->futexes->destroy_work);
+	}
+}
+
+/** Deallocates a task's CHT futex cache (must already be empty). */
+static void destroy_task_cache(work_t *work)
+{
+	struct futex_cache *cache = 
+		member_to_inst(work, struct futex_cache, destroy_work);
+	
+	/* 
+	 * Destroy the cache before manually freeing items of the cache in case
+	 * table resize is in progress.
+	 */
+	cht_destroy_unsafe(&cache->ht);
+	
+	/* Manually free futex_ptr cache items. */
+	list_foreach_safe(cache->list, cur_link, next_link) {
+		futex_ptr_t *fut_ptr = member_to_inst(cur_link, futex_ptr_t, all_link);
+
+		list_remove(cur_link);
+		free(fut_ptr);
+	}
+	
+	free(cache);
+}
+
+/** Remove references from futexes known to the current task. */
+void futex_task_cleanup(void)
+{
+	struct futex_cache *futexes = TASK->futexes;
+	
+	/* All threads of this task have terminated. This is the last thread. */
+	mutex_lock(&futexes->list_lock);
+	
+	list_foreach_safe(futexes->list, cur_link, next_link) {
+		futex_ptr_t *fut_ptr = member_to_inst(cur_link, futex_ptr_t, all_link);
+
+		/*
+		 * The function is free to free the futex. All other threads of this
+		 * task have already terminated, so they have also definitely
+		 * exited their CHT futex cache protecting rcu reader sections.
+		 * Moreover release_ref() only frees the futex if this is the 
+		 * last task referencing the futex. Therefore, only threads
+		 * of this task may have referenced the futex if it is to be freed.
+		 */
+		futex_release_ref_locked(fut_ptr->futex);
+	}
+	
+	mutex_unlock(&futexes->list_lock);
+}
+
+
+/** Initialize the kernel futex structure.
+ *
+ * @param futex	Kernel futex structure.
+ * @param paddr Physical address of the futex variable.
+ */
+static void futex_initialize(futex_t *futex, uintptr_t paddr)
 {
 	waitq_initialize(&futex->wq);
 	link_initialize(&futex->ht_link);
-	futex->paddr = 0;
+	futex->paddr = paddr;
 	futex->refcount = 1;
+}
+
+/** Increments the counter of tasks referencing the futex. */
+static void futex_add_ref(futex_t *futex)
+{
+	ASSERT(mutex_locked(&futex_ht_lock));
+	ASSERT(0 < futex->refcount);
+	++futex->refcount;
+}
+
+/** Decrements the counter of tasks referencing the futex. May free the futex.*/
+static void futex_release_ref(futex_t *futex)
+{
+	ASSERT(mutex_locked(&futex_ht_lock));
+	ASSERT(0 < futex->refcount);
+	
+	--futex->refcount;
+	
+	if (0 == futex->refcount) {
+		hash_table_remove(&futex_ht, &futex->paddr, 1);
+	}
+}
+
+/** Decrements the counter of tasks referencing the futex. May free the futex.*/
+static void futex_release_ref_locked(futex_t *futex)
+{
+	mutex_lock(&futex_ht_lock);
+	futex_release_ref(futex);
+	mutex_unlock(&futex_ht_lock);
+}
+
+/** Returns a futex for the virtual address @a uaddr (or creates one). */
+static futex_t *get_futex(uintptr_t uaddr)
+{
+	futex_t *futex = find_cached_futex(uaddr);
+	
+	if (futex) 
+		return futex;
+
+	uintptr_t paddr;
+
+	if (!find_futex_paddr(uaddr, &paddr)) 
+		return 0;
+
+	return get_and_cache_futex(paddr, uaddr);
+}
+
+
+/** Finds the physical address of the futex variable. */
+static bool find_futex_paddr(uintptr_t uaddr, uintptr_t *paddr)
+{
+	page_table_lock(AS, true);
+
+	bool found = false;
+	pte_t *t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), false);
+	
+	if (t && PTE_VALID(t) && PTE_PRESENT(t)) {
+		found = true;
+		*paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
+	}
+	
+	page_table_unlock(AS, true);
+	
+	return found;
+}
+
+/** Returns the futex cached in this task with the virtual address uaddr. */
+static futex_t *find_cached_futex(uintptr_t uaddr)
+{
+	cht_read_lock();
+	
+	futex_t *futex;
+	cht_link_t *futex_ptr_link = cht_find_lazy(&TASK->futexes->ht, &uaddr);
+
+	if (futex_ptr_link) {
+		futex_ptr_t *futex_ptr 
+			= member_to_inst(futex_ptr_link, futex_ptr_t, cht_link);
+		
+		futex = futex_ptr->futex;
+	} else {
+		futex = NULL;
+	}
+	
+	cht_read_unlock();
+	
+	return futex;
+}
+
+
+/** 
+ * Returns a kernel futex for the physical address @a phys_addr and caches 
+ * it in this task under the virtual address @a uaddr (if not already cached).
+ */
+static futex_t *get_and_cache_futex(uintptr_t phys_addr, uintptr_t uaddr)
+{
+	futex_t *futex = malloc(sizeof(futex_t), 0);
+	
+	/* 
+	 * Find the futex object in the global futex table (or insert it 
+	 * if it is not present).
+	 */
+	mutex_lock(&futex_ht_lock);
+	
+	link_t *fut_link = hash_table_find(&futex_ht, &phys_addr);
+	
+	if (fut_link) {
+		free(futex);
+		futex = member_to_inst(fut_link, futex_t, ht_link);
+		futex_add_ref(futex);
+	} else {
+		futex_initialize(futex, phys_addr);
+		hash_table_insert(&futex_ht, &phys_addr, &futex->ht_link);
+	}
+	
+	mutex_unlock(&futex_ht_lock);
+	
+	/* 
+	 * Cache the link to the futex object for this task. 
+	 */
+	futex_ptr_t *fut_ptr = malloc(sizeof(futex_ptr_t), 0);
+	cht_link_t *dup_link;
+	
+	fut_ptr->futex = futex;
+	fut_ptr->uaddr = uaddr;
+	
+	cht_read_lock();
+	
+	/* Cache the mapping from the virtual address to the futex for this task. */
+	if (cht_insert_unique(&TASK->futexes->ht, &fut_ptr->cht_link, &dup_link)) {
+		mutex_lock(&TASK->futexes->list_lock);
+		list_append(&fut_ptr->all_link, &TASK->futexes->list);
+		mutex_unlock(&TASK->futexes->list_lock);
+	} else {
+		/* Another thread of this task beat us to it. Use that mapping instead.*/
+		free(fut_ptr);
+		futex_release_ref_locked(futex);
+		
+		futex_ptr_t *dup = member_to_inst(dup_link, futex_ptr_t, cht_link);
+		futex = dup->futex;		
+	}
+
+	cht_read_unlock();
+	
+	return futex;
 }
 
@@ -109,27 +393,13 @@
 sysarg_t sys_futex_sleep(uintptr_t uaddr)
 {
-	futex_t *futex;
-	uintptr_t paddr;
-	pte_t *t;
-	int rc;
-	
-	/*
-	 * Find physical address of futex counter.
-	 */
-	page_table_lock(AS, true);
-	t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), false);
-	if (!t || !PTE_VALID(t) || !PTE_PRESENT(t)) {
-		page_table_unlock(AS, true);
+	futex_t *futex = get_futex(uaddr);
+	
+	if (!futex) 
 		return (sysarg_t) ENOENT;
-	}
-	paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
-	page_table_unlock(AS, true);
-	
-	futex = futex_find(paddr);
 
 #ifdef CONFIG_UDEBUG
 	udebug_stoppable_begin();
 #endif
-	rc = waitq_sleep_timeout(&futex->wq, 0, SYNCH_FLAGS_INTERRUPTIBLE); 
+	int rc = waitq_sleep_timeout(&futex->wq, 0, SYNCH_FLAGS_INTERRUPTIBLE); 
 #ifdef CONFIG_UDEBUG
 	udebug_stoppable_end();
@@ -146,84 +416,14 @@
 sysarg_t sys_futex_wakeup(uintptr_t uaddr)
 {
-	futex_t *futex;
-	uintptr_t paddr;
-	pte_t *t;
-	
-	/*
-	 * Find physical address of futex counter.
-	 */
-	page_table_lock(AS, true);
-	t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), false);
-	if (!t || !PTE_VALID(t) || !PTE_PRESENT(t)) {
-		page_table_unlock(AS, true);
+	futex_t *futex = get_futex(uaddr);
+	
+	if (futex) {
+		waitq_wakeup(&futex->wq, WAKEUP_FIRST);
+		return 0;
+	} else {
 		return (sysarg_t) ENOENT;
 	}
-	paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
-	page_table_unlock(AS, true);
-	
-	futex = futex_find(paddr);
-		
-	waitq_wakeup(&futex->wq, WAKEUP_FIRST);
-	
-	return 0;
-}
-
-/** Find kernel address of the futex structure corresponding to paddr.
- *
- * If the structure does not exist already, a new one is created.
- *
- * @param paddr		Physical address of the userspace futex counter.
- *
- * @return		Address of the kernel futex structure.
- */
-futex_t *futex_find(uintptr_t paddr)
-{
-	link_t *item;
-	futex_t *futex;
-	btree_node_t *leaf;
-	
-	/*
-	 * Find the respective futex structure
-	 * or allocate new one if it does not exist already.
-	 */
-	mutex_lock(&futex_ht_lock);
-	item = hash_table_find(&futex_ht, &paddr);
-	if (item) {
-		futex = hash_table_get_instance(item, futex_t, ht_link);
-
-		/*
-		 * See if the current task knows this futex.
-		 */
-		mutex_lock(&TASK->futexes_lock);
-		if (!btree_search(&TASK->futexes, paddr, &leaf)) {
-			/*
-			 * The futex is new to the current task.
-			 * Upgrade its reference count and put it to the
-			 * current task's B+tree of known futexes.
-			 */
-			futex->refcount++;
-			btree_insert(&TASK->futexes, paddr, futex, leaf);
-		}
-		mutex_unlock(&TASK->futexes_lock);
-	} else {
-		futex = (futex_t *) malloc(sizeof(futex_t), 0);
-		futex_initialize(futex);
-		futex->paddr = paddr;
-		hash_table_insert(&futex_ht, &paddr, &futex->ht_link);
-			
-		/*
-		 * This is the first task referencing the futex.
-		 * It can be directly inserted into its
-		 * B+tree of known futexes.
-		 */
-		mutex_lock(&TASK->futexes_lock);
-		btree_insert(&TASK->futexes, paddr, futex, NULL);
-		mutex_unlock(&TASK->futexes_lock);
-		
-	}
-	mutex_unlock(&futex_ht_lock);
-	
-	return futex;
-}
+}
+
 
 /** Compute hash index into futex hash table.
@@ -268,28 +468,36 @@
 }
 
-/** Remove references from futexes known to the current task. */
-void futex_cleanup(void)
-{
-	mutex_lock(&futex_ht_lock);
-	mutex_lock(&TASK->futexes_lock);
-
-	list_foreach(TASK->futexes.leaf_list, cur) {
-		btree_node_t *node;
-		unsigned int i;
-		
-		node = list_get_instance(cur, btree_node_t, leaf_link);
-		for (i = 0; i < node->keys; i++) {
-			futex_t *ftx;
-			uintptr_t paddr = node->key[i];
-			
-			ftx = (futex_t *) node->value[i];
-			if (--ftx->refcount == 0)
-				hash_table_remove(&futex_ht, &paddr, 1);
-		}
-	}
-	
-	mutex_unlock(&TASK->futexes_lock);
-	mutex_unlock(&futex_ht_lock);
-}
+/*
+ * Operations of a task's CHT that caches mappings of futex user space 
+ * virtual addresses to kernel futex objects.
+ */
+
+static size_t task_fut_ht_hash(const cht_link_t *link)
+{
+	const futex_ptr_t *fut_ptr = member_to_inst(link, futex_ptr_t, cht_link);
+	return fut_ptr->uaddr;
+}
+
+static size_t task_fut_ht_key_hash(void *key)
+{
+	return *(uintptr_t*)key;
+}
+
+static bool task_fut_ht_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	const futex_ptr_t *fut_ptr1 = member_to_inst(item1, futex_ptr_t, cht_link);
+	const futex_ptr_t *fut_ptr2 = member_to_inst(item2, futex_ptr_t, cht_link);
+	
+	return fut_ptr1->uaddr == fut_ptr2->uaddr;
+}
+
+static bool task_fut_ht_key_equal(void *key, const cht_link_t *item)
+{
+	const futex_ptr_t *fut_ptr = member_to_inst(item, futex_ptr_t, cht_link);
+	uintptr_t uaddr = *(uintptr_t*)key;
+	
+	return fut_ptr->uaddr == uaddr;
+}
+
 
 /** @}
Index: kernel/generic/src/synch/mutex.c
===================================================================
--- kernel/generic/src/synch/mutex.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/mutex.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,6 @@
 #include <arch.h>
 #include <stacktrace.h>
+#include <cpu.h>
+#include <proc/thread.h>
 
 /** Initialize mutex.
Index: kernel/generic/src/synch/rcu.c
===================================================================
--- kernel/generic/src/synch/rcu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/synch/rcu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,1873 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ 
+/** @addtogroup sync
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Preemptible read-copy update. Usable from interrupt handlers.
+ * 
+ * @par Podzimek-preempt-RCU (RCU_PREEMPT_PODZIMEK)
+ * 
+ * Podzimek-preempt-RCU is a preemptible variant of Podzimek's non-preemptible
+ * RCU algorithm [1, 2]. Grace period (GP) detection is centralized into a
+ * single detector thread. The detector requests that each cpu announces
+ * that it passed a quiescent state (QS), ie a state when the cpu is
+ * outside of an rcu reader section (CS). Cpus check for QSs during context
+ * switches and when entering and exiting rcu reader sections. Once all 
+ * cpus announce a QS and if there were no threads preempted in a CS, the 
+ * GP ends.
+ * 
+ * The detector increments the global GP counter, _rcu_cur_gp, in order 
+ * to start a new GP. Readers notice the new GP by comparing the changed 
+ * _rcu_cur_gp to a locally stored value last_seen_gp which denotes the
+ * the last GP number for which the cpu noted an explicit QS (and issued
+ * a memory barrier). Readers check for the change in the outer-most
+ * (ie not nested) rcu_read_lock()/unlock() as these functions represent 
+ * a QS. The reader first executes a memory barrier (MB) in order to contain 
+ * memory references within a CS (and to make changes made by writers 
+ * visible in the CS following rcu_read_lock()). Next, the reader notes 
+ * that it reached a QS by updating the cpu local last_seen_gp to the
+ * global GP counter, _rcu_cur_gp. Cache coherency eventually makes
+ * the updated last_seen_gp visible to the detector cpu, much like it
+ * delivered the changed _rcu_cur_gp to all cpus.
+ * 
+ * The detector waits a while after starting a GP and then reads each 
+ * cpu's last_seen_gp to see if it reached a QS. If a cpu did not record 
+ * a QS (might be a long running thread without an RCU reader CS; or cache
+ * coherency has yet to make the most current last_seen_gp visible to
+ * the detector; or the cpu is still in a CS) the cpu is interrupted
+ * via an IPI. If the IPI handler finds the cpu still in a CS, it instructs
+ * the cpu to notify the detector that it had exited the CS via a semaphore
+ * (CPU->rcu.is_delaying_gp). 
+ * The detector then waits on the semaphore for any cpus to exit their
+ * CSs. Lastly, it waits for the last reader preempted in a CS to 
+ * exit its CS if there were any and signals the end of the GP to
+ * separate reclaimer threads wired to each cpu. Reclaimers then
+ * execute the callbacks queued on each of the cpus.
+ * 
+ * 
+ * @par A-RCU algorithm (RCU_PREEMPT_A)
+ * 
+ * A-RCU is based on the user space rcu algorithm in [3] utilizing signals
+ * (urcu) and Podzimek's rcu [1]. Like in Podzimek's rcu, callbacks are 
+ * executed by cpu-bound reclaimer threads. There is however no dedicated 
+ * detector thread and the reclaimers take on the responsibilities of the 
+ * detector when they need to start a new GP. A new GP is again announced 
+ * and acknowledged with _rcu_cur_gp and the cpu local last_seen_gp. Unlike
+ * Podzimek's rcu, cpus check explicitly for QS only during context switches. 
+ * Like in urcu, rcu_read_lock()/unlock() only maintain the nesting count
+ * and never issue any memory barriers. This makes rcu_read_lock()/unlock()
+ * simple and fast.
+ * 
+ * If a new callback is queued for a reclaimer and no GP is in progress,
+ * the reclaimer takes on the role of a detector. The detector increments 
+ * _rcu_cur_gp in order to start a new GP. It waits a while to give cpus 
+ * a chance to switch a context (a natural QS). Then, it examines each
+ * non-idle cpu that has yet to pass a QS via an IPI. The IPI handler
+ * sees the most current _rcu_cur_gp and last_seen_gp and notes a QS
+ * with a memory barrier and an update to last_seen_gp. If the handler
+ * finds the cpu in a CS it does nothing and let the detector poll/interrupt
+ * the cpu again after a short sleep.
+ * 
+ * @par Caveats
+ * 
+ * last_seen_gp and _rcu_cur_gp are always 64bit variables and they
+ * are read non-atomically on 32bit machines. Reading a clobbered
+ * value of last_seen_gp or _rcu_cur_gp or writing a clobbered value
+ * of _rcu_cur_gp to last_seen_gp will at worst force the detector
+ * to unnecessarily interrupt a cpu. Interrupting a cpu makes the 
+ * correct value of _rcu_cur_gp visible to the cpu and correctly
+ * resets last_seen_gp in both algorithms.
+ * 
+ * 
+ * 
+ * [1] Read-copy-update for opensolaris,
+ *     2010, Podzimek
+ *     https://andrej.podzimek.org/thesis.pdf
+ * 
+ * [2] (podzimek-rcu) implementation file "rcu.patch"
+ *     http://d3s.mff.cuni.cz/projects/operating_systems/rcu/rcu.patch
+ * 
+ * [3] User-level implementations of read-copy update,
+ *     2012, appendix
+ *     http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
+ * 
+ */
+ 
+#include <synch/rcu.h>
+#include <synch/condvar.h>
+#include <synch/semaphore.h>
+#include <synch/spinlock.h>
+#include <synch/mutex.h>
+#include <proc/thread.h>
+#include <cpu/cpu_mask.h>
+#include <cpu.h>
+#include <smp/smp_call.h>
+#include <compiler/barrier.h>
+#include <atomic.h>
+#include <arch.h>
+#include <macros.h>
+
+/* 
+ * Number of milliseconds to give to preexisting readers to finish 
+ * when non-expedited grace period detection is in progress.
+ */
+#define DETECT_SLEEP_MS    10
+/* 
+ * Max number of pending callbacks in the local cpu's queue before 
+ * aggressively expediting the current grace period
+ */
+#define EXPEDITE_THRESHOLD 2000
+/*
+ * Max number of callbacks to execute in one go with preemption
+ * enabled. If there are more callbacks to be executed they will
+ * be run with preemption disabled in order to prolong reclaimer's
+ * time slice and give it a chance to catch up with callback producers.
+ */
+#define CRITICAL_THRESHOLD 30000
+/* Half the number of values a uint32 can hold. */
+#define UINT32_MAX_HALF    2147483648U
+
+/** 
+ * The current grace period number. Increases monotonically. 
+ * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
+ */
+rcu_gp_t _rcu_cur_gp;
+
+/** Global RCU data. */
+typedef struct rcu_data {
+	/** Detector uses so signal reclaimers that a grace period ended. */
+	condvar_t gp_ended;
+	/** Reclaimers use to notify the detector to accelerate GP detection. */
+	condvar_t expedite_now;
+	/** 
+	 * Protects: req_gp_end_cnt, req_expedited_cnt, completed_gp, _rcu_cur_gp;
+	 * or: completed_gp, _rcu_cur_gp
+	 */
+	SPINLOCK_DECLARE(gp_lock);
+	/**
+	 * The number of the most recently completed grace period. At most 
+	 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period 
+	 * detection is not in progress and the detector is idle.
+	 */
+	rcu_gp_t completed_gp;
+	
+	/** Protects the following 3 fields. */
+	IRQ_SPINLOCK_DECLARE(preempt_lock);
+	/** Preexisting readers that have been preempted. */
+	list_t cur_preempted;
+	/** Reader that have been preempted and might delay the next grace period.*/
+	list_t next_preempted;
+	/** 
+	 * The detector is waiting for the last preempted reader 
+	 * in cur_preempted to announce that it exited its reader 
+	 * section by up()ing remaining_readers.
+	 */
+	bool preempt_blocking_det;
+	
+#ifdef RCU_PREEMPT_A
+	
+	/** 
+	 * The detector waits on this semaphore for any preempted readers 
+	 * delaying the grace period once all cpus pass a quiescent state.
+	 */
+	semaphore_t remaining_readers;
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+	
+	/** Reclaimers notify the detector when they request more grace periods.*/
+	condvar_t req_gp_changed;
+	/** Number of grace period ends the detector was requested to announce. */
+	size_t req_gp_end_cnt;
+	/** Number of consecutive grace periods to detect quickly and aggressively.*/
+	size_t req_expedited_cnt;
+	/** 
+	 * Number of cpus with readers that are delaying the current GP.
+	 * They will up() remaining_readers.
+	 */
+	atomic_t delaying_cpu_cnt;
+	/** 
+	 * The detector waits on this semaphore for any readers delaying the GP.
+	 * 
+	 * Each of the cpus with readers that are delaying the current GP 
+	 * must up() this sema once they reach a quiescent state. If there 
+	 * are any readers in cur_preempted (ie preempted preexisting) and 
+	 * they are already delaying GP detection, the last to unlock its
+	 * reader section must up() this sema once.
+	 */
+	semaphore_t remaining_readers;
+#endif
+	
+	/** Excludes simultaneous rcu_barrier() calls. */
+	mutex_t barrier_mtx;
+	/** Number of cpus that we are waiting for to complete rcu_barrier(). */
+	atomic_t barrier_wait_cnt;
+	/** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
+	waitq_t barrier_wq;
+	
+	/** Interruptible attached detector thread pointer. */
+	thread_t *detector_thr;
+	
+	/* Some statistics. */
+	size_t stat_expedited_cnt;
+	size_t stat_delayed_cnt;
+	size_t stat_preempt_blocking_cnt;
+	/* Does not contain self/local calls. */
+	size_t stat_smp_call_cnt;
+} rcu_data_t;
+
+
+static rcu_data_t rcu;
+
+static void start_reclaimers(void);
+static void synch_complete(rcu_item_t *rcu_item);
+static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item, 
+	rcu_func_t func);
+static void add_barrier_cb(void *arg);
+static void barrier_complete(rcu_item_t *barrier_item);
+static bool arriving_cbs_empty(void);
+static bool next_cbs_empty(void);
+static bool cur_cbs_empty(void);
+static bool all_cbs_empty(void);
+static void reclaimer(void *arg);
+static bool wait_for_pending_cbs(void);
+static bool advance_cbs(void);
+static void exec_completed_cbs(rcu_gp_t last_completed_gp);
+static void exec_cbs(rcu_item_t **phead);
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
+static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
+
+#ifdef RCU_PREEMPT_PODZIMEK
+static void start_detector(void);
+static void read_unlock_impl(size_t *pnesting_cnt);
+static void req_detection(size_t req_cnt);
+static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
+static void detector(void *);
+static bool wait_for_detect_req(void);
+static void end_cur_gp(void);
+static bool wait_for_readers(void);
+static bool gp_sleep(void);
+static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
+static bool wait_for_delaying_cpus(void);
+#elif defined(RCU_PREEMPT_A)
+static bool wait_for_readers(bool expedite);
+static bool gp_sleep(bool *expedite);
+#endif
+
+static void start_new_gp(void);
+static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
+static void sample_cpus(cpu_mask_t *reader_cpus, void *arg);
+static void sample_local_cpu(void *);
+static bool wait_for_preempt_reader(void);
+static void note_preempted_reader(void);
+static void rm_preempted_reader(void);
+static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt);
+
+
+
+/** Initializes global RCU structures. */
+void rcu_init(void)
+{
+	condvar_initialize(&rcu.gp_ended);
+	condvar_initialize(&rcu.expedite_now);
+
+	spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
+	_rcu_cur_gp = 0;
+	rcu.completed_gp = 0;
+	
+	irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
+	list_initialize(&rcu.cur_preempted);
+	list_initialize(&rcu.next_preempted);
+	rcu.preempt_blocking_det = false;
+	
+	mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
+	atomic_set(&rcu.barrier_wait_cnt, 0);
+	waitq_initialize(&rcu.barrier_wq);
+
+	semaphore_initialize(&rcu.remaining_readers, 0);
+	
+#ifdef RCU_PREEMPT_PODZIMEK
+	condvar_initialize(&rcu.req_gp_changed);
+	
+	rcu.req_gp_end_cnt = 0;
+	rcu.req_expedited_cnt = 0;
+	atomic_set(&rcu.delaying_cpu_cnt, 0);
+#endif
+	
+	rcu.detector_thr = NULL;
+	
+	rcu.stat_expedited_cnt = 0;
+	rcu.stat_delayed_cnt = 0;
+	rcu.stat_preempt_blocking_cnt = 0;
+	rcu.stat_smp_call_cnt = 0;
+}
+
+/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
+void rcu_cpu_init(void)
+{
+	if (config.cpu_active == 1) {
+		rcu_init();
+	}
+
+	CPU->rcu.last_seen_gp = 0;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	CPU->rcu.nesting_cnt = 0;
+	CPU->rcu.is_delaying_gp = false;
+	CPU->rcu.signal_unlock = false;
+#endif
+	
+	CPU->rcu.cur_cbs = NULL;
+	CPU->rcu.cur_cbs_cnt = 0;
+	CPU->rcu.next_cbs = NULL;
+	CPU->rcu.next_cbs_cnt = 0;
+	CPU->rcu.arriving_cbs = NULL;
+	CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
+	CPU->rcu.arriving_cbs_cnt = 0;
+
+	CPU->rcu.cur_cbs_gp = 0;
+	CPU->rcu.next_cbs_gp = 0;
+	
+	semaphore_initialize(&CPU->rcu.arrived_flag, 0);
+
+	/* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
+	if (config.cpu_active == 1)
+		CPU->rcu.reclaimer_thr = NULL;
+	
+	CPU->rcu.stat_max_cbs = 0;
+	CPU->rcu.stat_avg_cbs = 0;
+	CPU->rcu.stat_missed_gps = 0;
+	CPU->rcu.stat_missed_gp_in_wait = 0;
+	CPU->rcu.stat_max_slice_cbs = 0;
+	CPU->rcu.last_arriving_cnt = 0;
+}
+
+/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
+void rcu_kinit_init(void)
+{
+#ifdef RCU_PREEMPT_PODZIMEK
+	start_detector();
+#endif
+	
+	start_reclaimers();
+}
+
+/** Initializes any per-thread RCU structures. */
+void rcu_thread_init(thread_t *thread)
+{
+	thread->rcu.nesting_cnt = 0;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	thread->rcu.was_preempted = false;
+#endif
+	
+	link_initialize(&thread->rcu.preempt_link);
+}
+
+
+/** Cleans up global RCU resources and stops dispatching callbacks. 
+ * 
+ * Call when shutting down the kernel. Outstanding callbacks will
+ * not be processed. Instead they will linger forever.
+ */
+void rcu_stop(void)
+{
+	/* Stop and wait for reclaimers. */
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
+		ASSERT(cpus[cpu_id].rcu.reclaimer_thr != NULL);
+	
+		if (cpus[cpu_id].rcu.reclaimer_thr) {
+			thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
+			thread_join(cpus[cpu_id].rcu.reclaimer_thr);
+			thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
+			cpus[cpu_id].rcu.reclaimer_thr = NULL;
+		}
+	}
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	/* Stop the detector and wait. */
+	if (rcu.detector_thr) {
+		thread_interrupt(rcu.detector_thr);
+		thread_join(rcu.detector_thr);
+		thread_detach(rcu.detector_thr);
+		rcu.detector_thr = NULL;
+	}
+#endif
+}
+
+/** Returns the number of elapsed grace periods since boot. */
+uint64_t rcu_completed_gps(void)
+{
+	spinlock_lock(&rcu.gp_lock);
+	uint64_t completed = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return completed;
+}
+
+/** Creates and runs cpu-bound reclaimer threads. */
+static void start_reclaimers(void)
+{
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
+		char name[THREAD_NAME_BUFLEN] = {0};
+		
+		snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
+		
+		cpus[cpu_id].rcu.reclaimer_thr = 
+			thread_create(reclaimer, NULL, TASK, THREAD_FLAG_NONE, name);
+
+		if (!cpus[cpu_id].rcu.reclaimer_thr) 
+			panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
+
+		thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
+		thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
+	}
+}
+
+#ifdef RCU_PREEMPT_PODZIMEK
+
+/** Starts the detector thread. */
+static void start_detector(void)
+{
+	rcu.detector_thr = 
+		thread_create(detector, NULL, TASK, THREAD_FLAG_NONE, "rcu-det");
+	
+	if (!rcu.detector_thr) 
+		panic("Failed to create RCU detector thread.");
+	
+	thread_ready(rcu.detector_thr);
+}
+
+/** Returns true if in an rcu reader section. */
+bool rcu_read_locked(void)
+{
+	preemption_disable();
+	bool locked = 0 < CPU->rcu.nesting_cnt;
+	preemption_enable();
+	
+	return locked;
+}
+
+/** Unlocks the local reader section using the given nesting count. 
+ * 
+ * Preemption or interrupts must be disabled. 
+ * 
+ * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or 
+ *           THREAD->rcu.nesting_cnt.
+ */
+static void read_unlock_impl(size_t *pnesting_cnt)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	if (0 == --(*pnesting_cnt)) {
+		_rcu_record_qs();
+		
+		/* 
+		 * The thread was preempted while in a critical section or 
+		 * the detector is eagerly waiting for this cpu's reader 
+		 * to finish. 
+		 * 
+		 * Note that THREAD may be NULL in scheduler() and not just during boot.
+		 */
+		if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
+			/* Rechecks with disabled interrupts. */
+			_rcu_signal_read_unlock();
+		}
+	}
+}
+
+/** If necessary, signals the detector that we exited a reader section. */
+void _rcu_signal_read_unlock(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/*
+	 * If an interrupt occurs here (even a NMI) it may beat us to
+	 * resetting .is_delaying_gp or .was_preempted and up the semaphore
+	 * for us.
+	 */
+	
+	/* 
+	 * If the detector is eagerly waiting for this cpu's reader to unlock,
+	 * notify it that the reader did so.
+	 */
+	if (local_atomic_exchange(&CPU->rcu.is_delaying_gp, false)) {
+		semaphore_up(&rcu.remaining_readers);
+	}
+	
+	/*
+	 * This reader was preempted while in a reader section.
+	 * We might be holding up the current GP. Notify the
+	 * detector if so.
+	 */
+	if (THREAD && local_atomic_exchange(&THREAD->rcu.was_preempted, false)) {
+		ASSERT(link_used(&THREAD->rcu.preempt_link));
+
+		rm_preempted_reader();
+	}
+	
+	/* If there was something to signal to the detector we have done so. */
+	CPU->rcu.signal_unlock = false;
+}
+
+#endif /* RCU_PREEMPT_PODZIMEK */
+
+typedef struct synch_item {
+	waitq_t wq;
+	rcu_item_t rcu_item;
+} synch_item_t;
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void rcu_synchronize(void)
+{
+	_rcu_synchronize(false);
+}
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void rcu_synchronize_expedite(void)
+{
+	_rcu_synchronize(true);
+}
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void _rcu_synchronize(bool expedite)
+{
+	/* Calling from a reader section will deadlock. */
+	ASSERT(!rcu_read_locked());
+	
+	synch_item_t completion; 
+
+	waitq_initialize(&completion.wq);
+	_rcu_call(expedite, &completion.rcu_item, synch_complete);
+	waitq_sleep(&completion.wq);
+}
+
+/** rcu_synchronize's callback. */
+static void synch_complete(rcu_item_t *rcu_item)
+{
+	synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
+	ASSERT(completion);
+	waitq_wakeup(&completion->wq, WAKEUP_FIRST);
+}
+
+/** Waits for all outstanding rcu calls to complete. */
+void rcu_barrier(void)
+{
+	/* 
+	 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
+	 * currently in use by rcu_barrier().
+	 */
+	mutex_lock(&rcu.barrier_mtx);
+	
+	/* 
+	 * Ensure we queue a barrier callback on all cpus before the already
+	 * enqueued barrier callbacks start signaling completion.
+	 */
+	atomic_set(&rcu.barrier_wait_cnt, 1);
+
+	DEFINE_CPU_MASK(cpu_mask);
+	cpu_mask_active(cpu_mask);
+	
+	cpu_mask_for_each(*cpu_mask, cpu_id) {
+		smp_call(cpu_id, add_barrier_cb, NULL);
+	}
+	
+	if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
+		waitq_sleep(&rcu.barrier_wq);
+	}
+	
+	mutex_unlock(&rcu.barrier_mtx);
+}
+
+/** Issues a rcu_barrier() callback on the local cpu. 
+ * 
+ * Executed with interrupts disabled.  
+ */
+static void add_barrier_cb(void *arg)
+{
+	ASSERT(interrupts_disabled() || PREEMPTION_DISABLED);
+	atomic_inc(&rcu.barrier_wait_cnt);
+	rcu_call(&CPU->rcu.barrier_item, barrier_complete);
+}
+
+/** Local cpu's rcu_barrier() completion callback. */
+static void barrier_complete(rcu_item_t *barrier_item)
+{
+	/* Is this the last barrier callback completed? */
+	if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
+		/* Notify rcu_barrier() that we're done. */
+		waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
+	}
+}
+
+/** Adds a callback to invoke after all preexisting readers finish. 
+ * 
+ * May be called from within interrupt handlers or RCU reader sections.
+ * 
+ * @param rcu_item Used by RCU to track the call. Must remain
+ *         until the user callback function is entered.
+ * @param func User callback function that will be invoked once a full
+ *         grace period elapsed, ie at a time when all preexisting
+ *         readers have finished. The callback should be short and must
+ *         not block. If you must sleep, enqueue your work in the system
+ *         work queue from the callback (ie workq_global_enqueue()).
+ */
+void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
+{
+	rcu_call_impl(false, rcu_item, func);
+}
+
+/** rcu_call() implementation. See rcu_call() for comments. */
+void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
+{
+	rcu_call_impl(expedite, rcu_item, func);
+}
+
+/** rcu_call() inline-able implementation. See rcu_call() for comments. */
+static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item, 
+	rcu_func_t func)
+{
+	ASSERT(rcu_item);
+	
+	rcu_item->func = func;
+	rcu_item->next = NULL;
+	
+	preemption_disable();
+
+	rcu_cpu_data_t *r = &CPU->rcu;
+
+	rcu_item_t **prev_tail 
+		= local_atomic_exchange(&r->parriving_cbs_tail, &rcu_item->next);
+	*prev_tail = rcu_item;
+	
+	/* Approximate the number of callbacks present. */
+	++r->arriving_cbs_cnt;
+	
+	if (expedite) {
+		r->expedite_arriving = true;
+	}
+	
+	bool first_cb = (prev_tail == &CPU->rcu.arriving_cbs);
+	
+	/* Added first callback - notify the reclaimer. */
+	if (first_cb && !semaphore_count_get(&r->arrived_flag)) {
+		semaphore_up(&r->arrived_flag);
+	}
+	
+	preemption_enable();
+}
+
+static bool cur_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	return NULL == CPU->rcu.cur_cbs;
+}
+
+static bool next_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	return NULL == CPU->rcu.next_cbs;
+}
+
+/** Disable interrupts to get an up-to-date result. */
+static bool arriving_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	/* 
+	 * Accessing with interrupts enabled may at worst lead to 
+	 * a false negative if we race with a local interrupt handler.
+	 */
+	return NULL == CPU->rcu.arriving_cbs;
+}
+
+static bool all_cbs_empty(void)
+{
+	return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
+}
+
+
+/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
+static void reclaimer(void *arg)
+{
+	ASSERT(THREAD && THREAD->wired);
+	ASSERT(THREAD == CPU->rcu.reclaimer_thr);
+
+	rcu_gp_t last_compl_gp = 0;
+	bool ok = true;
+	
+	while (ok && wait_for_pending_cbs()) {
+		ASSERT(CPU->rcu.reclaimer_thr == THREAD);
+		
+		exec_completed_cbs(last_compl_gp);
+
+		bool expedite = advance_cbs();
+		
+		ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
+	}
+}
+
+/** Waits until there are callbacks waiting to be dispatched. */
+static bool wait_for_pending_cbs(void)
+{
+	if (!all_cbs_empty()) 
+		return true;
+
+	bool ok = true;
+	
+	while (arriving_cbs_empty() && ok) {
+		ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
+	}
+	
+	return ok;
+}
+
+static void upd_stat_missed_gp(rcu_gp_t compl)
+{
+	if (CPU->rcu.cur_cbs_gp < compl) {
+		CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
+	}
+}
+
+/** Executes all callbacks for the given completed grace period. */
+static void exec_completed_cbs(rcu_gp_t last_completed_gp)
+{
+	upd_stat_missed_gp(last_completed_gp);
+	
+	/* Both next_cbs and cur_cbs GP elapsed. */
+	if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
+		ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+		
+		size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
+		
+		if (exec_cnt < CRITICAL_THRESHOLD) {
+			exec_cbs(&CPU->rcu.cur_cbs);
+			exec_cbs(&CPU->rcu.next_cbs);	
+		} else {
+			/* 
+			 * Getting overwhelmed with too many callbacks to run. 
+			 * Disable preemption in order to prolong our time slice 
+			 * and catch up with updaters posting new callbacks.
+			 */
+			preemption_disable();
+			exec_cbs(&CPU->rcu.cur_cbs);
+			exec_cbs(&CPU->rcu.next_cbs);	
+			preemption_enable();
+		}
+		
+		CPU->rcu.cur_cbs_cnt = 0;
+		CPU->rcu.next_cbs_cnt = 0;
+	} else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
+
+		if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
+			exec_cbs(&CPU->rcu.cur_cbs);
+		} else {
+			/* 
+			 * Getting overwhelmed with too many callbacks to run. 
+			 * Disable preemption in order to prolong our time slice 
+			 * and catch up with updaters posting new callbacks.
+			 */
+			preemption_disable();
+			exec_cbs(&CPU->rcu.cur_cbs);
+			preemption_enable();
+		}
+
+		CPU->rcu.cur_cbs_cnt = 0;
+	}
+}
+
+/** Executes callbacks in the single-linked list. The list is left empty. */
+static void exec_cbs(rcu_item_t **phead)
+{
+	rcu_item_t *rcu_item = *phead;
+
+	while (rcu_item) {
+		/* func() may free rcu_item. Get a local copy. */
+		rcu_item_t *next = rcu_item->next;
+		rcu_func_t func = rcu_item->func;
+		
+		func(rcu_item);
+		
+		rcu_item = next;
+	}
+	
+	*phead = NULL;
+}
+
+static void upd_stat_cb_cnts(size_t arriving_cnt)
+{
+	CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
+	if (0 < arriving_cnt) {
+		CPU->rcu.stat_avg_cbs = 
+			(99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
+	}
+}
+
+/** Prepares another batch of callbacks to dispatch at the nest grace period.
+ * 
+ * @return True if the next batch of callbacks must be expedited quickly.
+ */
+static bool advance_cbs(void)
+{
+	/* Move next_cbs to cur_cbs. */
+	CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
+	CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
+	CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
+	
+	/* Move arriving_cbs to next_cbs. */
+	
+	CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
+	CPU->rcu.arriving_cbs_cnt = 0;
+	
+	/* 
+	 * Too many callbacks queued. Better speed up the detection
+	 * or risk exhausting all system memory.
+	 */
+	bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.next_cbs_cnt)
+		|| CPU->rcu.expedite_arriving;	
+	CPU->rcu.expedite_arriving = false;
+
+	/* Start moving the arriving_cbs list to next_cbs. */
+	CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
+	
+	/* 
+	 * At least one callback arrived. The tail therefore does not point
+	 * to the head of arriving_cbs and we can safely reset it to NULL.
+	 */
+	if (CPU->rcu.next_cbs) {
+		ASSERT(CPU->rcu.parriving_cbs_tail != &CPU->rcu.arriving_cbs);
+		
+		CPU->rcu.arriving_cbs = NULL;
+		/* Reset arriving_cbs before updating the tail pointer. */
+		compiler_barrier();
+		/* Updating the tail pointer completes the move of arriving_cbs. */
+		ACCESS_ONCE(CPU->rcu.parriving_cbs_tail) = &CPU->rcu.arriving_cbs;
+	} else {
+		/* 
+		 * arriving_cbs was null and parriving_cbs_tail pointed to it 
+		 * so leave it that way. Note that interrupt handlers may have
+		 * added a callback in the meantime so it is not safe to reset
+		 * arriving_cbs or parriving_cbs.
+		 */
+	}
+
+	/* Update statistics of arrived callbacks. */
+	upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
+	
+	/* 
+	 * Make changes prior to queuing next_cbs visible to readers. 
+	 * See comment in wait_for_readers().
+	 */
+	memory_barrier(); /* MB A, B */
+
+	/* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
+	
+	if (!next_cbs_empty()) {
+		spinlock_lock(&rcu.gp_lock);
+	
+		/* Exec next_cbs at the end of the next GP. */
+		CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
+		
+		/* 
+		 * There are no callbacks to invoke before next_cbs. Instruct
+		 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
+		 * That could be sooner than next_cbs_gp (if the current GP 
+		 * had not yet completed), so we'll create a shorter batch
+		 * of callbacks next time around.
+		 */
+		if (cur_cbs_empty()) {
+			CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
+		} 
+		
+		spinlock_unlock(&rcu.gp_lock);
+	} else {
+		CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
+	}
+	
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	
+	return expedite;	
+}
+
+
+#ifdef RCU_PREEMPT_A
+
+/** Waits for the grace period associated with callbacks cub_cbs to elapse. 
+ * 
+ * @param expedite Instructs the detector to aggressively speed up grace 
+ *            period detection without any delay.
+ * @param completed_gp Returns the most recent completed grace period 
+ *            number.
+ * @return false if the thread was interrupted and should stop.
+ */
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
+{
+	spinlock_lock(&rcu.gp_lock);
+
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	ASSERT(CPU->rcu.cur_cbs_gp <= _rcu_cur_gp + 1);
+	
+	while (rcu.completed_gp < CPU->rcu.cur_cbs_gp) {
+		/* GP has not yet started - start a new one. */
+		if (rcu.completed_gp == _rcu_cur_gp) {
+			start_new_gp();
+			spinlock_unlock(&rcu.gp_lock);
+
+			if (!wait_for_readers(expedite))
+				return false;
+
+			spinlock_lock(&rcu.gp_lock);
+			/* Notify any reclaimers this GP had ended. */
+			rcu.completed_gp = _rcu_cur_gp;
+			condvar_broadcast(&rcu.gp_ended);
+		} else {
+			/* GP detection is in progress.*/ 
+			
+			if (expedite) 
+				condvar_signal(&rcu.expedite_now);
+			
+			/* Wait for the GP to complete. */
+			int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock, 
+				SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+			
+			if (ret == ESYNCH_INTERRUPTED) {
+				spinlock_unlock(&rcu.gp_lock);
+				return false;			
+			}
+		}
+	}
+	
+	upd_missed_gp_in_wait(rcu.completed_gp);
+	
+	*completed_gp = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return true;
+}
+
+static bool wait_for_readers(bool expedite)
+{
+	DEFINE_CPU_MASK(reader_cpus);
+	
+	cpu_mask_active(reader_cpus);
+	rm_quiescent_cpus(reader_cpus);
+	
+	while (!cpu_mask_is_none(reader_cpus)) {
+		/* Give cpus a chance to context switch (a QS) and batch callbacks. */
+		if(!gp_sleep(&expedite)) 
+			return false;
+		
+		rm_quiescent_cpus(reader_cpus);
+		sample_cpus(reader_cpus, reader_cpus);
+	}
+	
+	/* Update statistic. */
+	if (expedite) {
+		++rcu.stat_expedited_cnt;
+	}
+	
+	/* 
+	 * All cpus have passed through a QS and see the most recent _rcu_cur_gp.
+	 * As a result newly preempted readers will associate with next_preempted
+	 * and the number of old readers in cur_preempted will monotonically
+	 * decrease. Wait for those old/preexisting readers.
+	 */
+	return wait_for_preempt_reader();
+}
+
+static bool gp_sleep(bool *expedite)
+{
+	if (*expedite) {
+		scheduler();
+		return true;
+	} else {
+		spinlock_lock(&rcu.gp_lock);
+
+		int ret = 0;
+		ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
+			DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
+
+		/* rcu.expedite_now was signaled. */
+		if (ret == ESYNCH_OK_BLOCKED) {
+			*expedite = true;
+		}
+
+		spinlock_unlock(&rcu.gp_lock);
+
+		return (ret != ESYNCH_INTERRUPTED);
+	}
+}
+
+static void sample_local_cpu(void *arg)
+{
+	ASSERT(interrupts_disabled());
+	cpu_mask_t *reader_cpus = (cpu_mask_t *)arg;
+	
+	bool locked = RCU_CNT_INC <= THE->rcu_nesting;
+	/* smp_call machinery makes the most current _rcu_cur_gp visible. */
+	bool passed_qs = (CPU->rcu.last_seen_gp == _rcu_cur_gp);
+		
+	if (locked && !passed_qs) {
+		/* 
+		 * This cpu has not yet passed a quiescent state during this grace
+		 * period and it is currently in a reader section. We'll have to
+		 * try to sample this cpu again later.
+		 */
+	} else {
+		/* Either not in a reader section or already passed a QS. */
+		cpu_mask_reset(reader_cpus, CPU->id);
+		/* Contain new reader sections and make prior changes visible to them.*/
+		memory_barrier();
+		CPU->rcu.last_seen_gp = _rcu_cur_gp;
+	}
+}
+
+/** Called by the scheduler() when switching away from the current thread. */
+void rcu_after_thread_ran(void)
+{
+	ASSERT(interrupts_disabled());
+
+	/* 
+	 * In order not to worry about NMI seeing rcu_nesting change work 
+	 * with a local copy.
+	 */
+	size_t nesting_cnt = local_atomic_exchange(&THE->rcu_nesting, 0);
+	
+	/* 
+	 * Ensures NMIs see .rcu_nesting without the WAS_PREEMPTED mark and
+	 * do not accidentally call rm_preempted_reader() from unlock().
+	 */
+	compiler_barrier();
+	
+	/* Preempted a reader critical section for the first time. */
+	if (RCU_CNT_INC <= nesting_cnt && !(nesting_cnt & RCU_WAS_PREEMPTED)) {
+		nesting_cnt |= RCU_WAS_PREEMPTED;
+		note_preempted_reader();
+	}
+	
+	/* Save the thread's nesting count when it is not running. */
+	THREAD->rcu.nesting_cnt = nesting_cnt;
+
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* 
+		 * Contain any memory accesses of old readers before announcing a QS. 
+		 * Also make changes from the previous GP visible to this cpu.
+		 * Moreover it separates writing to last_seen_gp from 
+		 * note_preempted_reader().
+		 */
+		memory_barrier();
+		/* 
+		 * The preempted reader has been noted globally. There are therefore
+		 * no readers running on this cpu so this is a quiescent state.
+		 * 
+		 * Reading the multiword _rcu_cur_gp non-atomically is benign. 
+		 * At worst, the read value will be different from the actual value.
+		 * As a result, both the detector and this cpu will believe
+		 * this cpu has not yet passed a QS although it really did.
+		 * 
+		 * Reloading _rcu_cur_gp is benign, because it cannot change
+		 * until this cpu acknowledges it passed a QS by writing to
+		 * last_seen_gp. Since interrupts are disabled, only this
+		 * code may to so (IPIs won't get through).
+		 */
+		CPU->rcu.last_seen_gp = _rcu_cur_gp;
+	}
+
+	/* 
+	 * Forcefully associate the reclaimer with the highest priority
+	 * even if preempted due to its time slice running out.
+	 */
+	if (THREAD == CPU->rcu.reclaimer_thr) {
+		THREAD->priority = -1;
+	} 
+	
+	upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
+}
+
+/** Called by the scheduler() when switching to a newly scheduled thread. */
+void rcu_before_thread_runs(void)
+{
+	ASSERT(!rcu_read_locked());
+	
+	/* Load the thread's saved nesting count from before it was preempted. */
+	THE->rcu_nesting = THREAD->rcu.nesting_cnt;
+}
+
+/** Called from scheduler() when exiting the current thread. 
+ * 
+ * Preemption or interrupts are disabled and the scheduler() already
+ * switched away from the current thread, calling rcu_after_thread_ran().
+ */
+void rcu_thread_exiting(void)
+{
+	ASSERT(THE->rcu_nesting == 0);
+	
+	/* 
+	 * The thread forgot to exit its reader critical section. 
+	 * It is a bug, but rather than letting the entire system lock up
+	 * forcefully leave the reader section. The thread is not holding 
+	 * any references anyway since it is exiting so it is safe.
+	 */
+	if (RCU_CNT_INC <= THREAD->rcu.nesting_cnt) {
+		/* Emulate _rcu_preempted_unlock() with the proper nesting count. */
+		if (THREAD->rcu.nesting_cnt & RCU_WAS_PREEMPTED) {
+			rm_preempted_reader();
+		}
+
+		printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
+			" section.\n", THREAD->tid, THREAD->name);
+	}
+}
+
+/** Returns true if in an rcu reader section. */
+bool rcu_read_locked(void)
+{
+	return RCU_CNT_INC <= THE->rcu_nesting;
+}
+
+/** Invoked when a preempted reader finally exits its reader section. */
+void _rcu_preempted_unlock(void)
+{
+	ASSERT(0 == THE->rcu_nesting || RCU_WAS_PREEMPTED == THE->rcu_nesting);
+	
+	size_t prev = local_atomic_exchange(&THE->rcu_nesting, 0);
+	if (prev == RCU_WAS_PREEMPTED) {
+		/* 
+		 * NMI handlers are never preempted but may call rm_preempted_reader()
+		 * if a NMI occurred in _rcu_preempted_unlock() of a preempted thread.
+		 * The only other rcu code that may have been interrupted by the NMI
+		 * in _rcu_preempted_unlock() is: an IPI/sample_local_cpu() and
+		 * the initial part of rcu_after_thread_ran().
+		 * 
+		 * rm_preempted_reader() will not deadlock because none of the locks
+		 * it uses are locked in this case. Neither _rcu_preempted_unlock()
+		 * nor sample_local_cpu() nor the initial part of rcu_after_thread_ran()
+		 * acquire any locks.
+		 */
+		rm_preempted_reader();
+	}
+}
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+
+/** Waits for the grace period associated with callbacks cub_cbs to elapse. 
+ * 
+ * @param expedite Instructs the detector to aggressively speed up grace 
+ *            period detection without any delay.
+ * @param completed_gp Returns the most recent completed grace period 
+ *            number.
+ * @return false if the thread was interrupted and should stop.
+ */
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
+{
+	/* 
+	 * Use a possibly outdated version of completed_gp to bypass checking
+	 * with the lock.
+	 * 
+	 * Note that loading and storing rcu.completed_gp is not atomic 
+	 * (it is 64bit wide). Reading a clobbered value that is less than 
+	 * rcu.completed_gp is harmless - we'll recheck with a lock. The 
+	 * only way to read a clobbered value that is greater than the actual 
+	 * value is if the detector increases the higher-order word first and 
+	 * then decreases the lower-order word (or we see stores in that order), 
+	 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value 
+	 * suddenly jumps by 2^32. It would take hours for such an increase 
+	 * to occur so it is safe to discard the value. We allow increases 
+	 * of up to half the maximum to generously accommodate for loading an
+	 * outdated lower word.
+	 */
+	rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
+	if (CPU->rcu.cur_cbs_gp <= compl_gp 
+		&& compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
+		*completed_gp = compl_gp;
+		return true;
+	}
+	
+	spinlock_lock(&rcu.gp_lock);
+	
+	if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
+		*completed_gp = rcu.completed_gp;
+		spinlock_unlock(&rcu.gp_lock);
+		return true;
+	}
+	
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	ASSERT(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
+	
+	/* 
+	 * Notify the detector of how many GP ends we intend to wait for, so 
+	 * it can avoid going to sleep unnecessarily. Optimistically assume
+	 * new callbacks will arrive while we're waiting; hence +1.
+	 */
+	size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
+	req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
+	
+	/* 
+	 * Ask the detector to speed up GP detection if there are too many 
+	 * pending callbacks and other reclaimers have not already done so.
+	 */
+	if (expedite) {
+		if(0 == rcu.req_expedited_cnt) 
+			condvar_signal(&rcu.expedite_now);
+		
+		/* 
+		 * Expedite only cub_cbs. If there really is a surge of callbacks 
+		 * the arriving batch will expedite the GP for the huge number
+		 * of callbacks currently in next_cbs
+		 */
+		rcu.req_expedited_cnt = 1;
+	}
+
+	/* Wait for cur_cbs_gp to end. */
+	bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
+	
+	*completed_gp = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);	
+	
+	if (!interrupted)
+		upd_missed_gp_in_wait(*completed_gp);
+	
+	return !interrupted;
+}
+
+/** Waits for an announcement of the end of the grace period wait_on_gp. */
+static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	bool interrupted = false;
+	
+	/* Wait until wait_on_gp ends. */
+	while (rcu.completed_gp < wait_on_gp && !interrupted) {
+		int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock, 
+			SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		interrupted = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	return interrupted;
+}
+
+/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
+static void req_detection(size_t req_cnt)
+{
+	if (rcu.req_gp_end_cnt < req_cnt) {
+		bool detector_idle = (0 == rcu.req_gp_end_cnt);
+		rcu.req_gp_end_cnt = req_cnt;
+
+		if (detector_idle) {
+			ASSERT(_rcu_cur_gp == rcu.completed_gp);
+			condvar_signal(&rcu.req_gp_changed);
+		}
+	}
+}
+
+
+/** The detector thread detects and notifies reclaimers of grace period ends. */
+static void detector(void *arg)
+{
+	spinlock_lock(&rcu.gp_lock);
+	
+	while (wait_for_detect_req()) {
+		/* 
+		 * Announce new GP started. Readers start lazily acknowledging that
+		 * they passed a QS.
+		 */
+		start_new_gp();
+		
+		spinlock_unlock(&rcu.gp_lock);
+		
+		if (!wait_for_readers()) 
+			goto unlocked_out;
+		
+		spinlock_lock(&rcu.gp_lock);
+
+		/* Notify reclaimers that they may now invoke queued callbacks. */
+		end_cur_gp();
+	}
+	
+	spinlock_unlock(&rcu.gp_lock);
+	
+unlocked_out:
+	return;
+}
+
+/** Waits for a request from a reclaimer thread to detect a grace period. */
+static bool wait_for_detect_req(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	bool interrupted = false;
+	
+	while (0 == rcu.req_gp_end_cnt && !interrupted) {
+		int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed, 
+			&rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		
+		interrupted = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	return !interrupted;
+}
+
+
+static void end_cur_gp(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	rcu.completed_gp = _rcu_cur_gp;
+	--rcu.req_gp_end_cnt;
+	
+	condvar_broadcast(&rcu.gp_ended);
+}
+
+/** Waits for readers that started before the current GP started to finish. */
+static bool wait_for_readers(void)
+{
+	DEFINE_CPU_MASK(reading_cpus);
+	
+	/* All running cpus have potential readers. */
+	cpu_mask_active(reading_cpus);
+
+	/* 
+	 * Give readers time to pass through a QS. Also, batch arriving 
+	 * callbacks in order to amortize detection overhead.
+	 */
+	if (!gp_sleep())
+		return false;
+	
+	/* Non-intrusively determine which cpus have yet to pass a QS. */
+	rm_quiescent_cpus(reading_cpus);
+	
+	/* Actively interrupt cpus delaying the current GP and demand a QS. */
+	interrupt_delaying_cpus(reading_cpus);
+	
+	/* Wait for the interrupted cpus to notify us that they reached a QS. */
+	if (!wait_for_delaying_cpus())
+		return false;
+	/*
+	 * All cpus recorded a QS or are still idle. Any new readers will be added
+	 * to next_preempt if preempted, ie the number of readers in cur_preempted
+	 * monotonically descreases.
+	 */
+	
+	/* Wait for the last reader in cur_preempted to notify us it is done. */
+	if (!wait_for_preempt_reader())
+		return false;
+	
+	return true;
+}
+
+/** Sleeps a while if the current grace period is not to be expedited. */
+static bool gp_sleep(void)
+{
+	spinlock_lock(&rcu.gp_lock);
+
+	int ret = 0;
+	while (0 == rcu.req_expedited_cnt && 0 == ret) {
+		/* minor bug: sleeps for the same duration if woken up spuriously. */
+		ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
+			DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
+	}
+	
+	if (0 < rcu.req_expedited_cnt) {
+		--rcu.req_expedited_cnt;
+		/* Update statistic. */
+		++rcu.stat_expedited_cnt;
+	}
+	
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return (ret != ESYNCH_INTERRUPTED);
+}
+
+/** Actively interrupts and checks the offending cpus for quiescent states. */
+static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
+{
+	atomic_set(&rcu.delaying_cpu_cnt, 0);
+	
+	sample_cpus(cpu_mask, NULL);
+}
+
+/** Invoked on a cpu delaying grace period detection. 
+ * 
+ * Induces a quiescent state for the cpu or it instructs remaining 
+ * readers to notify the detector once they finish.
+ */
+static void sample_local_cpu(void *arg)
+{
+	ASSERT(interrupts_disabled());
+	ASSERT(!CPU->rcu.is_delaying_gp);
+	
+	/* Cpu did not pass a quiescent state yet. */
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* Interrupted a reader in a reader critical section. */
+		if (0 < CPU->rcu.nesting_cnt) {
+			ASSERT(!CPU->idle);
+			/* 
+			 * Note to notify the detector from rcu_read_unlock(). 
+			 * 
+			 * ACCESS_ONCE ensures the compiler writes to is_delaying_gp
+			 * only after it determines that we are in a reader CS.
+			 */
+			ACCESS_ONCE(CPU->rcu.is_delaying_gp) = true;
+			CPU->rcu.signal_unlock = true;
+			
+			atomic_inc(&rcu.delaying_cpu_cnt);
+		} else {
+			/* 
+			 * The cpu did not enter any rcu reader sections since 
+			 * the start of the current GP. Record a quiescent state.
+			 * 
+			 * Or, we interrupted rcu_read_unlock_impl() right before
+			 * it recorded a QS. Record a QS for it. The memory barrier 
+			 * contains the reader section's mem accesses before 
+			 * updating last_seen_gp.
+			 * 
+			 * Or, we interrupted rcu_read_lock() right after it recorded
+			 * a QS for the previous GP but before it got a chance to
+			 * increment its nesting count. The memory barrier again
+			 * stops the CS code from spilling out of the CS.
+			 */
+			memory_barrier();
+			CPU->rcu.last_seen_gp = _rcu_cur_gp;
+		}
+	} else {
+		/* 
+		 * This cpu already acknowledged that it had passed through 
+		 * a quiescent state since the start of cur_gp. 
+		 */
+	}
+	
+	/* 
+	 * smp_call() makes sure any changes propagate back to the caller.
+	 * In particular, it makes the most current last_seen_gp visible
+	 * to the detector.
+	 */
+}
+
+/** Waits for cpus delaying the current grace period if there are any. */
+static bool wait_for_delaying_cpus(void)
+{
+	int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
+
+	for (int i = 0; i < delaying_cpu_cnt; ++i){
+		if (!semaphore_down_interruptable(&rcu.remaining_readers))
+			return false;
+	}
+	
+	/* Update statistic. */
+	rcu.stat_delayed_cnt += delaying_cpu_cnt;
+	
+	return true;
+}
+
+/** Called by the scheduler() when switching away from the current thread. */
+void rcu_after_thread_ran(void)
+{
+	ASSERT(interrupts_disabled());
+
+	/* 
+	 * Prevent NMI handlers from interfering. The detector will be notified
+	 * in this function if CPU->rcu.is_delaying_gp. The current thread is 
+	 * no longer running so there is nothing else to signal to the detector.
+	 */
+	CPU->rcu.signal_unlock = false;
+	/* 
+	 * Separates clearing of .signal_unlock from accesses to 
+	 * THREAD->rcu.was_preempted and CPU->rcu.nesting_cnt.
+	 */
+	compiler_barrier();
+	
+	/* Save the thread's nesting count when it is not running. */
+	THREAD->rcu.nesting_cnt = CPU->rcu.nesting_cnt;
+	
+	/* Preempted a reader critical section for the first time. */
+	if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
+		THREAD->rcu.was_preempted = true;
+		note_preempted_reader();
+	}
+	
+	/* 
+	 * The preempted reader has been noted globally. There are therefore
+	 * no readers running on this cpu so this is a quiescent state.
+	 */
+	_rcu_record_qs();
+
+	/* 
+	 * Interrupt handlers might use RCU while idle in scheduler(). 
+	 * The preempted reader has been noted globally, so the handlers 
+	 * may now start announcing quiescent states.
+	 */
+	CPU->rcu.nesting_cnt = 0;
+	
+	/* 
+	 * This cpu is holding up the current GP. Let the detector know 
+	 * it has just passed a quiescent state. 
+	 * 
+	 * The detector waits separately for preempted readers, so we have 
+	 * to notify the detector even if we have just preempted a reader.
+	 */
+	if (CPU->rcu.is_delaying_gp) {
+		CPU->rcu.is_delaying_gp = false;
+		semaphore_up(&rcu.remaining_readers);
+	}
+
+	/* 
+	 * Forcefully associate the detector with the highest priority
+	 * even if preempted due to its time slice running out.
+	 * 
+	 * todo: Replace with strict scheduler priority classes.
+	 */
+	if (THREAD == rcu.detector_thr) {
+		THREAD->priority = -1;
+	} 
+	else if (THREAD == CPU->rcu.reclaimer_thr) {
+		THREAD->priority = -1;
+	} 
+	
+	upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
+}
+
+/** Called by the scheduler() when switching to a newly scheduled thread. */
+void rcu_before_thread_runs(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	ASSERT(0 == CPU->rcu.nesting_cnt);
+	
+	/* Load the thread's saved nesting count from before it was preempted. */
+	CPU->rcu.nesting_cnt = THREAD->rcu.nesting_cnt;
+	
+	/* 
+	 * Ensures NMI see the proper nesting count before .signal_unlock.
+	 * Otherwise the NMI may incorrectly signal that a preempted reader
+	 * exited its reader section.
+	 */
+	compiler_barrier();
+	
+	/* 
+	 * In the unlikely event that a NMI occurs between the loading of the 
+	 * variables and setting signal_unlock, the NMI handler may invoke 
+	 * rcu_read_unlock() and clear signal_unlock. In that case we will
+	 * incorrectly overwrite signal_unlock from false to true. This event
+	 * is benign and the next rcu_read_unlock() will at worst 
+	 * needlessly invoke _rcu_signal_unlock().
+	 */
+	CPU->rcu.signal_unlock = THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp;
+}
+
+/** Called from scheduler() when exiting the current thread. 
+ * 
+ * Preemption or interrupts are disabled and the scheduler() already
+ * switched away from the current thread, calling rcu_after_thread_ran().
+ */
+void rcu_thread_exiting(void)
+{
+	ASSERT(THREAD != NULL);
+	ASSERT(THREAD->state == Exiting);
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/* 
+	 * The thread forgot to exit its reader critical section. 
+	 * It is a bug, but rather than letting the entire system lock up
+	 * forcefully leave the reader section. The thread is not holding 
+	 * any references anyway since it is exiting so it is safe.
+	 */
+	if (0 < THREAD->rcu.nesting_cnt) {
+		THREAD->rcu.nesting_cnt = 1;
+		read_unlock_impl(&THREAD->rcu.nesting_cnt);
+
+		printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
+			" section.\n", THREAD->tid, THREAD->name);
+	}
+}
+
+
+#endif /* RCU_PREEMPT_PODZIMEK */
+
+/** Announces the start of a new grace period for preexisting readers to ack. */
+static void start_new_gp(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+	
+	/* Start a new GP. Announce to readers that a quiescent state is needed. */
+	++_rcu_cur_gp;
+	
+	/* 
+	 * Readers preempted before the start of this GP (next_preempted)
+	 * are preexisting readers now that a GP started and will hold up 
+	 * the current GP until they exit their reader sections.
+	 * 
+	 * Preempted readers from the previous GP have finished so 
+	 * cur_preempted is empty, but see comment in _rcu_record_qs(). 
+	 */
+	list_concat(&rcu.cur_preempted, &rcu.next_preempted);
+	
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+}
+
+/** Remove those cpus from the mask that have already passed a quiescent
+ * state since the start of the current grace period.
+ */
+static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
+{
+	/*
+	 * Ensure the announcement of the start of a new GP (ie up-to-date 
+	 * cur_gp) propagates to cpus that are just coming out of idle 
+	 * mode before we sample their idle state flag.
+	 * 
+	 * Cpus guarantee that after they set CPU->idle = true they will not
+	 * execute any RCU reader sections without first setting idle to
+	 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
+	 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
+	 * the cpu must not have yet executed its memory barrier (otherwise
+	 * it would pair up with this mem barrier and we would see idle == false).
+	 * That memory barrier will pair up with the one below and ensure
+	 * that a reader on the now-non-idle cpu will see the most current
+	 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
+	 * pending_readers) during this GP, which allows the detector to
+	 * ignore that cpu (the detector thinks it is idle). Moreover, any
+	 * changes made by RCU updaters will have propagated to readers
+	 * on the previously idle cpu -- again thanks to issuing a memory
+	 * barrier after returning from idle mode.
+	 * 
+	 * idle -> non-idle cpu      | detector      | reclaimer
+	 * ------------------------------------------------------
+	 * rcu reader 1              |               | rcu_call()
+	 * MB X                      |               |
+	 * idle = true               |               | rcu_call() 
+	 * (no rcu readers allowed ) |               | MB A in advance_cbs() 
+	 * MB Y                      | (...)         | (...)
+	 * (no rcu readers allowed)  |               | MB B in advance_cbs() 
+	 * idle = false              | ++cur_gp      |
+	 * (no rcu readers allowed)  | MB C          |
+	 * MB Z                      | signal gp_end |
+	 * rcu reader 2              |               | exec_cur_cbs()
+	 * 
+	 * 
+	 * MB Y orders visibility of changes to idle for detector's sake.
+	 * 
+	 * MB Z pairs up with MB C. The cpu making a transition from idle 
+	 * will see the most current value of cur_gp and will not attempt
+	 * to notify the detector even if preempted during this GP.
+	 * 
+	 * MB Z pairs up with MB A from the previous batch. Updaters' changes
+	 * are visible to reader 2 even when the detector thinks the cpu is idle 
+	 * but it is not anymore.
+	 * 
+	 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
+	 * and visible before idling and before any callbacks are executed 
+	 * by reclaimers.
+	 * 
+	 * In summary, the detector does not know of or wait for reader 2, but
+	 * it does not have to since it is a new reader that will not access
+	 * data from previous GPs and will see any changes.
+	 */
+	memory_barrier(); /* MB C */
+	
+	cpu_mask_for_each(*cpu_mask, cpu_id) {
+		/* 
+		 * The cpu already checked for and passed through a quiescent 
+		 * state since the beginning of this GP.
+		 * 
+		 * _rcu_cur_gp is modified by local detector thread only. 
+		 * Therefore, it is up-to-date even without a lock. 
+		 * 
+		 * cpu.last_seen_gp may not be up-to-date. At worst, we will
+		 * unnecessarily sample its last_seen_gp with a smp_call. 
+		 */
+		bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
+		
+		/*
+		 * Either the cpu is idle or it is exiting away from idle mode
+		 * and already sees the most current _rcu_cur_gp. See comment
+		 * in wait_for_readers().
+		 */
+		bool cpu_idle = cpus[cpu_id].idle;
+		
+		if (cpu_acked_gp || cpu_idle) {
+			cpu_mask_reset(cpu_mask, cpu_id);
+		}
+	}
+}
+
+/** Serially invokes sample_local_cpu(arg) on each cpu of reader_cpus. */
+static void sample_cpus(cpu_mask_t *reader_cpus, void *arg)
+{
+	cpu_mask_for_each(*reader_cpus, cpu_id) {
+		smp_call(cpu_id, sample_local_cpu, arg);
+
+		/* Update statistic. */
+		if (CPU->id != cpu_id)
+			++rcu.stat_smp_call_cnt;
+	}
+}
+
+static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
+{
+	ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
+	
+	size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
+	CPU->rcu.stat_missed_gp_in_wait += delta;
+}
+
+/** Globally note that the current thread was preempted in a reader section. */
+static void note_preempted_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, false);
+
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* The reader started before the GP started - we must wait for it.*/
+		list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
+	} else {
+		/* 
+		 * The reader started after the GP started and this cpu
+		 * already noted a quiescent state. We might block the next GP.
+		 */
+		list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
+	}
+
+	irq_spinlock_unlock(&rcu.preempt_lock, false);
+}
+
+/** Remove the current thread from the global list of preempted readers. */
+static void rm_preempted_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+	
+	ASSERT(link_used(&THREAD->rcu.preempt_link));
+
+	bool prev_empty = list_empty(&rcu.cur_preempted);
+	list_remove(&THREAD->rcu.preempt_link);
+	bool now_empty = list_empty(&rcu.cur_preempted);
+
+	/* This was the last reader in cur_preempted. */
+	bool last_removed = now_empty && !prev_empty;
+
+	/* 
+	 * Preempted readers are blocking the detector and 
+	 * this was the last reader blocking the current GP. 
+	 */
+	if (last_removed && rcu.preempt_blocking_det) {
+		rcu.preempt_blocking_det = false;
+		semaphore_up(&rcu.remaining_readers);
+	}
+
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+}
+
+/** Waits for any preempted readers blocking this grace period to finish.*/
+static bool wait_for_preempt_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+
+	bool reader_exists = !list_empty(&rcu.cur_preempted);
+	rcu.preempt_blocking_det = reader_exists;
+	
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+	
+	if (reader_exists) {
+		/* Update statistic. */
+		++rcu.stat_preempt_blocking_cnt;
+		
+		return semaphore_down_interruptable(&rcu.remaining_readers);
+	} 	
+	
+	return true;
+}
+
+static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt)
+{
+	rcu_cpu_data_t *cr = &CPU->rcu;
+	
+	if (arriving_cbs_cnt > cr->last_arriving_cnt) {
+		size_t arrived_cnt = arriving_cbs_cnt - cr->last_arriving_cnt;
+		cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
+	}
+	
+	cr->last_arriving_cnt = arriving_cbs_cnt;
+}
+
+/** Prints RCU run-time statistics. */
+void rcu_print_stat(void)
+{
+	/* 
+	 * Don't take locks. Worst case is we get out-dated values. 
+	 * CPU local values are updated without any locks, so there 
+	 * are no locks to lock in order to get up-to-date values.
+	 */
+	
+#ifdef RCU_PREEMPT_PODZIMEK
+	const char *algo = "podzimek-preempt-rcu";
+#elif defined(RCU_PREEMPT_A)
+	const char *algo = "a-preempt-rcu";
+#endif
+	
+	printf("Config: expedite_threshold=%d, critical_threshold=%d,"
+		" detect_sleep=%dms, %s\n",	
+		EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS, algo);
+	printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
+	printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
+	printf("Delayed GPs:   %zu (cpus w/ still running readers after gp sleep)\n", 
+		rcu.stat_delayed_cnt);
+	printf("Preempt blocked GPs: %zu (waited for preempted readers; "
+		"running or not)\n", rcu.stat_preempt_blocking_cnt);
+	printf("Smp calls:     %zu\n", rcu.stat_smp_call_cnt);
+	
+	printf("Max arrived callbacks per GP and CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_max_cbs);
+	}
+
+	printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_avg_cbs);
+	}
+	
+	printf("\nMax arrived callbacks per time slice and CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
+	}
+
+	printf("\nMissed GP notifications per CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_missed_gps);
+	}
+
+	printf("\nMissed GP notifications per CPU while waking up:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
+	}
+	printf("\n");
+}
+
+/** @}
+ */
Index: kernel/generic/src/synch/smc.c
===================================================================
--- kernel/generic/src/synch/smc.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/smc.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,5 @@
 #include <arch/barrier.h>
 #include <synch/smc.h>
+#include <mm/as.h>
 
 sysarg_t sys_smc_coherence(uintptr_t va, size_t size)
Index: kernel/generic/src/synch/smp_memory_barrier.c
===================================================================
--- kernel/generic/src/synch/smp_memory_barrier.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/synch/smp_memory_barrier.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Syscall implementation that issues a memory barrier on all cpus.
+ */
+
+#include <synch/smp_memory_barrier.h>
+#include <smp/smp_call.h>
+#include <config.h>
+
+
+static void issue_mem_bar(void *arg)
+{
+	/* smp_call already issues memory barriers on return from this function */
+}
+
+/** Issues a memory barrier on each cpu that is running a thread of the current
+ * task.
+ * 
+ * @return Irrelevant.
+ */
+sysarg_t sys_smp_memory_barrier(void)
+{
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
+		smp_call(cpu_id, issue_mem_bar, NULL);
+	}
+	
+	return 0;
+}
+
+/** @}
+ */
Index: kernel/generic/src/synch/spinlock.c
===================================================================
--- kernel/generic/src/synch/spinlock.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/spinlock.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -45,4 +45,5 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <cpu.h>
 
 #ifdef CONFIG_SMP
@@ -198,7 +199,6 @@
  *
  * @param lock    IRQ spinlock to be locked.
- * @param irq_dis If true, interrupts are actually disabled
- *                prior locking the spinlock. If false, interrupts
- *                are expected to be already disabled.
+ * @param irq_dis If true, disables interrupts before locking the spinlock.
+ *                If false, interrupts are expected to be already disabled.
  *
  */
Index: kernel/generic/src/synch/waitq.c
===================================================================
--- kernel/generic/src/synch/waitq.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/synch/waitq.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -57,4 +57,6 @@
 
 static void waitq_sleep_timed_out(void *);
+static void waitq_complete_wakeup(waitq_t *);
+
 
 /** Initialize wait queue
@@ -330,4 +332,18 @@
 		break;
 	default:
+		/* 
+		 * Wait for a waitq_wakeup() or waitq_unsleep() to complete
+		 * before returning from waitq_sleep() to the caller. Otherwise
+		 * the caller might expect that the wait queue is no longer used 
+		 * and deallocate it (although the wakeup on a another cpu has 
+		 * not yet completed and is using the wait queue). 
+		 * 
+		 * Note that we have to do this for ESYNCH_OK_BLOCKED and
+		 * ESYNCH_INTERRUPTED, but not necessarily for ESYNCH_TIMEOUT
+		 * where the timeout handler stops using the waitq before waking 
+		 * us up. To be on the safe side, ensure the waitq is not in use 
+		 * anymore in this case as well.
+		 */
+		waitq_complete_wakeup(wq);
 		break;
 	}
@@ -357,5 +373,5 @@
 	} else {
 		if (PARAM_NON_BLOCKING(flags, usec)) {
-			/* Return immediatelly instead of going to sleep */
+			/* Return immediately instead of going to sleep */
 			return ESYNCH_WOULD_BLOCK;
 		}
@@ -442,4 +458,48 @@
 	irq_spinlock_unlock(&wq->lock, true);
 }
+
+/** If there is a wakeup in progress actively waits for it to complete.
+ * 
+ * The function returns once the concurrently running waitq_wakeup()
+ * exits. It returns immediately if there are no concurrent wakeups 
+ * at the time.
+ * 
+ * Interrupts must be disabled.
+ * 
+ * Example usage:
+ * @code
+ * void callback(waitq *wq)
+ * {
+ *     // Do something and notify wait_for_completion() that we're done.
+ *     waitq_wakeup(wq);
+ * }
+ * void wait_for_completion(void) 
+ * {
+ *     waitq wg;
+ *     waitq_initialize(&wq);
+ *     // Run callback() in the background, pass it wq.
+ *     do_asynchronously(callback, &wq);
+ *     // Wait for callback() to complete its work.
+ *     waitq_sleep(&wq);
+ *     // callback() completed its work, but it may still be accessing 
+ *     // wq in waitq_wakeup(). Therefore it is not yet safe to return 
+ *     // from waitq_sleep() or it would clobber up our stack (where wq 
+ *     // is stored). waitq_sleep() ensures the wait queue is no longer
+ *     // in use by invoking waitq_complete_wakeup() internally.
+ *     
+ *     // waitq_sleep() returned, it is safe to free wq.
+ * }
+ * @endcode
+ * 
+ * @param wq  Pointer to a wait queue.
+ */
+static void waitq_complete_wakeup(waitq_t *wq)
+{
+	ASSERT(interrupts_disabled());
+	
+	irq_spinlock_lock(&wq->lock, false);
+	irq_spinlock_unlock(&wq->lock, false);
+}
+
 
 /** Internal SMP- and IRQ-unsafe version of waitq_wakeup()
Index: kernel/generic/src/synch/workqueue.c
===================================================================
--- kernel/generic/src/synch/workqueue.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/generic/src/synch/workqueue.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,976 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Work queue/thread pool that automatically adjusts its size
+ *        depending on the current load. Queued work functions may sleep..
+ */
+
+#include <synch/workqueue.h>
+#include <synch/spinlock.h>
+#include <synch/condvar.h>
+#include <synch/mutex.h>
+#include <proc/thread.h>
+#include <config.h>
+#include <arch.h>
+#include <cpu.h>
+#include <macros.h>
+
+#define WORKQ_MAGIC      0xf00c1333U
+#define WORK_ITEM_MAGIC  0xfeec1777U
+
+
+struct work_queue {
+	/* 
+	 * Protects everything except activate_worker. 
+	 * Must be acquired after any thread->locks.
+	 */
+	IRQ_SPINLOCK_DECLARE(lock);
+	
+	/* Activates a worker if new work arrives or if shutting down the queue. */
+	condvar_t activate_worker;
+	
+	/* Queue of work_items ready to be dispatched. */
+	list_t queue;
+	
+	/* List of worker threads. */
+	list_t workers;
+	
+	/* Number of work items queued. */
+	size_t item_cnt;
+	
+	/* Indicates the work queue is shutting down. */
+	bool stopping;
+	const char *name;
+
+	/* Total number of created worker threads. */
+	size_t cur_worker_cnt;
+	/* Number of workers waiting for work to arrive. */
+	size_t idle_worker_cnt;
+	/* Number of idle workers signaled that have not yet been woken up. */
+	size_t activate_pending;
+	/* Number of blocked workers sleeping in work func() (ie not idle). */
+	size_t blocked_worker_cnt;
+	
+	/* Number of pending signal_worker_op() operations. */
+	size_t pending_op_cnt;
+	
+	link_t nb_link;
+	
+#ifdef CONFIG_DEBUG
+	/* Magic cookie for integrity checks. Immutable. Accessed without lock. */
+	uint32_t cookie;
+#endif 
+};
+
+
+/** Min number of idle workers to keep. */
+static size_t min_worker_cnt;
+/** Max total number of workers - be it blocked, idle, or active. */
+static size_t max_worker_cnt;
+/** Max number of concurrently running active workers, ie not blocked nor idle. */
+static size_t max_concurrent_workers;
+/** Max number of work items per active worker before a new worker is activated.*/
+static const size_t max_items_per_worker = 8;
+	
+/** System wide work queue. */
+static struct work_queue g_work_queue;
+
+static int booting = true;
+
+
+typedef struct {
+	IRQ_SPINLOCK_DECLARE(lock);
+	condvar_t req_cv;
+	thread_t *thread;
+	list_t work_queues;
+} nonblock_adder_t;
+
+static nonblock_adder_t nonblock_adder;
+
+
+
+/** Typedef a worker thread signaling operation prototype. */
+typedef void (*signal_op_t)(struct work_queue *workq);
+
+
+/* Fwd decl. */
+static void workq_preinit(struct work_queue *workq, const char *name);
+static bool add_worker(struct work_queue *workq);
+static void interrupt_workers(struct work_queue *workq);
+static void wait_for_workers(struct work_queue *workq);
+static int _workq_enqueue(struct work_queue *workq, work_t *work_item, 
+	work_func_t func, bool can_block);
+static void init_work_item(work_t *work_item, work_func_t func);
+static signal_op_t signal_worker_logic(struct work_queue *workq, bool can_block);
+static void worker_thread(void *arg);
+static bool dequeue_work(struct work_queue *workq, work_t **pwork_item);
+static bool worker_unnecessary(struct work_queue *workq);
+static void cv_wait(struct work_queue *workq);
+static void nonblock_init(void);
+static bool workq_corrupted(struct work_queue *workq);
+static bool work_item_corrupted(work_t *work_item);
+
+
+/** Creates worker thread for the system-wide worker queue. */
+void workq_global_worker_init(void)
+{
+	/* 
+	 * No need for additional synchronization. Stores to word-sized 
+	 * variables are atomic and the change will eventually propagate.
+	 * Moreover add_worker() includes the necessary memory barriers
+	 * in spinlock lock/unlock().
+	 */
+	booting = false;
+	
+	nonblock_init();
+	
+	if (!add_worker(&g_work_queue))
+		panic("Could not create a single global work queue worker!\n");
+	
+}
+
+/** Initializes the system wide work queue and support for other work queues. */
+void workq_global_init(void)
+{
+	/* Keep idle workers on 1/4-th of cpus, but at least 2 threads. */
+	min_worker_cnt = max(2, config.cpu_count / 4);
+	/* Allow max 8 sleeping work items per cpu. */
+	max_worker_cnt = max(32, 8 * config.cpu_count);
+	/* Maximum concurrency without slowing down the system. */
+	max_concurrent_workers = max(2, config.cpu_count);
+	
+	workq_preinit(&g_work_queue, "kworkq");
+}
+
+/** Stops the system global work queue and waits for all work items to complete.*/
+void workq_global_stop(void)
+{
+	workq_stop(&g_work_queue);
+}
+
+/** Creates and initializes a work queue. Returns NULL upon failure. */
+struct work_queue * workq_create(const char *name)
+{
+	struct work_queue *workq = malloc(sizeof(struct work_queue), 0);
+	
+	if (workq) {
+		if (workq_init(workq, name)) {
+			ASSERT(!workq_corrupted(workq));
+			return workq;
+		}
+		
+		free(workq);
+	}
+	
+	return NULL;
+}
+
+/** Frees work queue resources and stops it if it had not been done so already.*/
+void workq_destroy(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	irq_spinlock_lock(&workq->lock, true);
+	bool stopped = workq->stopping;
+	size_t running_workers = workq->cur_worker_cnt;
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	if (!stopped) {
+		workq_stop(workq);
+	} else {
+		ASSERT(0 == running_workers);
+	}
+	
+#ifdef CONFIG_DEBUG
+	workq->cookie = 0;
+#endif 
+	
+	free(workq);
+}
+
+/** Initializes workq structure without creating any workers. */
+static void workq_preinit(struct work_queue *workq, const char *name)
+{
+#ifdef CONFIG_DEBUG
+	workq->cookie = WORKQ_MAGIC;
+#endif 
+	
+	irq_spinlock_initialize(&workq->lock, name);
+	condvar_initialize(&workq->activate_worker);
+	
+	list_initialize(&workq->queue);
+	list_initialize(&workq->workers);
+	
+	workq->item_cnt = 0;
+	workq->stopping = false;
+	workq->name = name;
+	
+	workq->cur_worker_cnt = 1;
+	workq->idle_worker_cnt = 0;
+	workq->activate_pending = 0;
+	workq->blocked_worker_cnt = 0;
+	
+	workq->pending_op_cnt = 0;
+	link_initialize(&workq->nb_link);
+}
+
+/** Initializes a work queue. Returns true if successful.  
+ * 
+ * Before destroying a work queue it must be stopped via
+ * workq_stop().
+ */
+int workq_init(struct work_queue *workq, const char *name)
+{
+	workq_preinit(workq, name);
+	return add_worker(workq);
+}
+
+/** Add a new worker thread. Returns false if the thread could not be created. */
+static bool add_worker(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+
+	thread_t *thread = thread_create(worker_thread, workq, TASK, 
+		THREAD_FLAG_NONE, workq->name);
+	
+	if (!thread) {
+		irq_spinlock_lock(&workq->lock, true);
+		
+		/* cur_worker_cnt proactively increased in signal_worker_logic() .*/
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+		
+		irq_spinlock_unlock(&workq->lock, true);
+		return false;
+	}
+	
+	/* Respect lock ordering. */
+	irq_spinlock_lock(&thread->lock, true);
+	irq_spinlock_lock(&workq->lock, false);
+
+	bool success;
+
+	if (!workq->stopping) {
+		success = true;
+		
+		/* Try to distribute workers among cpus right away. */
+		unsigned int cpu_id = (workq->cur_worker_cnt) % config.cpu_active;
+		
+		if (!cpus[cpu_id].active)
+			cpu_id = CPU->id;
+
+		thread->workq = workq;	
+		thread->cpu = &cpus[cpu_id];
+		thread->workq_blocked = false;
+		thread->workq_idling = false;
+		link_initialize(&thread->workq_link);
+
+		list_append(&thread->workq_link, &workq->workers);
+	} else {
+		/* 
+		 * Work queue is shutting down - we must not add the worker
+		 * and we cannot destroy it without ready-ing it. Mark it
+		 * interrupted so the worker exits right away without even
+		 * touching workq.
+		 */
+		success = false;
+		
+		/* cur_worker_cnt proactively increased in signal_worker() .*/
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+	}
+	
+	irq_spinlock_unlock(&workq->lock, false);
+	irq_spinlock_unlock(&thread->lock, true);
+
+	if (!success) {
+		thread_interrupt(thread);
+	}
+		
+	thread_ready(thread);
+	
+	return success;
+}
+
+/** Shuts down the work queue. Waits for all pending work items to complete.  
+ *
+ * workq_stop() may only be run once. 
+ */
+void workq_stop(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	interrupt_workers(workq);
+	wait_for_workers(workq);
+}
+
+/** Notifies worker threads the work queue is shutting down. */
+static void interrupt_workers(struct work_queue *workq)
+{
+	irq_spinlock_lock(&workq->lock, true);
+
+	/* workq_stop() may only be called once. */
+	ASSERT(!workq->stopping);
+	workq->stopping = true;
+	
+	/* Respect lock ordering - do not hold workq->lock during broadcast. */
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	condvar_broadcast(&workq->activate_worker);
+}
+
+/** Waits for all worker threads to exit. */
+static void wait_for_workers(struct work_queue *workq)
+{
+	ASSERT(!PREEMPTION_DISABLED);
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	list_foreach_safe(workq->workers, cur_worker, next_worker) {
+		thread_t *worker = list_get_instance(cur_worker, thread_t, workq_link);
+		list_remove(cur_worker);
+
+		/* Wait without the lock. */
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		thread_join(worker);
+		thread_detach(worker);
+		
+		irq_spinlock_lock(&workq->lock, true);
+	}
+	
+	ASSERT(list_empty(&workq->workers));
+	
+	/* Wait for deferred add_worker_op(), signal_worker_op() to finish. */
+	while (0 < workq->cur_worker_cnt || 0 < workq->pending_op_cnt) {
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		scheduler();
+		
+		irq_spinlock_lock(&workq->lock, true);
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+}
+
+/** Queues a function into the global wait queue without blocking. 
+ * 
+ * See workq_enqueue_noblock() for more details.
+ */
+int workq_global_enqueue_noblock(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue_noblock(&g_work_queue, work_item, func);
+}
+
+/** Queues a function into the global wait queue; may block. 
+ * 
+ * See workq_enqueue() for more details.
+ */
+int workq_global_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue(&g_work_queue, work_item, func);
+}
+
+/** Adds a function to be invoked in a separate thread without blocking. 
+ * 
+ * workq_enqueue_noblock() is guaranteed not to block. It is safe 
+ * to invoke from interrupt handlers.
+ * 
+ * Consider using workq_enqueue() instead if at all possible. Otherwise,
+ * your work item may have to wait for previously enqueued sleeping 
+ * work items to complete if you are unlucky.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise. func() will be invoked in a separate thread.
+ */
+int workq_enqueue_noblock(struct work_queue *workq, work_t *work_item, 
+	work_func_t func)
+{
+	return _workq_enqueue(workq, work_item, func, false);
+}
+
+/** Adds a function to be invoked in a separate thread; may block. 
+ * 
+ * While the workq_enqueue() is unlikely to block, it may do so if too 
+ * many previous work items blocked sleeping.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise. func() will be invoked in a separate thread.
+ */
+int workq_enqueue(struct work_queue *workq, work_t *work_item, work_func_t func)
+{
+	return _workq_enqueue(workq, work_item, func, true);
+}
+
+/** Adds a work item that will be processed by a separate worker thread.
+ * 
+ * func() will be invoked in another kernel thread and may block. 
+ * 
+ * Prefer to call _workq_enqueue() with can_block set. Otherwise
+ * your work item may have to wait for sleeping work items to complete.
+ * If all worker threads are blocked/sleeping a new worker thread cannot
+ * be create without can_block set because creating a thread might
+ * block due to low memory conditions.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ * @param can_block May adding this work item block?
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise.
+ */
+static int _workq_enqueue(struct work_queue *workq, work_t *work_item, 
+	work_func_t func, bool can_block)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	bool success = true;
+	signal_op_t signal_op = NULL;
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	if (workq->stopping) {
+		success = false;
+	} else {
+		init_work_item(work_item, func);
+		list_append(&work_item->queue_link, &workq->queue);
+		++workq->item_cnt;
+		success = true;
+		
+		if (!booting) {
+			signal_op = signal_worker_logic(workq, can_block);
+		} else {
+			/* 
+			 * During boot there are no workers to signal. Just queue 
+			 * the work and let future workers take care of it.
+			 */
+		}
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+
+	if (signal_op) {
+		signal_op(workq);
+	}
+	
+	return success;
+}
+
+/** Prepare an item to be added to the work item queue. */
+static void init_work_item(work_t *work_item, work_func_t func)
+{
+#ifdef CONFIG_DEBUG
+	work_item->cookie = WORK_ITEM_MAGIC;
+#endif 
+	
+	link_initialize(&work_item->queue_link);
+	work_item->func = func;
+}
+
+/** Returns the number of workers running work func() that are not blocked. */
+static size_t active_workers_now(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* Workers blocked are sleeping in the work function (ie not idle). */
+	ASSERT(workq->blocked_worker_cnt <= workq->cur_worker_cnt);
+	/* Idle workers are waiting for more work to arrive in condvar_wait. */
+	ASSERT(workq->idle_worker_cnt <= workq->cur_worker_cnt);
+	
+	/* Idle + blocked workers == sleeping worker threads. */
+	size_t sleeping_workers = workq->blocked_worker_cnt + workq->idle_worker_cnt;
+	
+	ASSERT(sleeping_workers	<= workq->cur_worker_cnt);
+	/* Workers pending activation are idle workers not yet given a time slice. */
+	ASSERT(workq->activate_pending <= workq->idle_worker_cnt);
+	
+	/* 
+	 * Workers actively running the work func() this very moment and 
+	 * are neither blocked nor idle. Exclude ->activate_pending workers 
+	 * since they will run their work func() once they get a time slice 
+	 * and are not running it right now.
+	 */
+	return workq->cur_worker_cnt - sleeping_workers;
+}
+
+/** 
+ * Returns the number of workers that are running or are about to run work 
+ * func() and that are not blocked. 
+ */
+static size_t active_workers(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* 
+	 * Workers actively running the work func() and are neither blocked nor 
+	 * idle. ->activate_pending workers will run their work func() once they
+	 * get a time slice after waking from a condvar wait, so count them
+	 * as well.
+	 */
+	return active_workers_now(workq) + workq->activate_pending;
+}
+
+static void add_worker_noblock_op(struct work_queue *workq)
+{
+	condvar_signal(&nonblock_adder.req_cv);
+}
+
+static void add_worker_op(struct work_queue *workq)
+{
+	add_worker(workq);
+}
+
+static void signal_worker_op(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+
+	condvar_signal(&workq->activate_worker);
+	
+	irq_spinlock_lock(&workq->lock, true);
+	ASSERT(0 < workq->pending_op_cnt);
+	--workq->pending_op_cnt;
+	irq_spinlock_unlock(&workq->lock, true);
+}
+
+/** Determines how to signal workers if at all.
+ * 
+ * @param workq     Work queue where a new work item was queued.
+ * @param can_block True if we may block while signaling a worker or creating 
+ *                  a new worker.
+ * 
+ * @return Function that will notify workers or NULL if no action is needed.
+ */
+static signal_op_t signal_worker_logic(struct work_queue *workq, bool can_block)
+{
+	ASSERT(!workq_corrupted(workq));
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* Only signal workers if really necessary. */
+	signal_op_t signal_op = NULL;
+
+	/* 
+	 * Workers actively running the work func() and neither blocked nor idle. 
+	 * Including ->activate_pending workers that will run their work func() 
+	 * once they get a time slice.
+	 */
+	size_t active = active_workers(workq);
+	/* Max total allowed number of work items queued for active workers. */
+	size_t max_load = active * max_items_per_worker;
+
+	/* Active workers are getting overwhelmed - activate another. */
+	if (max_load < workq->item_cnt) {
+
+		size_t remaining_idle = 
+			workq->idle_worker_cnt - workq->activate_pending;
+
+		/* Idle workers still exist - activate one. */
+		if (remaining_idle > 0) {
+			/* 
+			 * Directly changing idle_worker_cnt here would not allow
+			 * workers to recognize spurious wake-ups. Change 
+			 * activate_pending instead.
+			 */
+			++workq->activate_pending;
+			++workq->pending_op_cnt;
+			signal_op = signal_worker_op;
+		} else {
+			/* No idle workers remain. Request that a new one be created. */
+			bool need_worker = (active < max_concurrent_workers)
+				&& (workq->cur_worker_cnt < max_worker_cnt);
+			
+			if (need_worker && can_block) {
+				signal_op = add_worker_op;
+				/* 
+				 * It may take some time to actually create the worker.
+				 * We don't want to swamp the thread pool with superfluous
+				 * worker creation requests so pretend it was already
+				 * created and proactively increase the worker count.
+				 */
+				++workq->cur_worker_cnt;
+			}
+			
+			/* 
+			 * We cannot create a new worker but we need one desperately
+			 * because all workers are blocked in their work functions.
+			 */
+			if (need_worker && !can_block && 0 == active) {
+				ASSERT(0 == workq->idle_worker_cnt);
+				
+				irq_spinlock_lock(&nonblock_adder.lock, true);
+
+				if (nonblock_adder.thread && !link_used(&workq->nb_link)) {
+					signal_op = add_worker_noblock_op;
+					++workq->cur_worker_cnt;
+					list_append(&workq->nb_link, &nonblock_adder.work_queues);
+				}
+
+				irq_spinlock_unlock(&nonblock_adder.lock, true);
+			}
+		}
+	} else {
+		/* 
+		 * There are enough active/running workers to process the queue. 
+		 * No need to signal/activate any new workers.
+		 */
+		signal_op = NULL;
+	}
+	
+	return signal_op;
+}
+
+/** Executes queued work items. */
+static void worker_thread(void *arg)
+{
+	/* 
+	 * The thread has been created after the work queue was ordered to stop. 
+	 * Do not access the work queue and return immediately. 
+	 */
+	if (thread_interrupted(THREAD)) {
+		thread_detach(THREAD);
+		return;
+	}
+	
+	ASSERT(arg != NULL);
+	
+	struct work_queue *workq = arg;
+	work_t *work_item;
+	
+	while (dequeue_work(workq, &work_item)) {
+		/* Copy the func field so func() can safely free work_item. */
+		work_func_t func = work_item->func;
+
+		func(work_item);
+	}
+}
+
+/** Waits and retrieves a work item. Returns false if the worker should exit. */
+static bool dequeue_work(struct work_queue *workq, work_t **pwork_item)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	/* Check if we should exit if load is low. */
+	if (!workq->stopping && worker_unnecessary(workq)) {
+		/* There are too many workers for this load. Exit. */
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+		list_remove(&THREAD->workq_link);
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		thread_detach(THREAD);
+		return false;
+	}
+	
+	bool stop = false;
+	
+	/* Wait for work to arrive. */
+	while (list_empty(&workq->queue) && !workq->stopping) {
+		cv_wait(workq);
+		
+		if (0 < workq->activate_pending)
+			--workq->activate_pending;
+	}
+
+	/* Process remaining work even if requested to stop. */
+	if (!list_empty(&workq->queue)) {
+		link_t *work_link = list_first(&workq->queue);
+		*pwork_item = list_get_instance(work_link, work_t, queue_link);
+		
+#ifdef CONFIG_DEBUG
+		ASSERT(!work_item_corrupted(*pwork_item));
+		(*pwork_item)->cookie = 0;
+#endif
+		list_remove(work_link);
+		--workq->item_cnt;
+		
+		stop = false;
+	} else {
+		/* Requested to stop and no more work queued. */
+		ASSERT(workq->stopping);
+		--workq->cur_worker_cnt;
+		stop = true;
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	return !stop;
+}
+
+/** Returns true if for the given load there are too many workers. */
+static bool worker_unnecessary(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* No work is pending. We don't need too many idle threads. */
+	if (list_empty(&workq->queue)) {
+		/* There are too many idle workers. Exit. */
+		return (min_worker_cnt <= workq->idle_worker_cnt);
+	} else {
+		/* 
+		 * There is work but we are swamped with too many active workers
+		 * that were woken up from sleep at around the same time. We
+		 * don't need another worker fighting for cpu time.
+		 */
+		size_t active = active_workers_now(workq);
+		return (max_concurrent_workers < active);
+	}
+}
+
+/** Waits for a signal to activate_worker. Thread marked idle while waiting. */
+static void cv_wait(struct work_queue *workq)
+{
+	++workq->idle_worker_cnt;
+	THREAD->workq_idling = true;
+	
+	/* Ignore lock ordering just here. */
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	_condvar_wait_timeout_irq_spinlock(&workq->activate_worker,
+		&workq->lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_NONE);
+
+	ASSERT(!workq_corrupted(workq));
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	THREAD->workq_idling = false;
+	--workq->idle_worker_cnt;
+}
+
+
+/** Invoked from thread_ready() right before the thread is woken up. */
+void workq_before_thread_is_ready(thread_t *thread)
+{
+	ASSERT(thread);
+	ASSERT(irq_spinlock_locked(&thread->lock));
+
+	/* Worker's work func() is about to wake up from sleeping. */
+	if (thread->workq && thread->workq_blocked) {
+		/* Must be blocked in user work func() and not be waiting for work. */
+		ASSERT(!thread->workq_idling);
+		ASSERT(thread->state == Sleeping);
+		ASSERT(THREAD != thread);
+		ASSERT(!workq_corrupted(thread->workq));
+		
+		/* Protected by thread->lock */
+		thread->workq_blocked = false;
+		
+		irq_spinlock_lock(&thread->workq->lock, true);
+		--thread->workq->blocked_worker_cnt;
+		irq_spinlock_unlock(&thread->workq->lock, true);
+	}
+}
+
+/** Invoked from scheduler() before switching away from a thread. */
+void workq_after_thread_ran(void)
+{
+	ASSERT(THREAD);
+	ASSERT(irq_spinlock_locked(&THREAD->lock));
+
+	/* Worker's work func() is about to sleep/block. */
+	if (THREAD->workq && THREAD->state == Sleeping && !THREAD->workq_idling) {
+		ASSERT(!THREAD->workq_blocked);
+		ASSERT(!workq_corrupted(THREAD->workq));
+		
+		THREAD->workq_blocked = true;
+		
+		irq_spinlock_lock(&THREAD->workq->lock, false);
+
+		++THREAD->workq->blocked_worker_cnt;
+		
+		bool can_block = false;
+		signal_op_t op = signal_worker_logic(THREAD->workq, can_block);
+		
+		irq_spinlock_unlock(&THREAD->workq->lock, false);
+		
+		if (op) {
+			ASSERT(add_worker_noblock_op == op || signal_worker_op == op);
+			op(THREAD->workq);
+		}
+	}
+}
+
+/** Prints stats of the work queue to the kernel console. */
+void workq_print_info(struct work_queue *workq)
+{
+	irq_spinlock_lock(&workq->lock, true);
+
+	size_t total = workq->cur_worker_cnt;
+	size_t blocked = workq->blocked_worker_cnt;
+	size_t idle = workq->idle_worker_cnt;
+	size_t active = active_workers(workq);
+	size_t items = workq->item_cnt;
+	bool stopping = workq->stopping;
+	bool worker_surplus = worker_unnecessary(workq);
+	const char *load_str = worker_surplus ? "decreasing" : 
+		(0 < workq->activate_pending) ? "increasing" : "stable";
+	
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	printf(
+		"Configuration: max_worker_cnt=%zu, min_worker_cnt=%zu,\n"
+		" max_concurrent_workers=%zu, max_items_per_worker=%zu\n"
+		"Workers: %zu\n"
+		"Active:  %zu (workers currently processing work)\n"
+		"Blocked: %zu (work functions sleeping/blocked)\n"
+		"Idle:    %zu (idle workers waiting for more work)\n"
+		"Items:   %zu (queued not yet dispatched work)\n"
+		"Stopping: %d\n"
+		"Load: %s\n",
+		max_worker_cnt, min_worker_cnt, 
+		max_concurrent_workers, max_items_per_worker,
+		total,
+		active,
+		blocked,
+		idle,
+		items,
+		stopping,
+		load_str
+	);
+}
+
+/** Prints stats of the global work queue. */
+void workq_global_print_info(void)
+{
+	workq_print_info(&g_work_queue);
+}
+
+
+static bool dequeue_add_req(nonblock_adder_t *info, struct work_queue **pworkq)
+{
+	bool stop = false;
+
+	irq_spinlock_lock(&info->lock, true);
+	
+	while (list_empty(&info->work_queues) && !stop) {
+		int ret = _condvar_wait_timeout_irq_spinlock(&info->req_cv, 
+			&info->lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		
+		stop = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	if (!stop) {
+		*pworkq = list_get_instance(list_first(&info->work_queues), 
+			struct work_queue, nb_link);
+
+		ASSERT(!workq_corrupted(*pworkq));
+		
+		list_remove(&(*pworkq)->nb_link);
+	}
+	
+	irq_spinlock_unlock(&info->lock, true);
+	
+	return !stop;
+}
+
+static void thr_nonblock_add_worker(void *arg)
+{
+	nonblock_adder_t *info = arg;
+	struct work_queue *workq;
+	
+	while (dequeue_add_req(info, &workq)) {
+		add_worker(workq);
+	}
+}
+
+
+static void nonblock_init(void)
+{
+	irq_spinlock_initialize(&nonblock_adder.lock, "kworkq-nb.lock");
+	condvar_initialize(&nonblock_adder.req_cv);
+	list_initialize(&nonblock_adder.work_queues);
+	
+	nonblock_adder.thread = thread_create(thr_nonblock_add_worker, 
+		&nonblock_adder, TASK, THREAD_FLAG_NONE, "kworkq-nb");
+	
+	if (nonblock_adder.thread) {
+		thread_ready(nonblock_adder.thread);
+	} else {
+		/* 
+		 * We won't be able to add workers without blocking if all workers
+		 * sleep, but at least boot the system.
+		 */
+		printf("Failed to create kworkq-nb. Sleeping work may stall the workq.\n");
+	}
+}
+
+/** Returns true if the workq is definitely corrupted; false if not sure. 
+ * 
+ * Can be used outside of any locks.
+ */
+static bool workq_corrupted(struct work_queue *workq)
+{
+#ifdef CONFIG_DEBUG
+	/* 
+	 * Needed to make the most current cookie value set by workq_preinit()
+	 * visible even if we access the workq right after it is created but
+	 * on a different cpu. Otherwise, workq_corrupted() would not work
+	 * outside a lock.
+	 */
+	memory_barrier();
+	return NULL == workq || workq->cookie != WORKQ_MAGIC;
+#else
+	return false;
+#endif
+}
+
+/** Returns true if the work_item is definitely corrupted; false if not sure. 
+ * 
+ * Must be used with the work queue protecting spinlock locked.
+ */
+static bool work_item_corrupted(work_t *work_item)
+{
+#ifdef CONFIG_DEBUG
+	return NULL == work_item || work_item->cookie != WORK_ITEM_MAGIC;
+#else
+	return false;
+#endif
+}
+
+/** @}
+ */
Index: kernel/generic/src/syscall/syscall.c
===================================================================
--- kernel/generic/src/syscall/syscall.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/syscall/syscall.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -50,4 +50,5 @@
 #include <synch/futex.h>
 #include <synch/smc.h>
+#include <synch/smp_memory_barrier.h>
 #include <ddi/ddi.h>
 #include <ipc/event.h>
@@ -140,4 +141,6 @@
 	(syshandler_t) sys_futex_wakeup,
 	(syshandler_t) sys_smc_coherence,
+	(syshandler_t) sys_smp_memory_barrier,
+	
 	
 	/* Address space related syscalls. */
Index: kernel/generic/src/time/clock.c
===================================================================
--- kernel/generic/src/time/clock.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/time/clock.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -212,5 +212,5 @@
 		irq_spinlock_unlock(&THREAD->lock, false);
 		
-		if ((!ticks) && (!PREEMPTION_DISABLED)) {
+		if (ticks == 0 && PREEMPTION_ENABLED) {
 			scheduler();
 #ifdef CONFIG_UDEBUG
Index: kernel/generic/src/udebug/udebug.c
===================================================================
--- kernel/generic/src/udebug/udebug.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/generic/src/udebug/udebug.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -44,4 +44,6 @@
 #include <print.h>
 #include <arch.h>
+#include <proc/task.h>
+#include <proc/thread.h>
 
 /** Initialize udebug part of task structure.
Index: kernel/test/cht/cht1.c
===================================================================
--- kernel/test/cht/cht1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/cht/cht1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <print.h>
+#include <debug.h>
+#include <adt/cht.h>
+#include <synch/rcu.h>
+
+typedef struct val {
+	/* Place at the top to simplify re-casting. */
+	cht_link_t link;
+	size_t hash;
+	size_t unique_id;
+	bool deleted;
+	bool mark;
+} val_t;
+
+static size_t val_hash(const cht_link_t *item)
+{
+	val_t *v = member_to_inst(item, val_t, link);
+	ASSERT(v->hash == (v->unique_id % 10));
+	return v->hash;
+}
+
+static size_t val_key_hash(void *key)
+{
+	return (uintptr_t)key % 10;
+}
+
+static bool val_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	val_t *v1 = member_to_inst(item1, val_t, link);
+	val_t *v2 = member_to_inst(item2, val_t, link);
+	return v1->unique_id == v2->unique_id;
+}
+
+static bool val_key_equal(void *key, const cht_link_t *item2)
+{
+	val_t *v2 = member_to_inst(item2, val_t, link);
+	return (uintptr_t)key == v2->unique_id;
+}
+
+static void val_rm_callback(cht_link_t *item)
+{
+	val_t *v = member_to_inst(item, val_t, link);
+	ASSERT(!v->deleted);
+	v->deleted = true;
+	free(v);
+}
+
+
+static cht_ops_t val_ops = {
+	.hash = val_hash,
+	.key_hash = val_key_hash,
+	.equal = val_equal,
+	.key_equal = val_key_equal,
+	.remove_callback = val_rm_callback,
+};
+
+static void set_val(val_t *v, size_t h, size_t uid)
+{
+	v->hash = h;
+	v->unique_id = uid;
+	v->deleted = false;
+	v->mark = false;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static const char * do_sanity_test(cht_t *h)
+{
+	if (cht_find_lazy(h, (void*)0))
+		return "Found lazy in empty table.";
+	
+	if (cht_find(h, (void*)0))
+		return "Found in empty table.";
+	
+	if (cht_remove_key(h, (void*)0))
+		return "Removed from empty table.";
+	
+	const int val_cnt = 6;
+	val_t *v[6] = { NULL };
+	
+	for (int i = 0; i < val_cnt; ++i)
+		v[i] = malloc(sizeof(val_t), 0);
+	
+	size_t key[] = { 1, 1, 1, 11, 12, 13 };
+	
+	/* First three are identical */
+	for (int i = 0; i < 3; ++i)
+		set_val(v[i], 1, key[i]);
+	
+	/* Same hash, different key.*/
+	set_val(v[3], 1, key[3]);
+	
+	/* Different hashes and keys. */
+	set_val(v[4], 2, key[4]);
+	set_val(v[5], 3, key[5]);
+	
+	cht_link_t *dup;
+			
+	if (!cht_insert_unique(h, &v[0]->link, &dup))
+		return "Duplicates in empty";
+
+	if (cht_insert_unique(h, &v[1]->link, &dup))
+		return "Inserted a duplicate";
+	
+	if (dup != &v[0]->link)
+		return "Returned wrong duplicate";
+
+	if (!cht_insert_unique(h, &v[3]->link, &dup))
+		return "Refused non-equal item but with a hash in table.";
+	
+	cht_insert(h, &v[1]->link);
+	cht_insert(h, &v[2]->link);
+	
+	bool ok = true;
+	ok = ok && cht_insert_unique(h, &v[4]->link, &dup);
+	ok = ok && cht_insert_unique(h, &v[5]->link, &dup);
+	
+	if (!ok)
+		return "Refused unique ins 4, 5.";
+	
+	if (cht_find(h, (void*)0))
+		return "Phantom find.";
+	
+	cht_link_t *item = cht_find(h, (void*)v[5]->unique_id);
+	if (!item || item != &v[5]->link)
+		return "Missing 5.";
+
+	item = cht_find_next(h, &v[5]->link);
+	if (item)
+		return "Found nonexisting duplicate 5";
+	
+	item = cht_find(h, (void*)v[3]->unique_id);
+	if (!item || item != &v[3]->link)
+		return "Missing 3.";
+
+	item = cht_find_next(h, &v[3]->link);
+	if (item)
+		return "Found nonexisting duplicate 3, same hash as others.";
+	
+	item = cht_find(h, (void*)v[0]->unique_id);
+	((val_t*)item)->mark = true;
+	
+	for (int k = 1; k < 3; ++k) {
+		item = cht_find_next(h, item);
+		if (!item)
+			return "Did not find an inserted duplicate";
+		
+		val_t *val = ((val_t*)item);
+		
+		if (val->unique_id != v[0]->unique_id)
+			return "Found item with a different key.";
+		if (val->mark) 
+			return "Found twice the same node.";
+		val->mark = true;
+	}
+	
+	for (int i = 0; i < 3; ++i) {
+		if (!v[i]->mark) 
+			return "Did not find all duplicates";
+		
+		v[i]->mark = false;
+	}
+
+	if (cht_find_next(h, item))
+		return "Found non-existing duplicate.";
+
+	item = cht_find_next(h, cht_find(h, (void*)key[0]));
+	
+	((val_t*)item)->mark = true;
+	if (!cht_remove_item(h, item))
+		return "Failed to remove inserted item";
+	
+	item = cht_find(h, (void*)key[0]);
+	if (!item || ((val_t*)item)->mark)
+		return "Did not find proper item.";
+	
+	item = cht_find_next(h, item);
+	if (!item || ((val_t*)item)->mark)
+		return "Did not find proper duplicate.";
+
+	item = cht_find_next(h, item);
+	if (item)
+		return "Found removed duplicate";
+	
+	if (2 != cht_remove_key(h, (void*)key[0]))
+		return "Failed to remove all duplicates";
+	
+	if (cht_find(h, (void*)key[0]))
+		return "Found removed key";
+	
+	if (!cht_find(h, (void*)key[3]))
+		return "Removed incorrect key";
+	
+	for (size_t k = 0; k < sizeof(v) / sizeof(v[0]); ++k) {
+		cht_remove_key(h, (void*)key[k]);
+	}
+	
+	for (size_t k = 0; k < sizeof(v) / sizeof(v[0]); ++k) {
+		if (cht_find(h, (void*)key[k]))
+			return "Found a key in a cleared table";
+	}
+
+	return NULL;
+}
+
+static const char * sanity_test(void)
+{
+	cht_t h;
+	if (!cht_create_simple(&h, &val_ops))
+		return "Could not create the table.";
+	
+	rcu_read_lock();
+	const char *err = do_sanity_test(&h);
+	rcu_read_unlock();
+	
+	cht_destroy(&h);
+
+	return err;
+}
+
+/*-------------------------------------------------------------------*/
+
+static size_t next_rand(size_t seed)
+{
+	return (seed * 1103515245 + 12345) & ((1U << 31) - 1);
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	cht_link_t link;
+	size_t key;
+	bool free;
+	bool inserted;
+	bool deleted;
+} stress_t;
+
+typedef struct {
+	cht_t *h;
+	int *stop;
+	stress_t *elem;
+	size_t elem_cnt;
+	size_t upd_prob;
+	size_t wave_cnt;
+	size_t wave_elems;
+	size_t id;
+	bool failed;
+} stress_work_t;
+
+static size_t stress_hash(const cht_link_t *item)
+{
+	return ((stress_t*)item)->key >> 8;
+}
+static size_t stress_key_hash(void *key)
+{
+	return ((size_t)key) >> 8;
+}
+static bool stress_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	return ((stress_t*)item1)->key == ((stress_t*)item2)->key;
+}
+static bool stress_key_equal(void *key, const cht_link_t *item)
+{
+	return ((size_t)key) == ((stress_t*)item)->key;
+}
+static void stress_rm_callback(cht_link_t *item)
+{
+	if (((stress_t*)item)->free)
+		free(item);
+	else
+		((stress_t*)item)->deleted = true;
+}
+
+cht_ops_t stress_ops = {
+	.hash = stress_hash,
+	.key_hash = stress_key_hash,
+	.equal = stress_equal,
+	.key_equal = stress_key_equal,
+	.remove_callback = stress_rm_callback	
+};
+
+static void resize_stresser(void *arg)
+{
+	stress_work_t *work = (stress_work_t *)arg;
+
+	for (size_t k = 0; k < work->wave_cnt; ++k) {
+		TPRINTF("I{");
+		for (size_t i = 0; i < work->wave_elems; ++i) {
+			stress_t *s = malloc(sizeof(stress_t), FRAME_ATOMIC);
+			if (!s) {
+				TPRINTF("[out-of-mem]\n");
+				goto out_of_mem;				
+			}
+			
+			s->free = true;
+			s->key = (i << 8) + work->id;
+			
+			cht_insert(work->h, &s->link);
+		}
+		TPRINTF("}");
+		
+		thread_sleep(2);
+
+		TPRINTF("R<");
+		for (size_t i = 0; i < work->wave_elems; ++i) {
+			size_t key = (i << 8) + work->id;
+			
+			if (1 != cht_remove_key(work->h, (void*)key)) {
+				TPRINTF("Err: Failed to remove inserted item\n");
+				goto failed;
+			}
+		}
+		TPRINTF(">");
+	}
+	
+	/* Request that others stop. */
+	*work->stop = 1;
+	return;
+
+failed:
+	work->failed = true;
+
+out_of_mem:
+	/* Request that others stop. */
+	*work->stop = 1;
+
+	/* Remove anything we may have inserted. */
+	for (size_t i = 0; i < work->wave_elems; ++i) {
+		size_t key = (i << 8) + work->id;
+		cht_remove_key(work->h, (void*)key);
+	}
+}
+
+static void op_stresser(void *arg)
+{
+	stress_work_t *work = (stress_work_t *)arg;
+	ASSERT(0 == *work->stop);
+	
+	size_t loops = 0;
+	size_t seed = work->id;
+		
+	while (0 == *work->stop && !work->failed) {
+		seed = next_rand(seed);
+		bool upd = ((seed % 100) <= work->upd_prob);
+		seed = next_rand(seed);
+		size_t elem_idx = seed % work->elem_cnt;
+		
+		++loops;
+		if (0 == loops % (1024 * 1024)) {
+			/* Make the most current work->stop visible. */
+			read_barrier();
+			TPRINTF("*");
+		}
+			
+		if (upd) {
+			seed = next_rand(seed);
+			bool item_op = seed & 1;
+			
+			if (work->elem[elem_idx].inserted) {
+				if (item_op) {
+					rcu_read_lock();
+					cht_remove_item(work->h, &work->elem[elem_idx].link);
+					rcu_read_unlock();
+				} else {
+					void *key = (void*)work->elem[elem_idx].key;
+					if (1 != cht_remove_key(work->h, key)) {
+						TPRINTF("Err: did not rm the key\n");
+						work->failed = true;
+					}
+				}
+				work->elem[elem_idx].inserted = false;
+			} else if (work->elem[elem_idx].deleted) {
+				work->elem[elem_idx].deleted = false;
+				
+				if (item_op) {
+					rcu_read_lock();
+					cht_link_t *dup;
+					if (!cht_insert_unique(work->h, &work->elem[elem_idx].link, 
+						&dup)) {
+						TPRINTF("Err: already inserted\n");
+						work->failed = true;
+					}
+					rcu_read_unlock();
+				} else {
+					cht_insert(work->h, &work->elem[elem_idx].link);
+				}
+				
+				work->elem[elem_idx].inserted = true;
+			}
+		} else {
+			rcu_read_lock();
+			cht_link_t *item = 
+				cht_find(work->h, (void*)work->elem[elem_idx].key);
+			rcu_read_unlock();
+
+			if (item) {
+				if (!work->elem[elem_idx].inserted) {
+					TPRINTF("Err: found but not inserted!");
+					work->failed = true;
+				}
+				if (item != &work->elem[elem_idx].link) {
+					TPRINTF("Err: found but incorrect item\n");
+					work->failed = true;
+				}
+			} else {
+				if (work->elem[elem_idx].inserted) {
+					TPRINTF("Err: inserted but not found!");
+					work->failed = true;
+				}
+			}
+		}
+	}
+
+
+	/* Remove anything we may have inserted. */
+	for (size_t i = 0; i < work->elem_cnt; ++i) {
+		void *key = (void*) work->elem[i].key;
+		cht_remove_key(work->h, key);
+	}
+}
+
+static bool do_stress(void)
+{
+	cht_t h;
+	
+	if (!cht_create_simple(&h, &stress_ops)) {
+		TPRINTF("Failed to create the table\n");
+		return false;
+	}
+
+	const size_t wave_cnt = 10;
+	const size_t max_thread_cnt = 8;
+	const size_t resize_thread_cnt = 2;
+	size_t op_thread_cnt = min(max_thread_cnt, 2 * config.cpu_active);
+	size_t total_thr_cnt = op_thread_cnt + resize_thread_cnt;
+	size_t items_per_thread = 1024;
+	
+	size_t work_cnt = op_thread_cnt + resize_thread_cnt;
+	size_t item_cnt = op_thread_cnt * items_per_thread;
+	
+	/* Alloc hash table items. */
+	size_t size = item_cnt * sizeof(stress_t) + work_cnt * sizeof(stress_work_t)
+		+ sizeof(int);
+		
+	TPRINTF("Alloc and init table items. \n");
+	void *p = malloc(size, FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("Failed to alloc items\n");
+		cht_destroy(&h);
+		return false;
+	}
+	
+	stress_t *pitem = p + work_cnt * sizeof(stress_work_t);
+	stress_work_t *pwork = p;
+	int *pstop = (int*)(pitem + item_cnt);
+	
+	*pstop = 0;
+	
+	/* Init work items. */
+	for (size_t i = 0; i < op_thread_cnt; ++i) {
+		pwork[i].h = &h;
+		pwork[i].stop = pstop;
+		pwork[i].elem = &pitem[i * items_per_thread];
+		pwork[i].upd_prob = (i + 1) * 100 / op_thread_cnt;
+		pwork[i].id = i;
+		pwork[i].elem_cnt = items_per_thread;
+		pwork[i].failed = false;
+	}
+	
+	for (size_t i = op_thread_cnt; i < op_thread_cnt + resize_thread_cnt; ++i) {
+		pwork[i].h = &h;
+		pwork[i].stop = pstop;
+		pwork[i].wave_cnt = wave_cnt;
+		pwork[i].wave_elems = item_cnt * 4;
+		pwork[i].id = i;
+		pwork[i].failed = false;
+	}
+	
+	/* Init table elements. */
+	for (size_t k = 0; k < op_thread_cnt; ++k) {
+		for (size_t i = 0; i < items_per_thread; ++i) {
+			pwork[k].elem[i].key = (i << 8) + k;
+			pwork[k].elem[i].free = false;
+			pwork[k].elem[i].inserted = false;
+			pwork[k].elem[i].deleted = true;
+		}
+	}
+	
+	TPRINTF("Running %zu ins/del/find stress threads + %zu resizers.\n",
+		op_thread_cnt, resize_thread_cnt);
+	
+	/* Create and run threads. */
+	thread_t *thr[max_thread_cnt + resize_thread_cnt];
+	
+	for (size_t i = 0; i < total_thr_cnt; ++i) {
+		if (i < op_thread_cnt)
+			thr[i] = thread_create(op_stresser, &pwork[i], TASK, 0, "cht-op-stress");
+		else 
+			thr[i] = thread_create(resize_stresser, &pwork[i], TASK, 0, "cht-resize");
+		
+		ASSERT(thr[i]);
+		thread_wire(thr[i], &cpus[i % config.cpu_active]);
+		thread_ready(thr[i]);
+	}
+	
+	bool failed = false;
+	
+	/* Wait for all threads to return. */
+	TPRINTF("Joining resize stressers.\n");
+	for (size_t i = op_thread_cnt; i < total_thr_cnt; ++i) {
+		thread_join(thr[i]);
+		thread_detach(thr[i]);
+		failed = pwork[i].failed || failed;
+	}
+	
+	TPRINTF("Joining op stressers.\n");
+	for (int i = (int)op_thread_cnt - 1; i >= 0; --i) {
+		TPRINTF("%d threads remain\n", i);
+		thread_join(thr[i]);
+		thread_detach(thr[i]);
+		failed = pwork[i].failed || failed;
+	}
+	
+	cht_destroy(&h);
+	free(p);
+
+	return !failed;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+const char *test_cht1(void)
+{
+	const char *err = sanity_test();
+	if (err)
+		return err;
+	printf("Basic sanity test: ok.\n");
+	
+	if (!do_stress()) 
+		return "CHT stress test failed.";
+	else
+		return NULL;
+}
Index: kernel/test/cht/cht1.def
===================================================================
--- kernel/test/cht/cht1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/cht/cht1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,6 @@
+{
+	"cht",
+	"Concurrent hash table test",
+	&test_cht1,
+	true
+},
Index: kernel/test/smpcall/smpcall1.c
===================================================================
--- kernel/test/smpcall/smpcall1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/smpcall/smpcall1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,158 @@
+/*
+ */
+
+#include <print.h>
+#include <debug.h>
+
+#include <test.h>
+#include <smp/smp_call.h>
+#include <cpu.h>
+#include <macros.h>
+#include <config.h>
+#include <arch.h>
+#include <proc/thread.h>
+
+/* 
+ * Maximum total number of smp_calls in the system is: 
+ *  162000 == 9^2 * 1000 * 2 
+ *  == MAX_CPUS^2 * ITERATIONS * EACH_CPU_INC_PER_ITER
+ */
+#define MAX_CPUS   9
+#define ITERATIONS 1000
+#define EACH_CPU_INC_PER_ITER 2
+
+
+static void inc(void *p)
+{
+	ASSERT(interrupts_disabled());
+
+	size_t *pcall_cnt = (size_t*)p;
+	/* 
+	 * No synchronization. Tests if smp_calls makes changes 
+	 * visible to the caller. 
+	 */
+	++*pcall_cnt;
+}
+
+
+static void test_thread(void *p)
+{
+	size_t *pcall_cnt = (size_t*)p;
+	smp_call_t call_info[MAX_CPUS];
+	
+	unsigned int cpu_count = min(config.cpu_active, MAX_CPUS);
+	
+	for (int iter = 0; iter < ITERATIONS; ++iter) {
+		/* Synchronous version. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			/* 
+			 * smp_call should make changes by inc() visible on this cpu. 
+			 * As a result we can pass it our pcall_cnt and not worry 
+			 * about other synchronization.
+			 */
+			smp_call(cpu_id, inc, pcall_cnt);
+		}
+		
+		/* 
+		 * Async calls run in parallel on different cpus, so passing the 
+		 * same counter would clobber it without additional synchronization.
+		 */
+		size_t local_cnt[MAX_CPUS] = {0};
+		
+		/* Now start asynchronous calls. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			smp_call_async(cpu_id, inc, &local_cnt[cpu_id], &call_info[cpu_id]);
+		}
+		
+		/* And wait for all async calls to complete. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			smp_call_wait(&call_info[cpu_id]);
+			*pcall_cnt += local_cnt[cpu_id];
+		}
+
+		/* Give other threads a chance to run. */
+		thread_usleep(10000);
+	}
+}
+
+static size_t calc_exp_calls(size_t thread_cnt)
+{
+	return thread_cnt * ITERATIONS * EACH_CPU_INC_PER_ITER;
+}
+
+const char *test_smpcall1(void)
+{
+	/* Number of received calls that were sent by cpu[i]. */
+	size_t call_cnt[MAX_CPUS] = {0};
+	thread_t *thread[MAX_CPUS] = { NULL };
+	
+	unsigned int cpu_count = min(config.cpu_active, MAX_CPUS);
+	size_t running_thread_cnt = 0;
+
+	TPRINTF("Spawning threads on %u cpus.\n", cpu_count);
+	
+	/* Create a wired thread on each cpu. */
+	for (unsigned int id = 0; id < cpu_count; ++id) {
+		thread[id] = thread_create(test_thread, &call_cnt[id], TASK, 
+			THREAD_FLAG_NONE, "smp-call-test");
+		
+		if (thread[id]) {
+			thread_wire(thread[id], &cpus[id]);
+			++running_thread_cnt;
+		} else {
+			TPRINTF("Failed to create thread on cpu%u.\n", id);
+		}
+	}
+
+	size_t exp_calls = calc_exp_calls(running_thread_cnt);
+	size_t exp_calls_sum = exp_calls * cpu_count;
+	
+	TPRINTF("Running %zu wired threads. Expecting %zu calls. Be patient.\n", 
+		running_thread_cnt, exp_calls_sum);
+
+	for (unsigned int i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			thread_ready(thread[i]);
+		}
+	}
+	
+	/* Wait for threads to complete. */
+	for (unsigned int i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			thread_join(thread[i]);
+			thread_detach(thread[i]);
+		}
+	}
+
+	TPRINTF("Threads finished. Checking number of smp_call()s.\n");
+	
+	bool ok = true;
+	size_t calls_sum = 0;
+	
+	for (size_t i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			if (call_cnt[i] != exp_calls) {
+				ok = false;
+				TPRINTF("Error: %zu instead of %zu cpu%zu's calls were"
+					" acknowledged.\n", call_cnt[i], exp_calls, i);
+			} 
+		}
+		
+		calls_sum += call_cnt[i];
+	}
+	
+	if (calls_sum != exp_calls_sum) {
+		TPRINTF("Error: total acknowledged sum: %zu instead of %zu.\n",
+			calls_sum, exp_calls_sum);
+		
+		ok = false;
+	}
+	
+	if (ok) {
+		TPRINTF("Success: number of received smp_calls is as expected (%zu).\n",
+			exp_calls_sum);
+		return NULL;
+	} else
+		return "Failed: incorrect acknowledged smp_calls.\n";
+	
+}
Index: kernel/test/smpcall/smpcall1.def
===================================================================
--- kernel/test/smpcall/smpcall1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/smpcall/smpcall1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,6 @@
+{
+	"smpcall1",
+	"smp_call() test",
+	&test_smpcall1,
+	true
+},
Index: kernel/test/synch/rcu1.c
===================================================================
--- kernel/test/synch/rcu1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/rcu1.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <atomic.h>
+#include <print.h>
+#include <proc/thread.h>
+#include <macros.h>
+#include <str.h>
+#include <errno.h>
+#include <time/delay.h>
+
+#include <synch/rcu.h>
+
+
+#define MAX_THREADS 32
+
+static int one_idx = 0;
+static thread_t *thread[MAX_THREADS] = { NULL };
+
+typedef struct {
+	rcu_item_t rcu;
+	bool exited;
+} exited_t;
+
+/* Callback raced with preexisting readers. */
+#define ERACE   123
+/* Waited for too long for the callback to exit; consider it lost. */
+#define ECBLOST 432
+
+/*-------------------------------------------------------------------*/
+static void wait_for_cb_exit(size_t secs, exited_t *p, int *presult)
+{
+	size_t loops = 0;
+	/* 4 secs max */
+	size_t loop_ms_sec = 500;
+	size_t max_loops = ((secs * 1000 + loop_ms_sec - 1) / loop_ms_sec);
+
+	while (loops < max_loops && !p->exited) {
+		++loops;
+		thread_usleep(loop_ms_sec * 1000);
+		TPRINTF(".");
+	}
+	
+	if (!p->exited) {
+		*presult = ECBLOST;
+	}
+}
+
+static size_t get_thread_cnt(void)
+{
+	return min(MAX_THREADS, config.cpu_active * 4);
+}
+
+static void run_thread(size_t k, void (*func)(void*), void *arg)
+{
+	ASSERT(thread[k] == NULL);
+	
+	thread[k] = thread_create(func, arg, TASK, THREAD_FLAG_NONE, 
+		"test-rcu-thread");
+		
+	if(thread[k]) {
+		/* Distribute evenly. */
+		thread_wire(thread[k], &cpus[k % config.cpu_active]);
+		thread_ready(thread[k]);
+	}
+}
+
+static void run_all(void (*func)(void*))
+{
+	size_t thread_cnt = get_thread_cnt();
+	
+	one_idx = 0;
+	
+	for (size_t i = 0; i < thread_cnt; ++i) {
+		run_thread(i, func, NULL);
+	}
+}
+
+static void join_all(void)
+{
+	size_t thread_cnt = get_thread_cnt();
+	
+	one_idx = 0;
+	
+	for (size_t i = 0; i < thread_cnt; ++i) {
+		if (thread[i]) {
+			bool joined = false;
+			do {
+				int ret = thread_join_timeout(thread[i], 5 * 1000 * 1000, 0);
+				joined = (ret != ESYNCH_TIMEOUT);
+				
+				if (ret == ESYNCH_OK_BLOCKED) {
+					TPRINTF("%zu threads remain\n", thread_cnt - i - 1);
+				}
+			} while (!joined);
+			
+			thread_detach(thread[i]);
+			thread[i] = NULL;
+		}
+	}
+}
+
+static void run_one(void (*func)(void*), void *arg)
+{
+	ASSERT(one_idx < MAX_THREADS);
+	run_thread(one_idx, func, arg);
+	++one_idx;
+}
+
+
+static void join_one(void)
+{
+	ASSERT(0 < one_idx && one_idx <= MAX_THREADS);
+
+	--one_idx;
+	
+	if (thread[one_idx]) {
+		thread_join(thread[one_idx]);
+		thread_detach(thread[one_idx]);
+		thread[one_idx] = NULL;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static void nop_reader(void *arg)
+{
+	size_t nop_iters = (size_t)arg;
+	
+	TPRINTF("Enter nop-reader\n");
+	
+	for (size_t i = 0; i < nop_iters; ++i) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
+	
+	TPRINTF("Exit nop-reader\n");
+}
+
+static void get_seq(size_t from, size_t to, size_t steps, size_t *seq)
+{
+	ASSERT(0 < steps && from <= to && 0 < to);
+	size_t inc = (to - from) / (steps - 1);
+	
+	for (size_t i = 0; i < steps - 1; ++i) {
+		seq[i] = i * inc + from;
+	}
+	
+	seq[steps - 1] = to;
+}
+
+static bool do_nop_readers(void)
+{
+	size_t seq[MAX_THREADS] = {0};
+	get_seq(100, 100000, get_thread_cnt(), seq);
+	
+	TPRINTF("\nRun %zu thr: repeat empty no-op reader sections\n", get_thread_cnt());
+	
+	for (size_t k = 0; k < get_thread_cnt(); ++k)
+		run_one(nop_reader, (void*)seq[k]);
+	
+	TPRINTF("\nJoining %zu no-op readers\n", get_thread_cnt());
+	join_all();
+	
+	return true;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+
+static void long_reader(void *arg)
+{
+	const size_t iter_cnt = 100 * 1000 * 1000;
+	size_t nop_iters = (size_t)arg;
+	size_t outer_iters = iter_cnt / nop_iters;
+	
+	TPRINTF("Enter long-reader\n");
+	
+	for (size_t i = 0; i < outer_iters; ++i) {
+		rcu_read_lock();
+		
+		for (volatile size_t k = 0; k < nop_iters; ++k) {
+			/* nop, but increment volatile k */
+		}
+		
+		rcu_read_unlock();
+	}
+	
+	TPRINTF("Exit long-reader\n");
+}
+
+static bool do_long_readers(void)
+{
+	size_t seq[MAX_THREADS] = {0};
+	get_seq(10, 1000 * 1000, get_thread_cnt(), seq);
+	
+	TPRINTF("\nRun %zu thr: repeat long reader sections, will preempt, no cbs.\n", 
+		get_thread_cnt());
+	
+	for (size_t k = 0; k < get_thread_cnt(); ++k)
+		run_one(long_reader, (void*)seq[k]);
+	
+	TPRINTF("\nJoining %zu readers with long reader sections.\n", get_thread_cnt());
+	join_all();
+	
+	return true;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static atomic_t nop_callbacks_cnt = {0};
+/* Must be even. */
+static const int nop_updater_iters = 10000;
+
+static void count_cb(rcu_item_t *item)
+{
+	atomic_inc(&nop_callbacks_cnt);
+	free(item);
+}
+
+static void nop_updater(void *arg)
+{
+	for (int i = 0; i < nop_updater_iters; i += 2){
+		rcu_item_t *a = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		rcu_item_t *b = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		
+		if (a && b) {
+			rcu_call(a, count_cb);
+			rcu_call(b, count_cb);
+		} else {
+			TPRINTF("[out-of-mem]\n");
+			free(a);
+			free(b);
+			return;
+		}
+	}
+}
+
+static bool do_nop_callbacks(void)
+{
+	atomic_set(&nop_callbacks_cnt, 0);
+
+	size_t exp_cnt = nop_updater_iters * get_thread_cnt();
+	size_t max_used_mem = sizeof(rcu_item_t) * exp_cnt;
+	
+	TPRINTF("\nRun %zu thr: post %zu no-op callbacks (%zu B used), no readers.\n", 
+		get_thread_cnt(), exp_cnt, max_used_mem);
+	
+	run_all(nop_updater);
+	TPRINTF("\nJoining %zu no-op callback threads\n", get_thread_cnt());
+	join_all();
+	
+	size_t loop_cnt = 0, max_loops = 15;
+
+	while (exp_cnt != atomic_get(&nop_callbacks_cnt) && loop_cnt < max_loops) {
+		++loop_cnt;
+		TPRINTF(".");
+		thread_sleep(1);
+	}
+	
+	return loop_cnt < max_loops;
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	rcu_item_t rcu_item;
+	int cookie;
+} item_w_cookie_t;
+
+const int magic_cookie = 0x01234567;
+static int one_cb_is_done = 0;
+
+static void one_cb_done(rcu_item_t *item)
+{
+	ASSERT( ((item_w_cookie_t *)item)->cookie == magic_cookie);
+	one_cb_is_done = 1;
+	TPRINTF("Callback()\n");
+	free(item);
+}
+
+static void one_cb_reader(void *arg)
+{
+	TPRINTF("Enter one-cb-reader\n");
+	
+	rcu_read_lock();
+	
+	item_w_cookie_t *item = malloc(sizeof(item_w_cookie_t), FRAME_ATOMIC);
+	
+	if (item) {
+		item->cookie = magic_cookie;
+		rcu_call(&item->rcu_item, one_cb_done);
+	} else {
+		TPRINTF("\n[out-of-mem]\n");
+	}
+	
+	thread_sleep(1);
+	
+	rcu_read_unlock();
+	
+	TPRINTF("Exit one-cb-reader\n");
+}
+
+static bool do_one_cb(void)
+{
+	one_cb_is_done = 0;
+	
+	TPRINTF("\nRun a single reader that posts one callback.\n");
+	run_one(one_cb_reader, NULL);
+	join_one();
+	
+	TPRINTF("\nJoined one-cb reader, wait for callback.\n");
+	size_t loop_cnt = 0;
+	size_t max_loops = 4; /* 200 ms total */
+	
+	while (!one_cb_is_done && loop_cnt < max_loops) {
+		thread_usleep(50 * 1000);
+		++loop_cnt;
+	}
+	
+	return one_cb_is_done;
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	size_t update_cnt;
+	size_t read_cnt;
+	size_t iters;
+} seq_work_t;
+
+typedef struct {
+	rcu_item_t rcu;
+	atomic_count_t start_time;
+} seq_item_t;
+
+
+static int seq_test_result = EOK;
+
+static atomic_t cur_time = {1};
+static atomic_count_t max_upd_done_time = {0};
+
+static void seq_cb(rcu_item_t *rcu_item)
+{
+	seq_item_t *item = member_to_inst(rcu_item, seq_item_t, rcu);
+	
+	/* Racy but errs to the conservative side, so it is ok. */
+	if (max_upd_done_time < item->start_time) {
+		max_upd_done_time = item->start_time;
+		
+		/* Make updated time visible */
+		memory_barrier();
+	}
+
+	free(item);
+}
+
+static void seq_func(void *arg)
+{
+	seq_work_t *work = (seq_work_t*)arg;
+	
+	/* Alternate between reader and updater roles. */
+	for (size_t k = 0; k < work->iters; ++k) {
+		/* Reader */
+		for (size_t i = 0; i < work->read_cnt; ++i) {
+			rcu_read_lock();
+			atomic_count_t start_time = atomic_postinc(&cur_time);
+			
+			for (volatile size_t d = 0; d < 10 * i; ++d ){
+				/* no-op */
+			}
+			
+			/* Get most recent max_upd_done_time. */
+			memory_barrier();
+			
+			if (start_time < max_upd_done_time) {
+				seq_test_result = ERACE;
+			}
+			
+			rcu_read_unlock();
+			
+			if (seq_test_result != EOK) 
+				return;
+		}
+		
+		/* Updater */
+		for (size_t i = 0; i < work->update_cnt; ++i) {
+			seq_item_t *a = malloc(sizeof(seq_item_t), FRAME_ATOMIC);
+			seq_item_t *b = malloc(sizeof(seq_item_t), FRAME_ATOMIC);
+			
+			if (a && b) {
+				a->start_time = atomic_postinc(&cur_time);
+				rcu_call(&a->rcu, seq_cb);
+				
+				b->start_time = atomic_postinc(&cur_time);
+				rcu_call(&b->rcu, seq_cb);
+			} else {
+				TPRINTF("\n[out-of-mem]\n");
+				seq_test_result = ENOMEM;
+				free(a);
+				free(b);
+				return;
+			}
+		}
+		
+	} 
+}
+
+static bool do_seq_check(void)
+{
+	seq_test_result = EOK;
+	max_upd_done_time = 0;
+	atomic_set(&cur_time, 1);
+
+	const size_t iters = 100;
+	const size_t total_cnt = 1000;
+	size_t read_cnt[MAX_THREADS] = {0};
+	seq_work_t item[MAX_THREADS];
+	
+	size_t total_cbs = 0;
+	size_t max_used_mem = 0;
+	
+	get_seq(0, total_cnt, get_thread_cnt(), read_cnt);
+	
+
+	for (size_t i = 0; i < get_thread_cnt(); ++i) {
+		item[i].update_cnt = total_cnt - read_cnt[i];
+		item[i].read_cnt = read_cnt[i];
+		item[i].iters = iters;
+		
+		total_cbs += 2 * iters * item[i].update_cnt;
+	}
+	
+	max_used_mem = total_cbs * sizeof(seq_item_t);
+
+	const char *mem_suffix;
+	uint64_t mem_units;
+	bin_order_suffix(max_used_mem, &mem_units, &mem_suffix, false);
+	
+	TPRINTF("\nRun %zu th: check callback completion time in readers. "
+		"%zu callbacks total (max %" PRIu64 " %s used). Be patient.\n", 
+		get_thread_cnt(), total_cbs, mem_units, mem_suffix);
+	
+	for (size_t i = 0; i < get_thread_cnt(); ++i) {
+		run_one(seq_func, &item[i]);
+	}
+	
+	TPRINTF("\nJoining %zu seq-threads\n", get_thread_cnt());
+	join_all();
+	
+	if (seq_test_result == ENOMEM) {
+		TPRINTF("\nErr: out-of mem\n");
+	} else if (seq_test_result == ERACE) {
+		TPRINTF("\nERROR: race detected!!\n");
+	} 
+	
+	return seq_test_result == EOK;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static void reader_unlocked(rcu_item_t *item)
+{
+	exited_t *p = (exited_t*)item;
+	p->exited = true;
+}
+
+static void reader_exit(void *arg)
+{
+	rcu_read_lock();
+	rcu_read_lock();
+	rcu_read_lock();
+	rcu_read_unlock();
+	
+	rcu_call((rcu_item_t*)arg, reader_unlocked);
+	
+	rcu_read_lock();
+	rcu_read_lock();
+	
+	/* Exit without unlocking the rcu reader section. */
+}
+
+static bool do_reader_exit(void)
+{
+	TPRINTF("\nReader exits thread with rcu_lock\n");
+	
+	exited_t *p = malloc(sizeof(exited_t), FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+		
+	p->exited = false;
+	
+	run_one(reader_exit, p);	
+	join_one();
+	
+	int result = EOK;
+	wait_for_cb_exit(2 /* secs */, p, &result);
+	
+	if (result != EOK) {
+		TPRINTF("Err: RCU locked up after exiting from within a reader\n");
+		/* Leak the mem. */
+	} else {
+		free(p);
+	}
+	
+	return result == EOK;
+}
+
+/*-------------------------------------------------------------------*/
+
+/*-------------------------------------------------------------------*/
+
+typedef struct preempt_struct {
+	exited_t e;
+	int result;
+} preempt_t;
+
+
+static void preempted_unlocked(rcu_item_t *item)
+{
+	preempt_t *p = member_to_inst(item, preempt_t, e.rcu);
+	p->e.exited = true;
+	TPRINTF("Callback().\n");
+}
+
+static void preempted_reader_prev(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+
+	TPRINTF("reader_prev{ ");
+	
+	rcu_read_lock();
+	scheduler();
+	rcu_read_unlock();
+
+	/* 
+	 * Start GP after exiting reader section w/ preemption. 
+	 * Just check that the callback does not lock up and is not lost.
+	 */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	TPRINTF("}reader_prev\n");
+}
+
+static void preempted_reader_inside_cur(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_inside_cur{ ");
+	/* 
+	 * Start a GP and try to finish the reader before 
+	 * the GP ends (including preemption). 
+	 */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Give RCU threads a chance to start up. */
+	scheduler();
+	scheduler();
+
+	rcu_read_lock();
+	/* Come back as soon as possible to complete before GP ends. */
+	thread_usleep(2);
+	rcu_read_unlock();
+
+	TPRINTF("}reader_inside_cur\n");
+}
+
+
+static void preempted_reader_cur(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_cur{ ");
+	rcu_read_lock();
+
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Preempt while cur GP detection is running */
+	thread_sleep(1);
+	
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_cur\n");
+}
+
+static void preempted_reader_next1(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_next1{ ");
+	rcu_read_lock();
+
+	/* Preempt before cur GP detection starts. */
+	scheduler();
+	
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_next1\n");
+}
+
+static void preempted_reader_next2(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_next2{ ");
+	rcu_read_lock();
+
+	/* Preempt before cur GP detection starts. */
+	scheduler();
+	
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* 
+	 * Preempt twice while GP is running after we've been known 
+	 * to hold up the GP just to make sure multiple preemptions
+	 * are properly tracked if a reader is delaying the cur GP.
+	 */
+	thread_sleep(1);
+	thread_sleep(1);
+
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_next2\n");
+}
+
+
+static bool do_one_reader_preempt(void (*f)(void*), const char *err)
+{
+	preempt_t *p = malloc(sizeof(preempt_t), FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	p->e.exited = false;
+	p->result = EOK;
+	
+	run_one(f, p);	
+	join_one();
+	
+	/* Wait at most 4 secs. */
+	wait_for_cb_exit(4, &p->e, &p->result);
+	
+	if (p->result == EOK) {
+		free(p);
+		return true;
+	} else {
+		TPRINTF(err);
+		/* Leak a bit of mem. */
+		return false;
+	}
+}
+
+static bool do_reader_preempt(void)
+{
+	TPRINTF("\nReaders will be preempted.\n");
+	
+	bool success = true;
+	bool ok = true;
+	
+	ok = do_one_reader_preempt(preempted_reader_prev, 
+		"Err: preempted_reader_prev()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_inside_cur, 
+		"Err: preempted_reader_inside_cur()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_cur, 
+		"Err: preempted_reader_cur()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_next1, 
+		"Err: preempted_reader_next1()\n");
+	success = success && ok;
+
+	ok = do_one_reader_preempt(preempted_reader_next2, 
+		"Err: preempted_reader_next2()\n");
+	success = success && ok;
+	
+	return success;
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	bool reader_done;
+	bool reader_running;
+	bool synch_running;
+} synch_t;
+
+static void synch_reader(void *arg)
+{
+	synch_t *synch = (synch_t *) arg;
+	
+	rcu_read_lock();
+
+	/* Order accesses of synch after the reader section begins. */
+	memory_barrier();
+	
+	synch->reader_running = true;
+	
+	while (!synch->synch_running) {
+		/* 0.5 sec */
+		delay(500 * 1000);
+	}
+	
+	/* Run for 1 sec */
+	delay(1000 * 1000);
+	/* thread_join() propagates done to do_synch() */
+	synch->reader_done = true;
+	
+	rcu_read_unlock();
+}
+
+
+static bool do_synch(void)
+{
+	TPRINTF("\nSynchronize with long reader\n");
+	
+	synch_t *synch = malloc(sizeof(synch_t), FRAME_ATOMIC);
+	
+	if (!synch) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	synch->reader_done = false;
+	synch->reader_running = false;
+	synch->synch_running = false;
+	
+	run_one(synch_reader, synch);	
+	
+	/* Wait for the reader to enter its critical section. */
+	scheduler();
+	while (!synch->reader_running) {
+		thread_usleep(500 * 1000);
+	}
+	
+	synch->synch_running = true;
+	
+	rcu_synchronize();
+	join_one();
+	
+	
+	if (synch->reader_done) {
+		free(synch);
+		return true;
+	} else {
+		TPRINTF("Err: synchronize() exited prematurely \n");
+		/* Leak some mem. */
+		return false;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	rcu_item_t rcu_item;
+	atomic_t done;
+} barrier_t;
+
+static void barrier_callback(rcu_item_t *item)
+{
+	barrier_t *b = member_to_inst(item, barrier_t, rcu_item);
+	atomic_set(&b->done, 1);
+}
+
+static bool do_barrier(void)
+{
+	TPRINTF("\nrcu_barrier: Wait for outstanding rcu callbacks to complete\n");
+	
+	barrier_t *barrier = malloc(sizeof(barrier_t), FRAME_ATOMIC);
+	
+	if (!barrier) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	atomic_set(&barrier->done, 0);
+	
+	rcu_call(&barrier->rcu_item, barrier_callback);
+	rcu_barrier();
+	
+	if (1 == atomic_get(&barrier->done)) {
+		free(barrier);
+		return true;
+	} else {
+		TPRINTF("rcu_barrier() exited prematurely.\n");
+		/* Leak some mem. */
+		return false;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	size_t iters;
+	bool master;
+} stress_t;
+
+
+static void stress_reader(void *arg)
+{
+	bool *done = (bool*) arg;
+	
+	while (!*done) {
+		rcu_read_lock();
+		rcu_read_unlock();
+		
+		/* 
+		 * Do some work outside of the reader section so we are not always
+		 * preempted in the reader section.
+		 */
+		delay(5);
+	}
+}
+
+static void stress_cb(rcu_item_t *item)
+{
+	/* 5 us * 1000 * 1000 iters == 5 sec per updater thread */
+	delay(5);
+	free(item);
+}
+
+static void stress_updater(void *arg)
+{
+	stress_t *s = (stress_t *)arg;
+	
+	for (size_t i = 0; i < s->iters; ++i) {
+		rcu_item_t *item = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		
+		if (item) {
+			rcu_call(item, stress_cb);
+		} else {
+			TPRINTF("[out-of-mem]\n");
+			return;
+		}
+		
+		/* Print a dot if we make a progress of 1% */
+		if (s->master && 0 == (i % (s->iters/100)))
+			TPRINTF(".");
+	}
+}
+
+static bool do_stress(void)
+{
+	size_t cb_per_thread = 1000 * 1000;
+	bool done = false;
+	stress_t master = { .iters = cb_per_thread, .master = true }; 
+	stress_t worker = { .iters = cb_per_thread, .master = false }; 
+	
+	size_t thread_cnt = min(MAX_THREADS / 2, config.cpu_active);
+	/* Each cpu has one reader and one updater. */
+	size_t reader_cnt = thread_cnt;
+	size_t updater_cnt = thread_cnt;
+	
+	size_t exp_upd_calls = updater_cnt * cb_per_thread;
+	size_t max_used_mem = exp_upd_calls * sizeof(rcu_item_t);
+	
+	const char *mem_suffix;
+	uint64_t mem_units;
+	bin_order_suffix(max_used_mem, &mem_units, &mem_suffix, false);
+
+	TPRINTF("\nStress: Run %zu nop-readers and %zu updaters. %zu callbacks"
+		" total (max %" PRIu64 " %s used). Be very patient.\n", 
+		reader_cnt, updater_cnt, exp_upd_calls, mem_units, mem_suffix);
+	
+	for (size_t k = 0; k < reader_cnt; ++k) {
+		run_one(stress_reader, &done);
+	}
+
+	for (size_t k = 0; k < updater_cnt; ++k) {
+		run_one(stress_updater, k > 0 ? &worker : &master);
+	}
+	
+	TPRINTF("\nJoining %zu stress updaters.\n", updater_cnt);
+	
+	for (size_t k = 0; k < updater_cnt; ++k) {
+		join_one();
+	}
+	
+	done = true;
+
+	TPRINTF("\nJoining %zu stress nop-readers.\n", reader_cnt);
+	
+	join_all();
+	return true;
+}
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	rcu_item_t r;
+	size_t total_cnt;
+	size_t count_down;
+	bool expedite;
+} expedite_t;
+
+static void expedite_cb(rcu_item_t *arg)
+{
+	expedite_t *e = (expedite_t *)arg;
+	
+	if (1 < e->count_down) {
+		--e->count_down;
+		
+		if (0 == (e->count_down % (e->total_cnt/100))) {
+			TPRINTF("*");
+		}
+		
+		_rcu_call(e->expedite, &e->r, expedite_cb);
+	} else {
+		/* Do not touch any of e's mem after we declare we're done with it. */
+		memory_barrier();
+		e->count_down = 0;
+	}
+}
+
+static void run_expedite(bool exp, size_t cnt)
+{
+	expedite_t e;
+	e.total_cnt = cnt;
+	e.count_down = cnt;
+	e.expedite = exp;
+	
+	_rcu_call(e.expedite, &e.r, expedite_cb);
+	
+	while (0 < e.count_down) {
+		thread_sleep(1);
+		TPRINTF(".");
+	}
+}
+
+static bool do_expedite(void)
+{
+	size_t exp_cnt = 1000 * 1000;
+	size_t normal_cnt = 1 * 1000;
+	
+	TPRINTF("Expedited: sequence of %zu rcu_calls\n", exp_cnt);
+	run_expedite(true, exp_cnt);
+	TPRINTF("Normal/non-expedited: sequence of %zu rcu_calls\n", normal_cnt);
+	run_expedite(false, normal_cnt);
+	return true;
+}
+/*-------------------------------------------------------------------*/
+
+struct test_func {
+	bool include;
+	bool (*func)(void);
+	const char *desc;
+};
+
+
+const char *test_rcu1(void)
+{
+	struct test_func test_func[] = {
+		{ 1, do_one_cb, "do_one_cb" },
+		{ 1, do_reader_preempt, "do_reader_preempt" },
+		{ 1, do_synch, "do_synch" },
+		{ 1, do_barrier, "do_barrier" },
+		{ 1, do_reader_exit, "do_reader_exit" },
+		{ 1, do_nop_readers, "do_nop_readers" },
+		{ 1, do_seq_check, "do_seq_check" },
+		{ 0, do_long_readers, "do_long_readers" },
+		{ 1, do_nop_callbacks, "do_nop_callbacks" },
+		{ 0, do_expedite, "do_expedite" },
+		{ 1, do_stress, "do_stress" },
+		{ 0, NULL, NULL }
+	};
+	
+	bool success = true;
+	bool ok = true;
+	uint64_t completed_gps = rcu_completed_gps();
+	uint64_t delta_gps = 0;
+	
+	for (int i = 0; test_func[i].func; ++i) {
+		if (!test_func[i].include) {
+			TPRINTF("\nSubtest %s() skipped.\n", test_func[i].desc);
+			continue;
+		} else {
+			TPRINTF("\nRunning subtest %s.\n", test_func[i].desc);
+		}
+		
+		ok = test_func[i].func();
+		success = success && ok;
+		
+		delta_gps = rcu_completed_gps() - completed_gps;
+		completed_gps += delta_gps;
+
+		if (ok) {  
+			TPRINTF("\nSubtest %s() ok (GPs: %" PRIu64 ").\n", 
+				test_func[i].desc, delta_gps);
+		} else {
+			TPRINTF("\nFailed: %s(). Pausing for 5 secs.\n", test_func[i].desc);
+			thread_sleep(5);
+		} 
+	}
+
+	if (success)
+		return NULL;
+	else
+		return "One of the tests failed.";
+}
Index: kernel/test/synch/rcu1.def
===================================================================
--- kernel/test/synch/rcu1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/rcu1.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,6 @@
+{
+	"rcu1",
+	"Basic RCU test",
+	&test_rcu1,
+	true
+},
Index: kernel/test/synch/workq-test-core.h
===================================================================
--- kernel/test/synch/workq-test-core.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/workq-test-core.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <atomic.h>
+#include <print.h>
+#include <proc/thread.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+typedef struct test_work {
+	work_t work_item;
+	int master;
+	int wave;
+	int count_down;
+} test_work_t;
+
+static atomic_t call_cnt[WAVES];
+
+
+/* Fwd decl - implement in your actual test file.. */
+static int core_workq_enqueue(work_t *work_item, work_func_t func);
+
+
+static bool new_wave(test_work_t *work)
+{
+	++work->wave;
+	
+	if (work->wave < WAVES) {
+		work->count_down = COUNT;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+
+static int is_pow2(int num)
+{
+	unsigned n = (unsigned)num;
+	return (n != 0) && 0 == (n & (n-1));
+}
+
+static test_work_t * create_child(test_work_t *work)
+{
+	test_work_t *child = malloc(sizeof(test_work_t), 0);
+	ASSERT(child);
+	if (child) {
+		child->master = false;
+		child->wave = work->wave;
+		child->count_down = work->count_down;
+	}
+	
+	return child;
+}
+
+static void free_work(test_work_t *work)
+{
+	memsetb(work, sizeof(test_work_t), 0xfa);
+	free(work);
+}
+
+static void reproduce(work_t *work_item)
+{
+	/* Ensure work_item is ours for the taking. */
+	memsetb(work_item, sizeof(work_t), 0xec);
+	
+	test_work_t *work = (test_work_t *)work_item;
+	
+	atomic_inc(&call_cnt[work->wave]);
+	
+	if (0 < work->count_down) {
+		/* Sleep right before creating the last generation. */
+		if (1 == work->count_down) {
+			bool sleeping_wave = ((work->wave % 2) == 1);
+
+			/* Master never sleeps. */
+			if (sleeping_wave && !work->master) {
+				thread_usleep(WAVE_SLEEP_MS * 1000);
+			}
+		}
+		
+		--work->count_down;
+
+		/* 
+		 * Enqueue a child if count_down is power-of-2. 
+		 * Leads to exponential growth. 
+		 */
+		if (is_pow2(work->count_down + 1)) {
+			test_work_t *child = create_child(work);
+			if (child) {
+				if (!core_workq_enqueue(&child->work_item, reproduce))
+					free_work(child);
+			}
+		}
+		
+		if (!core_workq_enqueue(work_item, reproduce)) {
+			if (work->master) 
+				TPRINTF("\nErr: Master work item exiting prematurely!\n");
+
+			free_work(work);
+		}
+	} else {
+		/* We're done with this wave - only the master survives. */
+		
+		if (work->master && new_wave(work)) {
+			if (!core_workq_enqueue(work_item, reproduce)) {
+				TPRINTF("\nErr: Master work could not start a new wave!\n");
+				free_work(work);
+			}
+		} else {
+			if (work->master)
+				TPRINTF("\nMaster work item done.\n");
+				
+			free_work(work);
+		}
+	}
+}
+
+static const char *run_workq_core(bool end_prematurely)
+{
+	for (int i = 0; i < WAVES; ++i) {
+		atomic_set(&call_cnt[i], 0);
+	}
+
+	test_work_t *work = malloc(sizeof(test_work_t), 0);
+
+	work->master = true;
+	work->wave = 0;
+	work->count_down = COUNT;
+	
+	/*
+	 * k == COUNT_POW
+	 * 2^k == COUNT + 1
+	 * 
+	 * We have "k" branching points. Therefore:
+	 * exp_call_cnt == k*2^(k-1) + 2^k == (k + 2) * 2^(k-1)
+	 */
+	size_t exp_call_cnt = (COUNT_POW + 2) * (1 << (COUNT_POW - 1));
+	
+	TPRINTF("waves: %d, count_down: %d, total expected calls: %zu\n", 
+		WAVES, COUNT, exp_call_cnt * WAVES);
+	
+
+	core_workq_enqueue(&work->work_item, reproduce);
+	
+	size_t sleep_cnt = 0;
+	/* At least 40 seconds total (or 2 sec to end while there's work). */
+	size_t max_sleep_secs = end_prematurely ? 2 : MAIN_MAX_SLEEP_SEC;
+	size_t max_sleep_cnt = (max_sleep_secs * 1000) / MAIN_POLL_SLEEP_MS;
+	
+	for (int i = 0; i < WAVES; ++i) {
+		while (atomic_get(&call_cnt[i]) < exp_call_cnt 
+			&& sleep_cnt < max_sleep_cnt) {
+			TPRINTF(".");
+			thread_usleep(MAIN_POLL_SLEEP_MS * 1000);
+			++sleep_cnt;
+		}
+	}
+	
+	bool success = true;
+	
+	for (int i = 0; i < WAVES; ++i) {
+		if (atomic_get(&call_cnt[i]) == exp_call_cnt) {
+			TPRINTF("Ok: %" PRIua " calls in wave %d, as expected.\n",
+				atomic_get(&call_cnt[i]), i);
+		} else {
+			success = false;
+			TPRINTF("Error: %" PRIua " calls in wave %d, but %zu expected.\n",
+				atomic_get(&call_cnt[i]), i, exp_call_cnt);
+		} 
+	}
+	
+	
+	if (success)
+		return NULL;
+	else {
+		return "Failed to invoke the expected number of calls.\n";
+	}
+}
Index: kernel/test/synch/workqueue2.c
===================================================================
--- kernel/test/synch/workqueue2.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/workqueue2.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <print.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+#define WAVES 10
+#define COUNT_POW 12
+#define COUNT ((1 << COUNT_POW) - 1)
+#define WAVE_SLEEP_MS 100
+#define MAIN_POLL_SLEEP_MS 100
+#define MAIN_MAX_SLEEP_SEC 40
+
+/*
+ * Include the test implementation.
+ */
+#include "workq-test-core.h"
+
+
+/*-------------------------------------------------------------------*/
+
+static work_t basic_work;
+static int basic_done = 0;
+
+static void basic_test_work(work_t *work_item)
+{
+	basic_done = 1;
+	TPRINTF("basic_test_work()");
+}
+
+
+static void basic_test(void)
+{
+	TPRINTF("Issue a single work item.\n");
+	basic_done = 0;
+	workq_global_enqueue(&basic_work, basic_test_work);
+	
+	while (!basic_done) {
+		TPRINTF(".");
+		thread_sleep(1);
+	}
+
+	TPRINTF("\nBasic test done\n");
+}
+
+/*-------------------------------------------------------------------*/
+
+
+struct work_queue *workq = NULL;
+
+static int core_workq_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue(workq, work_item, func);
+}
+/*-------------------------------------------------------------------*/
+
+
+static const char *test_custom_workq_impl(bool stop, const char *qname)
+{
+	workq = workq_create(qname);
+	
+	if (!workq) {
+		return "Failed to create a work queue.\n";
+	}
+	
+	const char *ret = run_workq_core(stop);
+	
+	TPRINTF("Stopping work queue...\n");
+	workq_stop(workq);
+	
+	TPRINTF("Destroying work queue...\n");
+	workq_destroy(workq);
+	return ret;
+}
+
+static const char *test_custom_workq(void)
+{
+	TPRINTF("Stress testing a custom queue.\n");
+	return test_custom_workq_impl(false, "test-workq");
+}
+
+
+static const char *test_custom_workq_stop(void)
+{
+	TPRINTF("Stress testing a custom queue. Stops prematurely. "
+		"Errors are expected.\n");
+	test_custom_workq_impl(true, "test-workq-stop");
+	/* Errors are expected. */
+	return NULL;
+}
+
+
+const char *test_workqueue_all(void)
+{
+	const char *err = NULL;
+	const char *res;
+	
+	basic_test();
+	
+	res = test_custom_workq();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+	
+	res = test_custom_workq_stop();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+	
+	res = test_workqueue3();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+
+	return err;
+}
Index: kernel/test/synch/workqueue2.def
===================================================================
--- kernel/test/synch/workqueue2.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/workqueue2.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,6 @@
+{
+	"workqueue",
+	"Separate and system work queue stress test",
+	&test_workqueue_all,
+	true
+},
Index: kernel/test/synch/workqueue3.c
===================================================================
--- kernel/test/synch/workqueue3.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/workqueue3.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <print.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+#define WAVES 10
+#define COUNT_POW 12
+#define COUNT ((1 << COUNT_POW) - 1)
+#define WAVE_SLEEP_MS 100
+#define MAIN_POLL_SLEEP_MS 100
+#define MAIN_MAX_SLEEP_SEC 40
+
+/*
+ * Include the test implementation.
+ */
+#include "workq-test-core.h"
+
+
+static int core_workq_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_global_enqueue(work_item, func);
+}
+
+
+
+static const char *do_test(bool exit_early)
+{
+	const char *err = NULL;
+	TPRINTF("Stress testing system queue.\n");
+	TPRINTF("First run:\n");
+	err = run_workq_core(exit_early);
+
+	if (!err) {
+		TPRINTF("\nSecond run:\n");
+		err = run_workq_core(exit_early);
+	} 
+
+	TPRINTF("Done.\n");
+	
+	return err;
+}
+
+const char *test_workqueue3(void)
+{
+	return do_test(false);
+}
+
+const char *test_workqueue3quit(void)
+{
+	return do_test(true);
+}
Index: kernel/test/synch/workqueue3.def
===================================================================
--- kernel/test/synch/workqueue3.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ kernel/test/synch/workqueue3.def	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,6 @@
+{
+	"workqueue3quit",
+	"Global work queue test, exits early",
+	&test_workqueue3quit,
+	true
+},
Index: kernel/test/test.c
===================================================================
--- kernel/test/test.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/test/test.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -41,4 +41,5 @@
 #include <avltree/avltree1.def>
 #include <btree/btree1.def>
+#include <cht/cht1.def>
 #include <debug/mips1.def>
 #include <fault/fault1.def>
@@ -50,4 +51,7 @@
 #include <synch/semaphore1.def>
 #include <synch/semaphore2.def>
+#include <synch/rcu1.def>
+#include <synch/workqueue2.def>
+#include <synch/workqueue3.def>
 #include <print/print1.def>
 #include <print/print2.def>
@@ -56,4 +60,5 @@
 #include <print/print5.def>
 #include <thread/thread1.def>
+#include <smpcall/smpcall1.def>
 	{
 		.name = NULL,
Index: kernel/test/test.h
===================================================================
--- kernel/test/test.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ kernel/test/test.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -59,4 +59,5 @@
 extern const char *test_avltree1(void);
 extern const char *test_btree1(void);
+extern const char *test_cht1(void);
 extern const char *test_mips1(void);
 extern const char *test_fault1(void);
@@ -75,4 +76,10 @@
 extern const char *test_print5(void);
 extern const char *test_thread1(void);
+extern const char *test_smpcall1(void);
+extern const char *test_workqueue_all(void);
+extern const char *test_workqueue3(void);
+extern const char *test_workqueue3quit(void);
+extern const char *test_rcu1(void);
+
 
 extern test_t tests[];
Index: uspace/Makefile
===================================================================
--- uspace/Makefile	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ uspace/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -53,4 +53,5 @@
 	app/nterm \
 	app/redir \
+	app/rcutest \
 	app/sbi \
 	app/sportdmp \
@@ -217,4 +218,5 @@
 	lib/nic \
 	lib/ext4 \
+	lib/urcu \
 	lib/usb \
 	lib/usbhost \
Index: uspace/Makefile.common
===================================================================
--- uspace/Makefile.common	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ uspace/Makefile.common	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -135,4 +135,6 @@
 LIBBITHENGE_PREFIX = $(LIB_PREFIX)/bithenge
 
+LIBURCU_PREFIX = $(LIB_PREFIX)/urcu
+
 ifeq ($(STATIC_NEEDED),y)
 	STATIC_BUILD = y
Index: uspace/app/rcutest/Makefile
===================================================================
--- uspace/app/rcutest/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/app/rcutest/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,41 @@
+#
+# Copyright (c) 2012 Adam Hraska
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# - Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# - The name of the author may not be used to endorse or promote products
+#   derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+USPACE_PREFIX = ../..
+
+LIBS = $(LIBURCU_PREFIX)/liburcu.a
+
+EXTRA_CFLAGS += -I$(LIBURCU_PREFIX)
+	
+BINARY = rcutest
+
+SOURCES = \
+	rcutest.c
+
+include $(USPACE_PREFIX)/Makefile.common
+
Index: uspace/app/rcutest/rcutest.c
===================================================================
--- uspace/app/rcutest/rcutest.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/app/rcutest/rcutest.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,909 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup test
+ * @{
+ */
+
+/**
+ * @file rcutest.c
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <mem.h>
+#include <errno.h>
+#include <thread.h>
+#include <assert.h>
+#include <async.h>
+#include <fibril.h>
+#include <fibril_synch.h>
+#include <compiler/barrier.h>
+#include <futex.h>
+
+#include <rcu.h>
+
+
+
+#define USECS_PER_SEC (1000 * 1000)
+#define USECS_PER_MS  1000
+
+/* fwd decl. */
+struct test_info;
+
+typedef struct test_desc {
+	/* Aggregate test that runs other tests already in the table test_desc. */
+	bool aggregate;
+	enum {
+		T_OTHER,
+		T_SANITY,
+		T_STRESS
+	} type;
+	bool (*func)(struct test_info*);
+	const char *name;
+	const char *desc;
+} test_desc_t;
+
+
+typedef struct test_info {
+	size_t thread_cnt;
+	test_desc_t *desc;
+} test_info_t;
+
+
+
+static bool run_all_tests(struct test_info*);
+static bool run_sanity_tests(struct test_info*);
+static bool run_stress_tests(struct test_info*);
+
+static bool wait_for_one_reader(struct test_info*);
+static bool basic_sanity_check(struct test_info*);
+static bool dont_wait_for_new_reader(struct test_info*);
+static bool wait_for_exiting_reader(struct test_info*);
+static bool seq_test(struct test_info*);
+
+
+static test_desc_t test_desc[] = {
+	{
+		.aggregate = true,
+		.type = T_OTHER,
+		.func = run_all_tests,
+		.name = "*",
+		.desc = "Runs all tests.",
+	},
+	{
+		.aggregate = true,
+		.type = T_SANITY,
+		.func = run_sanity_tests,
+		.name = "sanity-tests",
+		.desc = "Runs all RCU sanity tests.",
+	},
+	{
+		.aggregate = true,
+		.type = T_STRESS,
+		.func = run_stress_tests,
+		.name = "stress-tests",
+		.desc = "Runs all RCU stress tests.",
+	},
+
+	{
+		.aggregate = false,
+		.type = T_SANITY,
+		.func = basic_sanity_check,
+		.name = "basic-sanity",
+		.desc = "Locks/unlocks and syncs in 1 fibril, no contention.",
+	},
+	{
+		.aggregate = false,
+		.type = T_SANITY,
+		.func = wait_for_one_reader,
+		.name = "wait-for-one",
+		.desc = "Syncs with one 2 secs sleeping reader.",
+	},
+	{
+		.aggregate = false,
+		.type = T_SANITY,
+		.func = dont_wait_for_new_reader,
+		.name = "ignore-new-r",
+		.desc = "Syncs with preexisting reader; ignores new reader.",
+	},
+	{
+		.aggregate = false,
+		.type = T_SANITY,
+		.func = wait_for_exiting_reader,
+		.name = "dereg-unlocks",
+		.desc = "Lets deregister_fibril unlock the reader section.",
+	},
+	{
+		.aggregate = false,
+		.type = T_STRESS,
+		.func = seq_test,
+		.name = "seq",
+		.desc = "Checks lock/unlock/sync w/ global time sequence.",
+	},
+	{
+		.aggregate = false,
+		.type = T_OTHER,
+		.func = NULL,
+		.name = "(null)",
+		.desc = "",
+	},
+};
+
+static const size_t test_desc_cnt = sizeof(test_desc) / sizeof(test_desc[0]);
+
+/*--------------------------------------------------------------------*/
+
+static size_t next_rand(size_t seed)
+{
+	return (seed * 1103515245 + 12345) & ((1U << 31) - 1);
+}
+
+
+typedef int (*fibril_func_t)(void *);
+
+static bool create_fibril(int (*func)(void*), void *arg)
+{
+	fid_t fid = fibril_create(func, arg);
+	
+	if (0 == fid) {
+		printf("Failed to create a fibril!\n");
+		return false;
+	}
+	
+	fibril_add_ready(fid);
+	return true;
+}
+
+/*--------------------------------------------------------------------*/
+
+static bool run_tests(test_info_t *info, bool (*include_filter)(test_desc_t *)) 
+{
+	size_t failed_cnt = 0;
+	size_t ok_cnt = 0;
+	
+	for (size_t i = 0; i < test_desc_cnt; ++i) {
+		test_desc_t *t = &test_desc[i];
+		
+		if (t->func && !t->aggregate && include_filter(t)) {
+			printf("Running \'%s\'...\n", t->name);
+			bool ok = test_desc[i].func(info);
+			
+			if (ok) {
+				++ok_cnt;
+				printf("Passed: \'%s\'\n", t->name);
+			} else {
+				++failed_cnt;
+				printf("FAILED: \'%s\'\n", t->name);
+			}
+		}
+	}
+	
+	printf("\n");
+
+	printf("%zu tests passed\n", ok_cnt);
+
+	if (failed_cnt) {
+		printf("%zu tests failed\n", failed_cnt);
+	} 
+	
+	return 0 == failed_cnt;
+}
+
+/*--------------------------------------------------------------------*/
+
+static bool all_tests_include_filter(test_desc_t *desc)
+{
+	return true;
+}
+
+/* Runs all available tests tests one-by-one. */
+static bool run_all_tests(test_info_t *test_info)
+{
+	printf("Running all tests...\n");
+	return run_tests(test_info, all_tests_include_filter);
+}
+
+/*--------------------------------------------------------------------*/
+
+static bool stress_tests_include_filter(test_desc_t *desc)
+{
+	return desc->type == T_STRESS;
+}
+
+/* Runs all available stress tests one-by-one. */
+static bool run_stress_tests(test_info_t *test_info)
+{
+	printf("Running stress tests...\n");
+	return run_tests(test_info, stress_tests_include_filter);
+}
+
+/*--------------------------------------------------------------------*/
+
+static bool sanity_tests_include_filter(test_desc_t *desc)
+{
+	return desc->type == T_SANITY;
+}
+
+/* Runs all available sanity tests one-by-one. */
+static bool run_sanity_tests(test_info_t *test_info)
+{
+	printf("Running sanity tests...\n");
+	return run_tests(test_info, sanity_tests_include_filter);
+}
+
+/*--------------------------------------------------------------------*/
+
+/* Locks/unlocks rcu and synchronizes without contention in a single fibril. */
+static bool basic_sanity_check(test_info_t *test_info)
+{
+	rcu_read_lock();
+	/* nop */
+	rcu_read_unlock();
+
+	rcu_read_lock();
+	/* nop */
+	rcu_read_unlock();
+	
+	rcu_synchronize();
+
+	/* Nested lock with yield(). */
+	rcu_read_lock();
+	fibril_yield();
+	rcu_read_lock();
+	fibril_yield();
+	rcu_read_unlock();
+	fibril_yield();
+	rcu_read_unlock();
+	
+	fibril_yield();
+	rcu_synchronize();
+	rcu_synchronize();
+	
+	rcu_read_lock();
+	/* nop */
+	if (!rcu_read_locked())
+		return false;
+
+	rcu_read_unlock();
+	
+	return !rcu_read_locked();
+}
+
+typedef struct one_reader_info {
+	bool entered_cs;
+	bool exited_cs;
+	size_t done_sleeps_cnt;
+	bool synching;
+	bool synched;
+	size_t failed;
+} one_reader_info_t;
+
+
+static int sleeping_reader(one_reader_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	printf("lock{");
+	rcu_read_lock();
+	rcu_read_lock();
+	arg->entered_cs = true;
+	rcu_read_unlock();
+
+	printf("r-sleep{");
+	/* 2 sec */
+	async_usleep(2 * USECS_PER_SEC);
+	++arg->done_sleeps_cnt;
+	printf("}");
+	
+	if (arg->synched) {
+		arg->failed = 1;
+		printf("Error: rcu_sync exited prematurely.\n");
+	}
+	
+	arg->exited_cs = true;
+	rcu_read_unlock();
+	printf("}");
+	
+	rcu_deregister_fibril();
+	return 0;
+}
+
+static bool wait_for_one_reader(test_info_t *test_info)
+{
+	one_reader_info_t info = { 0 };
+	
+	if (!create_fibril((fibril_func_t) sleeping_reader, &info))
+		return false;
+	
+	/* 1 sec, waits for the reader to enter its critical section and sleep. */
+	async_usleep(1 * USECS_PER_SEC);
+	
+	if (!info.entered_cs || info.exited_cs) {
+		printf("Error: reader is unexpectedly outside of critical section.\n");
+		return false;
+	}
+	
+	info.synching = true;
+	printf("sync[");
+	rcu_synchronize();
+	printf("]\n");
+	info.synched = true;
+
+	/* Load info.exited_cs */
+	memory_barrier();
+	
+	if (!info.exited_cs || info.failed) {
+		printf("Error: rcu_sync() returned before the reader exited its CS.\n");
+		/* 
+		 * Sleep some more so we don't free info on stack while the reader 
+		 * is using it.
+		 */
+		/* 1.5 sec */
+		async_usleep(1500 * 1000);
+		return false;
+	} else {
+		return true;
+	}
+}
+
+/*--------------------------------------------------------------------*/
+
+#define WAIT_STEP_US  500 * USECS_PER_MS
+
+typedef struct two_reader_info {
+	bool new_entered_cs;
+	bool new_exited_cs;
+	bool old_entered_cs;
+	bool old_exited_cs;
+	bool synching;
+	bool synched;
+	size_t failed;
+} two_reader_info_t;
+
+
+static int preexisting_reader(two_reader_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	printf("old-lock{");
+	rcu_read_lock();
+	arg->old_entered_cs = true;
+	
+	printf("wait-for-sync{");
+	/* Wait for rcu_sync() to start waiting for us. */
+	while (!arg->synching) {
+		async_usleep(WAIT_STEP_US);
+	}
+	printf(" }");
+	
+	/* A new reader starts while rcu_sync() is in progress. */
+	
+	printf("wait-for-new-R{");
+	/* Wait for the new reader to enter its reader section. */
+	while (!arg->new_entered_cs) {
+		async_usleep(WAIT_STEP_US);
+	}
+	printf(" }");
+	
+	arg->old_exited_cs = true;
+	
+	assert(!arg->new_exited_cs);
+	
+	if (arg->synched) {
+		arg->failed = 1;
+		printf("Error: rcu_sync() did not wait for preexisting reader.\n");
+	}
+	
+	rcu_read_unlock();
+	printf(" }");
+	
+	rcu_deregister_fibril();
+	return 0;
+}
+
+static int new_reader(two_reader_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	/* Wait until rcu_sync() starts. */
+	while (!arg->synching) {
+		async_usleep(WAIT_STEP_US);
+	}
+	
+	/* 
+	 * synching is set when rcu_sync() is about to be entered so wait
+	 * some more to make sure it really does start executing.
+	 */
+	async_usleep(WAIT_STEP_US);
+	
+	printf("new-lock(");
+	rcu_read_lock();
+	arg->new_entered_cs = true;
+
+	/* Wait for rcu_sync() exit, ie stop waiting for the preexisting reader. */
+	while (!arg->synched) {
+		async_usleep(WAIT_STEP_US);
+	}
+	
+	arg->new_exited_cs = true;
+	/* Write new_exited_cs before exiting reader section. */
+	memory_barrier();
+	
+	/* 
+	 * Preexisting reader should have exited by now, so rcu_synchronize() 
+	 * must have returned.
+	 */
+	if (!arg->old_exited_cs) {
+		arg->failed = 1;
+		printf("Error: preexisting reader should have exited by now!\n");
+	}
+	
+	rcu_read_unlock();
+	printf(")");
+
+	rcu_deregister_fibril();
+	return 0;
+}
+
+static bool dont_wait_for_new_reader(test_info_t *test_info)
+{
+	two_reader_info_t info = { 0 };
+	
+	if (!create_fibril((fibril_func_t) preexisting_reader, &info))
+		return false;
+
+	if (!create_fibril((fibril_func_t) new_reader, &info))
+		return false;
+	
+	/* Waits for the preexisting_reader to enter its CS.*/
+	while (!info.old_entered_cs) {
+		async_usleep(WAIT_STEP_US);
+	}
+	
+	assert(!info.old_exited_cs);
+	assert(!info.new_entered_cs);
+	assert(!info.new_exited_cs);
+	
+	printf("sync[");
+	info.synching = true;
+	rcu_synchronize();
+	printf(" ]");
+	
+	/* Load info.exited_cs */
+	memory_barrier();
+	
+	if (!info.old_exited_cs) {
+		printf("Error: rcu_sync() returned before preexisting reader exited.\n");
+		info.failed = 1;
+	}
+	
+	bool new_outside_cs = !info.new_entered_cs || info.new_exited_cs;
+	
+	/* Test if new reader is waiting in CS before setting synched. */
+	compiler_barrier();
+	info.synched = true;
+		
+	if (new_outside_cs) {
+		printf("Error: new reader CS held up rcu_sync(). (4)\n");
+		info.failed = 1;
+	} else {
+		/* Wait for the new reader. */
+		rcu_synchronize();
+		
+		if (!info.new_exited_cs) {
+			printf("Error: 2nd rcu_sync() returned before new reader exited.\n");
+			info.failed = 1;
+		}
+		
+		printf("\n");
+	}
+	
+	if (info.failed) {
+		/* 
+		 * Sleep some more so we don't free info on stack while readers 
+		 * are using it.
+		 */
+		async_usleep(WAIT_STEP_US);
+	}
+	
+	return 0 == info.failed;
+}
+
+#undef WAIT_STEP_US
+
+/*--------------------------------------------------------------------*/
+#define WAIT_STEP_US  500 * USECS_PER_MS
+
+typedef struct exit_reader_info {
+	bool entered_cs;
+	bool exited_cs;
+	bool synching;
+	bool synched;
+} exit_reader_info_t;
+
+
+static int exiting_locked_reader(exit_reader_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	printf("old-lock{");
+	rcu_read_lock();
+	rcu_read_lock();
+	rcu_read_lock();
+	arg->entered_cs = true;
+	
+	printf("wait-for-sync{");
+	/* Wait for rcu_sync() to start waiting for us. */
+	while (!arg->synching) {
+		async_usleep(WAIT_STEP_US);
+	}
+	printf(" }");
+	
+	rcu_read_unlock();
+	printf(" }");
+
+	arg->exited_cs = true;
+	/* Store exited_cs before unlocking reader section in deregister. */
+	memory_barrier();
+	
+	/* Deregister forcefully unlocks the reader section. */
+	rcu_deregister_fibril();
+	return 0;
+}
+
+
+static bool wait_for_exiting_reader(test_info_t *test_info)
+{
+	exit_reader_info_t info = { 0 };
+	
+	if (!create_fibril((fibril_func_t) exiting_locked_reader, &info))
+		return false;
+	
+	/* Waits for the preexisting_reader to enter its CS.*/
+	while (!info.entered_cs) {
+		async_usleep(WAIT_STEP_US);
+	}
+	
+	assert(!info.exited_cs);
+	
+	printf("sync[");
+	info.synching = true;
+	rcu_synchronize();
+	info.synched = true;
+	printf(" ]\n");
+	
+	/* Load info.exited_cs */
+	memory_barrier();
+	
+	if (!info.exited_cs) {
+		printf("Error: rcu_deregister_fibril did not unlock the CS.\n");
+		return false;
+	}	
+	
+	return true;
+}
+
+#undef WAIT_STEP_US
+
+
+/*--------------------------------------------------------------------*/
+
+typedef struct {
+	atomic_t time;
+	atomic_t max_start_time_of_done_sync;
+	
+	size_t total_workers;
+	size_t done_reader_cnt;
+	size_t done_updater_cnt;
+	fibril_mutex_t done_cnt_mtx;
+	fibril_condvar_t done_cnt_changed;
+
+	size_t read_iters;
+	size_t upd_iters;
+	
+	atomic_t seed;
+	int failed;
+} seq_test_info_t;
+
+
+static void signal_seq_fibril_done(seq_test_info_t *arg, size_t *cnt)
+{
+	fibril_mutex_lock(&arg->done_cnt_mtx);
+	++*cnt;
+	
+	if (arg->total_workers == arg->done_reader_cnt + arg->done_updater_cnt) {
+		fibril_condvar_signal(&arg->done_cnt_changed);
+	}
+	
+	fibril_mutex_unlock(&arg->done_cnt_mtx);
+}
+
+static int seq_reader(seq_test_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	size_t seed = (size_t) atomic_preinc(&arg->seed);
+	bool first = (seed == 1);
+	
+	for (size_t k = 0; k < arg->read_iters; ++k) {
+		/* Print progress if the first reader fibril. */
+		if (first && 0 == k % (arg->read_iters/100 + 1)) {
+			printf(".");
+		}
+		
+		rcu_read_lock();
+		atomic_count_t start_time = atomic_preinc(&arg->time);
+		
+		/* Do some work. */
+		seed = next_rand(seed);
+		size_t idle_iters = seed % 8;
+		
+		for (size_t i = 0; i < idle_iters; ++i) {
+			fibril_yield();
+		}
+		
+		/* 
+		 * Check if the most recently started rcu_sync of the already
+		 * finished rcu_syncs did not happen to start after this reader
+		 * and, therefore, should have waited for this reader to exit
+		 * (but did not - since it already announced it completed).
+		 */
+		if (start_time <= atomic_get(&arg->max_start_time_of_done_sync)) {
+			arg->failed = 1;
+		}
+		
+		rcu_read_unlock();
+	}
+	
+	rcu_deregister_fibril();
+
+	signal_seq_fibril_done(arg, &arg->done_reader_cnt);
+	return 0;
+}
+
+static int seq_updater(seq_test_info_t *arg)
+{
+	rcu_register_fibril();
+	
+	for (size_t k = 0; k < arg->upd_iters; ++k) {
+		atomic_count_t start_time = atomic_get(&arg->time);
+		rcu_synchronize();
+		
+		/* This is prone to a race but if it happens it errs to the safe side.*/
+		if (atomic_get(&arg->max_start_time_of_done_sync) < start_time) {
+			atomic_set(&arg->max_start_time_of_done_sync, start_time);
+		}
+	}
+	
+	rcu_deregister_fibril();
+	
+	signal_seq_fibril_done(arg, &arg->done_updater_cnt);
+	return 0;
+}
+
+static bool seq_test(test_info_t *test_info)
+{
+	size_t reader_cnt = test_info->thread_cnt; 
+	size_t updater_cnt = test_info->thread_cnt; 
+		
+	seq_test_info_t info = {
+		.time = {0},
+		.max_start_time_of_done_sync = {0},
+		.read_iters = 10 * 1000,
+		.upd_iters = 5 * 1000,
+		.total_workers = updater_cnt + reader_cnt,
+		.done_reader_cnt = 0,
+		.done_updater_cnt = 0,
+		.done_cnt_mtx = FIBRIL_MUTEX_INITIALIZER(info.done_cnt_mtx),
+		.done_cnt_changed = FIBRIL_CONDVAR_INITIALIZER(info.done_cnt_changed),
+		.seed = {0},
+		.failed = 0,
+	};
+	
+	/* Create and start worker fibrils. */
+	for (size_t k = 0; k + k < reader_cnt + updater_cnt; ++k) {
+		bool ok = create_fibril((fibril_func_t) seq_reader, &info);
+		ok = ok && create_fibril((fibril_func_t) seq_updater, &info);
+		
+		if (!ok) {
+			/* Let the already created fibrils corrupt the stack. */
+			return false;
+		}
+	}
+	
+	/* Wait for all worker fibrils to complete their work. */
+	fibril_mutex_lock(&info.done_cnt_mtx);
+	
+	while (info.total_workers != info.done_reader_cnt + info.done_updater_cnt) {
+		fibril_condvar_wait(&info.done_cnt_changed, &info.done_cnt_mtx);
+	}
+	
+	fibril_mutex_unlock(&info.done_cnt_mtx);
+	
+	if (info.failed) {
+		printf("Error: rcu_sync() did not wait for a preexisting reader.");
+	}
+	
+	return 0 == info.failed;
+}
+
+/*--------------------------------------------------------------------*/
+
+static FIBRIL_MUTEX_INITIALIZE(blocking_mtx);
+
+static void dummy_fibril(void *arg)
+{
+	/* Block on an already locked mutex - enters the fibril manager. */
+	fibril_mutex_lock(&blocking_mtx);
+	assert(false);
+}
+
+static bool create_threads(size_t cnt)
+{
+	/* Sanity check. */
+	assert(cnt < 1024);
+	
+	/* Keep this mutex locked so that dummy fibrils never exit. */
+	bool success = fibril_mutex_trylock(&blocking_mtx);
+	assert(success);
+	
+	for (size_t k = 0; k < cnt; ++k) {
+		thread_id_t tid;
+		
+		int ret = thread_create(dummy_fibril, NULL, "urcu-test-worker", &tid);
+		if (EOK != ret) {
+			printf("Failed to create thread '%zu' (error: %d)\n", k + 1, ret);
+			return false;
+		}
+	}
+	
+	return true;
+}
+
+/*--------------------------------------------------------------------*/
+static test_desc_t *find_test(const char *name)
+{
+	/* First match for test name. */
+	for (size_t k = 0; k < test_desc_cnt; ++k) {
+		test_desc_t *t = &test_desc[k];
+		
+		if (t->func && 0 == str_cmp(t->name, name))
+			return t;
+	}
+	
+	/* Try to match the test number. */
+	uint32_t test_num = 0;
+	
+	if (EOK == str_uint32_t(name, NULL, 0, true, &test_num)) {
+		if (test_num < test_desc_cnt && test_desc[test_num].func) {
+			return &test_desc[test_num];
+		}
+	}
+	
+	return NULL;
+}
+
+static void list_tests(void)
+{
+	printf("Available tests: \n");
+	
+	for (size_t i = 0; i < test_desc_cnt; ++i) {
+		test_desc_t *t = &test_desc[i];
+		
+		if (!t->func) 
+			continue;
+		
+		const char *type = "";
+		
+		if (t->type == T_SANITY)
+			type = " (sanity)";
+		if (t->type == T_STRESS)
+			type = " (stress)";
+
+		printf("%zu: %s ..%s %s\n", i, t->name, type, t->desc);
+	}
+}
+
+
+static void print_usage(void)
+{
+	printf("Usage: rcutest [test_name|test_number] {number_of_threads}\n");
+	list_tests();
+	
+	printf("\nExample usage:\n");
+	printf("\trcutest *\n");
+	printf("\trcutest sanity-tests\n");
+}
+
+
+static bool parse_cmd_line(int argc, char **argv, test_info_t *info)
+{
+	if (argc != 2 && argc != 3) {
+		print_usage();
+		return false;
+	}
+	
+	info->desc = find_test(argv[1]);
+
+	if (!info->desc) {
+		printf("Non-existent test '%s'.\n", argv[1]);
+		list_tests();
+		return false;
+	}
+	
+	if (argc == 3) {
+		uint32_t thread_cnt = 0;
+		int ret = str_uint32_t(argv[2], NULL, 0, true, &thread_cnt);
+		
+		if (ret == EOK && 1 <= thread_cnt && thread_cnt <= 64) {
+			info->thread_cnt = thread_cnt;
+		} else {
+			info->thread_cnt = 1;
+			printf("Err: Invalid number of threads '%s'; using 1.\n", argv[2]);
+		} 
+	} else {
+		info->thread_cnt = 1;
+	}
+	
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	rcu_register_fibril();
+	
+	test_info_t info;
+	
+	bool ok = parse_cmd_line(argc, argv, &info);
+	ok = ok && create_threads(info.thread_cnt - 1);
+	
+	if (ok) {
+		assert(1 <= info.thread_cnt);
+		test_desc_t *t = info.desc;
+		
+		printf("Running '%s' (in %zu threads)...\n", t->name, info.thread_cnt);
+		ok = t->func(&info);
+
+		printf("%s: '%s'\n", ok ? "Passed" : "FAILED", t->name);
+
+		rcu_deregister_fibril();
+		
+		/* Let the kernel clean up the created background threads. */
+		return ok ? 0 : 1;
+	} else {
+		rcu_deregister_fibril();
+		return 2;
+	}
+}
+
+
+/**
+ * @}
+ */
Index: uspace/lib/c/Makefile
===================================================================
--- uspace/lib/c/Makefile	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ uspace/lib/c/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -90,4 +90,5 @@
 	generic/pcb.c \
 	generic/smc.c \
+	generic/smp_memory_barrier.c \
 	generic/thread.c \
 	generic/tls.c \
Index: uspace/lib/c/generic/adt/hash_table.c
===================================================================
--- uspace/lib/c/generic/adt/hash_table.c	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ uspace/lib/c/generic/adt/hash_table.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -133,5 +133,5 @@
 	free(h->bucket);
 
-	h->bucket = 0;
+	h->bucket = NULL;
 	h->bucket_cnt = 0;
 }
Index: uspace/lib/c/generic/smp_memory_barrier.c
===================================================================
--- uspace/lib/c/generic/smp_memory_barrier.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/c/generic/smp_memory_barrier.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup libc
+ * @{
+ */
+/** @file
+ */
+
+#include <smp_memory_barrier.h>
+#include <libc.h>
+
+void smp_memory_barrier(void)
+{
+	__SYSCALL0(SYS_SMP_MEMORY_BARRIER);
+}
+
+/** @}
+ */
Index: uspace/lib/c/include/adt/list.h
===================================================================
--- uspace/lib/c/include/adt/list.h	(revision 7462674581270cbc4c5e0b2d1075ebff0d1aec55)
+++ uspace/lib/c/include/adt/list.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -57,5 +57,23 @@
  */
 #define LIST_INITIALIZE(name) \
-	list_t name = { \
+	list_t name = LIST_INITIALIZER(name)
+
+/** Initializer for statically allocated list.
+ * 
+ * @code
+ * struct named_list {
+ *     const char *name;
+ *     list_t list;
+ * } var = { 
+ *     .name = "default name", 
+ *     .list = LIST_INITIALIZER(name_list.list) 
+ * };
+ * @endcode
+ *
+ * @param name Name of the new statically allocated list.
+ *
+ */
+#define LIST_INITIALIZER(name) \
+	{ \
 		.head = { \
 			.prev = &(name).head, \
Index: uspace/lib/c/include/compiler/barrier.h
===================================================================
--- uspace/lib/c/include/compiler/barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/c/include/compiler/barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup libc
+ * @{
+ */
+/** @file
+ */
+
+#ifndef LIBC_COMPILER_BARRIER_H_
+#define LIBC_COMPILER_BARRIER_H_
+
+#define compiler_barrier() asm volatile ("" ::: "memory")
+
+/** Forces the compiler to access (ie load/store) the variable only once. */
+#define ACCESS_ONCE(var) (*((volatile typeof(var)*)&(var)))
+
+#endif /* LIBC_COMPILER_BARRIER_H_ */
Index: uspace/lib/c/include/smp_memory_barrier.h
===================================================================
--- uspace/lib/c/include/smp_memory_barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/c/include/smp_memory_barrier.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup libc
+ * @{
+ */
+/** @file
+ */
+
+#ifndef LIBC_SMP_MEM_BAR_H_
+#define LIBC_SMP_MEM_BAR_H_
+
+extern void smp_memory_barrier(void);
+
+#endif
+
+/** @}
+ */
Index: uspace/lib/urcu/Makefile
===================================================================
--- uspace/lib/urcu/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/urcu/Makefile	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2012 Adam Hraska
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# - Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# - The name of the author may not be used to endorse or promote products
+#   derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+USPACE_PREFIX = ../..
+
+LIBS = $(LIBC_PREFIX)/libc.a 
+
+EXTRA_CFLAGS = -I. -I$(LIBC_PREFIX)/include
+
+LIBRARY = liburcu
+
+SOURCES = \
+	rcu.c
+
+include $(USPACE_PREFIX)/Makefile.common
Index: uspace/lib/urcu/rcu.c
===================================================================
--- uspace/lib/urcu/rcu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/urcu/rcu.c	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup liburcu
+ * @{
+ */
+/**
+ * @file
+ * 
+ * User space RCU is based on URCU utilizing signals [1]. This 
+ * implementation does not however signal each thread of the process 
+ * to issue a memory barrier. Instead, we introduced a syscall that
+ * issues memory barriers (via IPIs) on cpus that are running threads
+ * of the current process. First, it does not require us to schedule
+ * and run every thread of the process. Second, IPIs are less intrusive 
+ * than switching contexts and entering user space.
+ * 
+ * This algorithm is further modified to require a single instead of
+ * two reader group changes per grace period. Signal-URCU flips
+ * the reader group and waits for readers of the previous group 
+ * twice in succession in order to wait for new readers that were
+ * delayed and mistakenly associated with the previous reader group. 
+ * The modified algorithm ensures that the new reader group is
+ * always empty (by explicitly waiting for it to become empty).
+ * Only then does it flip the reader group and wait for preexisting
+ * readers of the old reader group (invariant of SRCU [2, 3]).
+ * 
+ * 
+ * [1] User-level implementations of read-copy update,
+ *     2012, appendix
+ *     http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
+ * 
+ * [2] linux/kernel/srcu.c in Linux 3.5-rc2,
+ *     2012
+ *     http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/kernel/srcu.c?v=linux-3.5-rc2-ccs-1.8.3
+ *
+ * [3] [RFC PATCH 5/5 single-thread-version] implement 
+ *     per-domain single-thread state machine,
+ *     2012, Lai
+ *     https://lkml.org/lkml/2012/3/6/586
+ */
+
+#include "rcu.h"
+#include <fibril_synch.h>
+#include <fibril.h>
+#include <stdio.h>
+#include <compiler/barrier.h>
+#include <libarch/barrier.h>
+#include <futex.h>
+#include <macros.h>
+#include <async.h>
+#include <adt/list.h>
+#include <smp_memory_barrier.h>
+#include <assert.h>
+
+
+/** RCU sleeps for RCU_SLEEP_MS before polling an active RCU reader again. */
+#define RCU_SLEEP_MS        10
+
+#define RCU_NESTING_SHIFT   1
+#define RCU_NESTING_INC     (1 << RCU_NESTING_SHIFT)
+#define RCU_GROUP_BIT_MASK  (size_t)(RCU_NESTING_INC - 1)
+#define RCU_GROUP_A         (size_t)(0 | RCU_NESTING_INC)
+#define RCU_GROUP_B         (size_t)(1 | RCU_NESTING_INC)
+
+
+/** Fibril local RCU data. */
+typedef struct fibril_rcu_data {
+	size_t nesting_cnt;
+	link_t link;
+	bool registered;
+} fibril_rcu_data_t;
+
+/** Process global RCU data. */
+typedef struct rcu_data {
+	fibril_mutex_t mtx;
+	size_t cur_gp;
+	size_t reader_group;
+	futex_t list_futex;
+	list_t fibrils_list;
+} rcu_data_t;
+
+
+/** Fibril local RCU data. */
+static fibril_local fibril_rcu_data_t fibril_rcu = {
+	.nesting_cnt = 0,
+	.link = {
+		.next = NULL,
+		.prev = NULL
+	},
+	.registered = false
+};
+
+/** Process global RCU data. */
+static rcu_data_t rcu = {
+	.mtx = FIBRIL_MUTEX_INITIALIZER(rcu.mtx),
+	.cur_gp = 0,
+	.reader_group = RCU_GROUP_A,
+	.list_futex = FUTEX_INITIALIZER,
+	.fibrils_list = LIST_INITIALIZER(rcu.fibrils_list),
+};
+
+
+static void wait_for_readers(size_t reader_group);
+static void force_mb_in_all_threads(void);
+static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group);
+
+static bool is_in_group(size_t nesting_cnt, size_t group);
+static bool is_in_reader_section(size_t nesting_cnt);
+static size_t get_other_group(size_t group);
+
+
+/** Registers a fibril so it may start using RCU read sections.
+ * 
+ * A fibril must be registered with rcu before it can enter RCU critical
+ * sections delineated by rcu_read_lock() and rcu_read_unlock().
+ */
+void rcu_register_fibril(void)
+{
+	assert(!fibril_rcu.registered);
+	
+	futex_down(&rcu.list_futex);
+	list_append(&fibril_rcu.link, &rcu.fibrils_list);
+	futex_up(&rcu.list_futex);
+	
+	fibril_rcu.registered = true;
+}
+
+/** Deregisters a fibril that had been using RCU read sections.
+ * 
+ * A fibril must be deregistered before it exits if it had
+ * been registered with rcu via rcu_register_fibril().
+ */
+void rcu_deregister_fibril(void)
+{
+	assert(fibril_rcu.registered);
+	
+	/* 
+	 * Forcefully unlock any reader sections. The fibril is exiting
+	 * so it is not holding any references to data protected by the
+	 * rcu section. Therefore, it is safe to unlock. Otherwise, 
+	 * rcu_synchronize() would wait indefinitely.
+	 */
+	memory_barrier();
+	fibril_rcu.nesting_cnt = 0;
+	
+	futex_down(&rcu.list_futex);
+	list_remove(&fibril_rcu.link);
+	futex_up(&rcu.list_futex);
+
+	fibril_rcu.registered = false;
+}
+
+/** Delimits the start of an RCU reader critical section. 
+ * 
+ * RCU reader sections may be nested.  
+ */
+void rcu_read_lock(void)
+{
+	assert(fibril_rcu.registered);
+	
+	size_t nesting_cnt = ACCESS_ONCE(fibril_rcu.nesting_cnt);
+	
+	if (0 == (nesting_cnt >> RCU_NESTING_SHIFT)) {
+		ACCESS_ONCE(fibril_rcu.nesting_cnt) = ACCESS_ONCE(rcu.reader_group);
+		/* Required by MB_FORCE_L */
+		compiler_barrier(); /* CC_BAR_L */
+	} else {
+		ACCESS_ONCE(fibril_rcu.nesting_cnt) = nesting_cnt + RCU_NESTING_INC;
+	}
+}
+
+/** Delimits the start of an RCU reader critical section. */
+void rcu_read_unlock(void)
+{
+	assert(fibril_rcu.registered);
+	
+	/* Required by MB_FORCE_U */
+	compiler_barrier(); /* CC_BAR_U */
+	/* todo: ACCESS_ONCE(nesting_cnt) ? */
+	fibril_rcu.nesting_cnt -= RCU_NESTING_INC;
+}
+
+/** Returns true if the current fibril is in an RCU reader section. */
+bool rcu_read_locked(void)
+{
+	return 0 != (fibril_rcu.nesting_cnt >> RCU_NESTING_SHIFT);
+}
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void rcu_synchronize(void)
+{
+	assert(!rcu_read_locked());
+	
+	/* Contain load of rcu.cur_gp. */
+	memory_barrier();
+
+	/* Approximately the number of the GP in progress. */
+	size_t gp_in_progress = ACCESS_ONCE(rcu.cur_gp);
+	
+	/* todo: early exit for batched sync()s */
+	fibril_mutex_lock(&rcu.mtx);
+	
+	/* 
+	 * Exit early if we were stuck waiting for the mutex for a full grace 
+	 * period. Started waiting during gp_in_progress (or gp_in_progress + 1
+	 * if the value propagated to this cpu too late) so wait for the next
+	 * full GP, gp_in_progress + 1, to finish. Ie don't wait if the GP
+	 * after that, gp_in_progress + 2, already started.
+	 */
+	/* rcu.cur_gp >= gp_in_progress + 2, but tolerates overflows. */
+	if (rcu.cur_gp != gp_in_progress && rcu.cur_gp + 1 != gp_in_progress) {
+		fibril_mutex_unlock(&rcu.mtx);
+		return;
+	}
+	
+	++ACCESS_ONCE(rcu.cur_gp);
+	
+	/* 
+	 * Pairs up with MB_FORCE_L (ie CC_BAR_L). Makes changes prior 
+	 * to rcu_synchronize() visible to new readers. 
+	 */
+	memory_barrier(); /* MB_A */
+	
+	/* 
+	 * Pairs up with MB_A. 
+	 * 
+	 * If the memory barrier is issued before CC_BAR_L in the target
+	 * thread, it pairs up with MB_A and the thread sees all changes
+	 * prior to rcu_synchronize(). Ie any reader sections are new
+	 * rcu readers.  
+	 * 
+	 * If the memory barrier is issued after CC_BAR_L, it pairs up
+	 * with MB_B and it will make the most recent nesting_cnt visible
+	 * in this thread. Since the reader may have already accessed
+	 * memory protected by RCU (it ran instructions passed CC_BAR_L),
+	 * it is a preexisting reader. Seeing the most recent nesting_cnt 
+	 * ensures the thread will be identified as a preexisting reader
+	 * and we will wait for it in wait_for_readers(old_reader_group).
+	 */
+	force_mb_in_all_threads(); /* MB_FORCE_L */
+	
+	/* 
+	 * Pairs with MB_FORCE_L (ie CC_BAR_L, CC_BAR_U) and makes the most
+	 * current fibril.nesting_cnt visible to this cpu.
+	 */
+	read_barrier(); /* MB_B */
+	
+	size_t new_reader_group = get_other_group(rcu.reader_group);
+	wait_for_readers(new_reader_group);
+	
+	/* Separates waiting for readers in new_reader_group from group flip. */
+	memory_barrier();
+	
+	/* Flip the group new readers should associate with. */
+	size_t old_reader_group = rcu.reader_group;
+	rcu.reader_group = new_reader_group;
+
+	/* Flip the group before waiting for preexisting readers in the old group.*/
+	memory_barrier();
+	
+	wait_for_readers(old_reader_group);
+	
+	/* MB_FORCE_U  */
+	force_mb_in_all_threads(); /* MB_FORCE_U */
+	
+	fibril_mutex_unlock(&rcu.mtx);
+}
+
+/** Issues a memory barrier in each thread of this process. */
+static void force_mb_in_all_threads(void)
+{
+	/* 
+	 * Only issue barriers in running threads. The scheduler will 
+	 * execute additional memory barriers when switching to threads
+	 * of the process that are currently not running.
+	 */
+	smp_memory_barrier();
+}
+
+/** Waits for readers of reader_group to exit their readers sections. */
+static void wait_for_readers(size_t reader_group)
+{
+	futex_down(&rcu.list_futex);
+	
+	list_t quiescent_fibrils;
+	list_initialize(&quiescent_fibrils);
+	
+	while (!list_empty(&rcu.fibrils_list)) {
+		list_foreach_safe(rcu.fibrils_list, fibril_it, next_fibril) {
+			fibril_rcu_data_t *fib = member_to_inst(fibril_it, 
+				fibril_rcu_data_t, link);
+			
+			if (is_preexisting_reader(fib, reader_group)) {
+				futex_up(&rcu.list_futex);
+				async_usleep(RCU_SLEEP_MS * 1000);
+				futex_down(&rcu.list_futex);
+				break;
+			} else {
+				list_remove(fibril_it);
+				list_append(fibril_it, &quiescent_fibrils);
+			}
+		}
+	}
+	
+	list_concat(&rcu.fibrils_list, &quiescent_fibrils);
+	futex_up(&rcu.list_futex);
+}
+
+static bool is_preexisting_reader(const fibril_rcu_data_t *fib, size_t group)
+{
+	size_t nesting_cnt = ACCESS_ONCE(fib->nesting_cnt);
+	
+	return is_in_group(nesting_cnt, group) && is_in_reader_section(nesting_cnt);
+}
+
+static size_t get_other_group(size_t group)
+{
+	if (group == RCU_GROUP_A) 
+		return RCU_GROUP_B;
+	else
+		return RCU_GROUP_A;
+}
+
+static bool is_in_reader_section(size_t nesting_cnt)
+{
+	return RCU_NESTING_INC <= nesting_cnt;
+}
+
+static bool is_in_group(size_t nesting_cnt, size_t group)
+{
+	return (nesting_cnt & RCU_GROUP_BIT_MASK) == (group & RCU_GROUP_BIT_MASK);
+}
+
+
+
+/** @}
+ */
Index: uspace/lib/urcu/rcu.h
===================================================================
--- uspace/lib/urcu/rcu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
+++ uspace/lib/urcu/rcu.h	(revision 69146b93ef5ca7beba675e46730ac8d6d5a24316)
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup liburcu
+ * @{
+ */
+/**
+ * @file
+ */
+
+#ifndef LIBURCU_RCU_H_
+#define LIBURCU_RCU_H_
+
+#include <compiler/barrier.h>
+#include <libarch/barrier.h>
+#include <bool.h>
+
+/** Use to assign a pointer to newly initialized data to a rcu reader 
+ * accessible pointer.
+ * 
+ * Example:
+ * @code
+ * typedef struct exam {
+ *     struct exam *next;
+ *     int grade;
+ * } exam_t;
+ * 
+ * exam_t *exam_list;
+ * // ..
+ * 
+ * // Insert at the beginning of the list.
+ * exam_t *my_exam = malloc(sizeof(exam_t), 0);
+ * my_exam->grade = 5;
+ * my_exam->next = exam_list;
+ * rcu_assign(exam_list, my_exam);
+ * 
+ * // Changes properly propagate. Every reader either sees
+ * // the old version of exam_list or the new version with
+ * // the fully initialized my_exam.
+ * rcu_synchronize();
+ * // Now we can be sure every reader sees my_exam.
+ * 
+ * @endcode
+ */
+#define rcu_assign(ptr, value) \
+	do { \
+		memory_barrier(); \
+		(ptr) = (value); \
+	} while (0)
+
+/** Use to access RCU protected data in a reader section.
+ * 
+ * Example:
+ * @code
+ * exam_t *exam_list;
+ * // ...
+ * 
+ * rcu_read_lock();
+ * exam_t *first_exam = rcu_access(exam_list);
+ * // We can now safely use first_exam, it won't change 
+ * // under us while we're using it.
+ *
+ * // ..
+ * rcu_read_unlock();
+ * @endcode
+ */
+#define rcu_access(ptr) ACCESS_ONCE(ptr)
+
+
+extern void rcu_register_fibril(void);
+extern void rcu_deregister_fibril(void);
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+
+extern bool rcu_read_locked(void);
+
+extern void rcu_synchronize(void);
+
+#endif
+
+/** @}
+ */