Index: kernel/Makefile
===================================================================
--- kernel/Makefile	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/Makefile	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -198,4 +198,5 @@
 	generic/src/adt/bitmap.c \
 	generic/src/adt/btree.c \
+	generic/src/adt/cht.c \
 	generic/src/adt/hash_table.c \
 	generic/src/adt/list.c \
@@ -204,4 +205,5 @@
 	generic/src/console/prompt.c \
 	generic/src/cpu/cpu.c \
+	generic/src/cpu/cpu_mask.c \
 	generic/src/ddi/ddi.c \
 	generic/src/ddi/irq.c \
@@ -257,8 +259,12 @@
 	generic/src/synch/semaphore.c \
 	generic/src/synch/smc.c \
+	generic/src/synch/smp_memory_barrier.c \
 	generic/src/synch/waitq.c \
 	generic/src/synch/futex.c \
+	generic/src/synch/workqueue.c \
+	generic/src/synch/rcu.c \
 	generic/src/smp/ipi.c \
 	generic/src/smp/smp.c \
+	generic/src/smp/smp_call.c \
 	generic/src/ipc/ipc.c \
 	generic/src/ipc/sysipc.c \
@@ -310,4 +316,5 @@
 		test/atomic/atomic1.c \
 		test/btree/btree1.c \
+		test/cht/cht1.c \
 		test/avltree/avltree1.c \
 		test/fault/fault1.c \
@@ -319,4 +326,7 @@
 		test/synch/semaphore1.c \
 		test/synch/semaphore2.c \
+		test/synch/workqueue2.c \
+		test/synch/workqueue3.c \
+		test/synch/rcu1.c \
 		test/print/print1.c \
 		test/print/print2.c \
@@ -324,5 +334,6 @@
 		test/print/print4.c \
 		test/print/print5.c \
-		test/thread/thread1.c
+		test/thread/thread1.c \
+		test/smpcall/smpcall1.c
 	
 	ifeq ($(KARCH),mips32)
Index: kernel/arch/abs32le/Makefile.inc
===================================================================
--- kernel/arch/abs32le/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/abs32le/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -51,4 +51,5 @@
 	arch/$(KARCH)/src/cpu/cpu.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/smp/ipi.c \
 	arch/$(KARCH)/src/mm/km.c \
Index: kernel/arch/abs32le/src/cpu/cpu.c
===================================================================
--- kernel/arch/abs32le/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/abs32le/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -34,4 +34,5 @@
 
 #include <arch/cpu.h>
+#include <cpu.h>
 #include <arch.h>
 #include <typedefs.h>
Index: kernel/arch/abs32le/src/smp/smp_call.c
===================================================================
--- kernel/arch/abs32le/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/abs32le/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup abs32le 
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/amd64/Makefile.inc
===================================================================
--- kernel/arch/amd64/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -84,4 +84,5 @@
 		arch/$(KARCH)/src/smp/ipi.c \
 		arch/$(KARCH)/src/smp/mps.c \
+		arch/$(KARCH)/src/smp/smp_call.c \
 		arch/$(KARCH)/src/smp/smp.c
 endif
Index: kernel/arch/amd64/include/arch/atomic.h
===================================================================
--- kernel/arch/amd64/include/arch/atomic.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/include/arch/atomic.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2001-2004 Jakub Jermar
+ * Copyright (c) 2012      Adam Hraska
  * All rights reserved.
  *
@@ -140,6 +141,124 @@
 }
 
+
+#define _atomic_cas_impl(pptr, exp_val, new_val, old_val, prefix) \
+({ \
+	switch (sizeof(typeof(*(pptr)))) { \
+	case 1: \
+		asm volatile ( \
+			prefix " cmpxchgb %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 2: \
+		asm volatile ( \
+			prefix " cmpxchgw %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 4: \
+		asm volatile ( \
+			prefix " cmpxchgl %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 8: \
+		asm volatile ( \
+			prefix " cmpxchgq %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	} \
+})
+
+
+#ifndef local_atomic_cas
+
+#define local_atomic_cas(pptr, exp_val, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _old_val_cas; \
+	typeof(*(pptr)) _exp_val_cas = exp_val; \
+	typeof(*(pptr)) _new_val_cas = new_val; \
+	_atomic_cas_impl(pptr, _exp_val_cas, _new_val_cas, _old_val_cas, ""); \
+	\
+	_old_val_cas; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
 #endif
 
+
+#ifndef local_atomic_exchange
+/* 
+ * Issuing a xchg instruction always implies lock prefix semantics.
+ * Therefore, it is cheaper to use a cmpxchg without a lock prefix 
+ * in a loop.
+ */
+#define local_atomic_exchange(pptr, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _exp_val_x; \
+	typeof(*(pptr)) _old_val_x; \
+	typeof(*(pptr)) _new_val_x = new_val; \
+	\
+	do { \
+		_exp_val_x = *pptr; \
+		_old_val_x = local_atomic_cas(pptr, _exp_val_x, _new_val_x); \
+	} while (_old_val_x != _exp_val_x); \
+	\
+	_old_val_x; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
+#endif
+
 /** @}
  */
Index: kernel/arch/amd64/include/arch/cpu.h
===================================================================
--- kernel/arch/amd64/include/arch/cpu.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/include/arch/cpu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -73,4 +73,6 @@
 	tss_t *tss;
 	
+	unsigned int id; /** CPU's local, ie physical, APIC ID. */
+	
 	size_t iomapver_copy;  /** Copy of TASK's I/O Permission bitmap generation count. */
 } cpu_arch_t;
Index: kernel/arch/amd64/include/arch/interrupt.h
===================================================================
--- kernel/arch/amd64/include/arch/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/include/arch/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -71,4 +71,5 @@
 #define VECTOR_TLB_SHOOTDOWN_IPI  (IVT_FREEBASE + 1)
 #define VECTOR_DEBUG_IPI          (IVT_FREEBASE + 2)
+#define VECTOR_SMP_CALL_IPI       (IVT_FREEBASE + 3)
 
 extern void (* disable_irqs_function)(uint16_t);
Index: kernel/arch/amd64/src/amd64.c
===================================================================
--- kernel/arch/amd64/src/amd64.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/src/amd64.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -170,5 +170,5 @@
 }
 
-void arch_post_cpu_init()
+void arch_post_cpu_init(void)
 {
 #ifdef CONFIG_SMP
Index: kernel/arch/amd64/src/cpu/cpu.c
===================================================================
--- kernel/arch/amd64/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -158,7 +158,7 @@
 void cpu_print_report(cpu_t* m)
 {
-	printf("cpu%d: (%s family=%d model=%d stepping=%d) %dMHz\n",
+	printf("cpu%d: (%s family=%d model=%d stepping=%d apicid=%u) %dMHz\n",
 	    m->id, vendor_str[m->arch.vendor], m->arch.family, m->arch.model,
-	    m->arch.stepping, m->frequency_mhz);
+	    m->arch.stepping, m->arch.id, m->frequency_mhz);
 }
 
Index: kernel/arch/amd64/src/interrupt.c
===================================================================
--- kernel/arch/amd64/src/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/amd64/src/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -55,4 +55,5 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <smp/smp_call.h>
 
 /*
@@ -161,4 +162,10 @@
 	trap_virtual_eoi();
 	tlb_shootdown_ipi_recv();
+}
+
+static void arch_smp_call_ipi_recv(unsigned int n, istate_t *istate)
+{
+	trap_virtual_eoi();
+	smp_call_ipi_recv();
 }
 #endif
@@ -224,4 +231,6 @@
 	exc_register(VECTOR_TLB_SHOOTDOWN_IPI, "tlb_shootdown", true,
 	    (iroutine_t) tlb_shootdown_ipi);
+	exc_register(VECTOR_SMP_CALL_IPI, "smp_call", true, 
+		(iroutine_t) arch_smp_call_ipi_recv);
 #endif
 }
Index: kernel/arch/amd64/src/smp/smp_call.c
===================================================================
--- kernel/arch/amd64/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/amd64/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,1 @@
+../../../ia32/src/smp/smp_call.c
Index: kernel/arch/arm32/Makefile.inc
===================================================================
--- kernel/arch/arm32/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/arm32/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -65,4 +65,5 @@
 	arch/$(KARCH)/src/mm/tlb.c \
 	arch/$(KARCH)/src/mm/page_fault.c \
+	arch/$(KARCH)/src/atomic.c \
 	arch/$(KARCH)/src/ras.c
 
Index: kernel/arch/arm32/include/arch/cp15.h
===================================================================
--- kernel/arch/arm32/include/arch/cp15.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/arm32/include/arch/cp15.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,4 +37,9 @@
 #define KERN_arm32_CP15_H_
 
+#if defined(KERNEL) || defined(BOOT)
+#include <typedefs.h>
+#else
+#include <sys/types.h>
+#endif
 
 /** See ARM Architecture reference manual ch. B3.17.1 page B3-1456
Index: kernel/arch/arm32/src/atomic.c
===================================================================
--- kernel/arch/arm32/src/atomic.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/arm32/src/atomic.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup arm32
+ * @{
+ */
+/** @file
+ *  @brief Atomic operations emulation.
+ */
+
+#include <synch/spinlock.h>
+
+
+IRQ_SPINLOCK_STATIC_INITIALIZE_NAME(cas_lock, "arm-cas-lock");
+
+/** Implements GCC's missing compare-and-swap intrinsic for ARM.
+ *
+ * Sets \a *ptr to \a new_val if it is equal to \a expected. In any case,
+ * returns the previous value of \a *ptr.
+ */
+void * __sync_val_compare_and_swap_4(void **ptr, void *expected, void *new_val)
+{
+	/* 
+	 * Using an interrupt disabling spinlock might still lead to deadlock
+	 * if CAS() is used in an exception handler. Eg. if a CAS() results
+	 * in a page fault exception and the exception handler again tries
+	 * to invoke CAS() (even for a different memory location), the spinlock
+	 * would deadlock.
+	 */
+	irq_spinlock_lock(&cas_lock, true);
+	
+	void * cur_val = *ptr;
+	
+	if (cur_val == expected) {
+		*ptr = new_val;
+	}
+	
+	irq_spinlock_unlock(&cas_lock, true);
+	
+	return cur_val;
+}
+
+
+/** @}
+ */
Index: kernel/arch/arm32/src/cpu/cpu.c
===================================================================
--- kernel/arch/arm32/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/arm32/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,8 @@
 #include <print.h>
 
+#ifdef CONFIG_FPU
+#include <arch/fpu_context.h>
+#endif
+
 static inline unsigned log2(unsigned val)
 {
@@ -60,6 +64,5 @@
 static const char * implementer(unsigned id)
 {
-	switch (id)
-	{
+	switch (id) {
 	case 0x41: return "ARM Limited";
 	case 0x44: return "Digital Equipment Corporation";
Index: kernel/arch/arm32/src/exception.c
===================================================================
--- kernel/arch/arm32/src/exception.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/arm32/src/exception.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -88,6 +88,8 @@
 static void swi_exception(unsigned int exc_no, istate_t *istate)
 {
+	interrupts_enable();
 	istate->r0 = syscall_handler(istate->r0, istate->r1, istate->r2,
 	    istate->r3, istate->r4, istate->r5, istate->r6);
+	interrupts_disable();
 }
 
Index: kernel/arch/ia32/Makefile.inc
===================================================================
--- kernel/arch/ia32/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -85,4 +85,5 @@
 	arch/$(KARCH)/src/smp/mps.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/atomic.S \
 	arch/$(KARCH)/src/smp/ipi.c \
Index: kernel/arch/ia32/include/arch/atomic.h
===================================================================
--- kernel/arch/ia32/include/arch/atomic.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/include/arch/atomic.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2001-2004 Jakub Jermar
+ * Copyright (c) 2012      Adam Hraska
  * All rights reserved.
  *
@@ -113,4 +114,5 @@
 }
 
+
 /** ia32 specific fast spinlock */
 NO_TRACE static inline void atomic_lock_arch(atomic_t *val)
@@ -142,4 +144,106 @@
 }
 
+
+#define _atomic_cas_impl(pptr, exp_val, new_val, old_val, prefix) \
+({ \
+	switch (sizeof(typeof(*(pptr)))) { \
+	case 1: \
+		asm volatile ( \
+			prefix " cmpxchgb %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 2: \
+		asm volatile ( \
+			prefix " cmpxchgw %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	case 4: \
+		asm volatile ( \
+			prefix " cmpxchgl %[newval], %[ptr]\n" \
+			: /* Output operands. */ \
+			/* Old/current value is returned in eax. */ \
+			[oldval] "=a" (old_val), \
+			/* (*ptr) will be read and written to, hence "+" */ \
+			[ptr] "+m" (*pptr) \
+			: /* Input operands. */ \
+			/* Expected value must be in eax. */ \
+			[expval] "a" (exp_val), \
+			/* The new value may be in any register. */ \
+			[newval] "r" (new_val) \
+			: "memory" \
+		); \
+		break; \
+	} \
+})
+
+
+#ifndef local_atomic_cas
+
+#define local_atomic_cas(pptr, exp_val, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _old_val_cas; \
+	typeof(*(pptr)) _exp_val_cas = exp_val; \
+	typeof(*(pptr)) _new_val_cas = new_val; \
+	_atomic_cas_impl(pptr, _exp_val_cas, _new_val_cas, _old_val_cas, ""); \
+	\
+	_old_val_cas; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
+#ifndef local_atomic_exchange
+/* 
+ * Issuing a xchg instruction always implies lock prefix semantics.
+ * Therefore, it is cheaper to use a cmpxchg without a lock prefix 
+ * in a loop.
+ */
+#define local_atomic_exchange(pptr, new_val) \
+({ \
+	/* Use proper types and avoid name clashes */ \
+	typeof(*(pptr)) _exp_val_x; \
+	typeof(*(pptr)) _old_val_x; \
+	typeof(*(pptr)) _new_val_x = new_val; \
+	\
+	do { \
+		_exp_val_x = *pptr; \
+		_old_val_x = local_atomic_cas(pptr, _exp_val_x, _new_val_x); \
+	} while (_old_val_x != _exp_val_x); \
+	\
+	_old_val_x; \
+})
+
+#else
+/* Check if arch/atomic.h does not accidentally include /atomic.h .*/
+#error Architecture specific cpu local atomics already defined! Check your includes.
+#endif
+
+
 #endif
 
Index: kernel/arch/ia32/include/arch/cpu.h
===================================================================
--- kernel/arch/ia32/include/arch/cpu.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/include/arch/cpu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -61,5 +61,7 @@
 	unsigned int stepping;
 	cpuid_feature_info_t fi;
-	
+
+	unsigned int id; /** CPU's local, ie physical, APIC ID. */
+
 	tss_t *tss;
 	
Index: kernel/arch/ia32/include/arch/interrupt.h
===================================================================
--- kernel/arch/ia32/include/arch/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/include/arch/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -71,4 +71,5 @@
 #define VECTOR_TLB_SHOOTDOWN_IPI  (IVT_FREEBASE + 1)
 #define VECTOR_DEBUG_IPI          (IVT_FREEBASE + 2)
+#define VECTOR_SMP_CALL_IPI       (IVT_FREEBASE + 3)
 
 extern void (* disable_irqs_function)(uint16_t);
Index: kernel/arch/ia32/include/arch/smp/apic.h
===================================================================
--- kernel/arch/ia32/include/arch/smp/apic.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/include/arch/smp/apic.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -353,4 +353,5 @@
 extern void l_apic_init(void);
 extern void l_apic_eoi(void);
+extern int l_apic_send_custom_ipi(uint8_t, uint8_t);
 extern int l_apic_broadcast_custom_ipi(uint8_t);
 extern int l_apic_send_init_ipi(uint8_t);
Index: kernel/arch/ia32/src/cpu/cpu.c
===================================================================
--- kernel/arch/ia32/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -160,7 +160,7 @@
 void cpu_print_report(cpu_t* cpu)
 {
-	printf("cpu%u: (%s family=%u model=%u stepping=%u) %" PRIu16 " MHz\n",
-		cpu->id, vendor_str[cpu->arch.vendor], cpu->arch.family,
-		cpu->arch.model, cpu->arch.stepping, cpu->frequency_mhz);
+	printf("cpu%u: (%s family=%u model=%u stepping=%u apicid=%u) %" PRIu16 
+		" MHz\n", cpu->id, vendor_str[cpu->arch.vendor], cpu->arch.family,
+		cpu->arch.model, cpu->arch.stepping, cpu->arch.id, cpu->frequency_mhz);
 }
 
Index: kernel/arch/ia32/src/ia32.c
===================================================================
--- kernel/arch/ia32/src/ia32.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/src/ia32.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -124,5 +124,5 @@
 }
 
-void arch_post_cpu_init()
+void arch_post_cpu_init(void)
 {
 #ifdef CONFIG_SMP
Index: kernel/arch/ia32/src/interrupt.c
===================================================================
--- kernel/arch/ia32/src/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/src/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -54,4 +54,6 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <smp/smp_call.h>
+#include <proc/task.h>
 
 /*
@@ -170,4 +172,10 @@
 	tlb_shootdown_ipi_recv();
 }
+
+static void arch_smp_call_ipi_recv(unsigned int n, istate_t *istate)
+{
+	trap_virtual_eoi();
+	smp_call_ipi_recv();
+}
 #endif
 
@@ -230,4 +238,6 @@
 	exc_register(VECTOR_TLB_SHOOTDOWN_IPI, "tlb_shootdown", true,
 	    (iroutine_t) tlb_shootdown_ipi);
+	exc_register(VECTOR_SMP_CALL_IPI, "smp_call", true,
+	    (iroutine_t) arch_smp_call_ipi_recv);
 #endif
 }
Index: kernel/arch/ia32/src/smp/apic.c
===================================================================
--- kernel/arch/ia32/src/smp/apic.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/src/smp/apic.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -264,22 +264,44 @@
 }
 
-#define DELIVS_PENDING_SILENT_RETRIES	4	
-
+/* Waits for the destination cpu to accept the previous ipi. */
 static void l_apic_wait_for_delivery(void)
 {
 	icr_t icr;
-	unsigned retries = 0;
-
+	
 	do {
-		if (retries++ > DELIVS_PENDING_SILENT_RETRIES) {
-			retries = 0;
-#ifdef CONFIG_DEBUG
-			log(LF_ARCH, LVL_DEBUG, "IPI is pending.");
-#endif
-			delay(20);
-		}
 		icr.lo = l_apic[ICRlo];
-	} while (icr.delivs == DELIVS_PENDING);
-	
+	} while (icr.delivs != DELIVS_IDLE);
+}
+
+/** Send one CPU an IPI vector.
+ *
+ * @param apicid Physical APIC ID of the destination CPU.
+ * @param vector Interrupt vector to be sent.
+ *
+ * @return 0 on failure, 1 on success.
+ */
+int l_apic_send_custom_ipi(uint8_t apicid, uint8_t vector)
+{
+	icr_t icr;
+
+	/* Wait for a destination cpu to accept our previous ipi. */
+	l_apic_wait_for_delivery();
+	
+	icr.lo = l_apic[ICRlo];
+	icr.hi = l_apic[ICRhi];
+	
+	icr.delmod = DELMOD_FIXED;
+	icr.destmod = DESTMOD_PHYS;
+	icr.level = LEVEL_ASSERT;
+	icr.shorthand = SHORTHAND_NONE;
+	icr.trigger_mode = TRIGMOD_LEVEL;
+	icr.vector = vector;
+	icr.dest = apicid;
+
+	/* Send the IPI by writing to l_apic[ICRlo]. */
+	l_apic[ICRhi] = icr.hi;
+	l_apic[ICRlo] = icr.lo;
+	
+	return apic_poll_errors();
 }
 
@@ -294,4 +316,7 @@
 {
 	icr_t icr;
+
+	/* Wait for a destination cpu to accept our previous ipi. */
+	l_apic_wait_for_delivery();
 	
 	icr.lo = l_apic[ICRlo];
@@ -304,6 +329,4 @@
 	
 	l_apic[ICRlo] = icr.lo;
-
-	l_apic_wait_for_delivery();
 	
 	return apic_poll_errors();
Index: kernel/arch/ia32/src/smp/smp.c
===================================================================
--- kernel/arch/ia32/src/smp/smp.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia32/src/smp/smp.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -55,4 +55,5 @@
 #include <memstr.h>
 #include <arch/drivers/i8259.h>
+#include <cpu.h>
 
 #ifdef CONFIG_SMP
@@ -77,4 +78,14 @@
 		io_apic = (uint32_t *) km_map((uintptr_t) io_apic, PAGE_SIZE,
 		    PAGE_WRITE | PAGE_NOT_CACHEABLE);
+	}
+}
+
+static void cpu_arch_id_init(void)
+{
+	ASSERT(ops != NULL);
+	ASSERT(cpus != NULL);
+	
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		cpus[i].arch.id = ops->cpu_apic_id(i);
 	}
 }
@@ -92,4 +103,10 @@
 	
 	ASSERT(ops != NULL);
+
+	/*
+	 * SMP initialized, cpus array allocated. Assign each CPU its 
+	 * physical APIC ID.
+	 */
+	cpu_arch_id_init();
 	
 	/*
Index: kernel/arch/ia32/src/smp/smp_call.c
===================================================================
--- kernel/arch/ia32/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/ia32/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup ia32
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <arch/smp/apic.h>
+#include <arch/interrupt.h>
+#include <cpu.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	(void) l_apic_send_custom_ipi(cpus[cpu_id].arch.id, VECTOR_SMP_CALL_IPI);
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/ia64/Makefile.inc
===================================================================
--- kernel/arch/ia64/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -61,4 +61,5 @@
 	arch/$(KARCH)/src/ddi/ddi.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/drivers/it.c
 
Index: kernel/arch/ia64/include/arch/interrupt.h
===================================================================
--- kernel/arch/ia64/include/arch/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/include/arch/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -36,16 +36,27 @@
 #define KERN_ia64_INTERRUPT_H_
 
+#ifndef __ASM__
 #include <typedefs.h>
 #include <arch/istate.h>
+#endif
+
+#define EXC_ALT_ITLB_FAULT	0xc
+#define EXC_ALT_DTLB_FAULT	0x10
+#define EXC_NESTED_TLB_FAULT	0x14
+#define EXC_DATA_D_BIT_FAULT	0x20
+#define EXC_INST_A_BIT_FAULT	0x24
+#define EXC_DATA_A_BIT_FAULT	0x28
+#define EXC_BREAK_INSTRUCTION	0x2c
+#define EXC_EXT_INTERRUPT	0x30
+#define EXC_PAGE_NOT_PRESENT	0x50
+#define EXC_DATA_AR_FAULT	0x53
+#define EXC_GENERAL_EXCEPTION	0x54
+#define EXC_DISABLED_FP_REG	0x55
+#define EXC_SPECULATION		0x57
 
 /** ia64 has 256 INRs. */
 #define INR_COUNT  256
 
-/*
- * We need to keep this just to compile.
- * We might eventually move interrupt/ stuff
- * to genarch.
- */
-#define IVT_ITEMS  0
+#define IVT_ITEMS  128 
 #define IVT_FIRST  0
 
@@ -72,14 +83,17 @@
 #define EOI  0  /**< The actual value doesn't matter. */
 
+#ifndef __ASM__
 extern void *ivt;
 
-extern void general_exception(uint64_t, istate_t *);
-extern int break_instruction(uint64_t, istate_t *);
-extern void universal_handler(uint64_t, istate_t *);
-extern void nop_handler(uint64_t, istate_t *);
-extern void external_interrupt(uint64_t, istate_t *);
-extern void disabled_fp_register(uint64_t, istate_t *);
+extern void general_exception(unsigned int, istate_t *);
+extern sysarg_t break_instruction(unsigned int, istate_t *);
+extern void universal_handler(unsigned int, istate_t *);
+extern void external_interrupt(unsigned int, istate_t *);
+extern void disabled_fp_register(unsigned int, istate_t *);
 
 extern void trap_virtual_enable_irqs(uint16_t);
+
+void exception_init(void);
+#endif
 
 #endif
Index: kernel/arch/ia64/include/arch/mm/tlb.h
===================================================================
--- kernel/arch/ia64/include/arch/mm/tlb.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/include/arch/mm/tlb.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -86,12 +86,12 @@
 extern void itc_pte_copy(pte_t *t);
 
-extern void alternate_instruction_tlb_fault(uint64_t vector, istate_t *istate);
-extern void alternate_data_tlb_fault(uint64_t vector, istate_t *istate);
-extern void data_nested_tlb_fault(uint64_t vector, istate_t *istate);
-extern void data_dirty_bit_fault(uint64_t vector, istate_t *istate);
-extern void instruction_access_bit_fault(uint64_t vector, istate_t *istate);
-extern void data_access_bit_fault(uint64_t vector, istate_t *istate);
-extern void data_access_rights_fault(uint64_t vector, istate_t *istate);
-extern void page_not_present(uint64_t vector, istate_t *istate);
+extern void alternate_instruction_tlb_fault(unsigned int, istate_t *);
+extern void alternate_data_tlb_fault(unsigned int, istate_t *);
+extern void data_nested_tlb_fault(unsigned int, istate_t *);
+extern void data_dirty_bit_fault(unsigned int, istate_t *);
+extern void instruction_access_bit_fault(unsigned int, istate_t *);
+extern void data_access_bit_fault(unsigned int, istate_t *);
+extern void data_access_rights_fault(unsigned int, istate_t *);
+extern void page_not_present(unsigned int, istate_t *);
 
 #endif
Index: kernel/arch/ia64/src/ia64.c
===================================================================
--- kernel/arch/ia64/src/ia64.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/src/ia64.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,4 +37,5 @@
 #include <errno.h>
 #include <interrupt.h>
+#include <arch/interrupt.h>
 #include <macros.h>
 #include <str.h>
@@ -85,4 +86,6 @@
 void arch_pre_mm_init(void)
 {
+	if (config.cpu_active == 1)
+		exception_init();
 }
 
Index: kernel/arch/ia64/src/interrupt.c
===================================================================
--- kernel/arch/ia64/src/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/src/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -54,4 +54,5 @@
 #include <synch/spinlock.h>
 #include <mm/tlb.h>
+#include <arch/mm/tlb.h>
 #include <symtab.h>
 #include <putchar.h>
@@ -59,9 +60,7 @@
 #define VECTORS_64_BUNDLE        20
 #define VECTORS_16_BUNDLE        48
-#define VECTORS_16_BUNDLE_START  0x5000
-
-#define VECTOR_MAX  0x7f00
-
-#define BUNDLE_SIZE  16
+#define VECTORS_16_BUNDLE_START  0x50
+
+#define VECTOR_MAX  0x7f
 
 static const char *vector_names_64_bundle[VECTORS_64_BUNDLE] = {
@@ -122,13 +121,12 @@
 };
 
-static const char *vector_to_string(uint16_t vector)
-{
-	ASSERT(vector <= VECTOR_MAX);
-	
-	if (vector >= VECTORS_16_BUNDLE_START)
-		return vector_names_16_bundle[(vector -
-		    VECTORS_16_BUNDLE_START) / (16 * BUNDLE_SIZE)];
+static const char *vector_to_string(unsigned int n)
+{
+	ASSERT(n <= VECTOR_MAX);
+	
+	if (n >= VECTORS_16_BUNDLE_START)
+		return vector_names_16_bundle[n - VECTORS_16_BUNDLE_START];
 	else
-		return vector_names_64_bundle[vector / (64 * BUNDLE_SIZE)];
+		return vector_names_64_bundle[n / 4];
 }
 
@@ -153,5 +151,5 @@
 }
 
-void general_exception(uint64_t vector, istate_t *istate)
+void general_exception(unsigned int n, istate_t *istate)
 {
 	const char *desc;
@@ -182,8 +180,8 @@
 	
 	fault_if_from_uspace(istate, "General Exception (%s).", desc);
-	panic_badtrap(istate, vector, "General Exception (%s).", desc);
-}
-
-void disabled_fp_register(uint64_t vector, istate_t *istate)
+	panic_badtrap(istate, n, "General Exception (%s).", desc);
+}
+
+void disabled_fp_register(unsigned int n, istate_t *istate)
 {
 #ifdef CONFIG_FPU_LAZY
@@ -191,17 +189,15 @@
 #else
 	fault_if_from_uspace(istate, "Interruption: %#hx (%s).",
-	    (uint16_t) vector, vector_to_string(vector));
+	    (uint16_t) n, vector_to_string(n));
 	panic_badtrap(istate, vector, "Interruption: %#hx (%s).",
-	    (uint16_t) vector, vector_to_string(vector));
+	    (uint16_t) n, vector_to_string(n));
 #endif
 }
 
-void nop_handler(uint64_t vector, istate_t *istate)
-{
-}
-
 /** Handle syscall. */
-int break_instruction(uint64_t vector, istate_t *istate)
-{
+sysarg_t break_instruction(unsigned int n, istate_t *istate)
+{
+	sysarg_t ret;
+
 	/*
 	 * Move to next instruction after BREAK.
@@ -214,14 +210,18 @@
 	}
 	
-	return syscall_handler(istate->in0, istate->in1, istate->in2,
+	interrupts_enable();
+	ret = syscall_handler(istate->in0, istate->in1, istate->in2,
 	    istate->in3, istate->in4, istate->in5, istate->in6);
-}
-
-void universal_handler(uint64_t vector, istate_t *istate)
+	interrupts_disable();
+
+	return ret;
+}
+
+void universal_handler(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "Interruption: %#hx (%s).",
-	    (uint16_t) vector, vector_to_string(vector));
-	panic_badtrap(istate, vector, "Interruption: %#hx (%s).",
-	    (uint16_t) vector, vector_to_string(vector));
+	    n, vector_to_string(n));
+	panic_badtrap(istate, n, "Interruption: %#hx (%s).",
+	    n, vector_to_string(n));
 }
 
@@ -229,9 +229,9 @@
 {
 	asm volatile (
-		"mov cr.eoi=r0;;"
+		"mov cr.eoi = r0 ;;"
 	);
 }
 
-void external_interrupt(uint64_t vector, istate_t *istate)
+void external_interrupt(unsigned int n, istate_t *istate)
 {
 	cr_ivr_t ivr;
@@ -298,4 +298,47 @@
 }
 
+void exception_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < IVT_ITEMS; i++)
+		exc_register(i, "universal_handler", false, universal_handler);
+
+	exc_register(EXC_ALT_ITLB_FAULT,
+	    vector_to_string(EXC_ALT_ITLB_FAULT), true,
+	    alternate_instruction_tlb_fault);
+	exc_register(EXC_ALT_DTLB_FAULT,
+	    vector_to_string(EXC_ALT_DTLB_FAULT), true,
+	    alternate_data_tlb_fault);
+	exc_register(EXC_NESTED_TLB_FAULT,
+	    vector_to_string(EXC_NESTED_TLB_FAULT), false,
+	    data_nested_tlb_fault);
+	exc_register(EXC_DATA_D_BIT_FAULT,
+	    vector_to_string(EXC_DATA_D_BIT_FAULT), true,
+	    data_dirty_bit_fault);
+	exc_register(EXC_INST_A_BIT_FAULT,
+	    vector_to_string(EXC_INST_A_BIT_FAULT), true,
+	    instruction_access_bit_fault);
+	exc_register(EXC_DATA_A_BIT_FAULT, 
+	    vector_to_string(EXC_DATA_A_BIT_FAULT), true,
+	    data_access_bit_fault);
+	exc_register(EXC_EXT_INTERRUPT,
+	    vector_to_string(EXC_EXT_INTERRUPT), true,
+	    external_interrupt);
+
+	exc_register(EXC_PAGE_NOT_PRESENT,
+	    vector_to_string(EXC_PAGE_NOT_PRESENT), true,
+	    page_not_present);
+	exc_register(EXC_DATA_AR_FAULT,
+	    vector_to_string(EXC_DATA_AR_FAULT), true,
+	    data_access_rights_fault);
+	exc_register(EXC_GENERAL_EXCEPTION,
+	    vector_to_string(EXC_GENERAL_EXCEPTION), false,
+	    general_exception);
+	exc_register(EXC_DISABLED_FP_REG,
+	    vector_to_string(EXC_DISABLED_FP_REG), true,
+	    disabled_fp_register);
+}
+
 /** @}
  */
Index: kernel/arch/ia64/src/ivt.S
===================================================================
--- kernel/arch/ia64/src/ivt.S	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/src/ivt.S	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -31,4 +31,5 @@
 #include <arch/register.h>
 #include <arch/mm/page.h>
+#include <arch/interrupt.h>
 #include <arch/istate_struct.h>
 #include <align.h>
@@ -39,14 +40,13 @@
 
 /** Partitioning of bank 0 registers. */
-#define R_OFFS 		r16
+#define R_VECTOR	r16
 #define R_HANDLER	r17
 #define R_RET		r18
-#define R_TMP		r19
 #define R_KSTACK_BSP	r22	/* keep in sync with before_thread_runs_arch() */
 #define R_KSTACK	r23	/* keep in sync with before_thread_runs_arch() */
 
 /* Speculation vector handler */
-.macro SPECULATION_VECTOR_HANDLER offs
-    .org ivt + \offs
+.macro SPECULATION_VECTOR_HANDLER vector 
+    .org ivt + \vector * 0x100
 
     /* 1. Save predicates, IIM, IIP, IPSR and ISR CR's in bank 0 registers. */
@@ -94,7 +94,7 @@
  * @param handler Interrupt handler address.
  */
-.macro HEAVYWEIGHT_HANDLER offs, handler=universal_handler
-    .org ivt + \offs
-	mov R_OFFS = \offs
+.macro HEAVYWEIGHT_HANDLER vector, handler=exc_dispatch
+    .org ivt + \vector * 0x100
+	mov R_VECTOR = \vector
 	movl R_HANDLER = \handler ;;
 	br heavyweight_handler
@@ -165,6 +165,5 @@
 	 * copy input parameters to stack.
 	 */
-    	mov R_TMP = 0x2c00 ;;
-	cmp.eq p6, p5 = R_OFFS, R_TMP ;;
+	cmp.eq p6, p5 = EXC_BREAK_INSTRUCTION, R_VECTOR ;;
 	
 	/*
@@ -309,5 +308,5 @@
 	mov loc1 = R_RET	/* b0 belonging to interrupted context */
 	mov loc2 = R_HANDLER
-	mov out0 = R_OFFS
+	mov out0 = R_VECTOR
 	
 	add out1 = STACK_SCRATCH_AREA_SIZE, r12
@@ -543,73 +542,73 @@
 .align 32768
 ivt:
-	HEAVYWEIGHT_HANDLER 0x0000
-	HEAVYWEIGHT_HANDLER 0x0400
-	HEAVYWEIGHT_HANDLER 0x0800
-	HEAVYWEIGHT_HANDLER 0x0c00 alternate_instruction_tlb_fault
-	HEAVYWEIGHT_HANDLER 0x1000 alternate_data_tlb_fault
-	HEAVYWEIGHT_HANDLER 0x1400 data_nested_tlb_fault
-	HEAVYWEIGHT_HANDLER 0x1800
-	HEAVYWEIGHT_HANDLER 0x1c00
-	HEAVYWEIGHT_HANDLER 0x2000 data_dirty_bit_fault
-	HEAVYWEIGHT_HANDLER 0x2400 instruction_access_bit_fault
-	HEAVYWEIGHT_HANDLER 0x2800 data_access_bit_fault
-	HEAVYWEIGHT_HANDLER 0x2c00 break_instruction
-	HEAVYWEIGHT_HANDLER 0x3000 external_interrupt	/* For external interrupt, heavyweight handler is used. */
-	HEAVYWEIGHT_HANDLER 0x3400
-	HEAVYWEIGHT_HANDLER 0x3800
-	HEAVYWEIGHT_HANDLER 0x3c00
-	HEAVYWEIGHT_HANDLER 0x4000
-	HEAVYWEIGHT_HANDLER 0x4400
-	HEAVYWEIGHT_HANDLER 0x4800
-	HEAVYWEIGHT_HANDLER 0x4c00
-
-	HEAVYWEIGHT_HANDLER 0x5000 page_not_present
-	HEAVYWEIGHT_HANDLER 0x5100
-	HEAVYWEIGHT_HANDLER 0x5200
-	HEAVYWEIGHT_HANDLER 0x5300 data_access_rights_fault
-	HEAVYWEIGHT_HANDLER 0x5400 general_exception
-	HEAVYWEIGHT_HANDLER 0x5500 disabled_fp_register
-	HEAVYWEIGHT_HANDLER 0x5600
-	SPECULATION_VECTOR_HANDLER 0x5700
-	HEAVYWEIGHT_HANDLER 0x5800
-	HEAVYWEIGHT_HANDLER 0x5900
-	HEAVYWEIGHT_HANDLER 0x5a00
-	HEAVYWEIGHT_HANDLER 0x5b00
-	HEAVYWEIGHT_HANDLER 0x5c00
-	HEAVYWEIGHT_HANDLER 0x5d00 
-	HEAVYWEIGHT_HANDLER 0x5e00
-	HEAVYWEIGHT_HANDLER 0x5f00
-	
-	HEAVYWEIGHT_HANDLER 0x6000
-	HEAVYWEIGHT_HANDLER 0x6100
-	HEAVYWEIGHT_HANDLER 0x6200
-	HEAVYWEIGHT_HANDLER 0x6300
-	HEAVYWEIGHT_HANDLER 0x6400
-	HEAVYWEIGHT_HANDLER 0x6500
-	HEAVYWEIGHT_HANDLER 0x6600
-	HEAVYWEIGHT_HANDLER 0x6700
-	HEAVYWEIGHT_HANDLER 0x6800
-	HEAVYWEIGHT_HANDLER 0x6900
-	HEAVYWEIGHT_HANDLER 0x6a00
-	HEAVYWEIGHT_HANDLER 0x6b00
-	HEAVYWEIGHT_HANDLER 0x6c00
-	HEAVYWEIGHT_HANDLER 0x6d00
-	HEAVYWEIGHT_HANDLER 0x6e00
-	HEAVYWEIGHT_HANDLER 0x6f00
-
-	HEAVYWEIGHT_HANDLER 0x7000
-	HEAVYWEIGHT_HANDLER 0x7100
-	HEAVYWEIGHT_HANDLER 0x7200
-	HEAVYWEIGHT_HANDLER 0x7300
-	HEAVYWEIGHT_HANDLER 0x7400
-	HEAVYWEIGHT_HANDLER 0x7500
-	HEAVYWEIGHT_HANDLER 0x7600
-	HEAVYWEIGHT_HANDLER 0x7700
-	HEAVYWEIGHT_HANDLER 0x7800
-	HEAVYWEIGHT_HANDLER 0x7900
-	HEAVYWEIGHT_HANDLER 0x7a00
-	HEAVYWEIGHT_HANDLER 0x7b00
-	HEAVYWEIGHT_HANDLER 0x7c00
-	HEAVYWEIGHT_HANDLER 0x7d00
-	HEAVYWEIGHT_HANDLER 0x7e00
-	HEAVYWEIGHT_HANDLER 0x7f00
+	HEAVYWEIGHT_HANDLER 0x00
+	HEAVYWEIGHT_HANDLER 0x04
+	HEAVYWEIGHT_HANDLER 0x08
+	HEAVYWEIGHT_HANDLER 0x0c
+	HEAVYWEIGHT_HANDLER 0x10
+	HEAVYWEIGHT_HANDLER 0x14
+	HEAVYWEIGHT_HANDLER 0x18
+	HEAVYWEIGHT_HANDLER 0x1c
+	HEAVYWEIGHT_HANDLER 0x20
+	HEAVYWEIGHT_HANDLER 0x24
+	HEAVYWEIGHT_HANDLER 0x28
+	HEAVYWEIGHT_HANDLER 0x2c break_instruction
+	HEAVYWEIGHT_HANDLER 0x30
+	HEAVYWEIGHT_HANDLER 0x34
+	HEAVYWEIGHT_HANDLER 0x38
+	HEAVYWEIGHT_HANDLER 0x3c
+	HEAVYWEIGHT_HANDLER 0x40
+	HEAVYWEIGHT_HANDLER 0x44
+	HEAVYWEIGHT_HANDLER 0x48
+	HEAVYWEIGHT_HANDLER 0x4c
+
+	HEAVYWEIGHT_HANDLER 0x50
+	HEAVYWEIGHT_HANDLER 0x51
+	HEAVYWEIGHT_HANDLER 0x52
+	HEAVYWEIGHT_HANDLER 0x53
+	HEAVYWEIGHT_HANDLER 0x54
+	HEAVYWEIGHT_HANDLER 0x55
+	HEAVYWEIGHT_HANDLER 0x56
+	SPECULATION_VECTOR_HANDLER 0x57
+	HEAVYWEIGHT_HANDLER 0x58
+	HEAVYWEIGHT_HANDLER 0x59
+	HEAVYWEIGHT_HANDLER 0x5a
+	HEAVYWEIGHT_HANDLER 0x5b
+	HEAVYWEIGHT_HANDLER 0x5c
+	HEAVYWEIGHT_HANDLER 0x5d 
+	HEAVYWEIGHT_HANDLER 0x5e
+	HEAVYWEIGHT_HANDLER 0x5f
+	
+	HEAVYWEIGHT_HANDLER 0x60
+	HEAVYWEIGHT_HANDLER 0x61
+	HEAVYWEIGHT_HANDLER 0x62
+	HEAVYWEIGHT_HANDLER 0x63
+	HEAVYWEIGHT_HANDLER 0x64
+	HEAVYWEIGHT_HANDLER 0x65
+	HEAVYWEIGHT_HANDLER 0x66
+	HEAVYWEIGHT_HANDLER 0x67
+	HEAVYWEIGHT_HANDLER 0x68
+	HEAVYWEIGHT_HANDLER 0x69
+	HEAVYWEIGHT_HANDLER 0x6a
+	HEAVYWEIGHT_HANDLER 0x6b
+	HEAVYWEIGHT_HANDLER 0x6c
+	HEAVYWEIGHT_HANDLER 0x6d
+	HEAVYWEIGHT_HANDLER 0x6e
+	HEAVYWEIGHT_HANDLER 0x6f
+
+	HEAVYWEIGHT_HANDLER 0x70
+	HEAVYWEIGHT_HANDLER 0x71
+	HEAVYWEIGHT_HANDLER 0x72
+	HEAVYWEIGHT_HANDLER 0x73
+	HEAVYWEIGHT_HANDLER 0x74
+	HEAVYWEIGHT_HANDLER 0x75
+	HEAVYWEIGHT_HANDLER 0x76
+	HEAVYWEIGHT_HANDLER 0x77
+	HEAVYWEIGHT_HANDLER 0x78
+	HEAVYWEIGHT_HANDLER 0x79
+	HEAVYWEIGHT_HANDLER 0x7a
+	HEAVYWEIGHT_HANDLER 0x7b
+	HEAVYWEIGHT_HANDLER 0x7c
+	HEAVYWEIGHT_HANDLER 0x7d
+	HEAVYWEIGHT_HANDLER 0x7e
+	HEAVYWEIGHT_HANDLER 0x7f
Index: kernel/arch/ia64/src/mm/tlb.c
===================================================================
--- kernel/arch/ia64/src/mm/tlb.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/ia64/src/mm/tlb.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -477,9 +477,9 @@
 /** Instruction TLB fault handler for faults with VHPT turned off.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void alternate_instruction_tlb_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void alternate_instruction_tlb_fault(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
@@ -566,9 +566,9 @@
 /** Data TLB fault handler for faults with VHPT turned off.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void alternate_data_tlb_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void alternate_data_tlb_fault(unsigned int n, istate_t *istate)
 {
 	if (istate->cr_isr.sp) {
@@ -623,9 +623,9 @@
  * This fault should not occur.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void data_nested_tlb_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void data_nested_tlb_fault(unsigned int n, istate_t *istate)
 {
 	ASSERT(false);
@@ -634,9 +634,9 @@
 /** Data Dirty bit fault handler.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void data_dirty_bit_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void data_dirty_bit_fault(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
@@ -665,9 +665,9 @@
 /** Instruction access bit fault handler.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void instruction_access_bit_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void instruction_access_bit_fault(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
@@ -694,9 +694,9 @@
 /** Data access bit fault handler.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void data_access_bit_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void data_access_bit_fault(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
@@ -729,9 +729,9 @@
 /** Data access rights fault handler.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void data_access_rights_fault(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void data_access_rights_fault(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
@@ -753,9 +753,9 @@
 /** Page not present fault handler.
  *
- * @param vector Interruption vector.
- * @param istate Structure with saved interruption state.
- *
- */
-void page_not_present(uint64_t vector, istate_t *istate)
+ * @param n Interruption vector.
+ * @param istate Structure with saved interruption state.
+ *
+ */
+void page_not_present(unsigned int n, istate_t *istate)
 {
 	uintptr_t va;
Index: kernel/arch/ia64/src/smp/smp_call.c
===================================================================
--- kernel/arch/ia64/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/ia64/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup ia64
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/mips32/Makefile.inc
===================================================================
--- kernel/arch/mips32/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/mips32/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -71,4 +71,5 @@
 	arch/$(KARCH)/src/fpu_context.c \
 	arch/$(KARCH)/src/smp/smp.c \
+	arch/$(KARCH)/src/smp/smp_call.c \
 	arch/$(KARCH)/src/machine_func.c
 
Index: kernel/arch/mips32/src/smp/smp_call.c
===================================================================
--- kernel/arch/mips32/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/mips32/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup mips32
+ * @{
+ */
+/** @file
+ */
+
+#include <smp/smp_call.h>
+#include <panic.h>
+
+#ifdef CONFIG_SMP
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	panic("smp_call IPI not implemented.");
+}
+
+#endif /* CONFIG_SMP */
+
+/** @}
+ */
Index: kernel/arch/sparc32/include/arch/stack.h
===================================================================
--- kernel/arch/sparc32/include/arch/stack.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc32/include/arch/stack.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -54,15 +54,4 @@
 #define STACK_ARG_SAVE_AREA_SIZE  (6 * STACK_ITEM_SIZE)
 
-/**
- * Offsets of arguments on stack.
- */
-#define STACK_ARG0  0
-#define STACK_ARG1  4
-#define STACK_ARG2  8
-#define STACK_ARG3  12
-#define STACK_ARG4  16
-#define STACK_ARG5  20
-#define STACK_ARG6  24
-
 #endif
 
Index: kernel/arch/sparc32/src/cpu/cpu.c
===================================================================
--- kernel/arch/sparc32/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc32/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -34,4 +34,5 @@
 
 #include <arch/cpu.h>
+#include <cpu.h>
 #include <arch.h>
 #include <typedefs.h>
Index: kernel/arch/sparc32/src/debug/stacktrace.c
===================================================================
--- kernel/arch/sparc32/src/debug/stacktrace.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc32/src/debug/stacktrace.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -39,4 +39,5 @@
 #include <arch.h>
 #include <arch/stack.h>
+#include <proc/thread.h>
 
 #define FRAME_OFFSET_FP_PREV  14
Index: kernel/arch/sparc32/src/sparc32.c
===================================================================
--- kernel/arch/sparc32/src/sparc32.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc32/src/sparc32.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,4 +37,5 @@
 #include <arch/interrupt.h>
 #include <arch/asm.h>
+#include <arch/barrier.h>
 #include <arch/machine_func.h>
 #include <func.h>
@@ -169,4 +170,27 @@
 }
 
+bool __atomic_compare_exchange_4(uint32_t *ptr, uint32_t *expected,
+    uint32_t desired, bool weak, int success_mm, int failure_mm)
+{
+	ipl_t ipl;
+	bool success;
+
+	/* XXX: This is a rather dummy implementation. */
+
+	ipl = interrupts_disable();
+	memory_barrier();
+	if (*ptr == *expected) {
+		success = true;
+		*ptr = desired;
+	} else {
+		success = false;
+		*expected = *ptr;
+	}
+	memory_barrier();
+	interrupts_restore(ipl);
+
+	return success;
+}
+
 /** @}
  */
Index: kernel/arch/sparc64/Makefile.inc
===================================================================
--- kernel/arch/sparc64/Makefile.inc	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/Makefile.inc	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -100,4 +100,5 @@
 	ARCH_SOURCES += \
 		arch/$(KARCH)/src/smp/$(USARCH)/smp.c \
+		arch/$(KARCH)/src/smp/$(USARCH)/smp_call.c \
 		arch/$(KARCH)/src/smp/$(USARCH)/ipi.c
 endif
Index: kernel/arch/sparc64/include/arch/barrier.h
===================================================================
--- kernel/arch/sparc64/include/arch/barrier.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/barrier.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,4 +37,10 @@
 
 #include <trace.h>
+
+#ifdef KERNEL
+#include <arch/common.h>
+#else
+#include <libarch/common.h>
+#endif
 
 /*
Index: kernel/arch/sparc64/include/arch/interrupt.h
===================================================================
--- kernel/arch/sparc64/include/arch/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -40,6 +40,6 @@
 #include <arch/istate.h>
 
-#define IVT_ITEMS  15
-#define IVT_FIRST  1
+#define IVT_ITEMS  512
+#define IVT_FIRST  0 
 
 /* This needs to be defined for inter-architecture API portability. */
@@ -47,6 +47,9 @@
 
 enum {
-	IPI_TLB_SHOOTDOWN = VECTOR_TLB_SHOOTDOWN_IPI
+	IPI_TLB_SHOOTDOWN = VECTOR_TLB_SHOOTDOWN_IPI,
+	IPI_SMP_CALL
 };
+
+extern void exc_arch_init(void);
 
 #endif
Index: kernel/arch/sparc64/include/arch/istate_struct.ag
===================================================================
--- kernel/arch/sparc64/include/arch/istate_struct.ag	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/istate_struct.ag	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,98 @@
 
         members : [
+
+                #
+                # Window save area for locals and inputs. Required by ABI.
+                # Before using these, make sure that the corresponding register
+                # window has been spilled into memory, otherwise l0-l7 and
+                # i0-i7 will have undefined values.
+                #
+                {
+                        name : l0,
+                        type : uint64_t,
+                },
+                {
+                        name : l1,
+                        type : uint64_t,
+                },
+                {
+                        name : l2,
+                        type : uint64_t,
+                },
+                {
+                        name : l3,
+                        type : uint64_t,
+                },
+                {
+                        name : l4,
+                        type : uint64_t,
+                },
+                {
+                        name : l5,
+                        type : uint64_t,
+                },
+                {
+                        name : l6,
+                        type : uint64_t,
+                },
+                {
+                        name : l7,
+                        type : uint64_t,
+                },
+                {
+                        name : i0,
+                        type : uint64_t,
+                },
+                {
+                        name : i1,
+                        type : uint64_t,
+                },
+                {
+                        name : i2,
+                        type : uint64_t,
+                },
+                {
+                        name : i3,
+                        type : uint64_t,
+                },
+                {
+                        name : i4,
+                        type : uint64_t,
+                },
+                {
+                        name : i5,
+                        type : uint64_t,
+                },
+                {
+                        name : i6,
+                        type : uint64_t,
+                },
+                {
+                        name : i7,
+                        type : uint64_t,
+                },
+
+                #
+                # Six mandatory argument slots, required by the ABI, plus an
+                # optional argument slot for the 7th argument used by our
+                # syscalls. Since the preemptible handler is always passing
+                # integral arguments, undef_arg[0] - undef_arg[5] are always
+                # undefined.
+                #
+                {
+                        name : undef_arg,
+                        type : uint64_t,
+                        elements : 6,
+                },
+                {
+                        name : arg6,
+                        type : uint64_t,
+                },
+
+                #
+                # From this point onwards, the istate layout is not dicated by
+                # the ABI. The only requirement is the stack alignment.
+                #
+
                 {
                         name : tnpc,
@@ -51,4 +145,53 @@
                 {
                         name : tstate,
+                        type : uint64_t
+                },
+                {
+                        name : y,
+                        type : uint64_t,
+                },
+
+                #
+                # At the moment, these are defined only when needed by the
+                # preemptible handler, so consider them undefined for now.
+                #
+                {
+                        name : o0,
+                        type : uint64_t,
+                },
+                {
+                        name : o1,
+                        type : uint64_t,
+                },
+                {
+                        name : o2,
+                        type : uint64_t,
+                },
+                {
+                        name : o3,
+                        type : uint64_t,
+                },
+                {
+                        name : o4,
+                        type : uint64_t,
+                },
+                {
+                        name : o5,
+                        type : uint64_t,
+                },
+                {
+                        name : o6,
+                        type : uint64_t,
+                },
+                {
+                        name : o7,
+                        type : uint64_t,
+                },
+
+                #
+                # I/DTLB Tag Access register or zero for non-MMU traps.
+                #
+                {
+                        name : tlb_tag_access,
                         type : uint64_t
                 }
Index: kernel/arch/sparc64/include/arch/mm/sun4u/tlb.h
===================================================================
--- kernel/arch/sparc64/include/arch/mm/sun4u/tlb.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/mm/sun4u/tlb.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -678,7 +678,7 @@
 }
 
-extern void fast_instruction_access_mmu_miss(sysarg_t, istate_t *);
-extern void fast_data_access_mmu_miss(tlb_tag_access_reg_t, istate_t *);
-extern void fast_data_access_protection(tlb_tag_access_reg_t , istate_t *);
+extern void fast_instruction_access_mmu_miss(unsigned int, istate_t *);
+extern void fast_data_access_mmu_miss(unsigned int, istate_t *);
+extern void fast_data_access_protection(unsigned int, istate_t *);
 
 extern void dtlb_insert_mapping(uintptr_t, uintptr_t, int, bool, bool);
Index: kernel/arch/sparc64/include/arch/mm/sun4v/tlb.h
===================================================================
--- kernel/arch/sparc64/include/arch/mm/sun4v/tlb.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/mm/sun4v/tlb.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -141,7 +141,7 @@
 }
 
-extern void fast_instruction_access_mmu_miss(sysarg_t, istate_t *);
-extern void fast_data_access_mmu_miss(sysarg_t, istate_t *);
-extern void fast_data_access_protection(sysarg_t, istate_t *);
+extern void fast_instruction_access_mmu_miss(unsigned int, istate_t *);
+extern void fast_data_access_mmu_miss(unsigned int, istate_t *);
+extern void fast_data_access_protection(unsigned int, istate_t *);
 
 extern void dtlb_insert_mapping(uintptr_t, uintptr_t, int, bool, bool);
Index: kernel/arch/sparc64/include/arch/smp/sun4u/ipi.h
===================================================================
--- kernel/arch/sparc64/include/arch/smp/sun4u/ipi.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/sparc64/include/arch/smp/sun4u/ipi.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sparc64	
+ * @{
+ */
+/**
+ * @file
+ * @brief	IPI functions specific to Sun4U.
+ */
+
+#ifndef KERN_sparc64_sun4u_IPI_H_
+#define KERN_sparc64_sun4u_IPI_H_
+
+extern void ipi_unicast_arch(unsigned int, int);
+
+#endif
+
+/** @}
+ */
Index: kernel/arch/sparc64/include/arch/trap/exception.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/exception.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/exception.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -71,23 +71,23 @@
 extern void dump_istate(istate_t *istate);
 
-extern void instruction_access_exception(int n, istate_t *istate);
-extern void instruction_access_error(int n, istate_t *istate);
-extern void illegal_instruction(int n, istate_t *istate);
-extern void privileged_opcode(int n, istate_t *istate);
-extern void unimplemented_LDD(int n, istate_t *istate);
-extern void unimplemented_STD(int n, istate_t *istate);
-extern void fp_disabled(int n, istate_t *istate);
-extern void fp_exception_ieee_754(int n, istate_t *istate);
-extern void fp_exception_other(int n, istate_t *istate);
-extern void tag_overflow(int n, istate_t *istate);
-extern void division_by_zero(int n, istate_t *istate);
-extern void data_access_exception(int n, istate_t *istate);
-extern void data_access_error(int n, istate_t *istate);
-extern void mem_address_not_aligned(int n, istate_t *istate);
-extern void LDDF_mem_address_not_aligned(int n, istate_t *istate);
-extern void STDF_mem_address_not_aligned(int n, istate_t *istate);
-extern void privileged_action(int n, istate_t *istate);
-extern void LDQF_mem_address_not_aligned(int n, istate_t *istate);
-extern void STQF_mem_address_not_aligned(int n, istate_t *istate);
+extern void instruction_access_exception(unsigned int, istate_t *);
+extern void instruction_access_error(unsigned int, istate_t *);
+extern void illegal_instruction(unsigned int, istate_t *);
+extern void privileged_opcode(unsigned int, istate_t *);
+extern void unimplemented_LDD(unsigned int, istate_t *);
+extern void unimplemented_STD(unsigned int, istate_t *);
+extern void fp_disabled(unsigned int, istate_t *);
+extern void fp_exception_ieee_754(unsigned int, istate_t *);
+extern void fp_exception_other(unsigned int, istate_t *);
+extern void tag_overflow(unsigned int, istate_t *);
+extern void division_by_zero(unsigned int, istate_t *);
+extern void data_access_exception(unsigned int, istate_t *);
+extern void data_access_error(unsigned int, istate_t *);
+extern void mem_address_not_aligned(unsigned int, istate_t *);
+extern void LDDF_mem_address_not_aligned(unsigned int, istate_t *);
+extern void STDF_mem_address_not_aligned(unsigned int, istate_t *);
+extern void privileged_action(unsigned int, istate_t *);
+extern void LDQF_mem_address_not_aligned(unsigned int, istate_t *);
+extern void STQF_mem_address_not_aligned(unsigned int, istate_t *);
 
 #endif /* !__ASM__ */
Index: kernel/arch/sparc64/include/arch/trap/interrupt.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -63,17 +63,10 @@
 #define IGN_SHIFT	6
 
-
-#ifdef __ASM__
-.macro INTERRUPT_LEVEL_N_HANDLER n
-	mov \n - 1, %g2
-	PREEMPTIBLE_HANDLER exc_dispatch
-.endm
-#endif
-
 #ifndef __ASM__
 
 #include <arch/interrupt.h>
 
-extern void interrupt(int n, istate_t *istate);
+extern void interrupt(unsigned int n, istate_t *istate);
+
 #endif /* !def __ASM__ */
 
Index: kernel/arch/sparc64/include/arch/trap/sun4u/interrupt.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/sun4u/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/sun4u/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -92,11 +92,4 @@
 #define INTERRUPT_VECTOR_TRAP_HANDLER_SIZE	TRAP_TABLE_ENTRY_SIZE
 
-#ifdef __ASM__
-.macro INTERRUPT_VECTOR_TRAP_HANDLER
-	PREEMPTIBLE_HANDLER interrupt
-.endm
-#endif /* __ASM__ */
-
-
 #endif
 
Index: kernel/arch/sparc64/include/arch/trap/sun4u/mmu.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/sun4u/mmu.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/sun4u/mmu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -74,5 +74,8 @@
 0:
 	wrpr %g0, PSTATE_PRIV_BIT | PSTATE_AG_BIT, %pstate
-	PREEMPTIBLE_HANDLER fast_instruction_access_mmu_miss
+	mov TT_FAST_INSTRUCTION_ACCESS_MMU_MISS, %g2
+	mov VA_IMMU_TAG_ACCESS, %g5
+	ldxa [%g5] ASI_IMMU, %g5			! read the faulting Context and VPN
+	PREEMPTIBLE_HANDLER exc_dispatch 
 .endm
 
@@ -107,5 +110,5 @@
 	wr %g0, ASI_DMMU, %asi
 	ldxa [VA_DMMU_TAG_ACCESS] %asi, %g1		! read the faulting Context and VPN
-	set TLB_TAG_ACCESS_CONTEXT_MASK, %g2
+	ldx [%g7 + %lo(tlb_tag_access_context_mask)], %g2
 	andcc %g1, %g2, %g3				! get Context
 	bnz %xcc, 0f					! Context is non-zero
@@ -138,14 +141,7 @@
 	wrpr %g0, PSTATE_PRIV_BIT | PSTATE_AG_BIT, %pstate
 
-	/*
-	 * Read the Tag Access register for the higher-level handler.
-	 * This is necessary to survive nested DTLB misses.
-	 */	
-	ldxa [VA_DMMU_TAG_ACCESS] %asi, %g2
-
-	/*
-	 * g2 will be passed as an argument to fast_data_access_mmu_miss().
-	 */
-	PREEMPTIBLE_HANDLER fast_data_access_mmu_miss
+	mov TT_FAST_DATA_ACCESS_MMU_MISS, %g2
+	ldxa [VA_DMMU_TAG_ACCESS] %asi, %g5		! read the faulting Context and VPN
+	PREEMPTIBLE_HANDLER exc_dispatch 
 .endm
 
@@ -164,15 +160,8 @@
 	wrpr %g0, PSTATE_PRIV_BIT | PSTATE_AG_BIT, %pstate
 
-	/*
-	 * Read the Tag Access register for the higher-level handler.
-	 * This is necessary to survive nested DTLB misses.
-	 */	
-	mov VA_DMMU_TAG_ACCESS, %g2
-	ldxa [%g2] ASI_DMMU, %g2
-
-	/*
-	 * g2 will be passed as an argument to fast_data_access_mmu_miss().
-	 */
-	PREEMPTIBLE_HANDLER fast_data_access_protection
+	mov TT_FAST_DATA_ACCESS_PROTECTION, %g2
+	mov VA_DMMU_TAG_ACCESS, %g5
+	ldxa [%g5] ASI_DMMU, %g5			! read the faulting Context and VPN
+	PREEMPTIBLE_HANDLER exc_dispatch 
 .endm
 
Index: kernel/arch/sparc64/include/arch/trap/sun4v/interrupt.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/sun4v/interrupt.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/sun4v/interrupt.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -40,6 +40,8 @@
 #ifndef __ASM__
 
+#include <arch/istate_struct.h>
+
 extern void sun4v_ipi_init(void);
-extern void cpu_mondo(void);
+extern void cpu_mondo(unsigned int, istate_t *);
 
 #endif
Index: kernel/arch/sparc64/include/arch/trap/sun4v/mmu.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/sun4v/mmu.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/sun4v/mmu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -73,5 +73,7 @@
 
 .macro FAST_INSTRUCTION_ACCESS_MMU_MISS_HANDLER
-	PREEMPTIBLE_HANDLER fast_instruction_access_mmu_miss
+	mov TT_FAST_INSTRUCTION_ACCESS_MMU_MISS, %g2
+	clr %g5		! XXX
+	PREEMPTIBLE_HANDLER exc_dispatch 
 .endm
 
@@ -123,7 +125,7 @@
 	 * mapped. In such a case, this handler will be called from TL = 1.
 	 * We handle the situation by pretending that the MMU miss occurred
-	 * on TL = 0. Once the MMU miss trap is services, the instruction which
+	 * on TL = 0. Once the MMU miss trap is serviced, the instruction which
 	 * caused the spill/fill trap is restarted, the spill/fill trap occurs,
-	 * but this time its handler accesse memory which IS mapped.
+	 * but this time its handler accesses memory which is mapped.
 	 */
 	.if (\tl > 0)
@@ -131,16 +133,18 @@
 	.endif
 
+	mov TT_FAST_DATA_ACCESS_MMU_MISS, %g2
+
 	/*
-	 * Save the faulting virtual page and faulting context to the %g2
-	 * register. The most significant 51 bits of the %g2 register will
+	 * Save the faulting virtual page and faulting context to the %g5
+	 * register. The most significant 51 bits of the %g5 register will
 	 * contain the virtual address which caused the fault truncated to the
-	 * page boundary. The least significant 13 bits of the %g2 register
+	 * page boundary. The least significant 13 bits of the %g5 register
 	 * will contain the number of the context in which the fault occurred.
-	 * The value of the %g2 register will be passed as a parameter to the
-	 * higher level service routine.
+	 * The value of the %g5 register will be stored in the istate structure
+	 * for inspeciton by the higher level service routine.
 	 */
-	or %g1, %g3, %g2
+	or %g1, %g3, %g5
 
-	PREEMPTIBLE_HANDLER fast_data_access_mmu_miss
+	PREEMPTIBLE_HANDLER exc_dispatch
 .endm
 
@@ -170,8 +174,10 @@
 	sllx %g1, TTE_DATA_TADDR_OFFSET, %g1
 
+	mov TT_FAST_DATA_ACCESS_PROTECTION, %g2
+
 	/* the same as for FAST_DATA_ACCESS_MMU_MISS_HANDLER */
-	or %g1, %g3, %g2
+	or %g1, %g3, %g5
 
-	PREEMPTIBLE_HANDLER fast_data_access_protection
+	PREEMPTIBLE_HANDLER exc_dispatch 
 .endm
 #endif /* __ASM__ */
Index: kernel/arch/sparc64/include/arch/trap/trap_table.h
===================================================================
--- kernel/arch/sparc64/include/arch/trap/trap_table.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/include/arch/trap/trap_table.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -43,29 +43,4 @@
 #define TRAP_TABLE_SIZE		(TRAP_TABLE_ENTRY_COUNT * TRAP_TABLE_ENTRY_SIZE)
 
-#define ISTATE_END_OFFSET(o)	((o) - ISTATE_SIZE)
-
-/*
- * The one STACK_ITEM_SIZE is counted for space holding the 7th
- * argument to syscall_handler (i.e. syscall number) and the other
- * STACK_ITEM_SIZE is counted because of the required alignment.
- */
-#define PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE \
-    (STACK_WINDOW_SAVE_AREA_SIZE + STACK_ARG_SAVE_AREA_SIZE + \
-    (2 * STACK_ITEM_SIZE) + (ISTATE_SIZE + 9 * 8))
-/* <-- istate_t ends here */
-#define SAVED_TSTATE	ISTATE_END_OFFSET(ISTATE_OFFSET_TSTATE)	
-#define SAVED_TPC	ISTATE_END_OFFSET(ISTATE_OFFSET_TPC)
-#define SAVED_TNPC	ISTATE_END_OFFSET(ISTATE_OFFSET_TNPC)
-/* <-- istate_t begins here */
-#define SAVED_Y		-(1 * 8 + ISTATE_SIZE)
-#define SAVED_I0	-(2 * 8 + ISTATE_SIZE)
-#define SAVED_I1	-(3 * 8 + ISTATE_SIZE)
-#define SAVED_I2	-(4 * 8 + ISTATE_SIZE)
-#define SAVED_I3	-(5 * 8 + ISTATE_SIZE)
-#define SAVED_I4	-(6 * 8 + ISTATE_SIZE)
-#define SAVED_I5	-(7 * 8 + ISTATE_SIZE)
-#define SAVED_I6	-(8 * 8 + ISTATE_SIZE)
-#define SAVED_I7	-(9 * 8 + ISTATE_SIZE)
-
 #ifndef __ASM__
 
@@ -80,4 +55,5 @@
 extern trap_table_entry_t trap_table[TRAP_TABLE_ENTRY_COUNT];
 extern trap_table_entry_t trap_table_save[TRAP_TABLE_ENTRY_COUNT];
+
 #endif /* !__ASM__ */
 
Index: kernel/arch/sparc64/src/debug/stacktrace.c
===================================================================
--- kernel/arch/sparc64/src/debug/stacktrace.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/debug/stacktrace.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -36,8 +36,11 @@
 #include <syscall/copy.h>
 #include <typedefs.h>
+#include <proc/thread.h>
 
 #include <arch.h>
 #include <arch/stack.h>
 #include <arch/trap/trap_table.h>
+
+#include <arch/istate_struct.h>
 
 #if defined(SUN4V)
@@ -61,5 +64,5 @@
 
 	kstack += STACK_BIAS;
-	kstack -= PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE;
+	kstack -= ISTATE_SIZE;
 
 	if (THREAD && (ctx->fp == kstack))
Index: kernel/arch/sparc64/src/drivers/tick.c
===================================================================
--- kernel/arch/sparc64/src/drivers/tick.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/drivers/tick.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -35,4 +35,5 @@
 #include <arch/drivers/tick.h>
 #include <arch/interrupt.h>
+#include <arch/trap/interrupt.h>
 #include <arch/sparc64.h>
 #include <arch/asm.h>
@@ -51,5 +52,4 @@
 	softint_reg_t clear;
 
-	interrupt_register(14, "tick_int", tick_interrupt);
 	compare.int_dis = false;
 	compare.tick_cmpr = tick_counter_read() +
@@ -79,5 +79,5 @@
 /** Process tick interrupt.
  *
- * @param n      Interrupt Level (14, can be ignored)
+ * @param n      Trap type (0x4e, can be ignored)
  * @param istate Interrupted state.
  *
@@ -93,5 +93,5 @@
 	 * Make sure we are servicing interrupt_level_14
 	 */
-	ASSERT(n == 14);
+	ASSERT(n == TT_INTERRUPT_LEVEL_14);
 	
 	/*
Index: kernel/arch/sparc64/src/mm/sun4u/tlb.c
===================================================================
--- kernel/arch/sparc64/src/mm/sun4u/tlb.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/mm/sun4u/tlb.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -194,5 +194,5 @@
 
 /** ITLB miss handler. */
-void fast_instruction_access_mmu_miss(sysarg_t unused, istate_t *istate)
+void fast_instruction_access_mmu_miss(unsigned int tt, istate_t *istate)
 {
 	size_t index = (istate->tpc >> MMU_PAGE_WIDTH) % MMU_PAGES_PER_PAGE;
@@ -224,12 +224,10 @@
  * low-level, assembly language part of the fast_data_access_mmu_miss handler.
  *
- * @param tag		Content of the TLB Tag Access register as it existed
- * 			when the trap happened. This is to prevent confusion
- * 			created by clobbered Tag Access register during a nested
- * 			DTLB miss.
+ * @param tt		Trap type.
  * @param istate	Interrupted state saved on the stack.
  */
-void fast_data_access_mmu_miss(tlb_tag_access_reg_t tag, istate_t *istate)
-{
+void fast_data_access_mmu_miss(unsigned int tt, istate_t *istate)
+{
+	tlb_tag_access_reg_t tag;
 	uintptr_t page_8k;
 	uintptr_t page_16k;
@@ -238,4 +236,5 @@
 	as_t *as = AS;
 
+	tag.value = istate->tlb_tag_access;
 	page_8k = (uint64_t) tag.vpn << MMU_PAGE_WIDTH;
 	page_16k = ALIGN_DOWN(page_8k, PAGE_SIZE);
@@ -276,12 +275,10 @@
 /** DTLB protection fault handler.
  *
- * @param tag		Content of the TLB Tag Access register as it existed
- * 			when the trap happened. This is to prevent confusion
- * 			created by clobbered Tag Access register during a nested
- * 			DTLB miss.
+ * @param tt		Trap type.
  * @param istate	Interrupted state saved on the stack.
  */
-void fast_data_access_protection(tlb_tag_access_reg_t tag, istate_t *istate)
-{
+void fast_data_access_protection(unsigned int tt, istate_t *istate)
+{
+	tlb_tag_access_reg_t tag;
 	uintptr_t page_16k;
 	size_t index;
@@ -289,4 +286,5 @@
 	as_t *as = AS;
 
+	tag.value = istate->tlb_tag_access;
 	page_16k = ALIGN_DOWN((uint64_t) tag.vpn << MMU_PAGE_WIDTH, PAGE_SIZE);
 	index = tag.vpn % MMU_PAGES_PER_PAGE;	/* 16K-page emulation */
Index: kernel/arch/sparc64/src/mm/sun4v/tlb.c
===================================================================
--- kernel/arch/sparc64/src/mm/sun4v/tlb.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/mm/sun4v/tlb.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -208,5 +208,5 @@
 
 /** ITLB miss handler. */
-void fast_instruction_access_mmu_miss(sysarg_t unused, istate_t *istate)
+void fast_instruction_access_mmu_miss(unsigned int tt, istate_t *istate)
 {
 	uintptr_t va = ALIGN_DOWN(istate->tpc, PAGE_SIZE);
@@ -239,17 +239,12 @@
  * low-level, assembly language part of the fast_data_access_mmu_miss handler.
  *
- * @param page_and_ctx	A 64-bit value describing the fault. The most
- * 			significant 51 bits of the value contain the virtual
- * 			address which caused the fault truncated to the page
- * 			boundary. The least significant 13 bits of the value
- * 			contain the number of the context in which the fault
- * 			occurred.
+ * @param tt		Trap type.
  * @param istate	Interrupted state saved on the stack.
  */
-void fast_data_access_mmu_miss(uint64_t page_and_ctx, istate_t *istate)
+void fast_data_access_mmu_miss(unsigned int tt, istate_t *istate)
 {
 	pte_t *t;
-	uintptr_t va = DMISS_ADDRESS(page_and_ctx);
-	uint16_t ctx = DMISS_CONTEXT(page_and_ctx);
+	uintptr_t va = DMISS_ADDRESS(istate->tlb_tag_access);
+	uint16_t ctx = DMISS_CONTEXT(istate->tlb_tag_access);
 	as_t *as = AS;
 
@@ -288,17 +283,12 @@
 /** DTLB protection fault handler.
  *
- * @param page_and_ctx	A 64-bit value describing the fault. The most
- * 			significant 51 bits of the value contain the virtual
- * 			address which caused the fault truncated to the page
- * 			boundary. The least significant 13 bits of the value
- * 			contain the number of the context in which the fault
- * 			occurred.
+ * @param tt		Trap type.
  * @param istate	Interrupted state saved on the stack.
  */
-void fast_data_access_protection(uint64_t page_and_ctx, istate_t *istate)
+void fast_data_access_protection(unsigned int tt, istate_t *istate)
 {
 	pte_t *t;
-	uintptr_t va = DMISS_ADDRESS(page_and_ctx);
-	uint16_t ctx = DMISS_CONTEXT(page_and_ctx);
+	uintptr_t va = DMISS_ADDRESS(istate->tlb_tag_access);
+	uint16_t ctx = DMISS_CONTEXT(istate->tlb_tag_access);
 	as_t *as = AS;
 
Index: kernel/arch/sparc64/src/smp/sun4u/ipi.c
===================================================================
--- kernel/arch/sparc64/src/smp/sun4u/ipi.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/smp/sun4u/ipi.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -34,4 +34,5 @@
 
 #include <smp/ipi.h>
+#include <arch/smp/sun4u/ipi.h>
 #include <cpu.h>
 #include <arch.h>
@@ -40,4 +41,5 @@
 #include <config.h>
 #include <mm/tlb.h>
+#include <smp/smp_call.h>
 #include <arch/interrupt.h>
 #include <arch/trap/interrupt.h>
@@ -171,4 +173,26 @@
 }
 
+
+/*
+ * Deliver an IPI to the specified processors (except the current one).
+ *
+ * Interrupts must be disabled.
+ *
+ * @param cpu_id Destination cpu id (index into cpus array). Must not 
+ *               be the current cpu.
+ * @param ipi    IPI number.
+ */
+void ipi_unicast_arch(unsigned int cpu_id, int ipi)
+{
+	ASSERT(&cpus[cpu_id] != CPU);
+	
+	if (ipi == IPI_SMP_CALL) {
+		cross_call(cpus[cpu_id].arch.mid, smp_call_ipi_recv);
+	} else {
+		panic("Unknown IPI (%d).\n", ipi);
+		return;
+	}
+}
+
 /** @}
  */
Index: kernel/arch/sparc64/src/smp/sun4u/smp_call.c
===================================================================
--- kernel/arch/sparc64/src/smp/sun4u/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/arch/sparc64/src/smp/sun4u/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sparc64
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Sun4u specific smp call support.
+ */
+
+#include <smp/smp_call.h>
+#include <arch/smp/sun4u/ipi.h>
+#include <arch/interrupt.h>
+
+void arch_smp_call_ipi(unsigned int cpu_id)
+{
+	/* 
+	 * Required by ipi_unicast_arch(). That functions resolves a potential
+	 * deadlock should both the destination and source cpus be sending
+	 * unicast ipis to each other with interrupts disabled.
+	 */
+	ipl_t ipl = interrupts_disable();
+	ipi_unicast_arch(cpu_id, IPI_SMP_CALL);
+	interrupts_restore(ipl);
+}
+
+/** @}
+ */
Index: kernel/arch/sparc64/src/sun4u/sparc64.c
===================================================================
--- kernel/arch/sparc64/src/sun4u/sparc64.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/sun4u/sparc64.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -86,6 +86,8 @@
 void arch_pre_mm_init(void)
 {
-	if (config.cpu_active == 1)
+	if (config.cpu_active == 1) {
 		trap_init();
+		exc_arch_init();
+	}
 }
 
Index: kernel/arch/sparc64/src/sun4u/start.S
===================================================================
--- kernel/arch/sparc64/src/sun4u/start.S	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/sun4u/start.S	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -401,13 +401,14 @@
 
 /*
- * The fast_data_access_mmu_miss_data_hi label and the end_of_identity and
- * kernel_8k_tlb_data_template variables are meant to stay together,
- * aligned on 16B boundary.
+ * The fast_data_access_mmu_miss_data_hi label, the end_of_identity,
+ * kernel_8k_tlb_data_template and tlb_tag_access_context_mask variables
+ * are meant to stay together, aligned on a 32B boundary.
  */
 .global fast_data_access_mmu_miss_data_hi
 .global end_of_identity 
 .global kernel_8k_tlb_data_template
-
-.align 16
+.global tlb_tag_access_context_mask
+
+.align 32 
 /*
  * This label is used by the fast_data_access_MMU_miss trap handler.
@@ -435,2 +436,9 @@
 #endif /* CONFIG_VIRT_IDX_DCACHE */
 
+/*
+ * This variable is used by the fast_data_access_MMU_miss trap handler.
+ * It allows us to save one precious instruction slot of this handler.
+ */
+tlb_tag_access_context_mask:
+	.quad TLB_TAG_ACCESS_CONTEXT_MASK
+
Index: kernel/arch/sparc64/src/sun4v/sparc64.c
===================================================================
--- kernel/arch/sparc64/src/sun4v/sparc64.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/sun4v/sparc64.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -84,6 +84,8 @@
 void arch_pre_mm_init(void)
 {
-	if (config.cpu_active == 1)
+	if (config.cpu_active == 1) {
 		trap_init();
+		exc_arch_init();
+	}
 }
 
Index: kernel/arch/sparc64/src/trap/exception.c
===================================================================
--- kernel/arch/sparc64/src/trap/exception.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/exception.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -55,5 +55,5 @@
 
 /** Handle instruction_access_exception. (0x8) */
-void instruction_access_exception(int n, istate_t *istate)
+void instruction_access_exception(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -62,5 +62,5 @@
 
 /** Handle instruction_access_error. (0xa) */
-void instruction_access_error(int n, istate_t *istate)
+void instruction_access_error(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -69,5 +69,5 @@
 
 /** Handle illegal_instruction. (0x10) */
-void illegal_instruction(int n, istate_t *istate)
+void illegal_instruction(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -76,5 +76,5 @@
 
 /** Handle privileged_opcode. (0x11) */
-void privileged_opcode(int n, istate_t *istate)
+void privileged_opcode(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -83,5 +83,5 @@
 
 /** Handle unimplemented_LDD. (0x12) */
-void unimplemented_LDD(int n, istate_t *istate)
+void unimplemented_LDD(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -90,5 +90,5 @@
 
 /** Handle unimplemented_STD. (0x13) */
-void unimplemented_STD(int n, istate_t *istate)
+void unimplemented_STD(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -97,5 +97,5 @@
 
 /** Handle fp_disabled. (0x20) */
-void fp_disabled(int n, istate_t *istate)
+void fp_disabled(unsigned int n, istate_t *istate)
 {
 	fprs_reg_t fprs;
@@ -117,5 +117,5 @@
 
 /** Handle fp_exception_ieee_754. (0x21) */
-void fp_exception_ieee_754(int n, istate_t *istate)
+void fp_exception_ieee_754(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -124,5 +124,5 @@
 
 /** Handle fp_exception_other. (0x22) */
-void fp_exception_other(int n, istate_t *istate)
+void fp_exception_other(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -131,5 +131,5 @@
 
 /** Handle tag_overflow. (0x23) */
-void tag_overflow(int n, istate_t *istate)
+void tag_overflow(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -138,5 +138,5 @@
 
 /** Handle division_by_zero. (0x28) */
-void division_by_zero(int n, istate_t *istate)
+void division_by_zero(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -145,5 +145,5 @@
 
 /** Handle data_access_exception. (0x30) */
-void data_access_exception(int n, istate_t *istate)
+void data_access_exception(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -152,5 +152,5 @@
 
 /** Handle data_access_error. (0x32) */
-void data_access_error(int n, istate_t *istate)
+void data_access_error(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -159,5 +159,5 @@
 
 /** Handle mem_address_not_aligned. (0x34) */
-void mem_address_not_aligned(int n, istate_t *istate)
+void mem_address_not_aligned(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -166,5 +166,5 @@
 
 /** Handle LDDF_mem_address_not_aligned. (0x35) */
-void LDDF_mem_address_not_aligned(int n, istate_t *istate)
+void LDDF_mem_address_not_aligned(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -173,5 +173,5 @@
 
 /** Handle STDF_mem_address_not_aligned. (0x36) */
-void STDF_mem_address_not_aligned(int n, istate_t *istate)
+void STDF_mem_address_not_aligned(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -180,5 +180,5 @@
 
 /** Handle privileged_action. (0x37) */
-void privileged_action(int n, istate_t *istate)
+void privileged_action(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -187,5 +187,5 @@
 
 /** Handle LDQF_mem_address_not_aligned. (0x38) */
-void LDQF_mem_address_not_aligned(int n, istate_t *istate)
+void LDQF_mem_address_not_aligned(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
@@ -194,5 +194,5 @@
 
 /** Handle STQF_mem_address_not_aligned. (0x39) */
-void STQF_mem_address_not_aligned(int n, istate_t *istate)
+void STQF_mem_address_not_aligned(unsigned int n, istate_t *istate)
 {
 	fault_if_from_uspace(istate, "%s.", __func__);
Index: kernel/arch/sparc64/src/trap/interrupt.c
===================================================================
--- kernel/arch/sparc64/src/trap/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -36,4 +36,6 @@
 #include <arch/interrupt.h>
 #include <arch/trap/interrupt.h>
+#include <arch/trap/exception.h>
+#include <arch/trap/mmu.h>
 #include <arch/sparc64.h>
 #include <interrupt.h>
@@ -43,4 +45,5 @@
 #include <arch/asm.h>
 #include <arch/barrier.h>
+#include <arch/drivers/tick.h>
 #include <print.h>
 #include <arch.h>
@@ -49,17 +52,119 @@
 #include <synch/spinlock.h>
 
-/** Register Interrupt Level Handler.
- *
- * @param n       Interrupt Level (1 - 15).
- * @param name    Short descriptive string.
- * @param handler Handler.
- *
- */
-void interrupt_register(unsigned int n, const char *name, iroutine_t handler)
+void exc_arch_init(void)
 {
-	ASSERT(n >= IVT_FIRST);
-	ASSERT(n <= IVT_ITEMS);
-	
-	exc_register(n - IVT_FIRST, name, true, handler);
+	exc_register(TT_INSTRUCTION_ACCESS_EXCEPTION,
+	    "instruction_access_exception", false,
+	    instruction_access_exception);
+	exc_register(TT_INSTRUCTION_ACCESS_ERROR,
+	    "instruction_access_error", false,
+	    instruction_access_error);
+
+#ifdef SUN4V
+	exc_register(TT_IAE_UNAUTH_ACCESS,
+	    "iae_unauth_access", false,
+	    instruction_access_exception);
+	exc_register(TT_IAE_NFO_PAGE,
+	    "iae_nfo_page", false,
+	    instruction_access_exception);
+#endif
+
+	exc_register(TT_ILLEGAL_INSTRUCTION,
+	    "illegal_instruction", false,
+	    illegal_instruction);
+	exc_register(TT_PRIVILEGED_OPCODE,
+	    "privileged_opcode", false,
+	    privileged_opcode);
+	exc_register(TT_UNIMPLEMENTED_LDD,
+	    "unimplemented_LDD", false,
+	    unimplemented_LDD);
+	exc_register(TT_UNIMPLEMENTED_STD,
+	    "unimplemented_STD", false,
+	    unimplemented_STD);
+
+#ifdef SUN4V
+	exc_register(TT_DAE_INVALID_ASI,
+	    "dae_invalid_asi", false,
+	    data_access_exception);
+	exc_register(TT_DAE_PRIVILEGE_VIOLATION,
+	    "dae_privilege_violation", false,
+	    data_access_exception);
+	exc_register(TT_DAE_NC_PAGE,
+	    "dae_nc_page", false,
+	    data_access_exception);
+	exc_register(TT_DAE_NC_PAGE,
+	    "dae_nc_page", false,
+	    data_access_exception);
+	exc_register(TT_DAE_NFO_PAGE,
+	    "dae_nfo_page", false,
+	    data_access_exception);
+#endif
+
+	exc_register(TT_FP_DISABLED,
+	    "fp_disabled", true,
+	    fp_disabled);
+	exc_register(TT_FP_EXCEPTION_IEEE_754,
+	    "fp_exception_ieee_754", false,
+	    fp_exception_ieee_754);
+	exc_register(TT_FP_EXCEPTION_OTHER,
+	    "fp_exception_other", false,
+	    fp_exception_other);
+	exc_register(TT_TAG_OVERFLOW,
+	    "tag_overflow", false,
+	    tag_overflow);	
+	exc_register(TT_DIVISION_BY_ZERO,
+	    "division_by_zero", false,
+	    division_by_zero);
+	exc_register(TT_DATA_ACCESS_EXCEPTION,
+	    "data_access_exception", false,
+	    data_access_exception);
+	exc_register(TT_DATA_ACCESS_ERROR,
+	    "data_access_error", false,
+	    data_access_error);
+	exc_register(TT_MEM_ADDRESS_NOT_ALIGNED,
+	    "mem_address_not_aligned", false,
+	    mem_address_not_aligned);
+	exc_register(TT_LDDF_MEM_ADDRESS_NOT_ALIGNED,
+	    "LDDF_mem_address_not_aligned", false,
+	    LDDF_mem_address_not_aligned);
+	exc_register(TT_STDF_MEM_ADDRESS_NOT_ALIGNED,
+	    "STDF_mem_address_not_aligned", false,
+	    STDF_mem_address_not_aligned);
+	exc_register(TT_PRIVILEGED_ACTION,
+	    "privileged_action", false,
+	    privileged_action);
+	exc_register(TT_LDQF_MEM_ADDRESS_NOT_ALIGNED,
+	    "LDQF_mem_address_not_aligned", false,
+	    LDQF_mem_address_not_aligned);
+	exc_register(TT_STQF_MEM_ADDRESS_NOT_ALIGNED,
+	    "STQF_mem_address_not_aligned", false,
+	    STQF_mem_address_not_aligned);
+
+	exc_register(TT_INTERRUPT_LEVEL_14,
+	    "interrupt_level_14", true,
+	    tick_interrupt);
+
+#ifdef SUN4U 
+	exc_register(TT_INTERRUPT_VECTOR_TRAP,
+	    "interrupt_vector_trap", true,
+	    interrupt);
+#endif
+
+	exc_register(TT_FAST_INSTRUCTION_ACCESS_MMU_MISS,
+	    "fast_instruction_access_mmu_miss", true,
+	    fast_instruction_access_mmu_miss);
+	exc_register(TT_FAST_DATA_ACCESS_MMU_MISS,
+	    "fast_data_access_mmu_miss", true,
+	    fast_data_access_mmu_miss);
+	exc_register(TT_FAST_DATA_ACCESS_PROTECTION,
+	    "fast_data_access_protection", true,
+	    fast_data_access_protection);	
+
+#ifdef SUN4V
+	exc_register(TT_CPU_MONDO,
+	    "cpu_mondo", true,
+	    cpu_mondo);
+#endif
+
 }
 
Index: kernel/arch/sparc64/src/trap/sun4u/interrupt.c
===================================================================
--- kernel/arch/sparc64/src/trap/sun4u/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/sun4u/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -53,5 +53,5 @@
  * @param istate Ignored.
  */
-void interrupt(int n, istate_t *istate)
+void interrupt(unsigned int n, istate_t *istate)
 {
 	uint64_t status = asi_u64_read(ASI_INTR_DISPATCH_STATUS, 0);
Index: kernel/arch/sparc64/src/trap/sun4u/trap_table.S
===================================================================
--- kernel/arch/sparc64/src/trap/sun4u/trap_table.S	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/sun4u/trap_table.S	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -63,5 +63,7 @@
 instruction_access_exception_tl0:
 	wrpr %g0, PSTATE_AG_BIT | PSTATE_PRIV_BIT, %pstate
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_INSTRUCTION_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x0a, TL = 0, instruction_access_error */
@@ -69,5 +71,7 @@
 .global instruction_access_error_tl0
 instruction_access_error_tl0:
-	PREEMPTIBLE_HANDLER instruction_access_error
+	mov TT_INSTRUCTION_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x10, TL = 0, illegal_instruction */
@@ -75,5 +79,7 @@
 .global illegal_instruction_tl0
 illegal_instruction_tl0:
-	PREEMPTIBLE_HANDLER illegal_instruction
+	mov TT_ILLEGAL_INSTRUCTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x11, TL = 0, privileged_opcode */
@@ -81,5 +87,7 @@
 .global privileged_opcode_tl0
 privileged_opcode_tl0:
-	PREEMPTIBLE_HANDLER privileged_opcode
+	mov TT_PRIVILEGED_OPCODE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x12, TL = 0, unimplemented_LDD */
@@ -87,5 +95,7 @@
 .global unimplemented_LDD_tl0
 unimplemented_LDD_tl0:
-	PREEMPTIBLE_HANDLER unimplemented_LDD
+	mov TT_UNIMPLEMENTED_LDD, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x13, TL = 0, unimplemented_STD */
@@ -93,5 +103,7 @@
 .global unimplemented_STD_tl0
 unimplemented_STD_tl0:
-	PREEMPTIBLE_HANDLER unimplemented_STD
+	mov TT_UNIMPLEMENTED_STD, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x20, TL = 0, fb_disabled handler */
@@ -99,5 +111,7 @@
 .global fb_disabled_tl0
 fp_disabled_tl0:
-	PREEMPTIBLE_HANDLER fp_disabled
+	mov TT_FP_DISABLED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x21, TL = 0, fb_exception_ieee_754 handler */
@@ -105,5 +119,7 @@
 .global fb_exception_ieee_754_tl0
 fp_exception_ieee_754_tl0:
-	PREEMPTIBLE_HANDLER fp_exception_ieee_754
+	mov TT_FP_EXCEPTION_IEEE_754, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x22, TL = 0, fb_exception_other handler */
@@ -111,5 +127,7 @@
 .global fb_exception_other_tl0
 fp_exception_other_tl0:
-	PREEMPTIBLE_HANDLER fp_exception_other
+	mov TT_FP_EXCEPTION_OTHER, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x23, TL = 0, tag_overflow */
@@ -117,5 +135,7 @@
 .global tag_overflow_tl0
 tag_overflow_tl0:
-	PREEMPTIBLE_HANDLER tag_overflow
+	mov TT_TAG_OVERFLOW, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x24, TL = 0, clean_window handler */
@@ -129,5 +149,7 @@
 .global division_by_zero_tl0
 division_by_zero_tl0:
-	PREEMPTIBLE_HANDLER division_by_zero
+	mov TT_DIVISION_BY_ZERO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x30, TL = 0, data_access_exception */
@@ -136,5 +158,7 @@
 data_access_exception_tl0:
 	wrpr %g0, PSTATE_AG_BIT | PSTATE_PRIV_BIT, %pstate
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DATA_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x32, TL = 0, data_access_error */
@@ -142,5 +166,7 @@
 .global data_access_error_tl0
 data_access_error_tl0:
-	PREEMPTIBLE_HANDLER data_access_error
+	mov TT_DATA_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x34, TL = 0, mem_address_not_aligned */
@@ -148,5 +174,7 @@
 .global mem_address_not_aligned_tl0
 mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER mem_address_not_aligned
+	mov TT_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x35, TL = 0, LDDF_mem_address_not_aligned */
@@ -154,5 +182,7 @@
 .global LDDF_mem_address_not_aligned_tl0
 LDDF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER LDDF_mem_address_not_aligned
+	mov TT_LDDF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x36, TL = 0, STDF_mem_address_not_aligned */
@@ -160,5 +190,7 @@
 .global STDF_mem_address_not_aligned_tl0
 STDF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER STDF_mem_address_not_aligned
+	mov TT_STDF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x37, TL = 0, privileged_action */
@@ -166,5 +198,7 @@
 .global privileged_action_tl0
 privileged_action_tl0:
-	PREEMPTIBLE_HANDLER privileged_action
+	mov TT_PRIVILEGED_ACTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x38, TL = 0, LDQF_mem_address_not_aligned */
@@ -172,5 +206,7 @@
 .global LDQF_mem_address_not_aligned_tl0
 LDQF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER LDQF_mem_address_not_aligned
+	mov TT_LDQF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x39, TL = 0, STQF_mem_address_not_aligned */
@@ -178,5 +214,7 @@
 .global STQF_mem_address_not_aligned_tl0
 STQF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER STQF_mem_address_not_aligned
+	mov TT_STQF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x41, TL = 0, interrupt_level_1 handler */
@@ -184,5 +222,7 @@
 .global interrupt_level_1_handler_tl0
 interrupt_level_1_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 1
+	mov TT_INTERRUPT_LEVEL_1, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x42, TL = 0, interrupt_level_2 handler */
@@ -190,5 +230,7 @@
 .global interrupt_level_2_handler_tl0
 interrupt_level_2_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 2
+	mov TT_INTERRUPT_LEVEL_2, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x43, TL = 0, interrupt_level_3 handler */
@@ -196,5 +238,7 @@
 .global interrupt_level_3_handler_tl0
 interrupt_level_3_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 3
+	mov TT_INTERRUPT_LEVEL_3, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x44, TL = 0, interrupt_level_4 handler */
@@ -202,5 +246,7 @@
 .global interrupt_level_4_handler_tl0
 interrupt_level_4_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 4
+	mov TT_INTERRUPT_LEVEL_4, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x45, TL = 0, interrupt_level_5 handler */
@@ -208,5 +254,7 @@
 .global interrupt_level_5_handler_tl0
 interrupt_level_5_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 5
+	mov TT_INTERRUPT_LEVEL_5, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x46, TL = 0, interrupt_level_6 handler */
@@ -214,5 +262,7 @@
 .global interrupt_level_6_handler_tl0
 interrupt_level_6_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 6
+	mov TT_INTERRUPT_LEVEL_6, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x47, TL = 0, interrupt_level_7 handler */
@@ -220,5 +270,7 @@
 .global interrupt_level_7_handler_tl0
 interrupt_level_7_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 7
+	mov TT_INTERRUPT_LEVEL_7, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x48, TL = 0, interrupt_level_8 handler */
@@ -226,5 +278,7 @@
 .global interrupt_level_8_handler_tl0
 interrupt_level_8_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 8
+	mov TT_INTERRUPT_LEVEL_8, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x49, TL = 0, interrupt_level_9 handler */
@@ -232,5 +286,7 @@
 .global interrupt_level_9_handler_tl0
 interrupt_level_9_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 9
+	mov TT_INTERRUPT_LEVEL_9, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4a, TL = 0, interrupt_level_10 handler */
@@ -238,5 +294,7 @@
 .global interrupt_level_10_handler_tl0
 interrupt_level_10_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 10
+	mov TT_INTERRUPT_LEVEL_10, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4b, TL = 0, interrupt_level_11 handler */
@@ -244,5 +302,7 @@
 .global interrupt_level_11_handler_tl0
 interrupt_level_11_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 11
+	mov TT_INTERRUPT_LEVEL_11, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4c, TL = 0, interrupt_level_12 handler */
@@ -250,5 +310,7 @@
 .global interrupt_level_12_handler_tl0
 interrupt_level_12_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 12
+	mov TT_INTERRUPT_LEVEL_12, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4d, TL = 0, interrupt_level_13 handler */
@@ -256,5 +318,7 @@
 .global interrupt_level_13_handler_tl0
 interrupt_level_13_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 13
+	mov TT_INTERRUPT_LEVEL_13, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4e, TL = 0, interrupt_level_14 handler */
@@ -262,5 +326,7 @@
 .global interrupt_level_14_handler_tl0
 interrupt_level_14_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 14
+	mov TT_INTERRUPT_LEVEL_14, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4f, TL = 0, interrupt_level_15 handler */
@@ -268,5 +334,7 @@
 .global interrupt_level_15_handler_tl0
 interrupt_level_15_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 15
+	mov TT_INTERRUPT_LEVEL_15, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x60, TL = 0, interrupt_vector_trap handler */
@@ -274,5 +342,7 @@
 .global interrupt_vector_trap_handler_tl0
 interrupt_vector_trap_handler_tl0:
-	INTERRUPT_VECTOR_TRAP_HANDLER
+	mov TT_INTERRUPT_VECTOR_TRAP, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x64, TL = 0, fast_instruction_access_MMU_miss */
@@ -342,6 +412,7 @@
 .global trap_instruction_\cur\()_tl0
 trap_instruction_\cur\()_tl0:
+	mov \cur, %g2
 	ba %xcc, trap_instruction_handler
-	mov \cur, %g2
+	clr %g5
 .endr
 
@@ -356,5 +427,7 @@
 	wrpr %g0, 1, %tl
 	wrpr %g0, PSTATE_AG_BIT | PSTATE_PRIV_BIT, %pstate
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_INSTRUCTION_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x0a, TL > 0, instruction_access_error */
@@ -363,5 +436,7 @@
 instruction_access_error_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER instruction_access_error
+	mov TT_INSTRUCTION_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x10, TL > 0, illegal_instruction */
@@ -370,5 +445,7 @@
 illegal_instruction_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER illegal_instruction
+	mov TT_ILLEGAL_INSTRUCTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x24, TL > 0, clean_window handler */
@@ -383,5 +460,7 @@
 division_by_zero_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER division_by_zero
+	mov TT_DIVISION_BY_ZERO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x30, TL > 0, data_access_exception */
@@ -391,5 +470,7 @@
 	wrpr %g0, 1, %tl
 	wrpr %g0, PSTATE_AG_BIT | PSTATE_PRIV_BIT, %pstate
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DATA_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x32, TL > 0, data_access_error */
@@ -398,5 +479,7 @@
 data_access_error_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_error
+	mov TT_DATA_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x34, TL > 0, mem_address_not_aligned */
@@ -405,5 +488,7 @@
 mem_address_not_aligned_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER mem_address_not_aligned
+	mov TT_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x68, TL > 0, fast_data_access_MMU_miss */
@@ -470,19 +555,9 @@
  *	%g1		Address of function to call if this is not a syscall.
  * 	%g2	 	First argument for the function.
+ *	%g5		I/DTLB_TAG_ACCESS register if applicable.
  *	%g6		Pre-set as kernel stack base if trap from userspace.
  *	%g7		Pre-set as address of the userspace window buffer.
  */
 .macro PREEMPTIBLE_HANDLER_TEMPLATE is_syscall
-	/*
-	 * ASSERT(%tl == 1)
-	 */
-	rdpr %tl, %g3
-	cmp %g3, 1
-	be %xcc, 1f
-	nop
-	! this is for debugging, if we ever get here it will be easy to find
-0:	ba,a %xcc, 0b
-
-1:
 .if NOT(\is_syscall)
 	rdpr %tstate, %g3
@@ -502,6 +577,6 @@
 	bnz %xcc, 0f				! ...skip setting of kernel stack and primary context
 	nop
-	
 .endif
+
 	/*
 	 * Normal window spills will go to the userspace window buffer.
@@ -516,5 +591,5 @@
 	 * and the new window's %fp.
 	 */
-	save %g6, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	save %g6, -ISTATE_SIZE, %sp
 
 .if \is_syscall
@@ -548,5 +623,5 @@
 	ba,a %xcc, 1f
 0:
-	save %sp, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	save %sp, -ISTATE_SIZE, %sp
 
 	/*
@@ -570,9 +645,9 @@
 .else
 	! store the syscall number on the stack as 7th argument
-	stx %g2, [%sp + STACK_WINDOW_SAVE_AREA_SIZE + STACK_BIAS + STACK_ARG6] 
+	stx %g2, [%sp + STACK_BIAS + ISTATE_OFFSET_ARG6] 
 .endif
 
 	/*
-	 * Save TSTATE, TPC and TNPC aside.
+	 * Save TSTATE, TPC, TNPC and I/DTLB_TAG_ACCESS aside.
 	 */
 	rdpr %tstate, %g1
@@ -581,16 +656,13 @@
 	rd %y, %g4
 
-	stx %g1, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TSTATE]
-	stx %g2, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TPC]
-	stx %g3, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC]
+	stx %g1, [%sp + STACK_BIAS + ISTATE_OFFSET_TSTATE]
+	stx %g2, [%sp + STACK_BIAS + ISTATE_OFFSET_TPC]
+	stx %g3, [%sp + STACK_BIAS + ISTATE_OFFSET_TNPC]
+	stx %g5, [%sp + STACK_BIAS + ISTATE_OFFSET_TLB_TAG_ACCESS]
 
 	/*
 	 * Save the Y register.
-	 * This register is deprecated according to SPARC V9 specification
-	 * and is only present for backward compatibility with previous
-	 * versions of the SPARC architecture.
-	 * Surprisingly, gcc makes use of this register without a notice.
-	 */
-	stx %g4, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_Y]
+	 */
+	stx %g4, [%sp + STACK_BIAS + ISTATE_OFFSET_Y]
 	
 	wrpr %g0, 0, %tl
@@ -603,5 +675,5 @@
 	 */
 	call %l0
-	add %sp, PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC, %o1
+	add %sp, STACK_BIAS, %o1
 .else
 	/*
@@ -621,7 +693,7 @@
 	 * Read TSTATE, TPC and TNPC from saved copy.
 	 */
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TSTATE], %g1
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TPC], %g2
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC], %g3
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TSTATE], %g1
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TPC], %g2
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TNPC], %g3
 
 	/*
@@ -644,5 +716,5 @@
 	 * Restore Y.
 	 */
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_Y], %g4
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_Y], %g4
 	wr %g4, %y
 
@@ -684,22 +756,22 @@
 	 */
 	mov %sp, %g2
-	stx %i0, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0]
-	stx %i1, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1]
-	stx %i2, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2]
-	stx %i3, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3]
-	stx %i4, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4]
-	stx %i5, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5]
-	stx %i6, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6]
-	stx %i7, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7]
+	stx %i0, [%sp + STACK_BIAS + ISTATE_OFFSET_O0]
+	stx %i1, [%sp + STACK_BIAS + ISTATE_OFFSET_O1]
+	stx %i2, [%sp + STACK_BIAS + ISTATE_OFFSET_O2]
+	stx %i3, [%sp + STACK_BIAS + ISTATE_OFFSET_O3]
+	stx %i4, [%sp + STACK_BIAS + ISTATE_OFFSET_O4]
+	stx %i5, [%sp + STACK_BIAS + ISTATE_OFFSET_O5]
+	stx %i6, [%sp + STACK_BIAS + ISTATE_OFFSET_O6]
+	stx %i7, [%sp + STACK_BIAS + ISTATE_OFFSET_O7]
 	wrpr %l0, 0, %cwp
 	mov %g2, %sp
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0], %i0
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1], %i1
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2], %i2
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3], %i3
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4], %i4
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5], %i5
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6], %i6
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7], %i7
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O0], %i0
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O1], %i1
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O2], %i2
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O3], %i3
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O4], %i4
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O5], %i5
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O6], %i6
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O7], %i7
 
 	/*
@@ -807,5 +879,5 @@
 	 * If the:
 	 *
-	 * 	save %g6, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	 * 	save %g6, -ISTATE_SIZE, %sp
 	 *
 	 * instruction trapped and spilled a register window into the userspace
Index: kernel/arch/sparc64/src/trap/sun4v/interrupt.c
===================================================================
--- kernel/arch/sparc64/src/trap/sun4v/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/sun4v/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -95,5 +95,5 @@
  * register and processes the message (invokes a function call).
  */
-void cpu_mondo(void)
+void cpu_mondo(unsigned int tt, istate_t *istate)
 {
 #ifdef CONFIG_SMP
Index: kernel/arch/sparc64/src/trap/sun4v/trap_table.S
===================================================================
--- kernel/arch/sparc64/src/trap/sun4v/trap_table.S	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/arch/sparc64/src/trap/sun4v/trap_table.S	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -66,5 +66,7 @@
 .global instruction_access_exception_tl0
 instruction_access_exception_tl0:
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_INSTRUCTION_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x09, TL = 0, instruction_access_mmu_miss */
@@ -77,5 +79,7 @@
 .global instruction_access_error_tl0
 instruction_access_error_tl0:
-	PREEMPTIBLE_HANDLER instruction_access_error
+	mov TT_INSTRUCTION_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x0b, TL = 0, IAE_unauth_access */
@@ -83,5 +87,7 @@
 .global iae_unauth_access_tl0
 iae_unauth_access_tl0:
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_IAE_UNAUTH_ACCESS, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x0c, TL = 0, IAE_nfo_page */
@@ -89,5 +95,7 @@
 .global iae_nfo_page_tl0
 iae_nfo_page_tl0:
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_IAE_NFO_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x10, TL = 0, illegal_instruction */
@@ -95,5 +103,7 @@
 .global illegal_instruction_tl0
 illegal_instruction_tl0:
-	PREEMPTIBLE_HANDLER illegal_instruction
+	mov TT_ILLEGAL_INSTRUCTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x11, TL = 0, privileged_opcode */
@@ -101,5 +111,7 @@
 .global privileged_opcode_tl0
 privileged_opcode_tl0:
-	PREEMPTIBLE_HANDLER privileged_opcode
+	mov TT_PRIVILEGED_OPCODE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x12, TL = 0, unimplemented_LDD */
@@ -107,5 +119,7 @@
 .global unimplemented_LDD_tl0
 unimplemented_LDD_tl0:
-	PREEMPTIBLE_HANDLER unimplemented_LDD
+	mov TT_UNIMPLEMENTED_LDD, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x13, TL = 0, unimplemented_STD */
@@ -113,5 +127,7 @@
 .global unimplemented_STD_tl0
 unimplemented_STD_tl0:
-	PREEMPTIBLE_HANDLER unimplemented_STD
+	mov TT_UNIMPLEMENTED_STD, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x14, TL = 0, DAE_invalid_asi */
@@ -119,5 +135,7 @@
 .global dae_invalid_asi_tl0
 dae_invalid_asi_tl0:
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_INVALID_ASI, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x15, TL = 0, DAE_privilege_violation */
@@ -125,5 +143,7 @@
 .global dae_privilege_violation_tl0
 dae_privilege_violation_tl0:
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_PRIVILEGE_VIOLATION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x16, TL = 0, DAE_nc_page */
@@ -131,5 +151,7 @@
 .global dae_nc_page_tl0
 dae_nc_page_tl0:
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_NC_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x17, TL = 0, DAE_nfo_page */
@@ -137,5 +159,7 @@
 .global dae_nfo_page_tl0
 dae_nfo_page_tl0:
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_NFO_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x20, TL = 0, fb_disabled handler */
@@ -143,5 +167,7 @@
 .global fb_disabled_tl0
 fp_disabled_tl0:
-	PREEMPTIBLE_HANDLER fp_disabled
+	mov TT_FP_DISABLED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x21, TL = 0, fb_exception_ieee_754 handler */
@@ -149,5 +175,7 @@
 .global fb_exception_ieee_754_tl0
 fp_exception_ieee_754_tl0:
-	PREEMPTIBLE_HANDLER fp_exception_ieee_754
+	mov TT_FP_EXCEPTION_IEEE_754, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x22, TL = 0, fb_exception_other handler */
@@ -155,5 +183,7 @@
 .global fb_exception_other_tl0
 fp_exception_other_tl0:
-	PREEMPTIBLE_HANDLER fp_exception_other
+	mov TT_FP_EXCEPTION_OTHER, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x23, TL = 0, tag_overflow */
@@ -161,5 +191,7 @@
 .global tag_overflow_tl0
 tag_overflow_tl0:
-	PREEMPTIBLE_HANDLER tag_overflow
+	mov TT_TAG_OVERFLOW, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x24, TL = 0, clean_window handler */
@@ -173,5 +205,7 @@
 .global division_by_zero_tl0
 division_by_zero_tl0:
-	PREEMPTIBLE_HANDLER division_by_zero
+	mov TT_DIVISION_BY_ZERO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x30, TL = 0, data_access_exception */
@@ -180,5 +214,7 @@
 .global data_access_exception_tl0
 data_access_exception_tl0:
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DATA_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x31, TL = 0, data_access_mmu_miss */
@@ -192,5 +228,7 @@
 .global data_access_error_tl0
 data_access_error_tl0:
-	PREEMPTIBLE_HANDLER data_access_error
+	mov TT_DATA_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x34, TL = 0, mem_address_not_aligned */
@@ -198,5 +236,7 @@
 .global mem_address_not_aligned_tl0
 mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER mem_address_not_aligned
+	mov TT_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x35, TL = 0, LDDF_mem_address_not_aligned */
@@ -204,5 +244,7 @@
 .global LDDF_mem_address_not_aligned_tl0
 LDDF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER LDDF_mem_address_not_aligned
+	mov TT_LDDF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x36, TL = 0, STDF_mem_address_not_aligned */
@@ -210,5 +252,7 @@
 .global STDF_mem_address_not_aligned_tl0
 STDF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER STDF_mem_address_not_aligned
+	mov TT_STDF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x37, TL = 0, privileged_action */
@@ -216,5 +260,7 @@
 .global privileged_action_tl0
 privileged_action_tl0:
-	PREEMPTIBLE_HANDLER privileged_action
+	mov TT_PRIVILEGED_ACTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x38, TL = 0, LDQF_mem_address_not_aligned */
@@ -222,5 +268,7 @@
 .global LDQF_mem_address_not_aligned_tl0
 LDQF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER LDQF_mem_address_not_aligned
+	mov TT_LDQF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x39, TL = 0, STQF_mem_address_not_aligned */
@@ -228,5 +276,7 @@
 .global STQF_mem_address_not_aligned_tl0
 STQF_mem_address_not_aligned_tl0:
-	PREEMPTIBLE_HANDLER STQF_mem_address_not_aligned
+	mov TT_STQF_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x41, TL = 0, interrupt_level_1 handler */
@@ -234,5 +284,7 @@
 .global interrupt_level_1_handler_tl0
 interrupt_level_1_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 1
+	mov TT_INTERRUPT_LEVEL_1, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x42, TL = 0, interrupt_level_2 handler */
@@ -240,5 +292,7 @@
 .global interrupt_level_2_handler_tl0
 interrupt_level_2_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 2
+	mov TT_INTERRUPT_LEVEL_2, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x43, TL = 0, interrupt_level_3 handler */
@@ -246,5 +300,7 @@
 .global interrupt_level_3_handler_tl0
 interrupt_level_3_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 3
+	mov TT_INTERRUPT_LEVEL_3, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x44, TL = 0, interrupt_level_4 handler */
@@ -252,5 +308,7 @@
 .global interrupt_level_4_handler_tl0
 interrupt_level_4_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 4
+	mov TT_INTERRUPT_LEVEL_4, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x45, TL = 0, interrupt_level_5 handler */
@@ -258,5 +316,7 @@
 .global interrupt_level_5_handler_tl0
 interrupt_level_5_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 5
+	mov TT_INTERRUPT_LEVEL_5, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x46, TL = 0, interrupt_level_6 handler */
@@ -264,5 +324,7 @@
 .global interrupt_level_6_handler_tl0
 interrupt_level_6_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 6
+	mov TT_INTERRUPT_LEVEL_6, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x47, TL = 0, interrupt_level_7 handler */
@@ -270,5 +332,7 @@
 .global interrupt_level_7_handler_tl0
 interrupt_level_7_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 7
+	mov TT_INTERRUPT_LEVEL_7, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x48, TL = 0, interrupt_level_8 handler */
@@ -276,5 +340,7 @@
 .global interrupt_level_8_handler_tl0
 interrupt_level_8_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 8
+	mov TT_INTERRUPT_LEVEL_8, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x49, TL = 0, interrupt_level_9 handler */
@@ -282,5 +348,7 @@
 .global interrupt_level_9_handler_tl0
 interrupt_level_9_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 9
+	mov TT_INTERRUPT_LEVEL_9, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4a, TL = 0, interrupt_level_10 handler */
@@ -288,5 +356,7 @@
 .global interrupt_level_10_handler_tl0
 interrupt_level_10_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 10
+	mov TT_INTERRUPT_LEVEL_10, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4b, TL = 0, interrupt_level_11 handler */
@@ -294,5 +364,7 @@
 .global interrupt_level_11_handler_tl0
 interrupt_level_11_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 11
+	mov TT_INTERRUPT_LEVEL_11, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4c, TL = 0, interrupt_level_12 handler */
@@ -300,5 +372,7 @@
 .global interrupt_level_12_handler_tl0
 interrupt_level_12_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 12
+	mov TT_INTERRUPT_LEVEL_12, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4d, TL = 0, interrupt_level_13 handler */
@@ -306,5 +380,7 @@
 .global interrupt_level_13_handler_tl0
 interrupt_level_13_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 13
+	mov TT_INTERRUPT_LEVEL_13, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4e, TL = 0, interrupt_level_14 handler */
@@ -312,5 +388,7 @@
 .global interrupt_level_14_handler_tl0
 interrupt_level_14_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 14
+	mov TT_INTERRUPT_LEVEL_14, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x4f, TL = 0, interrupt_level_15 handler */
@@ -318,5 +396,7 @@
 .global interrupt_level_15_handler_tl0
 interrupt_level_15_handler_tl0:
-	INTERRUPT_LEVEL_N_HANDLER 15
+	mov TT_INTERRUPT_LEVEL_15, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch
 
 /* TT = 0x64, TL = 0, fast_instruction_access_MMU_miss */
@@ -342,5 +422,7 @@
 .global cpu_mondo_handler_tl0
 cpu_mondo_handler_tl0:
-PREEMPTIBLE_HANDLER cpu_mondo
+	mov TT_CPU_MONDO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x80, TL = 0, spill_0_normal handler */
@@ -392,6 +474,7 @@
 .global trap_instruction_\cur\()_tl0
 trap_instruction_\cur\()_tl0:
+	mov \cur, %g2
 	ba %xcc, trap_instruction_handler
-	mov \cur, %g2
+	clr %g5
 .endr
 
@@ -406,5 +489,7 @@
 instruction_access_exception_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_INSTRUCTION_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x09, TL > 0, instruction_access_mmu_miss */
@@ -419,5 +504,7 @@
 instruction_access_error_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER instruction_access_error
+	mov TT_INSTRUCTION_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x0b, TL > 0, IAE_unauth_access */
@@ -426,5 +513,7 @@
 iae_unauth_access_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_IAE_UNAUTH_ACCESS, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x0c, TL > 0, IAE_nfo_page */
@@ -433,5 +522,7 @@
 iae_nfo_page_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER instruction_access_exception
+	mov TT_IAE_NFO_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x10, TL > 0, illegal_instruction */
@@ -440,5 +531,7 @@
 illegal_instruction_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER illegal_instruction
+	mov TT_ILLEGAL_INSTRUCTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x14, TL > 0, DAE_invalid_asi */
@@ -447,5 +540,7 @@
 dae_invalid_asi_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_INVALID_ASI, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x15, TL > 0, DAE_privilege_violation */
@@ -454,5 +549,7 @@
 dae_privilege_violation_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_PRIVILEGE_VIOLATION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x16, TL > 0, DAE_nc_page */
@@ -461,5 +558,7 @@
 dae_nc_page_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_NC_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x17, TL > 0, DAE_nfo_page */
@@ -468,5 +567,7 @@
 dae_nfo_page_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_exception
+	mov TT_DAE_NFO_PAGE, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x24, TL > 0, clean_window handler */
@@ -481,5 +582,7 @@
 division_by_zero_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER division_by_zero
+	mov TT_DIVISION_BY_ZERO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x30, TL > 0, data_access_exception */
@@ -487,7 +590,8 @@
 .global data_access_exception_tl1
 data_access_exception_tl1:
-	/*wrpr %g0, 1, %tl
-	wrpr %g0, PSTATE_AG_BIT | PSTATE_PRIV_BIT, %pstate
-	PREEMPTIBLE_HANDLER data_access_exception*/
+	wrpr %g0, 1, %tl
+	mov TT_DATA_ACCESS_EXCEPTION, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x31, TL > 0, data_access_mmu_miss */
@@ -502,5 +606,7 @@
 data_access_error_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER data_access_error
+	mov TT_DATA_ACCESS_ERROR, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x34, TL > 0, mem_address_not_aligned */
@@ -509,5 +615,7 @@
 mem_address_not_aligned_tl1:
 	wrpr %g0, 1, %tl
-	PREEMPTIBLE_HANDLER mem_address_not_aligned
+	mov TT_MEM_ADDRESS_NOT_ALIGNED, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x68, TL > 0, fast_data_access_MMU_miss */
@@ -528,5 +636,7 @@
 cpu_mondo_handler_tl1:
 	wrpr %g0, %tl
-	PREEMPTIBLE_HANDLER cpu_mondo
+	mov TT_CPU_MONDO, %g2
+	clr %g5
+	PREEMPTIBLE_HANDLER exc_dispatch 
 
 /* TT = 0x80, TL > 0, spill_0_normal handler */
@@ -654,5 +764,5 @@
 .else
 	! store the syscall number on the stack as 7th argument
-	stx %g2, [%sp + STACK_WINDOW_SAVE_AREA_SIZE + STACK_BIAS + STACK_ARG6] 
+	stx %g2, [%sp + STACK_BIAS + ISTATE_OFFSET_ARG6] 
 .endif
 
@@ -664,17 +774,18 @@
 	rdpr %tnpc, %g3
 
-	stx %g1, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TSTATE]
-	stx %g2, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TPC]
-	stx %g3, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC]
+	stx %g1, [%sp + STACK_BIAS + ISTATE_OFFSET_TSTATE]
+	stx %g2, [%sp + STACK_BIAS + ISTATE_OFFSET_TPC]
+	stx %g3, [%sp + STACK_BIAS + ISTATE_OFFSET_TNPC]
 
 	/*
 	 * Save the Y register.
-	 * This register is deprecated according to SPARC V9 specification
-	 * and is only present for backward compatibility with previous
-	 * versions of the SPARC architecture.
-	 * Surprisingly, gcc makes use of this register without a notice.
 	 */
 	rd %y, %g4
-	stx %g4, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_Y]
+	stx %g4, [%sp + STACK_BIAS + ISTATE_OFFSET_Y]
+
+	/*
+	 * Save the faulting page and context.
+	 */
+	stx %g5, [%sp + STACK_BIAS + ISTATE_OFFSET_TLB_TAG_ACCESS]
 
 	/* switch to TL = 0, explicitly enable FPU */
@@ -689,5 +800,5 @@
 	/* call higher-level service routine, pass istate as its 2nd parameter */
 	call %l0
-	add %sp, PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC, %o1
+	add %sp, STACK_BIAS, %o1
 .else
 	/* Call the higher-level syscall handler. */
@@ -711,7 +822,7 @@
 
 	/* Read TSTATE, TPC and TNPC from saved copy. */
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TSTATE], %g1
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TPC], %g2
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_TNPC], %g3
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TSTATE], %g1
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TPC], %g2
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_TNPC], %g3
 
 	/* Copy PSTATE.PEF to the in-register copy of TSTATE. */
@@ -728,5 +839,5 @@
 
 	/* Restore Y. */
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_Y], %g4
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_Y], %g4
 	wr %g4, %y
 	
@@ -750,22 +861,22 @@
 	 */
 	mov %sp, %g2
-	stx %i0, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0]
-	stx %i1, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1]
-	stx %i2, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2]
-	stx %i3, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3]
-	stx %i4, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4]
-	stx %i5, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5]
-	stx %i6, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6]
-	stx %i7, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7]
+	stx %i0, [%sp + STACK_BIAS + ISTATE_OFFSET_O0]
+	stx %i1, [%sp + STACK_BIAS + ISTATE_OFFSET_O1]
+	stx %i2, [%sp + STACK_BIAS + ISTATE_OFFSET_O2]
+	stx %i3, [%sp + STACK_BIAS + ISTATE_OFFSET_O3]
+	stx %i4, [%sp + STACK_BIAS + ISTATE_OFFSET_O4]
+	stx %i5, [%sp + STACK_BIAS + ISTATE_OFFSET_O5]
+	stx %i6, [%sp + STACK_BIAS + ISTATE_OFFSET_O6]
+	stx %i7, [%sp + STACK_BIAS + ISTATE_OFFSET_O7]
 	wrpr %l0, 0, %cwp
 	mov %g2, %sp
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0], %i0
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1], %i1
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2], %i2
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3], %i3
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4], %i4
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5], %i5
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6], %i6
-	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7], %i7
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O0], %i0
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O1], %i1
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O2], %i2
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O3], %i3
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O4], %i4
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O5], %i5
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O6], %i6
+	ldx [%sp + STACK_BIAS + ISTATE_OFFSET_O7], %i7
 .endm
 
@@ -774,17 +885,4 @@
  */
 .macro PREEMPTIBLE_HANDLER_KERNEL
-
-	/*
-	 * ASSERT(%tl == 1)
-	 */
-	rdpr %tl, %g3
-	cmp %g3, 1
-	be %xcc, 1f
-	nop
-
-	! this is for debugging, if we ever get here it will be easy to find
-0:	ba,a %xcc, 0b
-
-1:
 	/* prevent unnecessary CLEANWIN exceptions */
 	wrpr %g0, NWINDOWS - 1, %cleanwin
@@ -799,9 +897,19 @@
 	brnz %g3, 2f
 	nop
+	rdpr %otherwin, %g4
+	brnz %g4, 1f
+	nop
+
+	/* OTHERWIN is zero, we are spilling a kernel window. */
 	INLINE_SPILL %g3, %g4
+	ba,a %xcc, 2f
+
+1:
+	/* OTHERWIN is non-zero, we are spilling a uspace window. */
+	INLINE_SPILL_TO_WBUF %g3, %g4, %g7
 
 2:
 	/* ask for new register window */
-	save %sp, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	save %sp, -ISTATE_SIZE, %sp
 
 	MIDDLE_PART 0
@@ -882,5 +990,5 @@
 	set SCRATCHPAD_KSTACK, %g4
 	ldxa [%g4] ASI_SCRATCHPAD, %g6
-	save %g6, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	save %g6, -ISTATE_SIZE, %sp
 
 .if \is_syscall
@@ -1015,5 +1123,5 @@
 	 * If the:
 	 *
-	 * 	save %g6, -PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE, %sp
+	 * 	save %g6, -ISTATE_SIZE, %sp
 	 *
 	 * instruction trapped and spilled a register window into the userspace
Index: kernel/generic/include/adt/cht.h
===================================================================
--- kernel/generic/include/adt/cht.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/adt/cht.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup genericadt
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_CONC_HASH_TABLE_H_
+#define KERN_CONC_HASH_TABLE_H_
+
+#include <stdint.h>
+#include <adt/list.h>
+#include <synch/rcu_types.h>
+#include <macros.h>
+#include <synch/workqueue.h>
+
+typedef uintptr_t cht_ptr_t;
+
+/** Concurrent hash table node link. */
+typedef struct cht_link {
+	/* Must be placed first. 
+	 * 
+	 * The function pointer (rcu_link.func) is used to store the item's 
+	 * mixed memoized hash. If in use by RCU (ie waiting for deferred 
+	 * destruction) the hash will contain the value of 
+	 * cht_t.op->remove_callback.
+	 */
+	union {
+		rcu_item_t rcu_link;
+		size_t hash;
+	};
+	/** Link to the next item in the bucket including any marks. */
+	cht_ptr_t link;
+} cht_link_t;
+
+/** Set of operations for a concurrent hash table. */
+typedef struct cht_ops {
+	/** Returns the hash of the item.
+	 * 
+	 * Applicable also to items that were logically deleted from the table
+	 * but have yet to be physically removed by means of remove_callback().
+	 */
+	size_t (*hash)(const cht_link_t *item);
+	/** Returns the hash value of the key used to search for entries. */
+	size_t (*key_hash)(void *key);
+	/** Returns true if the two items store equal search keys. */
+	bool (*equal)(const cht_link_t *item1, const cht_link_t *item2);
+	/** Returns true if the item contains an equal search key. */
+	bool (*key_equal)(void *key, const cht_link_t *item);
+	/** Invoked to free a removed item once all references to it are dropped. */
+	void (*remove_callback)(cht_link_t *item);
+} cht_ops_t;
+
+/** Groups hash table buckets with their count.
+ * 
+ * It allows both the number of buckets as well as the bucket array
+ * to be swapped atomically when resing the table.
+ */
+typedef struct cht_buckets {
+	/** The number of buckets is 2^order. */
+	size_t order;
+	/** Array of single linked list bucket heads along with any marks. */
+	cht_ptr_t head[1];
+} cht_buckets_t;
+
+/** Concurrent hash table structure. */
+typedef struct {
+	/** Item specific operations. */
+	cht_ops_t *op;
+	
+	/** Buckets currently in use. */
+	cht_buckets_t *b;
+	/** Resized table buckets that will replace b once resize is complete. */
+	cht_buckets_t *new_b;
+	/** Invalid memoized hash value. 
+	 * 
+	 * If cht_link.hash contains this value the item had been logically
+	 * removed and is waiting to be freed. Such hashes (and the associated
+	 * items) are disregarded and skipped or the actual hash must be 
+	 * determined via op->hash().
+	 */
+	size_t invalid_hash;
+
+	/** Minimum number of buckets is 2^min_order. */
+	size_t min_order;
+	/** Maximum number of items per bucket before the table grows. */
+	size_t max_load;
+	/** Table is resized in the background in a work queue. */
+	work_t resize_work;
+	/** If positive the table should grow or shrink.
+	 * 
+	 * If not 0 resize work had already been posted to the system work queue.
+	 */
+	atomic_t resize_reqs;
+	
+	/** Number of items in the table that have not been logically deleted. */
+	atomic_t item_cnt;
+} cht_t;
+
+#define cht_get_inst(item, type, member) \
+	member_to_inst((item), type, member)
+
+
+#define cht_read_lock()     rcu_read_lock()
+#define cht_read_unlock()   rcu_read_unlock()
+
+extern bool cht_create_simple(cht_t *h, cht_ops_t *op);
+extern bool cht_create(cht_t *h, size_t init_size, size_t min_size, 
+	size_t max_load, bool can_block, cht_ops_t *op);
+extern void cht_destroy(cht_t *h);
+extern void cht_destroy_unsafe(cht_t *h);
+
+extern cht_link_t *cht_find(cht_t *h, void *key);
+extern cht_link_t *cht_find_lazy(cht_t *h, void *key);
+extern cht_link_t *cht_find_next(cht_t *h, const cht_link_t *item);
+extern cht_link_t *cht_find_next_lazy(cht_t *h, const cht_link_t *item);
+
+extern void cht_insert(cht_t *h, cht_link_t *item);
+extern bool cht_insert_unique(cht_t *h, cht_link_t *item, cht_link_t **dup_item);
+extern size_t cht_remove_key(cht_t *h, void *key);
+extern bool cht_remove_item(cht_t *h, cht_link_t *item);
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/adt/hash.h
===================================================================
--- kernel/generic/include/adt/hash.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/adt/hash.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup genericadt
+ * @{
+ */
+/** @file
+ */
+#ifndef KERN_HASH_H_
+#define KERN_HASH_H_
+
+#include <stdint.h>
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline uint32_t hash_mix32(uint32_t hash)
+{
+	/*
+	 * Thomas Wang's modification of Bob Jenkin's hash mixing function:
+	 * http://www.concentric.net/~Ttwang/tech/inthash.htm
+	 * Public domain.
+	 */
+	hash = ~hash + (hash << 15); 
+	hash = hash ^ (hash >> 12);
+	hash = hash + (hash << 2);
+	hash = hash ^ (hash >> 4);
+	hash = hash * 2057; 
+	hash = hash ^ (hash >> 16);
+	return hash;	
+}
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline uint64_t hash_mix64(uint64_t hash)
+{
+	/*
+	 * Thomas Wang's public domain 64-bit hash mixing function:
+	 * http://www.concentric.net/~Ttwang/tech/inthash.htm
+	 */
+	hash = (hash ^ 61) ^ (hash >> 16);
+	hash = hash + (hash << 3);
+	hash = hash ^ (hash >> 4);
+	hash = hash * 0x27d4eb2d;
+	hash = hash ^ (hash >> 15);	
+	/* 
+	 * Lower order bits are mixed more thoroughly. Swap them with
+	 * the higher order bits and make the resulting higher order bits
+	 * more usable.
+	 */
+	return (hash << 32) | (hash >> 32);
+}
+
+/** Produces a uniform hash affecting all output bits from the skewed input. */
+static inline size_t hash_mix(size_t hash) 
+{
+#ifdef __32_BITS__
+	return hash_mix32(hash);
+#elif defined(__64_BITS__)
+	return hash_mix64(hash);
+#else
+#error Unknown size_t size - cannot select proper hash mix function.
+#endif
+}
+
+/** Use to create a hash from multiple values.
+ * 
+ * Typical usage:
+ * @code
+ * int car_id;
+ * bool car_convertible;
+ * // ..
+ * size_t hash = 0;
+ * hash = hash_combine(hash, car_id);
+ * hash = hash_combine(hash, car_convertible);
+ * // Now use hash as a hash of both car_id and car_convertible.
+ * @endcode
+ */
+static inline size_t hash_combine(size_t seed, size_t hash)
+{
+	/* 
+	 * todo: use Bob Jenkin's proper mixing hash pass:
+	 * http://burtleburtle.net/bob/c/lookup3.c
+	 */
+	seed ^= hash + 0x9e3779b9 
+		+ ((seed << 5) | (seed >> (sizeof(size_t) * 8 - 5)));
+	return seed;	
+}
+
+#endif
Index: kernel/generic/include/adt/list.h
===================================================================
--- kernel/generic/include/adt/list.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/adt/list.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -52,4 +52,10 @@
 } list_t;
 
+
+extern int list_member(const link_t *, const list_t *);
+extern void list_splice(list_t *, link_t *);
+extern unsigned int list_count(const list_t *);
+
+
 /** Declare and initialize statically allocated list.
  *
@@ -80,4 +86,36 @@
 	    _link != &(list).head; _link = _link->prev)
 
+/** Unlike list_foreach(), allows removing items while traversing a list.
+ * 
+ * @code
+ * list_t mylist;
+ * typedef struct item {
+ *     int value;
+ *     link_t item_link;
+ * } item_t;
+ * 
+ * //..
+ * 
+ * // Print each list element's value and remove the element from the list.
+ * list_foreach_safe(mylist, cur_link, next_link) {
+ *     item_t *cur_item = list_get_instance(cur_link, item_t, item_link);
+ *     printf("%d\n", cur_item->value);
+ *     list_remove(cur_link);
+ * }
+ * @endcode
+ * 
+ * @param list List to traverse.
+ * @param iterator Iterator to the current element of the list.
+ *             The item this iterator points may be safely removed
+ *             from the list.
+ * @param next_iter Iterator to the next element of the list.
+ */
+#define list_foreach_safe(list, iterator, next_iter) \
+	for (link_t *iterator = (list).head.next, \
+		*next_iter = iterator->next; \
+		iterator != &(list).head; \
+		iterator = next_iter, next_iter = iterator->next)
+
+	
 #define assert_link_not_used(link) \
 	ASSERT(!link_used(link))
@@ -289,4 +327,19 @@
 {
 	headless_list_split_or_concat(part1, part2);
+}
+
+/** Concatenate two lists
+ *
+ * Concatenate lists @a list1 and @a list2, producing a single
+ * list @a list1 containing items from both (in @a list1, @a list2
+ * order) and empty list @a list2.
+ *
+ * @param list1		First list and concatenated output
+ * @param list2 	Second list and empty output.
+ *
+ */
+NO_TRACE static inline void list_concat(list_t *list1, list_t *list2)
+{
+	list_splice(list2, list1->head.prev);
 }
 
@@ -340,8 +393,4 @@
 }
 
-extern int list_member(const link_t *, const list_t *);
-extern void list_concat(list_t *, list_t *);
-extern unsigned int list_count(const list_t *);
-
 #endif
 
Index: kernel/generic/include/arch.h
===================================================================
--- kernel/generic/include/arch.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/arch.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -36,8 +36,8 @@
 #define KERN_ARCH_H_
 
-#include <arch/arch.h>
-#include <proc/thread.h>
-#include <proc/task.h>
-#include <mm/as.h>
+#include <arch/arch.h>  /* arch_pre_main() */
+#include <arch/asm.h>   /* get_stack_base() */
+#include <config.h>
+
 
 /*
@@ -49,9 +49,4 @@
 #define THE  ((the_t * )(get_stack_base()))
 
-#define CPU                  THE->cpu
-#define THREAD               THE->thread
-#define TASK                 THE->task
-#define AS                   THE->as
-#define PREEMPTION_DISABLED  THE->preemption_disabled
 #define MAGIC                UINT32_C(0xfacefeed)
 
@@ -62,4 +57,10 @@
 	((THE->task) ? (THE->task->container) : (DEFAULT_CONTAINER))
 
+/* Fwd decl. to avoid include hell. */
+struct thread;
+struct task;
+struct cpu;
+struct as;
+
 /**
  * For each possible kernel stack, structure
@@ -68,10 +69,13 @@
  */
 typedef struct {
-	size_t preemption_disabled;  /**< Preemption disabled counter. */
-	thread_t *thread;            /**< Current thread. */
-	task_t *task;                /**< Current task. */
-	cpu_t *cpu;                  /**< Executing cpu. */
-	as_t *as;                    /**< Current address space. */
-	uint32_t magic;              /**< Magic value */
+	size_t preemption;     /**< Preemption disabled counter and flag. */
+#ifdef RCU_PREEMPT_A
+	size_t rcu_nesting;    /**< RCU nesting count and flag. */
+#endif 
+	struct thread *thread; /**< Current thread. */
+	struct task *task;     /**< Current task. */
+	struct cpu *cpu;       /**< Executing cpu. */
+	struct as *as;         /**< Current address space. */
+	uint32_t magic;        /**< Magic value */
 } the_t;
 
@@ -91,4 +95,5 @@
 extern void *arch_construct_function(fncptr_t *, void *, void *);
 
+
 #endif
 
Index: kernel/generic/include/atomic.h
===================================================================
--- kernel/generic/include/atomic.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/atomic.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -53,4 +53,18 @@
 }
 
+
+/*
+ * If the architecture does not provide operations that are atomic
+ * only with respect to the local cpu (eg exception handlers) and
+ * not other cpus, implement these cpu local atomic operations with
+ * full blown smp-safe atomics.
+ */
+#ifndef local_atomic_exchange
+#define local_atomic_exchange(var_addr, new_val) \
+	__atomic_exchange_n((var_addr), (new_val), __ATOMIC_RELAXED)
+#endif
+
+
+
 #endif
 
Index: kernel/generic/include/compiler/barrier.h
===================================================================
--- kernel/generic/include/compiler/barrier.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/compiler/barrier.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef KERN_COMPILER_BARRIER_H_
+#define KERN_COMPILER_BARRIER_H_
+
+#define compiler_barrier() asm volatile ("" ::: "memory")
+
+/** Forces the compiler to access (ie load/store) the variable only once. */
+#define ACCESS_ONCE(var) (*((volatile typeof(var)*)&(var)))
+
+#endif /* KERN_COMPILER_BARRIER_H_ */
Index: kernel/generic/include/cpu.h
===================================================================
--- kernel/generic/include/cpu.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/cpu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -38,7 +38,13 @@
 #include <mm/tlb.h>
 #include <synch/spinlock.h>
+#include <synch/rcu_types.h>
 #include <proc/scheduler.h>
 #include <arch/cpu.h>
 #include <arch/context.h>
+#include <adt/list.h>
+#include <arch.h>
+
+#define CPU                  THE->cpu
+
 
 /** CPU structure.
@@ -94,4 +100,13 @@
 	
 	/**
+	 * SMP calls to invoke on this CPU.
+	 */
+	SPINLOCK_DECLARE(smp_calls_lock);
+	list_t smp_pending_calls;
+	
+	/** RCU per-cpu data. Uses own locking. */
+	rcu_cpu_data_t rcu;
+	
+	/**
 	 * Stack used by scheduler when there is no running thread.
 	 */
Index: kernel/generic/include/cpu/cpu_mask.h
===================================================================
--- kernel/generic/include/cpu/cpu_mask.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/cpu/cpu_mask.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+#ifndef KERN_CPU_CPU_MASK_H_
+#define KERN_CPU_CPU_MASK_H_
+
+#include <cpu.h>
+#include <config.h>
+#include <lib/memfnc.h>
+
+/** Iterates over all cpu id's whose bit is included in the cpu mask. 
+ * 
+ * Example usage:
+ * @code
+ * DEFINE_CPU_MASK(cpu_mask);
+ * cpu_mask_active(&cpu_mask);
+ * 
+ * cpu_mask_for_each(cpu_mask, cpu_id) {
+ *     printf("Cpu with logical id %u is active.\n", cpu_id);
+ * }
+ * @endcode
+ */
+#define cpu_mask_for_each(mask, cpu_id) \
+	for (unsigned int (cpu_id) = 0; (cpu_id) < config.cpu_count; ++(cpu_id)) \
+		if (cpu_mask_is_set(&(mask), (cpu_id))) 
+
+/** Allocates a cpu_mask_t on stack. */
+#define DEFINE_CPU_MASK(cpu_mask) \
+	cpu_mask_t *(cpu_mask) = (cpu_mask_t*) alloca(cpu_mask_size())
+
+/** If used with DEFINE_CPU_MASK, the mask is large enough for all detected cpus.*/
+typedef struct cpu_mask {
+	unsigned int mask[1];
+} cpu_mask_t;
+
+
+extern size_t cpu_mask_size(void);
+extern void cpu_mask_active(cpu_mask_t *);
+extern void cpu_mask_all(cpu_mask_t *);
+extern void cpu_mask_none(cpu_mask_t *);
+extern void cpu_mask_set(cpu_mask_t *, unsigned int);
+extern void cpu_mask_reset(cpu_mask_t *, unsigned int);
+extern bool cpu_mask_is_set(cpu_mask_t *, unsigned int);
+extern bool cpu_mask_is_none(cpu_mask_t *);
+
+#endif /* KERN_CPU_CPU_MASK_H_ */ 
+
+/** @}
+ */
Index: kernel/generic/include/lib/memfnc.h
===================================================================
--- kernel/generic/include/lib/memfnc.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/lib/memfnc.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -50,4 +50,6 @@
     ATTRIBUTE_OPTIMIZE("-fno-tree-loop-distribute-patterns") DO_NOT_DISCARD;
 
+#define alloca(size) __builtin_alloca((size))
+
 #endif
 
Index: kernel/generic/include/macros.h
===================================================================
--- kernel/generic/include/macros.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/macros.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -157,4 +157,11 @@
 	})
 
+
+#ifndef member_to_inst
+#define member_to_inst(ptr_member, type, member_identif) \
+	((type*) (((void*)(ptr_member)) - ((void*)&(((type*)0)->member_identif))))
+#endif
+
+
 #endif
 
Index: kernel/generic/include/mm/as.h
===================================================================
--- kernel/generic/include/mm/as.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/mm/as.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -48,4 +48,8 @@
 #include <adt/btree.h>
 #include <lib/elf.h>
+#include <arch.h>
+
+#define AS                   THE->as
+
 
 /**
Index: kernel/generic/include/preemption.h
===================================================================
--- kernel/generic/include/preemption.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/preemption.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -36,6 +36,27 @@
 #define KERN_PREEMPTION_H_
 
-extern void preemption_disable(void);
-extern void preemption_enable(void);
+#include <arch.h>
+#include <compiler/barrier.h>
+#include <debug.h>
+
+#define PREEMPTION_INC         (1 << 0)
+#define PREEMPTION_DISABLED    (PREEMPTION_INC <= THE->preemption)
+#define PREEMPTION_ENABLED     (!PREEMPTION_DISABLED)
+
+/** Increment preemption disabled counter. */
+#define preemption_disable() \
+	do { \
+		THE->preemption += PREEMPTION_INC; \
+		compiler_barrier(); \
+	} while (0)
+
+/** Restores preemption but never reschedules. */
+#define preemption_enable() \
+	do { \
+		ASSERT(PREEMPTION_DISABLED); \
+		compiler_barrier(); \
+		THE->preemption -= PREEMPTION_INC; \
+	} while (0)
+
 
 #endif
Index: kernel/generic/include/proc/task.h
===================================================================
--- kernel/generic/include/proc/task.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/proc/task.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -43,6 +43,8 @@
 #include <synch/mutex.h>
 #include <synch/futex.h>
+#include <synch/workqueue.h>
 #include <adt/avl.h>
 #include <adt/btree.h>
+#include <adt/cht.h>
 #include <adt/list.h>
 #include <security/cap.h>
@@ -57,4 +59,8 @@
 #include <mm/as.h>
 #include <abi/sysinfo.h>
+#include <arch.h>
+
+#define TASK                 THE->task
+
 
 struct thread;
@@ -123,11 +129,13 @@
 	task_arch_t arch;
 	
-	/**
-	 * Serializes access to the B+tree of task's futexes. This mutex is
-	 * independent on the task spinlock.
-	 */
-	mutex_t futexes_lock;
-	/** B+tree of futexes referenced by this task. */
-	btree_t futexes;
+	struct futex_cache {
+		/** CHT mapping virtual addresses of futex variables to futex objects.*/
+		cht_t ht;
+		/** Serializes access to futex_list.*/
+		SPINLOCK_DECLARE(list_lock);
+		/** List of all futexes accesses by this task. */
+		list_t list;
+		work_t destroy_work;
+	} *futexes;
 	
 	/** Accumulated accounting. */
Index: kernel/generic/include/proc/thread.h
===================================================================
--- kernel/generic/include/proc/thread.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/proc/thread.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,5 @@
 #include <cpu.h>
 #include <synch/spinlock.h>
+#include <synch/rcu_types.h>
 #include <adt/avl.h>
 #include <mm/slab.h>
@@ -48,4 +49,8 @@
 #include <udebug/udebug.h>
 #include <abi/sysinfo.h>
+#include <arch.h>
+
+
+#define THREAD              THE->thread
 
 #define THREAD_NAME_BUFLEN  20
@@ -180,4 +185,16 @@
 	/** Thread ID. */
 	thread_id_t tid;
+
+	/** Work queue this thread belongs to or NULL. Immutable. */
+	struct work_queue *workq;
+	/** Links work queue threads. Protected by workq->lock. */
+	link_t workq_link; 
+	/** True if the worker was blocked and is not running. Use thread->lock. */
+	bool workq_blocked;
+	/** True if the worker will block in order to become idle. Use workq->lock. */
+	bool workq_idling;
+	
+	/** RCU thread related data. Protected by its own locks. */
+	rcu_thread_data_t rcu;
 	
 	/** Architecture-specific data. */
@@ -217,4 +234,6 @@
 extern void thread_ready(thread_t *);
 extern void thread_exit(void) __attribute__((noreturn));
+extern void thread_interrupt(thread_t *);
+extern bool thread_interrupted(thread_t *);
 
 #ifndef thread_create_arch
Index: kernel/generic/include/smp/smp_call.h
===================================================================
--- kernel/generic/include/smp/smp_call.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/smp/smp_call.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_SMP_CALL_H_
+#define	KERN_SMP_CALL_H_
+
+#include <adt/list.h>
+#include <synch/spinlock.h>
+#include <atomic.h>
+
+typedef void (*smp_call_func_t)(void *);
+
+typedef struct smp_call {
+	smp_call_func_t func;
+	void *arg;
+	link_t calls_link;
+	atomic_t pending;
+} smp_call_t;
+
+
+
+extern void smp_call(unsigned int, smp_call_func_t, void *);
+extern void smp_call_async(unsigned int, smp_call_func_t, void *, smp_call_t *);
+extern void smp_call_wait(smp_call_t *);
+
+extern void smp_call_init(void);
+
+#ifdef CONFIG_SMP
+extern void smp_call_ipi_recv(void);
+extern void arch_smp_call_ipi(unsigned int);
+#endif
+
+
+
+
+#endif	/* KERN_SMP_CALL_H_ */
+
+/** @}
+ */
+
Index: kernel/generic/include/synch/condvar.h
===================================================================
--- kernel/generic/include/synch/condvar.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/synch/condvar.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -39,4 +39,5 @@
 #include <synch/waitq.h>
 #include <synch/mutex.h>
+#include <synch/spinlock.h>
 #include <abi/synch.h>
 
@@ -50,4 +51,12 @@
 	_condvar_wait_timeout((cv), (mtx), (usec), SYNCH_FLAGS_NONE)
 
+#ifdef CONFIG_SMP
+#define _condvar_wait_timeout_spinlock(cv, lock, usec, flags) \
+	_condvar_wait_timeout_spinlock_impl((cv), (lock), (usec), (flags))
+#else
+#define _condvar_wait_timeout_spinlock(cv, lock, usec, flags) \
+	_condvar_wait_timeout_spinlock_impl((cv), NULL, (usec), (flags))
+#endif
+
 extern void condvar_initialize(condvar_t *cv);
 extern void condvar_signal(condvar_t *cv);
@@ -55,4 +64,9 @@
 extern int _condvar_wait_timeout(condvar_t *cv, mutex_t *mtx, uint32_t usec,
     int flags);
+extern int _condvar_wait_timeout_spinlock_impl(condvar_t *cv, spinlock_t *lock, 
+	uint32_t usec, int flags);
+extern int _condvar_wait_timeout_irq_spinlock(condvar_t *cv, 
+	irq_spinlock_t *irq_lock, uint32_t usec, int flags);
+
 
 #endif
Index: kernel/generic/include/synch/futex.h
===================================================================
--- kernel/generic/include/synch/futex.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/synch/futex.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -55,5 +55,7 @@
 extern sysarg_t sys_futex_wakeup(uintptr_t);
 
-extern void futex_cleanup(void);
+extern void futex_task_cleanup(void);
+extern void futex_task_init(struct task *);
+extern void futex_task_deinit(struct task *);
 
 #endif
Index: kernel/generic/include/synch/rcu.h
===================================================================
--- kernel/generic/include/synch/rcu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/synch/rcu.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_RCU_H_
+#define KERN_RCU_H_
+
+#include <synch/rcu_types.h>
+#include <compiler/barrier.h>
+
+
+/** Use to assign a pointer to newly initialized data to a rcu reader 
+ * accessible pointer.
+ * 
+ * Example:
+ * @code
+ * typedef struct exam {
+ *     struct exam *next;
+ *     int grade;
+ * } exam_t;
+ * 
+ * exam_t *exam_list;
+ * // ..
+ * 
+ * // Insert at the beginning of the list.
+ * exam_t *my_exam = malloc(sizeof(exam_t), 0);
+ * my_exam->grade = 5;
+ * my_exam->next = exam_list;
+ * rcu_assign(exam_list, my_exam);
+ * 
+ * // Changes properly propagate. Every reader either sees
+ * // the old version of exam_list or the new version with
+ * // the fully initialized my_exam.
+ * rcu_synchronize();
+ * // Now we can be sure every reader sees my_exam.
+ * 
+ * @endcode
+ */
+#define rcu_assign(ptr, value) \
+	do { \
+		memory_barrier(); \
+		(ptr) = (value); \
+	} while (0)
+
+/** Use to access RCU protected data in a reader section.
+ * 
+ * Example:
+ * @code
+ * exam_t *exam_list;
+ * // ...
+ * 
+ * rcu_read_lock();
+ * exam_t *first_exam = rcu_access(exam_list);
+ * // We can now safely use first_exam, it won't change 
+ * // under us while we're using it.
+ *
+ * // ..
+ * rcu_read_unlock();
+ * @endcode
+ */
+#define rcu_access(ptr) ACCESS_ONCE(ptr)
+
+
+
+
+#include <debug.h>
+#include <preemption.h>
+#include <cpu.h>
+#include <proc/thread.h>
+
+
+extern bool rcu_read_locked(void);
+extern void rcu_synchronize(void);
+extern void rcu_synchronize_expedite(void);
+extern void rcu_call(rcu_item_t *rcu_item, rcu_func_t func);
+extern void rcu_barrier(void);
+
+extern void rcu_print_stat(void);
+
+extern void rcu_init(void);
+extern void rcu_stop(void);
+extern void rcu_cpu_init(void);
+extern void rcu_kinit_init(void);
+extern void rcu_thread_init(struct thread*);
+extern void rcu_thread_exiting(void);
+extern void rcu_after_thread_ran(void);
+extern void rcu_before_thread_runs(void);
+
+extern uint64_t rcu_completed_gps(void);
+extern void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func);
+extern void _rcu_synchronize(bool expedite);
+
+
+#ifdef RCU_PREEMPT_A
+
+#define RCU_CNT_INC       (1 << 1)
+#define RCU_WAS_PREEMPTED (1 << 0)
+
+/* Fwd. decl. because of inlining. */
+void _rcu_preempted_unlock(void);
+
+/** Delimits the start of an RCU reader critical section. 
+ * 
+ * Reader sections may be nested and are preemptible. You must not
+ * however block/sleep within reader sections.
+ */
+static inline void rcu_read_lock(void)
+{
+	THE->rcu_nesting += RCU_CNT_INC;
+	compiler_barrier();
+}
+
+/** Delimits the end of an RCU reader critical section. */
+static inline void rcu_read_unlock(void)
+{
+	compiler_barrier();
+	THE->rcu_nesting -= RCU_CNT_INC;
+	
+	if (RCU_WAS_PREEMPTED == THE->rcu_nesting) {
+		_rcu_preempted_unlock();
+	}
+}
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+
+/* Fwd decl. required by the inlined implementation. Not part of public API. */
+extern rcu_gp_t _rcu_cur_gp;
+extern void _rcu_signal_read_unlock(void);
+
+
+/** Unconditionally records a quiescent state for the local cpu. */
+static inline void _rcu_record_qs(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/* 
+	 * A new GP was started since the last time we passed a QS. 
+	 * Notify the detector we have reached a new QS.
+	 */
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		rcu_gp_t cur_gp = ACCESS_ONCE(_rcu_cur_gp);
+		/* 
+		 * Contain memory accesses within a reader critical section. 
+		 * If we are in rcu_lock() it also makes changes prior to the
+		 * start of the GP visible in the reader section.
+		 */
+		memory_barrier();
+		/*
+		 * Acknowledge we passed a QS since the beginning of rcu.cur_gp.
+		 * Cache coherency will lazily transport the value to the
+		 * detector while it sleeps in gp_sleep(). 
+		 * 
+		 * Note that there is a theoretical possibility that we
+		 * overwrite a more recent/greater last_seen_gp here with 
+		 * an older/smaller value. If this cpu is interrupted here
+		 * while in rcu_lock() reader sections in the interrupt handler 
+		 * will update last_seen_gp to the same value as is currently 
+		 * in local cur_gp. However, if the cpu continues processing 
+		 * interrupts and the detector starts a new GP immediately, 
+		 * local interrupt handlers may update last_seen_gp again (ie 
+		 * properly ack the new GP) with a value greater than local cur_gp. 
+		 * Resetting last_seen_gp to a previous value here is however 
+		 * benign and we only have to remember that this reader may end up 
+		 * in cur_preempted even after the GP ends. That is why we
+		 * append next_preempted to cur_preempted rather than overwriting 
+		 * it as if cur_preempted were empty.
+		 */
+		CPU->rcu.last_seen_gp = cur_gp;
+	}
+}
+
+/** Delimits the start of an RCU reader critical section. 
+ * 
+ * Reader sections may be nested and are preemptable. You must not
+ * however block/sleep within reader sections.
+ */
+static inline void rcu_read_lock(void)
+{
+	ASSERT(CPU);
+	preemption_disable();
+
+	/* Record a QS if not in a reader critical section. */
+	if (0 == CPU->rcu.nesting_cnt)
+		_rcu_record_qs();
+
+	++CPU->rcu.nesting_cnt;
+
+	preemption_enable();
+}
+
+/** Delimits the end of an RCU reader critical section. */
+static inline void rcu_read_unlock(void)
+{
+	ASSERT(CPU);
+	preemption_disable();
+	
+	if (0 == --CPU->rcu.nesting_cnt) {
+		_rcu_record_qs();
+		
+		/* 
+		 * The thread was preempted while in a critical section or 
+		 * the detector is eagerly waiting for this cpu's reader to finish. 
+		 */
+		if (CPU->rcu.signal_unlock) {
+			/* Rechecks with disabled interrupts. */
+			_rcu_signal_read_unlock();
+		}
+	}
+	
+	preemption_enable();
+}
+#endif
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/rcu_types.h
===================================================================
--- kernel/generic/include/synch/rcu_types.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/synch/rcu_types.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_RCU_TYPES_H_
+#define KERN_RCU_TYPES_H_
+
+#include <adt/list.h>
+#include <synch/semaphore.h>
+
+#if !defined(RCU_PREEMPT_PODZIMEK) && !defined(RCU_PREEMPT_A)
+#error You must select an RCU algorithm.
+#endif
+
+
+/* Fwd decl. */
+struct thread;
+struct rcu_item;
+
+/** Grace period number typedef. */
+typedef uint64_t rcu_gp_t;
+
+/** RCU callback type. The passed rcu_item_t maybe freed. */
+typedef void (*rcu_func_t)(struct rcu_item *rcu_item);
+
+typedef struct rcu_item {
+	rcu_func_t func;
+	struct rcu_item *next;
+} rcu_item_t;
+
+
+/** RCU related per-cpu data. */
+typedef struct rcu_cpu_data {
+	/** The cpu recorded a quiescent state last time during this grace period.*/
+	rcu_gp_t last_seen_gp;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	/** This cpu has not yet passed a quiescent state and it is delaying the
+	 * detector. Once it reaches a QS it must sema_up(rcu.remaining_readers).
+	 */
+	bool is_delaying_gp;
+	
+	/** True if we should signal the detector that we exited a reader section.
+	 * 
+	 * Equal to (THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp).
+	 */
+	bool signal_unlock;
+
+	/** The number of times an RCU reader section is nested on this cpu. 
+	 * 
+	 * If positive, it is definitely executing reader code. If zero, 
+	 * the thread might already be executing reader code thanks to
+	 * cpu instruction reordering.
+	 */
+	size_t nesting_cnt;
+#endif
+	
+	/** Callbacks to invoke once the current grace period ends, ie cur_cbs_gp.
+	 * Accessed by the local reclaimer only.
+	 */
+	rcu_item_t *cur_cbs;
+	/** Number of callbacks in cur_cbs. */
+	size_t cur_cbs_cnt;
+	/** Callbacks to invoke once the next grace period ends, ie next_cbs_gp. 
+	 * Accessed by the local reclaimer only.
+	 */
+	rcu_item_t *next_cbs;
+	/** Number of callbacks in next_cbs. */
+	size_t next_cbs_cnt;
+	/** New callbacks are place at the end of this list. */
+	rcu_item_t *arriving_cbs;
+	/** Tail of arriving_cbs list. Disable interrupts to access. */
+	rcu_item_t **parriving_cbs_tail;
+	/** Number of callbacks currently in arriving_cbs. 
+	 * Disable interrupts to access.
+	 */
+	size_t arriving_cbs_cnt;
+
+	/** At the end of this grace period callbacks in cur_cbs will be invoked.*/
+	rcu_gp_t cur_cbs_gp;
+	/** At the end of this grace period callbacks in next_cbs will be invoked.
+	 * 
+	 * Should be the next grace period but it allows the reclaimer to 
+	 * notice if it missed a grace period end announcement. In that
+	 * case it can execute next_cbs without waiting for another GP.
+	 * 
+	 * Invariant: next_cbs_gp >= cur_cbs_gp
+	 */
+	rcu_gp_t next_cbs_gp;
+	
+	/** Positive if there are callbacks pending in arriving_cbs. */
+	semaphore_t arrived_flag;
+	
+	/** The reclaimer should expedite GPs for cbs in arriving_cbs. */
+	bool expedite_arriving;
+	
+	/** Protected by global rcu.barrier_mtx. */
+	rcu_item_t barrier_item;
+	
+	/** Interruptable attached reclaimer thread. */
+	struct thread *reclaimer_thr;
+	
+	/* Some statistics. */
+	size_t stat_max_cbs;
+	size_t stat_avg_cbs;
+	size_t stat_missed_gps;
+	size_t stat_missed_gp_in_wait;
+	size_t stat_max_slice_cbs;
+	size_t last_arriving_cnt;
+} rcu_cpu_data_t;
+
+
+/** RCU related per-thread data. */
+typedef struct rcu_thread_data {
+	/** 
+	 * Nesting count of the thread's RCU read sections when the thread 
+	 * is not running.
+	 */
+	size_t nesting_cnt;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	
+	/** True if the thread was preempted in a reader section. 
+	 *
+	 * The thread is placed into rcu.cur_preempted or rcu.next_preempted
+	 * and must remove itself in rcu_read_unlock(). 
+	 * 
+	 * Access with interrupts disabled.
+	 */
+	bool was_preempted;
+#endif
+	
+	/** Preempted threads link. Access with rcu.prempt_lock.*/
+	link_t preempt_link;
+} rcu_thread_data_t;
+
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/semaphore.h
===================================================================
--- kernel/generic/include/synch/semaphore.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/synch/semaphore.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -53,4 +53,8 @@
 	_semaphore_down_timeout((s), (usec), SYNCH_FLAGS_NONE)
 
+#define semaphore_down_interruptable(s) \
+	(ESYNCH_INTERRUPTED != _semaphore_down_timeout((s), SYNCH_NO_TIMEOUT, \
+		SYNCH_FLAGS_INTERRUPTIBLE))
+
 extern void semaphore_initialize(semaphore_t *, int);
 extern int _semaphore_down_timeout(semaphore_t *, uint32_t, unsigned int);
Index: kernel/generic/include/synch/smp_memory_barrier.h
===================================================================
--- kernel/generic/include/synch/smp_memory_barrier.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/synch/smp_memory_barrier.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup sync
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_SMP_MEM_BAR_H_
+#define KERN_SMP_MEM_BAR_H_
+
+#include <typedefs.h>
+
+extern sysarg_t sys_smp_memory_barrier(void);
+
+#endif
+
+/** @}
+ */
Index: kernel/generic/include/synch/spinlock.h
===================================================================
--- kernel/generic/include/synch/spinlock.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/include/synch/spinlock.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -45,5 +45,5 @@
 #ifdef CONFIG_SMP
 
-typedef struct {
+typedef struct spinlock {
 	atomic_t val;
 	
@@ -163,4 +163,7 @@
 /* On UP systems, spinlocks are effectively left out. */
 
+/* Allow the use of spinlock_t as an incomplete type. */
+typedef struct spinlock spinlock_t;
+
 #define SPINLOCK_DECLARE(name)
 #define SPINLOCK_EXTERN(name)
@@ -177,5 +180,5 @@
 
 #define spinlock_lock(lock)     preemption_disable()
-#define spinlock_trylock(lock)  (preemption_disable(), 1)
+#define spinlock_trylock(lock)  ({ preemption_disable(); 1; })
 #define spinlock_unlock(lock)   preemption_enable()
 #define spinlock_locked(lock)	1
Index: kernel/generic/include/synch/workqueue.h
===================================================================
--- kernel/generic/include/synch/workqueue.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/include/synch/workqueue.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+/** @file
+ */
+
+#ifndef KERN_WORKQUEUE_H_
+#define KERN_WORKQUEUE_H_
+
+#include <adt/list.h>
+
+/* Fwd decl. */
+struct thread;
+struct work_item;
+struct work_queue;
+typedef struct work_queue work_queue_t;
+
+typedef void (*work_func_t)(struct work_item *);
+
+typedef struct work_item {
+	link_t queue_link;
+	work_func_t func;
+	
+#ifdef CONFIG_DEBUG
+	/* Magic number for integrity checks. */
+	uint32_t cookie;
+#endif 
+} work_t;
+
+
+
+extern void workq_global_init(void);
+extern void workq_global_worker_init(void);
+extern void workq_global_stop(void);
+extern int workq_global_enqueue_noblock(work_t *, work_func_t);
+extern int workq_global_enqueue(work_t *, work_func_t);
+
+extern struct work_queue * workq_create(const char *);
+extern void workq_destroy(struct work_queue *);
+extern int workq_init(struct work_queue *, const char *);
+extern void workq_stop(struct work_queue *);
+extern int workq_enqueue_noblock(struct work_queue *, work_t *, work_func_t);
+extern int workq_enqueue(struct work_queue *, work_t *, work_func_t);
+
+extern void workq_print_info(struct work_queue *);
+extern void workq_global_print_info(void);
+
+
+extern void workq_after_thread_ran(void);
+extern void workq_before_thread_is_ready(struct thread *);
+
+#endif /* KERN_WORKQUEUE_H_ */
+
+/** @}
+ */
Index: kernel/generic/src/adt/cht.c
===================================================================
--- kernel/generic/src/adt/cht.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/adt/cht.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,2711 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/** @addtogroup genericadt
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Scalable resizable concurrent lock-free hash table.
+ * 
+ * CHT is a concurrent hash table that is scalable resizable and lock-free.
+ * resizable = the number of buckets of the table increases or decreases
+ *     depending on the average number of elements per bucket (ie load)
+ * scalable = accessing the table from more cpus increases performance
+ *     almost linearly
+ * lock-free = common operations never block; even if any of the operations
+ *     is preempted or interrupted at any time, other operations will still
+ *     make forward progress
+ *
+ * CHT is designed for read mostly scenarios. Performance degrades as the
+ * fraction of updates (insert/remove) increases. Other data structures
+ * significantly outperform CHT if the fraction of updates exceeds ~40%.
+ * 
+ * CHT tolerates hardware exceptions and may be accessed from exception
+ * handlers as long as the underlying RCU implementation is exception safe.
+ * 
+ * @par Caveats
+ * 
+ * 0) Never assume an item is still in the table.
+ * The table may be accessed concurrently; therefore, other threads may
+ * insert or remove an item at any time. Do not assume an item is still
+ * in the table if cht_find() just returned it to you. Similarly, an
+ * item may have already been inserted by the time cht_find() returns NULL.
+ * 
+ * 1) Always use RCU read locks when searching the table.
+ * Holding an RCU lock guarantees that an item found in the table remains
+ * valid (eg is not freed) even if the item was removed from the table
+ * in the meantime by another thread.
+ * 
+ * 2) Never update values in place.
+ * Do not update items in the table in place, ie directly. The changes
+ * will not propagate to other readers (on other cpus) immediately or even
+ * correctly. Some readers may then encounter items that have only some
+ * of their fields changed or are completely inconsistent. 
+ * 
+ * Instead consider inserting an updated/changed copy of the item and 
+ * removing the original item. Or contact the maintainer to provide
+ * you with a function that atomically replaces an item with a copy.
+ * 
+ * 3) Use cht_insert_unique() instead of checking for duplicates with cht_find()
+ * The following code is prone to race conditions:
+ * @code
+ * if (NULL == cht_find(&h, key)) {
+ *     // If another thread inserts and item here, we'll insert a duplicate.
+ *     cht_insert(&h, item);
+ * }
+ * @endcode
+ * See cht_insert_unique() on how to correctly fix this.
+ * 
+ *
+ * @par Semantics
+ * 
+ * Lazy readers = cht_find_lazy(), cht_find_next_lazy()
+ * Readers = lazy readers, cht_find(), cht_find_next()
+ * Updates = cht_insert(), cht_insert_unique(), cht_remove_key(), 
+ *     cht_remove_item()
+ * 
+ * Readers (but not lazy readers) are guaranteed to see the effects 
+ * of @e completed updates. In other words, if cht_find() is invoked 
+ * after a cht_insert() @e returned eg on another cpu, cht_find() is 
+ * guaranteed to see the inserted item. 
+ * 
+ * Similarly, updates see the effects of @e completed updates. For example,
+ * issuing cht_remove() after a cht_insert() for that key returned (even 
+ * on another cpu) is guaranteed to remove the inserted item.
+ * 
+ * Reading or updating the table concurrently with other updates
+ * always returns consistent data and never corrupts the table.
+ * However the effects of concurrent updates may or may not be
+ * visible to all other concurrent readers or updaters. Eg, not
+ * all readers may see that an item has already been inserted 
+ * if cht_insert() has not yet returned. 
+ * 
+ * Lazy readers are guaranteed to eventually see updates but it
+ * may take some time (possibly milliseconds) after the update
+ * completes for the change to propagate to lazy readers on all
+ * cpus.
+ * 
+ * @par Implementation
+ * 
+ * Collisions in CHT are resolved with chaining. The number of buckets
+ * is always a power of 2. Each bucket is represented with a single linked 
+ * lock-free list [1]. Items in buckets are sorted by their mixed hashes 
+ * in ascending order. All buckets are terminated with a single global 
+ * sentinel node whose mixed hash value is the greatest possible. 
+ *
+ * CHT with 2^k buckets uses the k most significant bits of a hash value
+ * to determine the bucket number where an item is to be stored. To
+ * avoid storing all items in a single bucket if the user supplied
+ * hash function does not produce uniform hashes, hash values are
+ * mixed first so that the top bits of a mixed hash change even if hash
+ * values differ only in the least significant bits. The mixed hash 
+ * values are cached in cht_link.hash (which is overwritten once the 
+ * item is scheduled for removal via rcu_call).
+ * 
+ * A new item is inserted before all other existing items in the bucket
+ * with the same hash value as the newly inserted item (a la the original
+ * lock-free list [2]). Placing new items at the start of a same-hash 
+ * sequence of items (eg duplicates) allows us to easily check for duplicates 
+ * in cht_insert_unique(). The function can first check that there are 
+ * no duplicates of the newly inserted item amongst the items with the 
+ * same hash as the new item. If there were no duplicates the new item 
+ * is linked before the same-hash items. Inserting a duplicate while 
+ * the function is checking for duplicates is detected as a change of 
+ * the link to the first checked same-hash item (and the search for 
+ * duplicates can be restarted).
+ * 
+ * @par Table resize algorithm
+ * 
+ * Table resize is based on [3] and [5]. First, a new bucket head array
+ * is allocated and initialized. Second, old bucket heads are moved
+ * to the new bucket head array with the protocol mentioned in [5]. 
+ * At this point updaters start using the new bucket heads. Third,
+ * buckets are split (or joined) so that the table can make use of
+ * the extra bucket head slots in the new array (or stop wasting space
+ * with the unnecessary extra slots in the old array). Splitting
+ * or joining buckets employs a custom protocol. Last, the new array 
+ * replaces the original bucket array.
+ * 
+ * A single background work item (of the system work queue) guides
+ * resizing of the table. If an updater detects that the bucket it
+ * is about to access is undergoing a resize (ie its head is moving
+ * or it needs to be split/joined), it helps out and completes the
+ * head move or the bucket split/join.
+ * 
+ * The table always grows or shrinks by a factor of 2. Because items 
+ * are assigned a bucket based on the top k bits of their mixed hash 
+ * values, when growing the table each bucket is split into two buckets 
+ * and all items of the two new buckets come from the single bucket in the 
+ * original table. Ie items from separate buckets in the original table
+ * never intermix in the new buckets. Moreover 
+ * since the buckets are sorted by their mixed hash values the items 
+ * at the beginning of the old bucket will end up in the first new 
+ * bucket while all the remaining items of the old bucket will end up
+ * in the second new bucket. Therefore, there is a single point where 
+ * to split the linked list of the old bucket into two correctly sorted 
+ * linked lists of the new buckets:
+ *                            .- bucket split
+ *                            | 
+ *             <-- first -->  v  <-- second --> 
+ *   [old] --> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *              ^                 ^    
+ *   [new0] -- -+                 |  
+ *   [new1] -- -- -- -- -- -- -- -+
+ * 
+ * Resize in greater detail:
+ * 
+ * a) First, a resizer (a single background system work queue item 
+ * in charge of resizing the table) allocates and initializes a new 
+ * bucket head array. New bucket heads are pointed to the sentinel 
+ * and marked Invalid (in the lower order bits of the pointer to the 
+ * next item, ie the sentinel in this case):
+ * 
+ *   [old, N] --> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *                                                    ^ ^
+ *   [new0, Inv] -------------------------------------+ |
+ *   [new1, Inv] ---------------------------------------+
+ * 
+ * 
+ * b) Second, the resizer starts moving old bucket heads with the following 
+ * lock-free protocol (from [5]) where cas(variable, expected_val, new_val) 
+ * is short for compare-and-swap:
+ * 
+ *   old head     new0 head      transition to next state
+ *   --------     ---------      ------------------------
+ *   addr, N      sentinel, Inv  cas(old, (addr, N), (addr, Const))
+ *                               .. mark the old head as immutable, so that 
+ *                                  updaters do not relink it to other nodes 
+ *                                  until the head move is done.
+ *   addr, Const  sentinel, Inv  cas(new0, (sentinel, Inv), (addr, N))
+ *                               .. move the address to the new head and mark 
+ *                                  the new head normal so updaters can start
+ *                                  using it.
+ *   addr, Const  addr, N        cas(old, (addr, Const), (addr, Inv))
+ *                               .. mark the old head Invalid to signify
+ *                                  the head move is done.
+ *   addr, Inv    addr, N
+ * 
+ * Notice that concurrent updaters may step in at any point and correctly
+ * complete the head move without disrupting the resizer. At worst, the
+ * resizer or other concurrent updaters will attempt a number of CAS() that 
+ * will correctly fail.
+ * 
+ *   [old, Inv] -> [00b] -> [01b] -> [10b] -> [11b] -> sentinel
+ *                 ^                                   ^
+ *   [new0, N] ----+                                   |
+ *   [new1, Inv] --------------------------------------+
+ * 
+ *  
+ * c) Third, buckets are split if the table is growing; or joined if 
+ * shrinking (by the resizer or updaters depending on whoever accesses 
+ * the bucket first). See split_bucket() and join_buckets() for details.
+ * 
+ *  1) Mark the last item of new0 with JOIN_FOLLOWS:
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b] -> [11b] -> sentinel
+ *                 ^                                       ^
+ *   [new0, N] ----+                                       |
+ *   [new1, Inv] ------------------------------------------+
+ * 
+ *  2) Mark the first item of new1 with JOIN_NODE:
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b, JN] -> [11b] -> sentinel
+ *                 ^                                           ^
+ *   [new0, N] ----+                                           |
+ *   [new1, Inv] ----------------------------------------------+
+ * 
+ *  3) Point new1 to the join-node and mark new1 NORMAL.
+ *   [old, Inv] -> [00b] -> [01b, JF] -> [10b, JN] -> [11b] -> sentinel
+ *                 ^                     ^
+ *   [new0, N] ----+                     |
+ *   [new1, N] --------------------------+
+ * 
+ * 
+ * d) Fourth, the resizer cleans up extra marks added during bucket 
+ * splits/joins but only when it is sure all updaters are accessing
+ * the table via the new bucket heads only (ie it is certain there
+ * are no delayed updaters unaware of the resize and accessing the 
+ * table via the old bucket head).
+ * 
+ *   [old, Inv] ---+
+ *                 v
+ *   [new0, N] --> [00b] -> [01b, N] ---+
+ *                                      v
+ *   [new1, N] --> [10b, N] -> [11b] -> sentinel
+ * 
+ * 
+ * e) Last, the resizer publishes the new bucket head array for everyone
+ * to see and use. This signals the end of the resize and the old bucket
+ * array is freed. 
+ * 
+ * 
+ * To understand details of how the table is resized, read [1, 3, 5]
+ * and comments in join_buckets(), split_bucket().
+ *  
+ * 
+ * [1] High performance dynamic lock-free hash tables and list-based sets, 
+ *     Michael, 2002
+ *     http://www.research.ibm.com/people/m/michael/spaa-2002.pdf
+ * [2] Lock-free linked lists using compare-and-swap,
+ *     Valois, 1995
+ *     http://people.csail.mit.edu/bushl2/rpi/portfolio/lockfree-grape/documents/lock-free-linked-lists.pdf
+ * [3] Resizable, scalable, concurrent hash tables via relativistic programming,
+ *     Triplett, 2011
+ *     http://www.usenix.org/event/atc11/tech/final_files/Triplett.pdf
+ * [4] Split-ordered Lists: Lock-free Extensible Hash Tables,
+ *     Shavit, 2006
+ *     http://www.cs.ucf.edu/~dcm/Teaching/COT4810-Spring2011/Literature/SplitOrderedLists.pdf
+ * [5] Towards a Scalable Non-blocking Coding Style,
+ *     Click, 2008
+ *     http://www.azulsystems.com/events/javaone_2008/2008_CodingNonBlock.pdf
+ */
+
+
+#include <adt/cht.h>
+#include <adt/hash.h>
+#include <debug.h>
+#include <memstr.h>
+#include <mm/slab.h>
+#include <arch/barrier.h>
+#include <compiler/barrier.h>
+#include <atomic.h>
+#include <synch/rcu.h>
+
+#ifdef CONFIG_DEBUG
+/* Do not enclose in parentheses. */
+#define DBG(x) x
+#else
+#define DBG(x)
+#endif
+
+/* Logarithm of the min bucket count. Must be at least 3. 2^6 == 64 buckets. */
+#define CHT_MIN_ORDER 6
+/* Logarithm of the max bucket count. */
+#define CHT_MAX_ORDER (8 * sizeof(size_t))
+/* Minimum number of hash table buckets. */
+#define CHT_MIN_BUCKET_CNT (1 << CHT_MIN_ORDER)
+/* Does not have to be a power of 2. */
+#define CHT_MAX_LOAD 2 
+
+typedef cht_ptr_t marked_ptr_t;
+typedef bool (*equal_pred_t)(void *arg, const cht_link_t *item);
+
+/** The following mark items and bucket heads. 
+ * 
+ * They are stored in the two low order bits of the next item pointers.
+ * Some marks may be combined. Some marks share the same binary value and
+ * are distinguished only by context (eg bucket head vs an ordinary item),
+ * in particular by walk_mode_t.
+ */
+typedef enum mark {
+	/** Normal non-deleted item or a valid bucket head. */
+	N_NORMAL = 0,
+	/** Logically deleted item that might have already been unlinked.
+	 * 
+	 * May be combined with N_JOIN and N_JOIN_FOLLOWS. Applicable only 
+	 * to items; never to bucket heads. 
+	 * 
+	 * Once marked deleted an item remains marked deleted.	 
+	 */
+	N_DELETED = 1,
+	/** Immutable bucket head. 
+	 * 
+	 * The bucket is being moved or joined with another and its (old) head 
+	 * must not be modified.
+	 * 
+	 * May be combined with N_INVALID. Applicable only to old bucket heads,
+	 * ie cht_t.b and not cht_t.new_b.
+	 */
+	N_CONST = 1,
+	/** Invalid bucket head. The bucket head must not be modified. 
+	 * 
+	 * Old bucket heads (ie cht_t.b) are marked invalid if they have
+	 * already been moved to cht_t.new_b or if the bucket had already
+	 * been merged with another when shrinking the table. New bucket
+	 * heads (ie cht_t.new_b) are marked invalid if the old bucket had
+	 * not yet been moved or if an old bucket had not yet been split
+	 * when growing the table.
+	 */
+	N_INVALID = 3,
+	/** The item is a join node, ie joining two buckets
+	 * 
+	 * A join node is either the first node of the second part of
+	 * a bucket to be split; or it is the first node of the bucket
+	 * to be merged into/appended to/joined with another bucket.
+	 * 
+	 * May be combined with N_DELETED. Applicable only to items, never 
+	 * to bucket heads.
+	 * 
+	 * Join nodes are referred to from two different buckets and may,
+	 * therefore, not be safely/atomically unlinked from both buckets.
+	 * As a result join nodes are not unlinked but rather just marked
+	 * deleted. Once resize completes join nodes marked deleted are
+	 * garbage collected.
+	 */
+	N_JOIN = 2,
+	/** The next node is a join node and will soon be marked so. 
+	 * 
+	 * A join-follows node is the last node of the first part of bucket
+	 * that is to be split, ie it is the last node that will remain
+	 * in the same bucket after splitting it.
+	 * 
+	 * May be combined with N_DELETED. Applicable to items as well
+	 * as to bucket heads of the bucket to be split (but only in cht_t.new_b). 
+	 */
+	N_JOIN_FOLLOWS = 2,
+	/** Bit mask to filter out the address to the next item from the next ptr. */
+	N_MARK_MASK = 3
+} mark_t;
+
+/** Determines */
+typedef enum walk_mode {
+	/** The table is not resizing. */
+	WM_NORMAL = 4,
+	/** The table is undergoing a resize. Join nodes may be encountered. */
+	WM_LEAVE_JOIN,
+	/** The table is growing. A join-follows node may be encountered. */
+	WM_MOVE_JOIN_FOLLOWS
+} walk_mode_t;
+
+/** Bucket position window. */
+typedef struct wnd {
+	/** Pointer to cur's predecessor. */
+	marked_ptr_t *ppred;
+	/** Current item. */
+	cht_link_t *cur;
+	/** Last encountered item. Deleted or not. */
+	cht_link_t *last;
+} wnd_t;
+
+
+/* Sentinel node used by all buckets. Stores the greatest possible hash value.*/
+static const cht_link_t sentinel = {
+	/* NULL and N_NORMAL */
+	.link = 0 | N_NORMAL,
+	.hash = -1
+};
+
+
+static size_t size_to_order(size_t bucket_cnt, size_t min_order);
+static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid, 
+	bool can_block);
+static inline cht_link_t *find_lazy(cht_t *h, void *key);
+static cht_link_t *search_bucket(cht_t *h, marked_ptr_t head, void *key, 
+	size_t search_hash);
+static cht_link_t *find_resizing(cht_t *h, void *key, size_t hash, 
+	marked_ptr_t old_head, size_t old_idx);
+static bool insert_impl(cht_t *h, cht_link_t *item, cht_link_t **dup_item);
+static bool insert_at(cht_link_t *item, const wnd_t *wnd, walk_mode_t walk_mode,
+	bool *resizing);
+static bool has_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *cur, cht_link_t **dup_item);
+static cht_link_t *find_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *start);
+static bool remove_pred(cht_t *h, size_t hash, equal_pred_t pred, void *pred_arg);
+static bool delete_at(cht_t *h, wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *deleted_but_gc, bool *resizing);
+static bool mark_deleted(cht_link_t *cur, walk_mode_t walk_mode, bool *resizing);
+static bool unlink_from_pred(wnd_t *wnd, walk_mode_t walk_mode, bool *resizing);
+static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	equal_pred_t pred, void *pred_arg, wnd_t *wnd, bool *resizing);
+static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	wnd_t *wnd, bool *resizing);
+static bool gc_deleted_node(cht_t *h, walk_mode_t walk_mode, wnd_t *wnd,
+	bool *resizing);
+static bool join_completed(cht_t *h, const wnd_t *wnd);
+static void upd_resizing_head(cht_t *h, size_t hash, marked_ptr_t **phead, 
+	bool *join_finishing,  walk_mode_t *walk_mode);
+static void item_removed(cht_t *h);
+static void item_inserted(cht_t *h);
+static void free_later(cht_t *h, cht_link_t *item);
+static void help_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head);
+static void start_head_move(marked_ptr_t *psrc_head);
+static void mark_const(marked_ptr_t *psrc_head);
+static void complete_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head);
+static void split_bucket(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash);
+static void mark_join_follows(cht_t *h, marked_ptr_t *psrc_head, 
+	size_t split_hash, wnd_t *wnd);
+static void mark_join_node(cht_link_t *join_node);
+static void join_buckets(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash);
+static void link_to_join_node(cht_t *h, marked_ptr_t *pdest_head, 
+	cht_link_t *join_node, size_t split_hash);
+static void resize_table(work_t *arg);
+static void grow_table(cht_t *h);
+static void shrink_table(cht_t *h);
+static void cleanup_join_node(cht_t *h, marked_ptr_t *new_head);
+static void clear_join_and_gc(cht_t *h, cht_link_t *join_node, 
+	marked_ptr_t *new_head);
+static void cleanup_join_follows(cht_t *h, marked_ptr_t *new_head);
+static marked_ptr_t make_link(const cht_link_t *next, mark_t mark);
+static cht_link_t * get_next(marked_ptr_t link);
+static mark_t get_mark(marked_ptr_t link);
+static void next_wnd(wnd_t *wnd);
+static bool same_node_pred(void *node, const cht_link_t *item2);
+static size_t calc_key_hash(cht_t *h, void *key);
+static size_t node_hash(cht_t *h, const cht_link_t *item);
+static size_t calc_node_hash(cht_t *h, const cht_link_t *item);
+static void memoize_node_hash(cht_t *h, cht_link_t *item);
+static size_t calc_split_hash(size_t split_idx, size_t order);
+static size_t calc_bucket_idx(size_t hash, size_t order);
+static size_t grow_to_split_idx(size_t old_idx);
+static size_t grow_idx(size_t idx);
+static size_t shrink_idx(size_t idx);
+static marked_ptr_t cas_link(marked_ptr_t *link, const cht_link_t *cur_next, 
+	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark);
+static marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur, 
+	marked_ptr_t new);
+static void cas_order_barrier(void);
+
+static void dummy_remove_callback(cht_link_t *item)
+{
+	/* empty */
+}
+
+/** Creates a concurrent hash table.
+ * 
+ * @param h         Valid pointer to a cht_t instance.
+ * @param op        Item specific operations. All operations are compulsory.
+ * @return True if successfully created the table. False otherwise.
+ */
+bool cht_create_simple(cht_t *h, cht_ops_t *op)
+{
+	return cht_create(h, 0, 0, 0, false, op); 
+}
+
+/** Creates a concurrent hash table.
+ * 
+ * @param h         Valid pointer to a cht_t instance.
+ * @param init_size The initial number of buckets the table should contain.
+ *                  The table may be shrunk below this value if deemed necessary.
+ *                  Uses the default value if 0.
+ * @param min_size  Minimum number of buckets that the table should contain.
+ *                  The number of buckets never drops below this value,
+ *                  although it may be rounded up internally as appropriate.
+ *                  Uses the default value if 0.
+ * @param max_load  Maximum average number of items per bucket that allowed
+ *                  before the table grows.
+ * @param can_block If true creating the table blocks until enough memory
+ *                  is available (possibly indefinitely). Otherwise, 
+ *                  table creation does not block and returns immediately
+ *                  even if not enough memory is available. 
+ * @param op        Item specific operations. All operations are compulsory.
+ * @return True if successfully created the table. False otherwise.
+ */
+bool cht_create(cht_t *h, size_t init_size, size_t min_size, size_t max_load, 
+	bool can_block, cht_ops_t *op)
+{
+	ASSERT(h);
+	ASSERT(op && op->hash && op->key_hash && op->equal && op->key_equal);
+	/* Memoized hashes are stored in the rcu_link.func function pointer. */
+	ASSERT(sizeof(size_t) == sizeof(rcu_func_t));
+	ASSERT(sentinel.hash == (uintptr_t)sentinel.rcu_link.func);
+
+	/* All operations are compulsory. */
+	if (!op || !op->hash || !op->key_hash || !op->equal || !op->key_equal)
+		return false;
+	
+	size_t min_order = size_to_order(min_size, CHT_MIN_ORDER);
+	size_t order = size_to_order(init_size, min_order);
+	
+	h->b = alloc_buckets(order, false, can_block);
+	
+	if (!h->b)
+		return false;
+	
+	h->max_load = (max_load == 0) ? CHT_MAX_LOAD : max_load;
+	h->min_order = min_order;
+	h->new_b = NULL;
+	h->op = op;
+	atomic_set(&h->item_cnt, 0);
+	atomic_set(&h->resize_reqs, 0);
+	
+	if (NULL == op->remove_callback) {
+		h->op->remove_callback = dummy_remove_callback;
+	}
+	
+	/* 
+	 * Cached item hashes are stored in item->rcu_link.func. Once the item
+	 * is deleted rcu_link.func will contain the value of invalid_hash.
+	 */
+	h->invalid_hash = (uintptr_t)h->op->remove_callback;
+	
+	/* Ensure the initialization takes place before we start using the table. */
+	write_barrier();
+	
+	return true;
+}
+
+/** Allocates and initializes 2^order buckets.
+ * 
+ * All bucket heads are initialized to point to the sentinel node.
+ * 
+ * @param order       The number of buckets to allocate is 2^order.
+ * @param set_invalid Bucket heads are marked invalid if true; otherwise
+ *                    they are marked N_NORMAL.
+ * @param can_block   If true memory allocation blocks until enough memory
+ *                    is available (possibly indefinitely). Otherwise, 
+ *                    memory allocation does not block. 
+ * @return Newly allocated and initialized buckets or NULL if not enough memory.
+ */
+static cht_buckets_t *alloc_buckets(size_t order, bool set_invalid, bool can_block)
+{
+	size_t bucket_cnt = (1 << order);
+	size_t bytes = 
+		sizeof(cht_buckets_t) + (bucket_cnt - 1) * sizeof(marked_ptr_t);
+	cht_buckets_t *b = malloc(bytes, can_block ? 0 : FRAME_ATOMIC);
+	
+	if (!b)
+		return NULL;
+	
+	b->order = order;
+	
+	marked_ptr_t head_link = set_invalid 
+		? make_link(&sentinel, N_INVALID) 
+		: make_link(&sentinel, N_NORMAL);
+	
+	for (size_t i = 0; i < bucket_cnt; ++i) {
+		b->head[i] = head_link;
+	}
+	
+	return b;
+}
+
+/** Returns the smallest k such that bucket_cnt <= 2^k and min_order <= k.*/
+static size_t size_to_order(size_t bucket_cnt, size_t min_order)
+{
+	size_t order = min_order;
+
+	/* Find a power of two such that bucket_cnt <= 2^order */
+	do {
+		if (bucket_cnt <= ((size_t)1 << order))
+			return order;
+		
+		++order;
+	} while (order < CHT_MAX_ORDER);
+	
+	return order;
+}
+
+/** Destroys a CHT successfully created via cht_create().
+ * 
+ * Waits for all outstanding concurrent operations to complete and
+ * frees internal allocated resources. The table is however not cleared
+ * and items already present in the table (if any) are leaked.
+ */
+void cht_destroy(cht_t *h)
+{
+	cht_destroy_unsafe(h);
+	
+	/* You must clear the table of items. Otherwise cht_destroy will leak. */
+	ASSERT(atomic_get(&h->item_cnt) == 0);
+}
+
+/** Destroys a successfully created CHT but does no error checking. */
+void cht_destroy_unsafe(cht_t *h)
+{
+	/* Wait for resize to complete. */
+	while (0 < atomic_get(&h->resize_reqs)) {
+		rcu_barrier();
+	}
+	
+	/* Wait for all remove_callback()s to complete. */
+	rcu_barrier();
+	
+	free(h->b);
+	h->b = NULL;
+}
+
+/** Returns the first item equal to the search key or NULL if not found.
+ * 
+ * The call must be enclosed in a rcu_read_lock() unlock() pair. The 
+ * returned item is guaranteed to be allocated until rcu_read_unlock()
+ * although the item may be concurrently removed from the table by another
+ * cpu.
+ * 
+ * Further items matching the key may be retrieved via cht_find_next().
+ * 
+ * cht_find() sees the effects of any completed cht_remove(), cht_insert().
+ * If a concurrent remove or insert had not yet completed cht_find() may
+ * or may not see the effects of it (eg it may find an item being removed).
+ * 
+ * @param h   CHT to operate on.
+ * @param key Search key as defined by cht_ops_t.key_equal() and .key_hash().
+ * @return First item equal to the key or NULL if such an item does not exist.
+ */
+cht_link_t *cht_find(cht_t *h, void *key)
+{
+	/* Make the most recent changes to the table visible. */
+	read_barrier();
+	return cht_find_lazy(h, key);
+}
+
+/** Returns the first item equal to the search key or NULL if not found.
+ * 
+ * Unlike cht_find(), cht_find_lazy() may not see the effects of 
+ * cht_remove() or cht_insert() even though they have already completed.
+ * It may take a couple of milliseconds for those changes to propagate
+ * and become visible to cht_find_lazy(). On the other hand, cht_find_lazy() 
+ * operates a bit faster than cht_find().
+ * 
+ * See cht_find() for more details.
+ */
+cht_link_t *cht_find_lazy(cht_t *h, void *key)
+{
+	return find_lazy(h, key);
+}
+
+/** Finds the first item equal to the search key. */
+static inline cht_link_t *find_lazy(cht_t *h, void *key)
+{
+	ASSERT(h);
+	/* See docs to cht_find() and cht_find_lazy(). */
+	ASSERT(rcu_read_locked());
+	
+	size_t hash = calc_key_hash(h, key);
+	
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	/* 
+	 * No need for access_once. b->head[idx] will point to an allocated node 
+	 * even if marked invalid until we exit rcu read section.
+	 */
+	marked_ptr_t head = b->head[idx];
+	
+	/* Undergoing a resize - take the slow path. */
+	if (N_INVALID == get_mark(head))
+		return find_resizing(h, key, hash, head, idx);
+	
+	return search_bucket(h, head, key, hash);
+}
+
+/** Returns the next item matching \a item. 
+ * 
+ * Must be enclosed in a rcu_read_lock()/unlock() pair. Effects of 
+ * completed cht_remove(), cht_insert() are guaranteed to be visible
+ * to cht_find_next().
+ * 
+ * See cht_find() for more details.  
+ */
+cht_link_t *cht_find_next(cht_t *h, const cht_link_t *item)
+{
+	/* Make the most recent changes to the table visible. */
+	read_barrier();
+	return cht_find_next_lazy(h, item);
+}
+
+/** Returns the next item matching \a item. 
+ * 
+ * Must be enclosed in a rcu_read_lock()/unlock() pair. Effects of 
+ * completed cht_remove(), cht_insert() may or may not be visible
+ * to cht_find_next_lazy().
+ * 
+ * See cht_find_lazy() for more details.  
+ */
+cht_link_t *cht_find_next_lazy(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(h);
+	ASSERT(rcu_read_locked());
+	ASSERT(item);
+	
+	return find_duplicate(h, item, calc_node_hash(h, item), get_next(item->link));
+}
+
+/** Searches the bucket at head for key using search_hash. */
+static inline cht_link_t *search_bucket(cht_t *h, marked_ptr_t head, void *key, 
+	size_t search_hash)
+{
+	/* 
+	 * It is safe to access nodes even outside of this bucket (eg when
+	 * splitting the bucket). The resizer makes sure that any node we 
+	 * may find by following the next pointers is allocated.
+	 */
+
+	cht_link_t *cur = NULL;
+	marked_ptr_t prev = head;
+
+try_again:
+	/* Filter out items with different hashes. */
+	do {
+		cur = get_next(prev);
+		ASSERT(cur);
+		prev = cur->link;
+	} while (node_hash(h, cur) < search_hash);
+	
+	/* 
+	 * Only search for an item with an equal key if cur is not the sentinel
+	 * node or a node with a different hash. 
+	 */
+	while (node_hash(h, cur) == search_hash) {
+		if (h->op->key_equal(key, cur)) {
+			if (!(N_DELETED & get_mark(cur->link)))
+				return cur;
+		}
+		
+		cur = get_next(cur->link);
+		ASSERT(cur);
+	} 
+	
+	/* 
+	 * In the unlikely case that we have encountered a node whose cached
+	 * hash has been overwritten due to a pending rcu_call for it, skip
+	 * the node and try again.
+	 */
+	if (node_hash(h, cur) == h->invalid_hash) {
+		prev = cur->link;
+		goto try_again;
+	}
+	
+	return NULL;
+}
+
+/** Searches for the key while the table is undergoing a resize. */
+static cht_link_t *find_resizing(cht_t *h, void *key, size_t hash, 
+	marked_ptr_t old_head, size_t old_idx)
+{
+	ASSERT(N_INVALID == get_mark(old_head)); 
+	ASSERT(h->new_b);
+	
+	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
+	marked_ptr_t new_head = h->new_b->head[new_idx];
+	marked_ptr_t search_head = new_head;
+	
+	/* Growing. */
+	if (h->b->order < h->new_b->order) {
+		/* 
+		 * Old bucket head is invalid, so it must have been already
+		 * moved. Make the new head visible if still not visible, ie
+		 * invalid.
+		 */
+		if (N_INVALID == get_mark(new_head)) {
+			/* 
+			 * We should be searching a newly added bucket but the old
+			 * moved bucket has not yet been split (its marked invalid) 
+			 * or we have not yet seen the split. 
+			 */
+			if (grow_idx(old_idx) != new_idx) {
+				/* 
+				 * Search the moved bucket. It is guaranteed to contain
+				 * items of the newly added bucket that were present
+				 * before the moved bucket was split.
+				 */
+				new_head = h->new_b->head[grow_idx(old_idx)];
+			}
+			
+			/* new_head is now the moved bucket, either valid or invalid. */
+			
+			/* 
+			 * The old bucket was definitely moved to new_head but the
+			 * change of new_head had not yet propagated to this cpu.
+			 */
+			if (N_INVALID == get_mark(new_head)) {
+				/*
+				 * We could issue a read_barrier() and make the now valid
+				 * moved bucket head new_head visible, but instead fall back
+				 * on using the old bucket. Although the old bucket head is 
+				 * invalid, it points to a node that is allocated and in the 
+				 * right bucket. Before the node can be freed, it must be
+				 * unlinked from the head (or another item after that item
+				 * modified the new_head) and a grace period must elapse. 
+				 * As a result had the node been already freed the grace
+				 * period preceeding the free() would make the unlink and
+				 * any changes to new_head visible. Therefore, it is safe
+				 * to use the node pointed to from the old bucket head.
+				 */
+
+				search_head = old_head;
+			} else {
+				search_head = new_head;
+			}
+		}
+		
+		return search_bucket(h, search_head, key, hash);
+	} else if (h->b->order > h->new_b->order) {
+		/* Shrinking. */
+		
+		/* Index of the bucket in the old table that was moved. */
+		size_t move_src_idx = grow_idx(new_idx);
+		marked_ptr_t moved_old_head = h->b->head[move_src_idx];
+		
+		/*
+		 * h->b->head[move_src_idx] had already been moved to new_head 
+		 * but the change to new_head had not yet propagated to us.
+		 */
+		if (N_INVALID == get_mark(new_head)) {
+			/*
+			 * new_head is definitely valid and we could make it visible 
+			 * to this cpu with a read_barrier(). Instead, use the bucket 
+			 * in the old table that was moved even though it is now marked 
+			 * as invalid. The node it points to must be allocated because
+			 * a grace period would have to elapse before it could be freed;
+			 * and the grace period would make the now valid new_head 
+			 * visible to all cpus. 
+			 * 
+			 * Note that move_src_idx may not be the same as old_idx.
+			 * If move_src_idx != old_idx then old_idx is the bucket
+			 * in the old table that is not moved but instead it is
+			 * appended to the moved bucket, ie it is added at the tail
+			 * of new_head. In that case an invalid old_head notes that
+			 * it had already been merged into (the moved) new_head. 
+			 * We will try to search that bucket first because it
+			 * may contain some newly added nodes after the bucket 
+			 * join. Moreover, the bucket joining link may already be 
+			 * visible even if new_head is not. Therefore, if we're
+			 * lucky we'll find the item via moved_old_head. In any
+			 * case, we'll retry in proper old_head if not found.
+			 */
+			search_head = moved_old_head;
+		}
+		
+		cht_link_t *ret = search_bucket(h, search_head, key, hash);
+		
+		if (ret)
+			return ret;
+		/*
+		 * Bucket old_head was already joined with moved_old_head
+		 * in the new table but we have not yet seen change of the
+		 * joining link (or the item is not in the table).
+		 */
+		if (move_src_idx != old_idx && get_next(old_head) != &sentinel) {
+			/*
+			 * Note that old_head (the bucket to be merged into new_head) 
+			 * points to an allocated join node (if non-null) even if marked 
+			 * invalid. Before the resizer lets join nodes to be unlinked
+			 * (and freed) it sets old_head to NULL and waits for a grace period.
+			 * So either the invalid old_head points to join node; or old_head
+			 * is null and we would have seen a completed bucket join while
+			 * traversing search_head.
+			 */
+			ASSERT(N_JOIN & get_mark(get_next(old_head)->link));
+			return search_bucket(h, old_head, key, hash);
+		}
+		
+		return NULL;
+	} else {
+		/* 
+		 * Resize is almost done. The resizer is waiting to make
+		 * sure all cpus see that the new table replaced the old one.
+		 */
+		ASSERT(h->b->order == h->new_b->order);
+		/* 
+		 * The resizer must ensure all new bucket heads are visible before
+		 * replacing the old table.
+		 */
+		ASSERT(N_NORMAL == get_mark(new_head));
+		return search_bucket(h, new_head, key, hash);
+	}
+}
+
+/** Inserts an item. Succeeds even if an equal item is already present. */
+void cht_insert(cht_t *h, cht_link_t *item)
+{
+	insert_impl(h, item, NULL);
+}
+
+/** Inserts a unique item. Returns false if an equal item was already present. 
+ * 
+ * Use this function to atomically check if an equal/duplicate item had
+ * not yet been inserted into the table and to insert this item into the 
+ * table.
+ * 
+ * The following is @e NOT thread-safe, so do not use:
+ * @code
+ * if (!cht_find(h, key)) {
+ *     // A concurrent insert here may go unnoticed by cht_find() above.
+ *     item = malloc(..);
+ *     cht_insert(h, item);
+ *     // Now we may have two items with equal search keys.
+ * }
+ * @endcode
+ * 
+ * Replace such code with:
+ * @code
+ * item = malloc(..);
+ * if (!cht_insert_unique(h, item, &dup_item)) {
+ *     // Whoops, someone beat us to it - an equal item 'dup_item'
+ *     // had already been inserted.
+ *     free(item); 
+ * } else {
+ *     // Successfully inserted the item and we are guaranteed that
+ *     // there are no other equal items.
+ * }
+ * @endcode
+ * 
+ */
+bool cht_insert_unique(cht_t *h, cht_link_t *item, cht_link_t **dup_item)
+{
+	ASSERT(rcu_read_locked());
+	ASSERT(dup_item);
+	return insert_impl(h, item, dup_item);
+}
+
+/** Inserts the item into the table and checks for duplicates if dup_item. */
+static bool insert_impl(cht_t *h, cht_link_t *item, cht_link_t **dup_item)
+{
+	rcu_read_lock();
+
+	cht_buckets_t *b = rcu_access(h->b);
+	memoize_node_hash(h, item);
+	size_t hash = node_hash(h, item);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	marked_ptr_t *phead = &b->head[idx];
+
+	bool resizing = false;
+	bool inserted = false;
+	
+	do {
+		walk_mode_t walk_mode = WM_NORMAL;
+		bool join_finishing;
+		
+		resizing = resizing || (N_NORMAL != get_mark(*phead));
+		
+		/* The table is resizing. Get the correct bucket head. */
+		if (resizing) {
+			upd_resizing_head(h, hash, &phead, &join_finishing, &walk_mode);
+		}
+		
+		wnd_t wnd = {
+			.ppred = phead,
+			.cur = get_next(*phead),
+			.last = NULL
+		};
+		
+		if (!find_wnd_and_gc(h, hash, walk_mode, &wnd, &resizing)) {
+			/* Could not GC a node; or detected an unexpected resize. */
+			continue;
+		}
+		
+		if (dup_item && has_duplicate(h, item, hash, wnd.cur, dup_item)) {
+			rcu_read_unlock();
+			return false;
+		}
+		
+		inserted = insert_at(item, &wnd, walk_mode, &resizing);		
+	} while (!inserted);
+	
+	rcu_read_unlock();
+
+	item_inserted(h);
+	return true;
+}
+
+/** Inserts item between wnd.ppred and wnd.cur. 
+ * 
+ * @param item      Item to link to wnd.ppred and wnd.cur.
+ * @param wnd       The item will be inserted before wnd.cur. Wnd.ppred
+ *                  must be N_NORMAL.
+ * @param walk_mode 
+ * @param resizing  Set to true only if the table is undergoing resize 
+ *         and it was not expected (ie walk_mode == WM_NORMAL).
+ * @return True if the item was successfully linked to wnd.ppred. False
+ *         if whole insert operation must be retried because the predecessor
+ *         of wnd.cur has changed.
+ */
+inline static bool insert_at(cht_link_t *item, const wnd_t *wnd, 
+	walk_mode_t walk_mode, bool *resizing)
+{
+	marked_ptr_t ret;
+	
+	if (walk_mode == WM_NORMAL) {
+		item->link = make_link(wnd->cur, N_NORMAL);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		/* Link a clean/normal predecessor to the item. */
+		ret = cas_link(wnd->ppred, wnd->cur, N_NORMAL, item, N_NORMAL);
+		
+		if (ret == make_link(wnd->cur, N_NORMAL)) {
+			return true;
+		} else {
+			/* This includes an invalid head but not a const head. */
+			*resizing = ((N_JOIN_FOLLOWS | N_JOIN) & get_mark(ret));
+			return false;
+		}
+	} else if (walk_mode == WM_MOVE_JOIN_FOLLOWS) {
+		/* Move JOIN_FOLLOWS mark but filter out the DELETED mark. */
+		mark_t jf_mark = get_mark(*wnd->ppred) & N_JOIN_FOLLOWS;
+		item->link = make_link(wnd->cur, jf_mark);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		/* Link the not-deleted predecessor to the item. Move its JF mark. */
+		ret = cas_link(wnd->ppred, wnd->cur, jf_mark, item, N_NORMAL);
+		
+		return ret == make_link(wnd->cur, jf_mark);
+	} else {
+		ASSERT(walk_mode == WM_LEAVE_JOIN);
+
+		item->link = make_link(wnd->cur, N_NORMAL);
+		/* Initialize the item before adding it to a bucket. */
+		memory_barrier();
+		
+		mark_t pred_mark = get_mark(*wnd->ppred);
+		/* If the predecessor is a join node it may be marked deleted.*/
+		mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
+
+		ret = cas_link(wnd->ppred, wnd->cur, exp_pred_mark, item, exp_pred_mark);
+		return ret == make_link(wnd->cur, exp_pred_mark);
+	}
+}
+
+/** Returns true if the chain starting at cur has an item equal to \a item.
+ * 
+ * @param h    CHT to operate on.
+ * @param item Item whose duplicates the function looks for.
+ * @param hash Hash of \a item.
+ * @param[in] cur  The first node with a hash greater to or equal to item's hash.
+ * @param[out] dup_item The first duplicate item encountered.
+ * @return True if a non-deleted item equal to \a item exists in the table.
+ */
+static inline bool has_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *cur, cht_link_t **dup_item)
+{
+	ASSERT(cur);
+	ASSERT(cur == &sentinel || hash <= node_hash(h, cur)
+		|| node_hash(h, cur) == h->invalid_hash);
+	
+	/* hash < node_hash(h, cur) */
+	if (hash != node_hash(h, cur) && h->invalid_hash != node_hash(h, cur))
+		return false;
+
+	/* 
+	 * Load the most recent node marks. Otherwise we might pronounce a 
+	 * logically deleted node for a duplicate of the item just because 
+	 * the deleted node's DEL mark had not yet propagated to this cpu.
+	 */
+	read_barrier();
+	
+	*dup_item = find_duplicate(h, item, hash, cur);
+	return NULL != *dup_item;
+}
+
+/** Returns an item that is equal to \a item starting in a chain at \a start. */
+static cht_link_t *find_duplicate(cht_t *h, const cht_link_t *item, size_t hash, 
+	cht_link_t *start)
+{
+	ASSERT(hash <= node_hash(h, start) || h->invalid_hash == node_hash(h, start));
+
+	cht_link_t *cur = start;
+	
+try_again:	
+	ASSERT(cur);
+
+	while (node_hash(h, cur) == hash) {
+		ASSERT(cur != &sentinel);
+		
+		bool deleted = (N_DELETED & get_mark(cur->link));
+		
+		/* Skip logically deleted nodes. */
+		if (!deleted && h->op->equal(item, cur))
+			return cur;
+		
+		cur = get_next(cur->link);
+		ASSERT(cur);
+	} 
+
+	/* Skip logically deleted nodes with rcu_call() in progress. */
+	if (h->invalid_hash == node_hash(h, cur)) {
+		cur = get_next(cur->link);
+		goto try_again;
+	}
+	
+	return NULL;
+}
+
+/** Removes all items matching the search key. Returns the number of items removed.*/
+size_t cht_remove_key(cht_t *h, void *key)
+{
+	ASSERT(h);
+	
+	size_t hash = calc_key_hash(h, key);
+	size_t removed = 0;
+	
+	while (remove_pred(h, hash, h->op->key_equal, key)) 
+		++removed;
+	
+	return removed;
+}
+
+/** Removes a specific item from the table. 
+ * 
+ * The called must hold rcu read lock. 
+ * 
+ * @param item Item presumably present in the table and to be removed.
+ * @return True if the item was removed successfully; or false if it had
+ *     already been deleted. 
+ */
+bool cht_remove_item(cht_t *h, cht_link_t *item)
+{
+	ASSERT(h);
+	ASSERT(item);
+	/* Otherwise a concurrent cht_remove_key might free the item. */
+	ASSERT(rcu_read_locked());
+
+	/* 
+	 * Even though we know the node we want to delete we must unlink it
+	 * from the correct bucket and from a clean/normal predecessor. Therefore, 
+	 * we search for it again from the beginning of the correct bucket.
+	 */
+	size_t hash = calc_node_hash(h, item);
+	return remove_pred(h, hash, same_node_pred, item);
+}
+
+/** Removes an item equal to pred_arg according to the predicate pred. */
+static bool remove_pred(cht_t *h, size_t hash, equal_pred_t pred, void *pred_arg)
+{
+	rcu_read_lock();
+	
+	bool resizing = false;
+	bool deleted = false;
+	bool deleted_but_gc = false;
+	
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t idx = calc_bucket_idx(hash, b->order);
+	marked_ptr_t *phead = &b->head[idx];
+	
+	do {
+		walk_mode_t walk_mode = WM_NORMAL;
+		bool join_finishing = false;
+		
+		resizing = resizing || (N_NORMAL != get_mark(*phead));
+		
+		/* The table is resizing. Get the correct bucket head. */
+		if (resizing) {
+			upd_resizing_head(h, hash, &phead, &join_finishing, &walk_mode);
+		}
+		
+		wnd_t wnd = {
+			.ppred = phead,
+			.cur = get_next(*phead),
+			.last = NULL
+		};
+		
+		if (!find_wnd_and_gc_pred(
+			h, hash, walk_mode, pred, pred_arg, &wnd, &resizing)) {
+			/* Could not GC a node; or detected an unexpected resize. */
+			continue;
+		}
+		
+		/* 
+		 * The item lookup is affected by a bucket join but effects of
+		 * the bucket join have not been seen while searching for the item.
+		 */
+		if (join_finishing && !join_completed(h, &wnd)) {
+			/* 
+			 * Bucket was appended at the end of another but the next 
+			 * ptr linking them together was not visible on this cpu. 
+			 * join_completed() makes this appended bucket visible.
+			 */
+			continue;
+		}
+		
+		/* Already deleted, but delete_at() requested one GC pass. */
+		if (deleted_but_gc)
+			break;
+		
+		bool found = (wnd.cur != &sentinel && pred(pred_arg, wnd.cur));
+		
+		if (!found) {
+			rcu_read_unlock();
+			return false;
+		}
+		
+		deleted = delete_at(h, &wnd, walk_mode, &deleted_but_gc, &resizing);		
+	} while (!deleted || deleted_but_gc);
+	
+	rcu_read_unlock();
+	return true;
+}
+
+/** Unlinks wnd.cur from wnd.ppred and schedules a deferred free for the item.
+ * 
+ * Ignores nodes marked N_JOIN if walk mode is WM_LEAVE_JOIN.
+ * 
+ * @param h   CHT to operate on.
+ * @param wnd Points to the item to delete and its N_NORMAL predecessor.
+ * @param walk_mode Bucket chaing walk mode.
+ * @param deleted_but_gc Set to true if the item had been logically deleted, 
+ *         but a garbage collecting walk of the bucket is in order for
+ *         it to be fully unlinked.         
+ * @param resizing Set to true if the table is undergoing an unexpected
+ *         resize (ie walk_mode == WM_NORMAL).
+ * @return False if the wnd.ppred changed in the meantime and the whole
+ *         delete operation must be retried.
+ */
+static inline bool delete_at(cht_t *h, wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *deleted_but_gc, bool *resizing)
+{
+	ASSERT(wnd->cur && wnd->cur != &sentinel);
+	
+	*deleted_but_gc = false;
+	
+	if (!mark_deleted(wnd->cur, walk_mode, resizing)) {
+		/* Already deleted, or unexpectedly marked as JOIN/JOIN_FOLLOWS. */
+		return false;
+	}
+	
+	/* Marked deleted. Unlink from the bucket. */
+	
+	/* Never unlink join nodes. */
+	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link)))
+		return true;
+	
+	cas_order_barrier();
+	
+	if (unlink_from_pred(wnd, walk_mode, resizing)) {
+		free_later(h, wnd->cur);
+	} else {
+		*deleted_but_gc = true;
+	}
+	
+	return true;
+}
+
+/** Marks cur logically deleted. Returns false to request a retry. */
+static inline bool mark_deleted(cht_link_t *cur, walk_mode_t walk_mode, 
+	bool *resizing)
+{
+	ASSERT(cur && cur != &sentinel);
+	
+	/* 
+	 * Btw, we could loop here if the cas fails but let's not complicate
+	 * things and let's retry from the head of the bucket. 
+	 */
+	
+	cht_link_t *next = get_next(cur->link);
+	
+	if (walk_mode == WM_NORMAL) {
+		/* Only mark clean/normal nodes - JF/JN is used only during resize. */
+		marked_ptr_t ret = cas_link(&cur->link, next, N_NORMAL, next, N_DELETED);
+		
+		if (ret != make_link(next, N_NORMAL)) {
+			*resizing = (N_JOIN | N_JOIN_FOLLOWS) & get_mark(ret);
+			return false;
+		}
+	} else {
+		ASSERT(N_JOIN == N_JOIN_FOLLOWS);
+		
+		/* Keep the N_JOIN/N_JOIN_FOLLOWS mark but strip N_DELETED. */
+		mark_t cur_mark = get_mark(cur->link) & N_JOIN_FOLLOWS;
+		
+		marked_ptr_t ret = 
+			cas_link(&cur->link, next, cur_mark, next, cur_mark | N_DELETED);
+		
+		if (ret != make_link(next, cur_mark))
+			return false;
+	} 
+	
+	return true;
+}
+
+/** Unlinks wnd.cur from wnd.ppred. Returns false if it should be retried. */
+static inline bool unlink_from_pred(wnd_t *wnd, walk_mode_t walk_mode, 
+	bool *resizing)
+{
+	ASSERT(wnd->cur != &sentinel);
+	ASSERT(wnd->cur && (N_DELETED & get_mark(wnd->cur->link)));
+	
+	cht_link_t *next = get_next(wnd->cur->link);
+		
+	if (walk_mode == WM_LEAVE_JOIN) {
+		/* Never try to unlink join nodes. */
+		ASSERT(!(N_JOIN & get_mark(wnd->cur->link)));
+
+		mark_t pred_mark = get_mark(*wnd->ppred);
+		/* Succeed only if the predecessor is clean/normal or a join node. */
+		mark_t exp_pred_mark = (N_JOIN & pred_mark) ? pred_mark : N_NORMAL;
+		
+		marked_ptr_t pred_link = make_link(wnd->cur, exp_pred_mark);
+		marked_ptr_t next_link = make_link(next, exp_pred_mark);
+		
+		if (pred_link != _cas_link(wnd->ppred, pred_link, next_link))
+			return false;
+	} else {
+		ASSERT(walk_mode == WM_MOVE_JOIN_FOLLOWS || walk_mode == WM_NORMAL);
+		/* Move the JF mark if set. Clear DEL mark. */
+		mark_t cur_mark = N_JOIN_FOLLOWS & get_mark(wnd->cur->link);
+		
+		/* The predecessor must be clean/normal. */
+		marked_ptr_t pred_link = make_link(wnd->cur, N_NORMAL);
+		/* Link to cur's successor keeping/copying cur's JF mark. */
+		marked_ptr_t next_link = make_link(next, cur_mark);		
+		
+		marked_ptr_t ret = _cas_link(wnd->ppred, pred_link, next_link);
+		
+		if (pred_link != ret) {
+			/* If we're not resizing the table there are no JF/JN nodes. */
+			*resizing = (walk_mode == WM_NORMAL) 
+				&& (N_JOIN_FOLLOWS & get_mark(ret));
+			return false;
+		}
+	}
+	
+	return true;
+}
+
+/** Finds the first non-deleted item equal to \a pred_arg according to \a pred.
+ * 
+ * The function returns the candidate item in \a wnd. Logically deleted
+ * nodes are garbage collected so the predecessor will most likely not
+ * be marked as deleted. 
+ * 
+ * Unlike find_wnd_and_gc(), this function never returns a node that
+ * is known to have already been marked N_DELETED.
+ *
+ * Any logically deleted nodes (ie those marked N_DELETED) are garbage
+ * collected, ie free in the background via rcu_call (except for join-nodes
+ * if walk_mode == WM_LEAVE_JOIN).
+ * 
+ * @param h         CHT to operate on.
+ * @param hash      Hash the search for.
+ * @param walk_mode Bucket chain walk mode.
+ * @param pred      Predicate used to find an item equal to pred_arg.
+ * @param pred_arg  Argument to pass to the equality predicate \a pred.
+ * @param[in,out] wnd The search starts with wnd.cur. If the desired
+ *                  item is found wnd.cur will point to it.
+ * @param resizing  Set to true if the table is resizing but it was not
+ *                  expected (ie walk_mode == WM_NORMAL).
+ * @return False if the operation has to be retried. True otherwise 
+ *        (even if an equal item had not been found).
+ */
+static bool find_wnd_and_gc_pred(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	equal_pred_t pred, void *pred_arg, wnd_t *wnd, bool *resizing)
+{
+	ASSERT(wnd->cur);
+	
+	if (wnd->cur == &sentinel)
+		return true;
+	
+	/* 
+	 * A read barrier is not needed here to bring up the most recent 
+	 * node marks (esp the N_DELETED). At worst we'll try to delete
+	 * an already deleted node; fail in delete_at(); and retry.
+	 */
+	
+	size_t cur_hash;
+
+try_again:	
+	cur_hash = node_hash(h, wnd->cur);
+		
+	while (cur_hash <= hash) {
+		ASSERT(wnd->cur && wnd->cur != &sentinel);
+		
+		/* GC any deleted nodes on the way. */
+		if (N_DELETED & get_mark(wnd->cur->link)) {
+			if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
+				/* Retry from the head of a bucket. */
+				return false;
+			}
+		} else {
+			/* Is this the node we were looking for? */
+			if (cur_hash == hash && pred(pred_arg, wnd->cur))
+				return true;
+			
+			next_wnd(wnd);
+		}
+		
+		cur_hash = node_hash(h, wnd->cur);
+	}
+	
+	if (cur_hash == h->invalid_hash) {
+		next_wnd(wnd);
+		ASSERT(wnd->cur);
+		goto try_again;
+	}
+	
+	/* The searched for node is not in the current bucket. */
+	return true;
+}
+
+/** Find the first item (deleted or not) with a hash greater or equal to \a hash.
+ * 
+ * The function returns the first item with a hash that is greater or 
+ * equal to \a hash in \a wnd. Moreover it garbage collects logically
+ * deleted node that have not yet been unlinked and freed. Therefore,
+ * the returned node's predecessor will most likely be N_NORMAL.
+ * 
+ * Unlike find_wnd_and_gc_pred(), this function may return a node
+ * that is known to had been marked N_DELETED.
+ *  
+ * @param h         CHT to operate on.
+ * @param hash      Hash of the item to find.
+ * @param walk_mode Bucket chain walk mode.
+ * @param[in,out] wnd wnd.cur denotes the first node of the chain. If the 
+ *                  the operation is successful, \a wnd points to the desired 
+ *                  item.
+ * @param resizing  Set to true if a table resize was detected but walk_mode
+ *                  suggested the table was not undergoing a resize.
+ * @return False indicates the operation must be retried. True otherwise 
+ *       (even if an item with exactly the same has was not found).
+ */
+static bool find_wnd_and_gc(cht_t *h, size_t hash, walk_mode_t walk_mode, 
+	wnd_t *wnd, bool *resizing)
+{
+try_again:
+	ASSERT(wnd->cur);
+
+	while (node_hash(h, wnd->cur) < hash) {
+		/* GC any deleted nodes along the way to our desired node. */
+		if (N_DELETED & get_mark(wnd->cur->link)) {
+			if (!gc_deleted_node(h, walk_mode, wnd, resizing)) {
+				/* Failed to remove the garbage node. Retry. */
+				return false;
+			}
+		} else {
+			next_wnd(wnd);
+		}
+		
+		ASSERT(wnd->cur);
+	}
+	
+	if (node_hash(h, wnd->cur) == h->invalid_hash) {
+		next_wnd(wnd);
+		goto try_again;
+	}
+
+	/* wnd->cur may be NULL or even marked N_DELETED. */
+	return true;
+}
+
+/** Garbage collects the N_DELETED node at \a wnd skipping join nodes. */
+static bool gc_deleted_node(cht_t *h, walk_mode_t walk_mode, wnd_t *wnd,
+	bool *resizing)
+{
+	ASSERT(N_DELETED & get_mark(wnd->cur->link));
+
+	/* Skip deleted JOIN nodes. */
+	if (walk_mode == WM_LEAVE_JOIN && (N_JOIN & get_mark(wnd->cur->link))) {
+		next_wnd(wnd);
+	} else {
+		/* Ordinary deleted node or a deleted JOIN_FOLLOWS. */
+		ASSERT(walk_mode != WM_LEAVE_JOIN 
+			|| !((N_JOIN | N_JOIN_FOLLOWS) & get_mark(wnd->cur->link)));
+
+		/* Unlink an ordinary deleted node, move JOIN_FOLLOWS mark. */
+		if (!unlink_from_pred(wnd, walk_mode, resizing)) {
+			/* Retry. The predecessor was deleted, invalid, const, join_follows. */
+			return false;
+		}
+
+		free_later(h, wnd->cur);
+
+		/* Leave ppred as is. */
+		wnd->last = wnd->cur;
+		wnd->cur = get_next(wnd->cur->link);
+	}
+	
+	return true;
+}
+
+/** Returns true if a bucket join had already completed.
+ * 
+ * May only be called if upd_resizing_head() indicates a bucket join 
+ * may be in progress.
+ * 
+ * If it returns false, the search must be retried in order to guarantee
+ * all item that should have been encountered have been seen.
+ */
+static bool join_completed(cht_t *h, const wnd_t *wnd)
+{
+	/* 
+	 * The table is shrinking and the searched for item is in a bucket 
+	 * appended to another. Check that the link joining these two buckets 
+	 * is visible and if not, make it visible to this cpu.
+	 */
+	
+	/* 
+	 * Resizer ensures h->b->order stays the same for the duration of this 
+	 * func. We got here because there was an alternative head to search.
+	 * The resizer waits for all preexisting readers to finish after
+	 * it 
+	 */
+	ASSERT(h->b->order > h->new_b->order);
+	ASSERT(wnd->cur);
+	
+	/* Either we did not need the joining link or we have already followed it.*/
+	if (wnd->cur != &sentinel)
+		return true;
+	
+	/* We have reached the end of a bucket. */
+	
+	if (wnd->last != &sentinel) {
+		size_t last_seen_hash = node_hash(h, wnd->last);
+		
+		if (last_seen_hash == h->invalid_hash) {
+			last_seen_hash = calc_node_hash(h, wnd->last);
+		}
+		
+		size_t last_old_idx = calc_bucket_idx(last_seen_hash, h->b->order);
+		size_t move_src_idx = grow_idx(shrink_idx(last_old_idx));
+		
+		/* 
+		 * Last node seen was in the joining bucket - if the searched 
+		 * for node is there we will find it. 
+		 */
+		if (move_src_idx != last_old_idx) 
+			return true;
+	}
+	
+	/* 
+	 * Reached the end of the bucket but no nodes from the joining bucket
+	 * were seen. There should have at least been a JOIN node so we have
+	 * definitely not seen (and followed) the joining link. Make the link
+	 * visible and retry.
+	 */
+	read_barrier();
+	return false;
+}
+
+/** When resizing returns the bucket head to start the search with in \a phead.
+ * 
+ * If a resize had been detected (eg cht_t.b.head[idx] is marked immutable).
+ * upd_resizing_head() moves the bucket for \a hash from the old head
+ * to the new head. Moreover, it splits or joins buckets as necessary.
+ * 
+ * @param h     CHT to operate on.
+ * @param hash  Hash of an item whose chain we would like to traverse.
+ * @param[out] phead Head of the bucket to search for \a hash.
+ * @param[out] join_finishing Set to true if a bucket join might be
+ *              in progress and the bucket may have to traversed again
+ *              as indicated by join_completed().
+ * @param[out] walk_mode Specifies how to interpret node marks.  
+ */
+static void upd_resizing_head(cht_t *h, size_t hash, marked_ptr_t **phead, 
+	bool *join_finishing,  walk_mode_t *walk_mode)
+{
+	cht_buckets_t *b = rcu_access(h->b);
+	size_t old_idx = calc_bucket_idx(hash, b->order);
+	size_t new_idx = calc_bucket_idx(hash, h->new_b->order);
+	
+	marked_ptr_t *pold_head = &b->head[old_idx];
+	marked_ptr_t *pnew_head = &h->new_b->head[new_idx];
+	
+	/* In any case, use the bucket in the new table. */
+	*phead = pnew_head;
+
+	/* Growing the table. */
+	if (b->order < h->new_b->order) {
+		size_t move_dest_idx = grow_idx(old_idx);
+		marked_ptr_t *pmoved_head = &h->new_b->head[move_dest_idx];
+		
+		/* Complete moving the bucket from the old to the new table. */
+		help_head_move(pold_head, pmoved_head);
+		
+		/* The hash belongs to the moved bucket. */
+		if (move_dest_idx == new_idx) {
+			ASSERT(pmoved_head == pnew_head);
+			/* 
+			 * move_head() makes the new head of the moved bucket visible. 
+			 * The new head may be marked with a JOIN_FOLLOWS
+			 */
+			ASSERT(!(N_CONST & get_mark(*pmoved_head)));
+			*walk_mode = WM_MOVE_JOIN_FOLLOWS;
+		} else {
+			ASSERT(pmoved_head != pnew_head);
+			/* 
+			 * The hash belongs to the bucket that is the result of splitting 
+			 * the old/moved bucket, ie the bucket that contains the second
+			 * half of the split/old/moved bucket.
+			 */
+			
+			/* The moved bucket has not yet been split. */
+			if (N_NORMAL != get_mark(*pnew_head)) {
+				size_t split_hash = calc_split_hash(new_idx, h->new_b->order);
+				split_bucket(h, pmoved_head, pnew_head, split_hash);
+				/* 
+				 * split_bucket() makes the new head visible. No 
+				 * JOIN_FOLLOWS in this part of split bucket.
+				 */
+				ASSERT(N_NORMAL == get_mark(*pnew_head));
+			}
+			
+			*walk_mode = WM_LEAVE_JOIN;
+		}
+	} else if (h->new_b->order < b->order ) {
+		/* Shrinking the table. */
+		
+		size_t move_src_idx = grow_idx(new_idx);
+		
+		/* 
+		 * Complete moving the bucket from the old to the new table. 
+		 * Makes a valid pnew_head visible if already moved.
+		 */
+		help_head_move(&b->head[move_src_idx], pnew_head);
+		
+		/* Hash belongs to the bucket to be joined with the moved bucket. */
+		if (move_src_idx != old_idx) {
+			/* Bucket join not yet completed. */
+			if (N_INVALID != get_mark(*pold_head)) {
+				size_t split_hash = calc_split_hash(old_idx, b->order);
+				join_buckets(h, pold_head, pnew_head, split_hash);
+			}
+			
+			/* 
+			 * The resizer sets pold_head to &sentinel when all cpus are
+			 * guaranteed to see the bucket join.
+			 */
+			*join_finishing = (&sentinel != get_next(*pold_head));
+		}
+		
+		/* move_head() or join_buckets() makes it so or makes the mark visible.*/
+		ASSERT(N_INVALID == get_mark(*pold_head));
+		/* move_head() makes it visible. No JOIN_FOLLOWS used when shrinking. */
+		ASSERT(N_NORMAL == get_mark(*pnew_head));
+
+		*walk_mode = WM_LEAVE_JOIN;
+	} else {
+		/* 
+		 * Final stage of resize. The resizer is waiting for all 
+		 * readers to notice that the old table had been replaced.
+		 */
+		ASSERT(b == h->new_b);
+		*walk_mode = WM_NORMAL;
+	}
+}
+
+
+#if 0
+static void move_head(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head)
+{
+	start_head_move(psrc_head);
+	cas_order_barrier();
+	complete_head_move(psrc_head, pdest_head);
+}
+#endif
+
+/** Moves an immutable head \a psrc_head of cht_t.b to \a pdest_head of cht_t.new_b. 
+ * 
+ * The function guarantees the move will be visible on this cpu once
+ * it completes. In particular, *pdest_head will not be N_INVALID.
+ * 
+ * Unlike complete_head_move(), help_head_move() checks if the head had already
+ * been moved and tries to avoid moving the bucket heads if possible.
+ */
+static inline void help_head_move(marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head)
+{
+	/* Head move has to in progress already when calling this func. */
+	ASSERT(N_CONST & get_mark(*psrc_head));
+	
+	/* Head already moved. */
+	if (N_INVALID == get_mark(*psrc_head)) {
+		/* Effects of the head move have not yet propagated to this cpu. */
+		if (N_INVALID == get_mark(*pdest_head)) {
+			/* Make the move visible on this cpu. */
+			read_barrier();
+		}
+	} else {
+		complete_head_move(psrc_head, pdest_head);
+	}
+	
+	ASSERT(!(N_CONST & get_mark(*pdest_head)));
+}
+
+/** Initiates the move of the old head \a psrc_head.
+ * 
+ * The move may be completed with help_head_move(). 
+ */
+static void start_head_move(marked_ptr_t *psrc_head)
+{
+	/* Mark src head immutable. */
+	mark_const(psrc_head);
+}
+
+/** Marks the head immutable. */
+static void mark_const(marked_ptr_t *psrc_head)
+{
+	marked_ptr_t ret, src_link;
+	
+	/* Mark src head immutable. */
+	do {
+		cht_link_t *next = get_next(*psrc_head);
+		src_link = make_link(next, N_NORMAL);
+		
+		/* Mark the normal/clean src link immutable/const. */
+		ret = cas_link(psrc_head, next, N_NORMAL, next, N_CONST);
+	} while(ret != src_link && !(N_CONST & get_mark(ret)));
+}
+
+/** Completes moving head psrc_head to pdest_head (started by start_head_move()).*/
+static void complete_head_move(marked_ptr_t *psrc_head, marked_ptr_t *pdest_head)
+{
+	ASSERT(N_JOIN_FOLLOWS != get_mark(*psrc_head));
+	ASSERT(N_CONST & get_mark(*psrc_head));
+	
+	cht_link_t *next = get_next(*psrc_head);
+
+	DBG(marked_ptr_t ret = )
+		cas_link(pdest_head, &sentinel, N_INVALID, next, N_NORMAL);
+	ASSERT(ret == make_link(&sentinel, N_INVALID) || (N_NORMAL == get_mark(ret)));
+	cas_order_barrier();
+	
+	DBG(ret = ) 
+		cas_link(psrc_head, next, N_CONST, next, N_INVALID);	
+	ASSERT(ret == make_link(next, N_CONST) || (N_INVALID == get_mark(ret)));
+	cas_order_barrier();
+}
+
+/** Splits the bucket at psrc_head and links to the remainder from pdest_head.
+ * 
+ * Items with hashes greater or equal to \a split_hash are moved to bucket
+ * with head at \a pdest_head. 
+ * 
+ * @param h           CHT to operate on.
+ * @param psrc_head   Head of the bucket to split (in cht_t.new_b).
+ * @param pdest_head  Head of the bucket that points to the second part
+ *                    of the split bucket in psrc_head. (in cht_t.new_b)
+ * @param split_hash  Hash of the first possible item in the remainder of 
+ *                    psrc_head, ie the smallest hash pdest_head is allowed
+ *                    to point to..
+ */
+static void split_bucket(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash)
+{
+	/* Already split. */
+	if (N_NORMAL == get_mark(*pdest_head))
+		return;
+	
+	/*
+	 * L == Last node of the first part of the split bucket. That part
+	 *      remains in the original/src bucket. 
+	 * F == First node of the second part of the split bucket. That part
+	 *      will be referenced from the dest bucket head.
+	 *
+	 * We want to first mark a clean L as JF so that updaters unaware of 
+	 * the split (or table resize):
+	 * - do not insert a new node between L and F
+	 * - do not unlink L (that is why it has to be clean/normal)
+	 * - do not unlink F
+	 *
+	 * Then we can safely mark F as JN even if it has been marked deleted. 
+	 * Once F is marked as JN updaters aware of table resize will not 
+	 * attempt to unlink it (JN will have two predecessors - we cannot
+	 * safely unlink from both at the same time). Updaters unaware of 
+	 * ongoing resize can reach F only via L and that node is already 
+	 * marked JF, so they won't unlink F.
+	 * 
+	 * Last, link the new/dest head to F.
+	 * 
+	 * 
+	 * 0)                           ,-- split_hash, first hash of the dest bucket 
+	 *                              v  
+	 *  [src_head | N] -> .. -> [L] -> [F]
+	 *  [dest_head | Inv]
+	 * 
+	 * 1)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [F]
+	 *  [dest_head | Inv]
+	 * 
+	 * 2)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [JN]
+	 *  [dest_head | Inv]
+	 * 
+	 * 3)                             ,-- split_hash
+	 *                                v  
+	 *  [src_head | N] -> .. -> [JF] -> [JN]
+	 *                                   ^
+	 *  [dest_head | N] -----------------'
+	 */
+	wnd_t wnd;
+	
+	rcu_read_lock();
+	
+	/* Mark the last node of the first part of the split bucket as JF. */
+	mark_join_follows(h, psrc_head, split_hash, &wnd);
+	cas_order_barrier();
+	
+	/* There are nodes in the dest bucket, ie the second part of the split. */
+	if (wnd.cur != &sentinel) {
+		/* 
+		 * Mark the first node of the dest bucket as a join node so 
+		 * updaters do not attempt to unlink it if it is deleted. 
+		 */
+		mark_join_node(wnd.cur);
+		cas_order_barrier();
+	} else {
+		/* 
+		 * Second part of the split bucket is empty. There are no nodes
+		 * to mark as JOIN nodes and there never will be.
+		 */
+	}
+	
+	/* Link the dest head to the second part of the split. */
+	DBG(marked_ptr_t ret = )
+		cas_link(pdest_head, &sentinel, N_INVALID, wnd.cur, N_NORMAL);
+	ASSERT(ret == make_link(&sentinel, N_INVALID) || (N_NORMAL == get_mark(ret)));
+	cas_order_barrier();
+	
+	rcu_read_unlock();
+}
+
+/** Finds and marks the last node of psrc_head w/ hash less than split_hash.
+ * 
+ * Finds a node in psrc_head with the greatest hash that is strictly less 
+ * than split_hash and marks it with N_JOIN_FOLLOWS. 
+ * 
+ * Returns a window pointing to that node. 
+ * 
+ * Any logically deleted nodes along the way are 
+ * garbage collected; therefore, the predecessor node (if any) will most 
+ * likely not be marked N_DELETED.
+ * 
+ * @param h          CHT to operate on.
+ * @param psrc_head  Bucket head.
+ * @param split_hash The smallest hash a join node (ie the node following
+ *                   the desired join-follows node) may have.
+ * @param[out] wnd   Points to the node marked with N_JOIN_FOLLOWS.
+ */
+static void mark_join_follows(cht_t *h, marked_ptr_t *psrc_head, 
+	size_t split_hash, wnd_t *wnd)
+{
+	/* See comment in split_bucket(). */
+	
+	bool done;
+	do {
+		bool resizing = false;
+		wnd->ppred = psrc_head;
+		wnd->cur = get_next(*psrc_head);
+		
+		/* 
+		 * Find the split window, ie the last node of the first part of
+		 * the split bucket and the its successor - the first node of
+		 * the second part of the split bucket. Retry if GC failed. 
+		 */
+		if (!find_wnd_and_gc(h, split_hash, WM_MOVE_JOIN_FOLLOWS, wnd, &resizing))
+			continue;
+		
+		/* Must not report that the table is resizing if WM_MOVE_JOIN_FOLLOWS.*/
+		ASSERT(!resizing);
+		/* 
+		 * Mark the last node of the first half of the split bucket 
+		 * that a join node follows. It must be clean/normal.
+		 */
+		marked_ptr_t ret
+			= cas_link(wnd->ppred, wnd->cur, N_NORMAL, wnd->cur, N_JOIN_FOLLOWS);
+
+		/* 
+		 * Successfully marked as a JF node or already marked that way (even 
+		 * if also marked deleted - unlinking the node will move the JF mark). 
+		 */
+		done = (ret == make_link(wnd->cur, N_NORMAL))
+			|| (N_JOIN_FOLLOWS & get_mark(ret));
+	} while (!done);
+}
+
+/** Marks join_node with N_JOIN. */
+static void mark_join_node(cht_link_t *join_node)
+{
+	/* See comment in split_bucket(). */
+	
+	bool done;
+	do {
+		cht_link_t *next = get_next(join_node->link);
+		mark_t mark = get_mark(join_node->link);
+		
+		/* 
+		 * May already be marked as deleted, but it won't be unlinked 
+		 * because its predecessor is marked with JOIN_FOLLOWS or CONST.
+		 */
+		marked_ptr_t ret 
+			= cas_link(&join_node->link, next, mark, next, mark | N_JOIN);
+		
+		/* Successfully marked or already marked as a join node. */
+		done = (ret == make_link(next, mark))
+			|| (N_JOIN & get_mark(ret));
+	} while(!done);
+}
+
+/** Appends the bucket at psrc_head to the bucket at pdest_head.
+ * 
+ * @param h          CHT to operate on.
+ * @param psrc_head  Bucket to merge with pdest_head.
+ * @param pdest_head Bucket to be joined by psrc_head.
+ * @param split_hash The smallest hash psrc_head may contain.
+ */
+static void join_buckets(cht_t *h, marked_ptr_t *psrc_head, 
+	marked_ptr_t *pdest_head, size_t split_hash)
+{
+	/* Buckets already joined. */
+	if (N_INVALID == get_mark(*psrc_head))
+		return;
+	/*
+	 * F == First node of psrc_head, ie the bucket we want to append 
+	 *      to (ie join with) the bucket starting at pdest_head.
+	 * L == Last node of pdest_head, ie the bucket that psrc_head will
+	 *      be appended to. 
+	 *
+	 * (1) We first mark psrc_head immutable to signal that a join is 
+	 * in progress and so that updaters unaware of the join (or table 
+	 * resize):
+	 * - do not insert new nodes between the head psrc_head and F
+	 * - do not unlink F (it may already be marked deleted)
+	 * 
+	 * (2) Next, F is marked as a join node. Updaters aware of table resize
+	 * will not attempt to unlink it. We cannot safely/atomically unlink 
+	 * the join node because it will be pointed to from two different 
+	 * buckets. Updaters unaware of resize will fail to unlink the join
+	 * node due to the head being marked immutable.
+	 *
+	 * (3) Then the tail of the bucket at pdest_head is linked to the join
+	 * node. From now on, nodes in both buckets can be found via pdest_head.
+	 * 
+	 * (4) Last, mark immutable psrc_head as invalid. It signals updaters
+	 * that the join is complete and they can insert new nodes (originally
+	 * destined for psrc_head) into pdest_head. 
+	 * 
+	 * Note that pdest_head keeps pointing at the join node. This allows
+	 * lookups and updaters to determine if they should see a link between
+	 * the tail L and F when searching for nodes originally in psrc_head
+	 * via pdest_head. If they reach the tail of pdest_head without 
+	 * encountering any nodes of psrc_head, either there were no nodes
+	 * in psrc_head to begin with or the link between L and F did not
+	 * yet propagate to their cpus. If psrc_head was empty, it remains
+	 * NULL. Otherwise psrc_head points to a join node (it will not be 
+	 * unlinked until table resize completes) and updaters/lookups
+	 * should issue a read_barrier() to make the link [L]->[JN] visible.
+	 * 
+	 * 0)                           ,-- split_hash, first hash of the src bucket 
+	 *                              v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | N]--> [F] -> .. 
+	 *  ^
+	 *  ` split_hash, first hash of the src bucket
+	 * 
+	 * 1)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | C]--> [F] -> .. 
+	 * 
+	 * 2)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L]
+	 *  [src_head | C]--> [JN] -> .. 
+	 * 
+	 * 3)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L] --+
+	 *                                v
+	 *  [src_head | C]-------------> [JN] -> .. 
+	 * 
+	 * 4)                            ,-- split_hash
+	 *                               v  
+	 *  [dest_head | N]-> .. -> [L] --+
+	 *                                v
+	 *  [src_head | Inv]-----------> [JN] -> .. 
+	 */
+	
+	rcu_read_lock();
+	
+	/* Mark src_head immutable - signals updaters that bucket join started. */
+	mark_const(psrc_head);
+	cas_order_barrier();
+	
+	cht_link_t *join_node = get_next(*psrc_head);
+
+	if (join_node != &sentinel) {
+		mark_join_node(join_node);
+		cas_order_barrier();
+		
+		link_to_join_node(h, pdest_head, join_node, split_hash);
+		cas_order_barrier();
+	} 
+	
+	DBG(marked_ptr_t ret = )
+		cas_link(psrc_head, join_node, N_CONST, join_node, N_INVALID);
+	ASSERT(ret == make_link(join_node, N_CONST) || (N_INVALID == get_mark(ret)));
+	cas_order_barrier();
+	
+	rcu_read_unlock();
+}
+
+/** Links the tail of pdest_head to join_node.
+ * 
+ * @param h          CHT to operate on.
+ * @param pdest_head Head of the bucket whose tail is to be linked to join_node.
+ * @param join_node  A node marked N_JOIN with a hash greater or equal to
+ *                   split_hash.
+ * @param split_hash The least hash that is greater than the hash of any items
+ *                   (originally) in pdest_head.
+ */
+static void link_to_join_node(cht_t *h, marked_ptr_t *pdest_head, 
+	cht_link_t *join_node, size_t split_hash)
+{
+	bool done;
+	do {
+		wnd_t wnd = {
+			.ppred = pdest_head,
+			.cur = get_next(*pdest_head)
+		};
+		
+		bool resizing = false;
+		
+		if (!find_wnd_and_gc(h, split_hash, WM_LEAVE_JOIN, &wnd, &resizing))
+			continue;
+
+		ASSERT(!resizing);
+		
+		if (wnd.cur != &sentinel) {
+			/* Must be from the new appended bucket. */
+			ASSERT(split_hash <= node_hash(h, wnd.cur) 
+				|| h->invalid_hash == node_hash(h, wnd.cur));
+			return;
+		}
+		
+		/* Reached the tail of pdest_head - link it to the join node. */
+		marked_ptr_t ret = 
+			cas_link(wnd.ppred, &sentinel, N_NORMAL, join_node, N_NORMAL);
+		
+		done = (ret == make_link(&sentinel, N_NORMAL));
+	} while (!done);
+}
+
+/** Instructs RCU to free the item once all preexisting references are dropped. 
+ * 
+ * The item is freed via op->remove_callback().
+ */
+static void free_later(cht_t *h, cht_link_t *item)
+{
+	ASSERT(item != &sentinel);
+	
+	/* 
+	 * remove_callback only works as rcu_func_t because rcu_link is the first
+	 * field in cht_link_t.
+	 */
+	rcu_call(&item->rcu_link, (rcu_func_t)h->op->remove_callback);
+	
+	item_removed(h);
+}
+
+/** Notes that an item had been unlinked from the table and shrinks it if needed.
+ * 
+ * If the number of items in the table drops below 1/4 of the maximum 
+ * allowed load the table is shrunk in the background.
+ */
+static inline void item_removed(cht_t *h)
+{
+	size_t items = (size_t) atomic_predec(&h->item_cnt);
+	size_t bucket_cnt = (1 << h->b->order);
+	
+	bool need_shrink = (items == h->max_load * bucket_cnt / 4);
+	bool missed_shrink = (items == h->max_load * bucket_cnt / 8);
+	
+	if ((need_shrink || missed_shrink) && h->b->order > h->min_order) {
+		atomic_count_t resize_reqs = atomic_preinc(&h->resize_reqs);
+		/* The first resize request. Start the resizer. */
+		if (1 == resize_reqs) {
+			workq_global_enqueue_noblock(&h->resize_work, resize_table);
+		}
+	}
+}
+
+/** Notes an item had been inserted and grows the table if needed. 
+ * 
+ * The table is resized in the background.
+ */
+static inline void item_inserted(cht_t *h)
+{
+	size_t items = (size_t) atomic_preinc(&h->item_cnt);
+	size_t bucket_cnt = (1 << h->b->order);
+	
+	bool need_grow = (items == h->max_load * bucket_cnt);
+	bool missed_grow = (items == 2 * h->max_load * bucket_cnt);
+	
+	if ((need_grow || missed_grow) && h->b->order < CHT_MAX_ORDER) {
+		atomic_count_t resize_reqs = atomic_preinc(&h->resize_reqs);
+		/* The first resize request. Start the resizer. */
+		if (1 == resize_reqs) {
+			workq_global_enqueue_noblock(&h->resize_work, resize_table);
+		}
+	}
+}
+
+/** Resize request handler. Invoked on the system work queue. */
+static void resize_table(work_t *arg)
+{
+	cht_t *h = member_to_inst(arg, cht_t, resize_work);
+	
+#ifdef CONFIG_DEBUG
+	ASSERT(h->b);
+	/* Make resize_reqs visible. */
+	read_barrier();
+	ASSERT(0 < atomic_get(&h->resize_reqs));
+#endif
+
+	bool done;
+	do {
+		/* Load the most recent  h->item_cnt. */
+		read_barrier();
+		size_t cur_items = (size_t) atomic_get(&h->item_cnt);
+		size_t bucket_cnt = (1 << h->b->order);
+		size_t max_items = h->max_load * bucket_cnt;
+
+		if (cur_items >= max_items && h->b->order < CHT_MAX_ORDER) {
+			grow_table(h);
+		} else if (cur_items <= max_items / 4 && h->b->order > h->min_order) {
+			shrink_table(h);
+		} else {
+			/* Table is just the right size. */
+			atomic_count_t reqs = atomic_predec(&h->resize_reqs);
+			done = (reqs == 0);
+		}
+	} while (!done);
+}
+
+/** Increases the number of buckets two-fold. Blocks until done. */
+static void grow_table(cht_t *h)
+{
+	if (h->b->order >= CHT_MAX_ORDER)
+		return;
+	
+	h->new_b = alloc_buckets(h->b->order + 1, true, false);
+
+	/* Failed to alloc a new table - try next time the resizer is run. */
+	if (!h->new_b) 
+		return;
+
+	/* Wait for all readers and updaters to see the initialized new table. */
+	rcu_synchronize();
+	size_t old_bucket_cnt = (1 << h->b->order);
+	
+	/* 
+	 * Give updaters a chance to help out with the resize. Do the minimum 
+	 * work needed to announce a resize is in progress, ie start moving heads.
+	 */
+	for (size_t idx = 0; idx < old_bucket_cnt; ++idx) {
+		start_head_move(&h->b->head[idx]);
+	}
+	
+	/* Order start_head_move() wrt complete_head_move(). */
+	cas_order_barrier();
+	
+	/* Complete moving heads and split any buckets not yet split by updaters. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		marked_ptr_t *move_dest_head = &h->new_b->head[grow_idx(old_idx)];
+		marked_ptr_t *move_src_head = &h->b->head[old_idx];
+
+		/* Head move not yet completed. */
+		if (N_INVALID != get_mark(*move_src_head)) {
+			complete_head_move(move_src_head, move_dest_head);
+		}
+
+		size_t split_idx = grow_to_split_idx(old_idx);
+		size_t split_hash = calc_split_hash(split_idx, h->new_b->order);
+		marked_ptr_t *split_dest_head = &h->new_b->head[split_idx];
+
+		split_bucket(h, move_dest_head, split_dest_head, split_hash);
+	}
+	
+	/* 
+	 * Wait for all updaters to notice the new heads. Once everyone sees
+	 * the invalid old bucket heads they will know a resize is in progress
+	 * and updaters will modify the correct new buckets. 
+	 */
+	rcu_synchronize();
+	
+	/* Clear the JOIN_FOLLOWS mark and remove the link between the split buckets.*/
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = grow_idx(old_idx);
+		
+		cleanup_join_follows(h, &h->new_b->head[new_idx]);
+	}
+
+	/* 
+	 * Wait for everyone to notice that buckets were split, ie link connecting
+	 * the join follows and join node has been cut. 
+	 */
+	rcu_synchronize();
+	
+	/* Clear the JOIN mark and GC any deleted join nodes. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = grow_to_split_idx(old_idx);
+		
+		cleanup_join_node(h, &h->new_b->head[new_idx]);
+	}
+
+	/* Wait for everyone to see that the table is clear of any resize marks. */
+	rcu_synchronize();
+	
+	cht_buckets_t *old_b = h->b;
+	rcu_assign(h->b, h->new_b);
+
+	/* Wait for everyone to start using the new table. */
+	rcu_synchronize();
+	
+	free(old_b);
+	
+	/* Not needed; just for increased readability. */
+	h->new_b = NULL;
+}
+
+/** Halfs the number of buckets. Blocks until done. */
+static void shrink_table(cht_t *h)
+{
+	if (h->b->order <= h->min_order)
+		return;
+	
+	h->new_b = alloc_buckets(h->b->order - 1, true, false);
+
+	/* Failed to alloc a new table - try next time the resizer is run. */
+	if (!h->new_b) 
+		return;
+
+	/* Wait for all readers and updaters to see the initialized new table. */
+	rcu_synchronize();
+	
+	size_t old_bucket_cnt = (1 << h->b->order);
+	
+	/* 
+	 * Give updaters a chance to help out with the resize. Do the minimum 
+	 * work needed to announce a resize is in progress, ie start moving heads.
+	 */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = shrink_idx(old_idx);
+		
+		/* This bucket should be moved. */
+		if (grow_idx(new_idx) == old_idx) {
+			start_head_move(&h->b->head[old_idx]);
+		} else {
+			/* This bucket should join the moved bucket once the move is done.*/
+		}
+	}
+	
+	/* Order start_head_move() wrt to complete_head_move(). */
+	cas_order_barrier();
+	
+	/* Complete moving heads and join buckets with the moved buckets. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t new_idx = shrink_idx(old_idx);
+		size_t move_src_idx = grow_idx(new_idx);
+		
+		/* This bucket should be moved. */
+		if (move_src_idx == old_idx) {
+			/* Head move not yet completed. */
+			if (N_INVALID != get_mark(h->b->head[old_idx])) {
+				complete_head_move(&h->b->head[old_idx], &h->new_b->head[new_idx]);
+			}
+		} else {
+			/* This bucket should join the moved bucket. */
+			size_t split_hash = calc_split_hash(old_idx, h->b->order);
+			join_buckets(h, &h->b->head[old_idx], &h->new_b->head[new_idx], 
+				split_hash);
+		}
+	}
+	
+	/* 
+	 * Wait for all updaters to notice the new heads. Once everyone sees
+	 * the invalid old bucket heads they will know a resize is in progress
+	 * and updaters will modify the correct new buckets. 
+	 */
+	rcu_synchronize();
+	
+	/* Let everyone know joins are complete and fully visible. */
+	for (size_t old_idx = 0; old_idx < old_bucket_cnt; ++old_idx) {
+		size_t move_src_idx = grow_idx(shrink_idx(old_idx));
+	
+		/* Set the invalid joinee head to NULL. */
+		if (old_idx != move_src_idx) {
+			ASSERT(N_INVALID == get_mark(h->b->head[old_idx]));
+			
+			if (&sentinel != get_next(h->b->head[old_idx]))
+				h->b->head[old_idx] = make_link(&sentinel, N_INVALID);
+		}
+	}
+	
+	/* todo comment join node vs reset joinee head*/
+	rcu_synchronize();
+
+	size_t new_bucket_cnt = (1 << h->new_b->order);
+		
+	/* Clear the JOIN mark and GC any deleted join nodes. */
+	for (size_t new_idx = 0; new_idx < new_bucket_cnt; ++new_idx) {
+		cleanup_join_node(h, &h->new_b->head[new_idx]);
+	}
+
+	/* Wait for everyone to see that the table is clear of any resize marks. */
+	rcu_synchronize();
+	
+	cht_buckets_t *old_b = h->b;
+	rcu_assign(h->b, h->new_b);
+	
+	/* Wait for everyone to start using the new table. */
+	rcu_synchronize();
+	
+	free(old_b);
+	
+	/* Not needed; just for increased readability. */
+	h->new_b = NULL;
+}
+
+/** Finds and clears the N_JOIN mark from a node in new_head (if present). */
+static void cleanup_join_node(cht_t *h, marked_ptr_t *new_head)
+{
+	rcu_read_lock();
+
+	cht_link_t *cur = get_next(*new_head);
+		
+	while (cur != &sentinel) {
+		/* Clear the join node's JN mark - even if it is marked as deleted. */
+		if (N_JOIN & get_mark(cur->link)) {
+			clear_join_and_gc(h, cur, new_head);
+			break;
+		}
+		
+		cur = get_next(cur->link);
+	}
+	
+	rcu_read_unlock();
+}
+
+/** Clears the join_node's N_JOIN mark frees it if marked N_DELETED as well. */
+static void clear_join_and_gc(cht_t *h, cht_link_t *join_node, 
+	marked_ptr_t *new_head)
+{
+	ASSERT(join_node != &sentinel);
+	ASSERT(join_node && (N_JOIN & get_mark(join_node->link)));
+	
+	bool done;
+	
+	/* Clear the JN mark. */
+	do {
+		marked_ptr_t jn_link = join_node->link;
+		cht_link_t *next = get_next(jn_link);
+		/* Clear the JOIN mark but keep the DEL mark if present. */
+		mark_t cleared_mark = get_mark(jn_link) & N_DELETED;
+
+		marked_ptr_t ret = 
+			_cas_link(&join_node->link, jn_link, make_link(next, cleared_mark));
+
+		/* Done if the mark was cleared. Retry if a new node was inserted. */
+		done = (ret == jn_link);
+		ASSERT(ret == jn_link || (get_mark(ret) & N_JOIN));
+	} while (!done);
+	
+	if (!(N_DELETED & get_mark(join_node->link)))
+		return;
+
+	/* The join node had been marked as deleted - GC it. */
+
+	/* Clear the JOIN mark before trying to unlink the deleted join node.*/
+	cas_order_barrier();
+	
+	size_t jn_hash = node_hash(h, join_node);
+	do {
+		bool resizing = false;
+		
+		wnd_t wnd = {
+			.ppred = new_head,
+			.cur = get_next(*new_head)
+		};
+		
+		done = find_wnd_and_gc_pred(h, jn_hash, WM_NORMAL, same_node_pred, 
+			join_node, &wnd, &resizing);
+		
+		ASSERT(!resizing);
+	} while (!done);
+}
+
+/** Finds a non-deleted node with N_JOIN_FOLLOWS and clears the mark. */
+static void cleanup_join_follows(cht_t *h, marked_ptr_t *new_head)
+{
+	ASSERT(new_head);
+	
+	rcu_read_lock();
+
+	wnd_t wnd = {
+		.ppred = NULL,
+		.cur = NULL
+	};
+	marked_ptr_t *cur_link = new_head;
+		
+	/*
+	 * Find the non-deleted node with a JF mark and clear the JF mark.
+	 * The JF node may be deleted and/or the mark moved to its neighbors
+	 * at any time. Therefore, we GC deleted nodes until we find the JF 
+	 * node in order to remove stale/deleted JF nodes left behind eg by 
+	 * delayed threads that did not yet get a chance to unlink the deleted 
+	 * JF node and move its mark. 
+	 * 
+	 * Note that the head may be marked JF (but never DELETED).
+	 */
+	while (true) {
+		bool is_jf_node = N_JOIN_FOLLOWS & get_mark(*cur_link);
+		
+		/* GC any deleted nodes on the way - even deleted JOIN_FOLLOWS. */
+		if (N_DELETED & get_mark(*cur_link)) {
+			ASSERT(cur_link != new_head);
+			ASSERT(wnd.ppred && wnd.cur && wnd.cur != &sentinel);
+			ASSERT(cur_link == &wnd.cur->link);
+
+			bool dummy;
+			bool deleted = gc_deleted_node(h, WM_MOVE_JOIN_FOLLOWS, &wnd, &dummy);
+
+			/* Failed to GC or collected a deleted JOIN_FOLLOWS. */
+			if (!deleted || is_jf_node) {
+				/* Retry from the head of the bucket. */
+				cur_link = new_head;
+				continue;
+			}
+		} else {
+			/* Found a non-deleted JF. Clear its JF mark. */
+			if (is_jf_node) {
+				cht_link_t *next = get_next(*cur_link);
+				marked_ptr_t ret = 
+					cas_link(cur_link, next, N_JOIN_FOLLOWS, &sentinel, N_NORMAL);
+				
+				ASSERT(next == &sentinel 
+					|| ((N_JOIN | N_JOIN_FOLLOWS) & get_mark(ret)));
+
+				/* Successfully cleared the JF mark of a non-deleted node. */
+				if (ret == make_link(next, N_JOIN_FOLLOWS)) {
+					break;
+				} else {
+					/* 
+					 * The JF node had been deleted or a new node inserted 
+					 * right after it. Retry from the head.
+					 */
+					cur_link = new_head;
+					continue;
+				}
+			} else {
+				wnd.ppred = cur_link;
+				wnd.cur = get_next(*cur_link);				
+			}
+		}
+
+		/* We must encounter a JF node before we reach the end of the bucket. */
+		ASSERT(wnd.cur && wnd.cur != &sentinel);
+		cur_link = &wnd.cur->link;
+	}
+	
+	rcu_read_unlock();
+}
+
+/** Returns the first possible hash following a bucket split point. 
+ * 
+ * In other words the returned hash is the smallest possible hash
+ * the remainder of the split bucket may contain.
+ */
+static inline size_t calc_split_hash(size_t split_idx, size_t order)
+{
+	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
+	return split_idx << (8 * sizeof(size_t) - order);
+}
+
+/** Returns the bucket head index given the table size order and item hash. */
+static inline size_t calc_bucket_idx(size_t hash, size_t order)
+{
+	ASSERT(1 <= order && order <= 8 * sizeof(size_t));
+	return hash >> (8 * sizeof(size_t) - order);
+}
+
+/** Returns the bucket index of destination*/
+static inline size_t grow_to_split_idx(size_t old_idx)
+{
+	return grow_idx(old_idx) | 1;
+}
+
+/** Returns the destination index of a bucket head when the table is growing. */
+static inline size_t grow_idx(size_t idx)
+{
+	return idx << 1;
+}
+
+/** Returns the destination index of a bucket head when the table is shrinking.*/
+static inline size_t shrink_idx(size_t idx)
+{
+	return idx >> 1;
+}
+
+/** Returns a mixed hash of the search key.*/
+static inline size_t calc_key_hash(cht_t *h, void *key)
+{
+	/* Mimic calc_node_hash. */
+	return hash_mix(h->op->key_hash(key)) & ~(size_t)1;
+}
+
+/** Returns a memoized mixed hash of the item. */
+static inline size_t node_hash(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(item->hash == h->invalid_hash 
+		|| item->hash == sentinel.hash
+		|| item->hash == calc_node_hash(h, item));
+	
+	return item->hash;
+}
+
+/** Calculates and mixed the hash of the item. */
+static inline size_t calc_node_hash(cht_t *h, const cht_link_t *item)
+{
+	ASSERT(item != &sentinel);
+	/* 
+	 * Clear the lowest order bit in order for sentinel's node hash
+	 * to be the greatest possible.
+	 */
+	return hash_mix(h->op->hash(item)) & ~(size_t)1;
+}
+
+/** Computes and memoizes the hash of the item. */
+static inline void memoize_node_hash(cht_t *h, cht_link_t *item)
+{
+	item->hash = calc_node_hash(h, item);
+}
+
+/** Packs the next pointer address and the mark into a single pointer. */
+static inline marked_ptr_t make_link(const cht_link_t *next, mark_t mark)
+{
+	marked_ptr_t ptr = (marked_ptr_t) next;
+	
+	ASSERT(!(ptr & N_MARK_MASK));
+	ASSERT(!((unsigned)mark & ~N_MARK_MASK));
+	
+	return ptr | mark;
+}
+
+/** Strips any marks from the next item link and returns the next item's address.*/
+static inline cht_link_t * get_next(marked_ptr_t link)
+{
+	return (cht_link_t*)(link & ~N_MARK_MASK);
+}
+
+/** Returns the current node's mark stored in the next item link. */
+static inline mark_t get_mark(marked_ptr_t link)
+{
+	return (mark_t)(link & N_MARK_MASK);
+}
+
+/** Moves the window by one item so that is points to the next item. */
+static inline void next_wnd(wnd_t *wnd)
+{
+	ASSERT(wnd);
+	ASSERT(wnd->cur);
+
+	wnd->last = wnd->cur;
+	wnd->ppred = &wnd->cur->link;
+	wnd->cur = get_next(wnd->cur->link);
+}
+
+/** Predicate that matches only exactly the same node. */
+static bool same_node_pred(void *node, const cht_link_t *item2)
+{
+	const cht_link_t *item1 = (const cht_link_t*) node;
+	return item1 == item2;
+}
+
+/** Compare-and-swaps a next item link. */
+static inline marked_ptr_t cas_link(marked_ptr_t *link, const cht_link_t *cur_next, 
+	mark_t cur_mark, const cht_link_t *new_next, mark_t new_mark)
+{
+	return _cas_link(link, make_link(cur_next, cur_mark), 
+		make_link(new_next, new_mark));
+}
+
+/** Compare-and-swaps a next item link. */
+static inline marked_ptr_t _cas_link(marked_ptr_t *link, marked_ptr_t cur, 
+	marked_ptr_t new)
+{
+	ASSERT(link != &sentinel.link);
+	/*
+	 * cas(x) on the same location x on one cpu must be ordered, but do not
+	 * have to be ordered wrt to other cas(y) to a different location y
+	 * on the same cpu.
+	 * 
+	 * cas(x) must act as a write barrier on x, ie if cas(x) succeeds 
+	 * and is observed by another cpu, then all cpus must be able to 
+	 * make the effects of cas(x) visible just by issuing a load barrier.
+	 * For example:
+	 * cpu1         cpu2            cpu3
+	 *                              cas(x, 0 -> 1), succeeds 
+	 *              cas(x, 0 -> 1), fails
+	 *              MB, to order load of x in cas and store to y
+	 *              y = 7
+	 * sees y == 7
+	 * loadMB must be enough to make cas(x) on cpu3 visible to cpu1, ie x == 1.
+	 * 
+	 * If cas() did not work this way:
+	 * a) our head move protocol would not be correct.
+	 * b) freeing an item linked to a moved head after another item was
+	 *   inserted in front of it, would require more than one grace period.
+	 * 
+	 * Ad (a): In the following example, cpu1 starts moving old_head
+	 * to new_head, cpu2 completes the move and cpu3 notices cpu2
+	 * completed the move before cpu1 gets a chance to notice cpu2
+	 * had already completed the move. Our requirements for cas() 
+	 * assume cpu3 will see a valid and mutable value in new_head 
+	 * after issuing a load memory barrier once it has determined 
+	 * the old_head's value had been successfully moved to new_head 
+	 * (because it sees old_head marked invalid).
+	 * 
+	 *  cpu1             cpu2             cpu3
+	 *   cas(old_head, <addr, N>, <addr, Const>), succeeds
+	 *   cas-order-barrier
+	 *   // Move from old_head to new_head started, now the interesting stuff:
+	 *   cas(new_head, <0, Inv>, <addr, N>), succeeds
+	 * 
+	 *                    cas(new_head, <0, Inv>, <addr, N>), but fails
+	 *                    cas-order-barrier
+	 *                    cas(old_head, <addr, Const>, <addr, Inv>), succeeds
+	 *                                     
+	 *                                     Sees old_head marked Inv (by cpu2)
+	 *                                     load-MB
+	 *                                     assert(new_head == <addr, N>)
+	 *   
+	 *   cas-order-barrier
+	 *  
+	 * Even though cpu1 did not yet issue a cas-order-barrier, cpu1's store
+	 * to new_head (successful cas()) must be made visible to cpu3 with
+	 * a load memory barrier if cpu1's store to new_head is visible
+	 * on another cpu (cpu2) and that cpu's (cpu2's) store to old_head
+	 * is already visible to cpu3.	 * 
+	 */
+	void *expected = (void*)cur;
+	
+	/* 
+	 * Use the acquire-release model, although we could probably
+	 * get away even with the relaxed memory model due to our use
+	 * of explicit memory barriers.
+	 */
+	__atomic_compare_exchange_n((void**)link, &expected, (void *)new, false,
+		__ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+	
+	return (marked_ptr_t) expected;
+}
+
+/** Orders compare-and-swaps to different memory locations. */
+static inline void cas_order_barrier(void)
+{
+	/* Make sure CAS to different memory locations are ordered. */
+	write_barrier();
+}
+
+
+/** @}
+ */
Index: kernel/generic/src/adt/list.c
===================================================================
--- kernel/generic/src/adt/list.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/adt/list.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -68,24 +68,26 @@
 }
 
-/** Concatenate two lists
- *
- * Concatenate lists @a list1 and @a list2, producing a single
- * list @a list1 containing items from both (in @a list1, @a list2
- * order) and empty list @a list2.
- *
- * @param list1		First list and concatenated output
- * @param list2 	Second list and empty output.
- *
+/** Moves items of one list into another after the specified item.
+ * 
+ * Inserts all items of @a list after item at @a pos in another list. 
+ * Both lists may be empty. 
+ * 
+ * @param list Source list to move after pos. Empty afterwards.
+ * @param pos Source items will be placed after this item.
  */
-void list_concat(list_t *list1, list_t *list2)
+void list_splice(list_t *list, link_t *pos)
 {
-	if (list_empty(list2))
+	if (list_empty(list)) 
 		return;
-
-	list2->head.next->prev = list1->head.prev;
-	list2->head.prev->next = &list1->head;
-	list1->head.prev->next = list2->head.next;
-	list1->head.prev = list2->head.prev;
-	list_initialize(list2);
+	
+	/* Attach list to destination. */
+	list->head.next->prev = pos;
+	list->head.prev->next = pos->next;
+	
+	/* Link destination list to the added list. */
+	pos->next->prev = list->head.prev;
+	pos->next = list->head.next;
+	
+	list_initialize(list);
 }
 
Index: kernel/generic/src/console/chardev.c
===================================================================
--- kernel/generic/src/console/chardev.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/console/chardev.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -39,5 +39,5 @@
 #include <print.h>
 #include <func.h>
-#include <arch.h>
+#include <cpu.h>
 
 /** Initialize input character device.
Index: kernel/generic/src/console/cmd.c
===================================================================
--- kernel/generic/src/console/cmd.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/console/cmd.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -70,4 +70,6 @@
 #include <sysinfo/sysinfo.h>
 #include <symtab.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <errno.h>
 
@@ -526,4 +528,22 @@
 };
 
+/* Data and methods for the 'workq' command */
+static int cmd_workq(cmd_arg_t *argv);
+static cmd_info_t workq_info = {
+	.name = "workq",
+	.description = "Show global workq information.",
+	.func = cmd_workq,
+	.argc = 0
+};
+
+/* Data and methods for the 'workq' command */
+static int cmd_rcu(cmd_arg_t *argv);
+static cmd_info_t rcu_info = {
+	.name = "rcu",
+	.description = "Show RCU run-time statistics.",
+	.func = cmd_rcu,
+	.argc = 0
+};
+
 /* Data and methods for 'ipc' command */
 static int cmd_ipc(cmd_arg_t *argv);
@@ -589,4 +609,5 @@
 	&physmem_info,
 	&reboot_info,
+	&rcu_info,
 	&sched_info,
 	&set4_info,
@@ -599,4 +620,5 @@
 	&uptime_info,
 	&version_info,
+	&workq_info,
 	&zones_info,
 	&zone_info,
@@ -1270,4 +1292,28 @@
 {
 	sched_print_list();
+	return 1;
+}
+
+/** Prints information about the global work queue.
+ *
+ * @param argv Ignores
+ *
+ * @return Always 1
+ */
+int cmd_workq(cmd_arg_t *argv)
+{
+	workq_global_print_info();
+	return 1;
+}
+
+/** Prints RCU statistics.
+ *
+ * @param argv Ignores
+ *
+ * @return Always 1
+ */
+int cmd_rcu(cmd_arg_t *argv)
+{
+	rcu_print_stat();
 	return 1;
 }
Index: kernel/generic/src/console/console.c
===================================================================
--- kernel/generic/src/console/console.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/console/console.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -53,4 +53,6 @@
 #include <str.h>
 #include <abi/kio.h>
+#include <mm/frame.h> /* SIZE2FRAMES */
+#include <mm/slab.h>  /* malloc */
 
 #define KIO_PAGES    8
Index: kernel/generic/src/console/kconsole.c
===================================================================
--- kernel/generic/src/console/kconsole.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/console/kconsole.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -59,4 +59,5 @@
 #include <putchar.h>
 #include <str.h>
+#include <mm/slab.h>
 
 /** Simple kernel console.
Index: kernel/generic/src/cpu/cpu.c
===================================================================
--- kernel/generic/src/cpu/cpu.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/cpu/cpu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -50,4 +50,5 @@
 #include <sysinfo/sysinfo.h>
 #include <arch/cycle.h>
+#include <synch/rcu.h>
 
 cpu_t *cpus;
@@ -105,4 +106,5 @@
 	cpu_identify();
 	cpu_arch_init();
+	rcu_cpu_init();
 }
 
Index: kernel/generic/src/cpu/cpu_mask.c
===================================================================
--- kernel/generic/src/cpu/cpu_mask.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/cpu/cpu_mask.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief CPU mask manipulation functions.
+ */
+#include <cpu/cpu_mask.h>
+#include <cpu.h>
+#include <config.h>
+
+static const size_t word_size = sizeof(unsigned int);
+static const size_t word_bit_cnt = 8 * sizeof(unsigned int);
+
+/** Returns the size of cpu_mask_t for the detected number of cpus in bytes. */
+size_t cpu_mask_size(void)
+{
+	size_t word_cnt = (config.cpu_count + word_bit_cnt - 1) / word_bit_cnt;
+	return word_cnt * word_size;
+}
+
+/** Add first cpu_cnt cpus to the mask, ie sets the first cpu_cnt bits. */
+static void cpu_mask_count(cpu_mask_t *cpus, size_t cpu_cnt)
+{
+	ASSERT(NULL != cpus);
+	ASSERT(cpu_cnt <= config.cpu_count);
+	
+	for (size_t active_word = 0; 
+		(active_word + 1) * word_bit_cnt <= cpu_cnt;
+		++active_word) {
+		/* Set all bits in the cell/word. */
+		cpus->mask[active_word] = -1;
+	}
+	
+	size_t remaining_bits = (cpu_cnt % word_bit_cnt);
+	if (0 < remaining_bits) {
+		/* Set lower remaining_bits of the last word. */
+		cpus->mask[cpu_cnt / word_bit_cnt] = (1 << remaining_bits) - 1;
+	}
+}
+
+/** Sets bits corresponding to the active cpus, ie the first 
+ * config.cpu_active cpus. 
+ */
+void cpu_mask_active(cpu_mask_t *cpus)
+{
+	cpu_mask_none(cpus);
+	cpu_mask_count(cpus, config.cpu_active);
+}
+
+/** Sets bits for all cpus of the mask. */
+void cpu_mask_all(cpu_mask_t *cpus)
+{
+	cpu_mask_count(cpus, config.cpu_count);
+}
+
+/** Resets/removes all bits. */
+void cpu_mask_none(cpu_mask_t *cpus)
+{
+	ASSERT(cpus);
+	
+	size_t word_cnt = cpu_mask_size() / word_size;
+		
+	for (size_t word = 0; word < word_cnt; ++word) {
+		cpus->mask[word] = 0;
+	}
+}
+
+/** Sets the bit corresponding to cpu_id to true. */
+void cpu_mask_set(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	cpus->mask[word] |= (1U << word_pos);
+}
+
+/** Resets the bit corresponding to cpu_id to false. */
+void cpu_mask_reset(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	cpus->mask[word] &= ~(1U << word_pos);
+}
+
+/** Returns true if the bit corresponding to cpu_id is set. */
+bool cpu_mask_is_set(cpu_mask_t *cpus, unsigned int cpu_id)
+{
+	size_t word = cpu_id / word_bit_cnt;
+	size_t word_pos = cpu_id % word_bit_cnt;
+	
+	return 0 != (cpus->mask[word] & (1U << word_pos));
+}
+
+/** Returns true if no bits are set. */
+bool cpu_mask_is_none(cpu_mask_t *cpus)
+{
+	size_t word_cnt = cpu_mask_size() / word_size;
+
+	for (size_t word = 0; word < word_cnt; ++word) {
+		if (cpus->mask[word])
+			return false;
+	}
+	
+	return true;
+}
+
+/** @}
+ */
Index: kernel/generic/src/debug/panic.c
===================================================================
--- kernel/generic/src/debug/panic.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/debug/panic.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -96,6 +96,6 @@
 	printf("THE=%p: ", THE);
 	if (THE != NULL) {
-		printf("pd=%" PRIun " thread=%p task=%p cpu=%p as=%p"
-		    " magic=%#" PRIx32 "\n", THE->preemption_disabled,
+		printf("pe=%" PRIun " thread=%p task=%p cpu=%p as=%p"
+		    " magic=%#" PRIx32 "\n", THE->preemption,
 		    THE->thread, THE->task, THE->cpu, THE->as, THE->magic);
 		
Index: kernel/generic/src/interrupt/interrupt.c
===================================================================
--- kernel/generic/src/interrupt/interrupt.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/interrupt/interrupt.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -112,13 +112,11 @@
 	}
 	
-	/* Account CPU usage if it has waked up from sleep */
-	if (CPU) {
+	/* Account CPU usage if it woke up from sleep */
+	if (CPU && CPU->idle) {
 		irq_spinlock_lock(&CPU->lock, false);
-		if (CPU->idle) {
-			uint64_t now = get_cycle();
-			CPU->idle_cycles += now - CPU->last_cycle;
-			CPU->last_cycle = now;
-			CPU->idle = false;
-		}
+		uint64_t now = get_cycle();
+		CPU->idle_cycles += now - CPU->last_cycle;
+		CPU->last_cycle = now;
+		CPU->idle = false;
 		irq_spinlock_unlock(&CPU->lock, false);
 	}
Index: kernel/generic/src/ipc/kbox.c
===================================================================
--- kernel/generic/src/ipc/kbox.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/ipc/kbox.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -44,4 +44,5 @@
 #include <ipc/kbox.h>
 #include <print.h>
+#include <proc/thread.h>
 
 void ipc_kbox_cleanup(void)
Index: kernel/generic/src/lib/str.c
===================================================================
--- kernel/generic/src/lib/str.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/lib/str.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -111,4 +111,5 @@
 #include <debug.h>
 #include <macros.h>
+#include <mm/slab.h>
 
 /** Check the condition if wchar_t is signed */
@@ -567,4 +568,5 @@
 	/* There must be space for a null terminator in the buffer. */
 	ASSERT(size > 0);
+	ASSERT(src != NULL);
 	
 	size_t src_off = 0;
Index: kernel/generic/src/log/log.c
===================================================================
--- kernel/generic/src/log/log.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/log/log.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -53,4 +53,5 @@
 #include <console/console.h>
 #include <abi/log.h>
+#include <mm/slab.h>
 
 #define LOG_PAGES    8
Index: kernel/generic/src/main/kinit.c
===================================================================
--- kernel/generic/src/main/kinit.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/main/kinit.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -79,4 +79,6 @@
 #include <synch/waitq.h>
 #include <synch/spinlock.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 
 #define ALIVE_CHARS  4
@@ -105,6 +107,14 @@
 	 */
 	thread_detach(THREAD);
-	
+
 	interrupts_disable();
+	
+	/* Start processing RCU callbacks. RCU is fully functional afterwards. */
+	rcu_kinit_init();
+	
+	/*
+	 * Start processing work queue items. Some may have been queued during boot.
+	 */
+	workq_global_worker_init();
 	
 #ifdef CONFIG_SMP
Index: kernel/generic/src/main/main.c
===================================================================
--- kernel/generic/src/main/main.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/main/main.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -76,4 +76,6 @@
 #include <synch/waitq.h>
 #include <synch/futex.h>
+#include <synch/workqueue.h>
+#include <smp/smp_call.h>
 #include <arch/arch.h>
 #include <arch.h>
@@ -263,6 +265,9 @@
 	
 	cpu_init();
-	
 	calibrate_delay_loop();
+	arch_post_cpu_init();
+
+	smp_call_init();
+	workq_global_init();
 	clock_counter_init();
 	timeout_init();
@@ -367,4 +372,6 @@
 void main_ap_separated_stack(void)
 {
+	smp_call_init();
+	
 	/*
 	 * Configure timeouts for this cpu.
Index: kernel/generic/src/main/shutdown.c
===================================================================
--- kernel/generic/src/main/shutdown.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/main/shutdown.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,4 +37,5 @@
 
 #include <arch.h>
+#include <proc/task.h>
 #include <func.h>
 #include <print.h>
Index: kernel/generic/src/mm/frame.c
===================================================================
--- kernel/generic/src/mm/frame.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/mm/frame.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -61,4 +61,5 @@
 #include <config.h>
 #include <str.h>
+#include <proc/thread.h> /* THREAD */
 
 zones_t zones;
Index: kernel/generic/src/mm/km.c
===================================================================
--- kernel/generic/src/mm/km.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/mm/km.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -49,4 +49,5 @@
 #include <macros.h>
 #include <bitops.h>
+#include <proc/thread.h>
 
 static ra_arena_t *km_ni_arena;
Index: kernel/generic/src/mm/slab.c
===================================================================
--- kernel/generic/src/mm/slab.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/mm/slab.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -114,4 +114,5 @@
 #include <bitops.h>
 #include <macros.h>
+#include <cpu.h>
 
 IRQ_SPINLOCK_STATIC_INITIALIZE(slab_cache_lock);
Index: kernel/generic/src/preempt/preemption.c
===================================================================
--- kernel/generic/src/preempt/preemption.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/preempt/preemption.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -37,23 +37,5 @@
 
 #include <preemption.h>
-#include <arch.h>
-#include <arch/asm.h>
-#include <arch/barrier.h>
-#include <debug.h>
 
-/** Increment preemption disabled counter. */
-void preemption_disable(void)
-{
-	THE->preemption_disabled++;
-	memory_barrier();
-}
-
-/** Decrement preemption disabled counter. */
-void preemption_enable(void)
-{
-	ASSERT(PREEMPTION_DISABLED);
-	memory_barrier();
-	THE->preemption_disabled--;
-}
 
 /** @}
Index: kernel/generic/src/proc/scheduler.c
===================================================================
--- kernel/generic/src/proc/scheduler.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/proc/scheduler.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -52,4 +52,6 @@
 #include <atomic.h>
 #include <synch/spinlock.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <config.h>
 #include <context.h>
@@ -64,4 +66,5 @@
 #include <debug.h>
 #include <stacktrace.h>
+#include <cpu.h>
 
 static void scheduler_separated_stack(void);
@@ -87,4 +90,5 @@
 {
 	before_thread_runs_arch();
+	rcu_before_thread_runs();
 	
 #ifdef CONFIG_FPU_LAZY
@@ -127,4 +131,6 @@
 static void after_thread_ran(void)
 {
+	workq_after_thread_ran();
+	rcu_after_thread_ran();
 	after_thread_ran_arch();
 }
@@ -219,4 +225,6 @@
 		goto loop;
 	}
+
+	ASSERT(!CPU->idle);
 	
 	unsigned int i;
@@ -398,4 +406,5 @@
 	ASSERT((!THREAD) || (irq_spinlock_locked(&THREAD->lock)));
 	ASSERT(CPU != NULL);
+	ASSERT(interrupts_disabled());
 	
 	/*
@@ -421,4 +430,5 @@
 		
 		case Exiting:
+			rcu_thread_exiting();
 repeat:
 			if (THREAD->detached) {
Index: kernel/generic/src/proc/task.c
===================================================================
--- kernel/generic/src/proc/task.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/proc/task.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,5 @@
 #include <mm/slab.h>
 #include <atomic.h>
+#include <synch/futex.h>
 #include <synch/spinlock.h>
 #include <synch/waitq.h>
@@ -163,5 +164,4 @@
 	
 	irq_spinlock_initialize(&task->lock, "task_t_lock");
-	mutex_initialize(&task->futexes_lock, MUTEX_PASSIVE);
 	
 	list_initialize(&task->threads);
@@ -175,5 +175,5 @@
 	spinlock_initialize(&task->active_calls_lock, "active_calls_lock");
 	list_initialize(&task->active_calls);
-	
+		
 #ifdef CONFIG_UDEBUG
 	/* Init kbox stuff */
@@ -231,5 +231,5 @@
 		(void) ipc_phone_connect(&task->phones[0], ipc_phone_0);
 	
-	btree_create(&task->futexes);
+	futex_task_init(task);
 	
 	/*
@@ -272,5 +272,5 @@
 	 * Free up dynamically allocated state.
 	 */
-	btree_destroy(&task->futexes);
+	futex_task_deinit(task);
 	
 	/*
Index: kernel/generic/src/proc/the.c
===================================================================
--- kernel/generic/src/proc/the.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/proc/the.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -43,4 +43,5 @@
 
 #include <arch.h>
+#include <debug.h>
 
 /** Initialize THE structure
@@ -53,5 +54,5 @@
 void the_initialize(the_t *the)
 {
-	the->preemption_disabled = 0;
+	the->preemption = 0;
 	the->cpu = NULL;
 	the->thread = NULL;
@@ -59,4 +60,7 @@
 	the->as = NULL;
 	the->magic = MAGIC;
+#ifdef RCU_PREEMPT_A	
+	the->rcu_nesting = 0;
+#endif
 }
 
Index: kernel/generic/src/proc/thread.c
===================================================================
--- kernel/generic/src/proc/thread.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/proc/thread.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -46,4 +46,6 @@
 #include <synch/spinlock.h>
 #include <synch/waitq.h>
+#include <synch/workqueue.h>
+#include <synch/rcu.h>
 #include <cpu.h>
 #include <str.h>
@@ -263,4 +265,11 @@
 }
 
+/** Invoked right before thread_ready() readies the thread. thread is locked. */
+static void before_thread_is_ready(thread_t *thread)
+{
+	ASSERT(irq_spinlock_locked(&thread->lock));
+	workq_before_thread_is_ready(thread);
+}
+
 /** Make thread ready
  *
@@ -275,13 +284,20 @@
 	
 	ASSERT(thread->state != Ready);
+
+	before_thread_is_ready(thread);
 	
 	int i = (thread->priority < RQ_COUNT - 1) ?
 	    ++thread->priority : thread->priority;
-	
-	cpu_t *cpu;
-	if (thread->wired || thread->nomigrate || thread->fpu_context_engaged) {
-		ASSERT(thread->cpu != NULL);
-		cpu = thread->cpu;
-	} else
+
+	/* Check that thread->cpu is set whenever it needs to be. */
+	ASSERT(thread->cpu != NULL || 
+		(!thread->wired && !thread->nomigrate && !thread->fpu_context_engaged));
+
+	/* 
+	 * Prefer to run on the same cpu as the last time. Used by wired 
+	 * threads as well as threads with disabled migration.
+	 */
+	cpu_t *cpu = thread->cpu;
+	if (cpu == NULL) 
 		cpu = CPU;
 	
@@ -377,4 +393,6 @@
 	thread->task = task;
 	
+	thread->workq = NULL;
+	
 	thread->fpu_context_exists = false;
 	thread->fpu_context_engaged = false;
@@ -391,4 +409,6 @@
 	/* Might depend on previous initialization */
 	thread_create_arch(thread);
+	
+	rcu_thread_init(thread);
 	
 	if ((flags & THREAD_FLAG_NOATTACH) != THREAD_FLAG_NOATTACH)
@@ -501,5 +521,5 @@
 			 */
 			ipc_cleanup();
-			futex_cleanup();
+			futex_task_cleanup();
 			LOG("Cleanup of task %" PRIu64" completed.", TASK->taskid);
 		}
@@ -521,4 +541,52 @@
 	/* Not reached */
 	while (true);
+}
+
+/** Interrupts an existing thread so that it may exit as soon as possible.
+ * 
+ * Threads that are blocked waiting for a synchronization primitive 
+ * are woken up with a return code of ESYNCH_INTERRUPTED if the
+ * blocking call was interruptable. See waitq_sleep_timeout().
+ * 
+ * The caller must guarantee the thread object is valid during the entire
+ * function, eg by holding the threads_lock lock.
+ * 
+ * Interrupted threads automatically exit when returning back to user space.
+ * 
+ * @param thread A valid thread object. The caller must guarantee it
+ *               will remain valid until thread_interrupt() exits.
+ */
+void thread_interrupt(thread_t *thread)
+{
+	ASSERT(thread != NULL);
+	
+	irq_spinlock_lock(&thread->lock, true);
+	
+	thread->interrupted = true;
+	bool sleeping = (thread->state == Sleeping);
+	
+	irq_spinlock_unlock(&thread->lock, true);
+	
+	if (sleeping)
+		waitq_interrupt_sleep(thread);
+}
+
+/** Returns true if the thread was interrupted.
+ * 
+ * @param thread A valid thread object. User must guarantee it will
+ *               be alive during the entire call.
+ * @return true if the thread was already interrupted via thread_interrupt().
+ */
+bool thread_interrupted(thread_t *thread)
+{
+	ASSERT(thread != NULL);
+	
+	bool interrupted;
+	
+	irq_spinlock_lock(&thread->lock, true);
+	interrupted = thread->interrupted;
+	irq_spinlock_unlock(&thread->lock, true);
+	
+	return interrupted;
 }
 
Index: kernel/generic/src/smp/smp_call.c
===================================================================
--- kernel/generic/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/smp/smp_call.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Facility to invoke functions on other cpus via IPIs.
+ */
+
+#include <smp/smp_call.h>
+#include <arch/barrier.h>
+#include <arch/asm.h>  /* interrupt_disable */
+#include <arch.h>
+#include <config.h>
+#include <preemption.h>
+#include <debug.h>
+#include <cpu.h>
+
+static void call_start(smp_call_t *call_info, smp_call_func_t func, void *arg);
+static void call_done(smp_call_t *call_info);
+static void call_wait(smp_call_t *call_info);
+
+
+/** Init smp_call() on the local cpu. */
+void smp_call_init(void)
+{
+	ASSERT(CPU);
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	spinlock_initialize(&CPU->smp_calls_lock, "cpu[].smp_calls_lock");
+	list_initialize(&CPU->smp_pending_calls);
+}
+
+/** Invokes a function on a specific cpu and waits for it to complete.
+ * 
+ * Calls @a func on the CPU denoted by its logical id @cpu_id . 
+ * The function will execute with interrupts disabled. It should 
+ * be a quick and simple function and must never block. 
+ * 
+ * If @a cpu_id is the local CPU, the function will be invoked
+ * directly.
+ * 
+ * All memory accesses of prior to smp_call() will be visible
+ * to @a func on cpu @a cpu_id. Similarly, any changes @a func
+ * makes on cpu @a cpu_id will be visible on this cpu once
+ * smp_call() returns.
+ * 
+ * Invoking @a func on the destination cpu acts as a memory barrier
+ * on that cpu.
+ * 
+ * @param cpu_id Destination CPU's logical id (eg CPU->id)
+ * @param func Function to call.
+ * @param arg Argument to pass to the user supplied function @a func.
+ */
+void smp_call(unsigned int cpu_id, smp_call_func_t func, void *arg)
+{
+	smp_call_t call_info;
+	smp_call_async(cpu_id, func, arg, &call_info);
+	smp_call_wait(&call_info);
+}
+
+/** Invokes a function on a specific cpu asynchronously.
+ * 
+ * Calls @a func on the CPU denoted by its logical id @cpu_id . 
+ * The function will execute with interrupts disabled. It should 
+ * be a quick and simple function and must never block. 
+ * 
+ * Pass @a call_info to smp_call_wait() in order to wait for 
+ * @a func to complete.
+ * 
+ * @a call_info must be valid until/after @a func returns. Use
+ * smp_call_wait() to wait until it is safe to free @a call_info.
+ * 
+ * If @a cpu_id is the local CPU, the function will be invoked
+ * directly. If the destination cpu id @a cpu_id is invalid
+ * or denotes an inactive cpu, the call is discarded immediately.
+ * 
+ * All memory accesses of the caller prior to smp_call_async()
+ * will be made visible to @a func on the other cpu. Similarly, 
+ * any changes @a func makes on cpu @a cpu_id will be visible
+ * to this cpu when smp_call_wait() returns.
+ * 
+ * Invoking @a func on the destination cpu acts as a memory barrier
+ * on that cpu.
+ * 
+ * Interrupts must be enabled. Otherwise you run the risk
+ * of a deadlock.
+ * 
+ * @param cpu_id Destination CPU's logical id (eg CPU->id).
+ * @param func Function to call.
+ * @param arg Argument to pass to the user supplied function @a func.
+ * @param call_info Use it to wait for the function to complete. Must
+ *          be valid until the function completes.
+ */
+void smp_call_async(unsigned int cpu_id, smp_call_func_t func, void *arg, 
+	smp_call_t *call_info)
+{
+	/* 
+	 * Interrupts must not be disabled or you run the risk of a deadlock 
+	 * if both the destination and source cpus try to send an IPI to each
+	 * other with interrupts disabled. Because the interrupts are disabled 
+	 * the IPIs cannot be delivered and both cpus will forever busy wait 
+	 * for an acknowledgment of the IPI from the other cpu.
+	 */
+	ASSERT(!interrupts_disabled());
+	ASSERT(call_info != NULL);
+	
+	/* Discard invalid calls. */
+	if (config.cpu_count <= cpu_id || !cpus[cpu_id].active) {
+		call_start(call_info, func, arg);
+		call_done(call_info);
+		return;
+	}
+	
+	/* Protect cpu->id against migration. */
+	preemption_disable();
+
+	call_start(call_info, func, arg);
+	
+	if (cpu_id != CPU->id) {
+#ifdef CONFIG_SMP
+		spinlock_lock(&cpus[cpu_id].smp_calls_lock);
+		list_append(&call_info->calls_link, &cpus[cpu_id].smp_pending_calls);
+		spinlock_unlock(&cpus[cpu_id].smp_calls_lock);
+
+		/*
+		 * If a platform supports SMP it must implement arch_smp_call_ipi().
+		 * It should issue an IPI on cpu_id and invoke smp_call_ipi_recv()
+		 * on cpu_id in turn. 
+		 * 
+		 * Do not implement as just an empty dummy function. Instead
+		 * consider providing a full implementation or at least a version 
+		 * that panics if invoked. Note that smp_call_async() never
+		 * calls arch_smp_call_ipi() on uniprocessors even if CONFIG_SMP.
+		 */
+		arch_smp_call_ipi(cpu_id);
+#endif
+	} else {
+		/* Invoke local smp calls in place. */
+		ipl_t ipl = interrupts_disable();
+		func(arg);
+		interrupts_restore(ipl);
+		
+		call_done(call_info);
+	}
+	
+	preemption_enable();
+}
+
+/** Waits for a function invoked on another CPU asynchronously to complete.
+ * 
+ * Does not sleep but rather spins.
+ * 
+ * Example usage:
+ * @code
+ * void hello(void *p) {
+ *     puts((char*)p);
+ * }
+ * 
+ * smp_call_t call_info;
+ * smp_call_async(cpus[2].id, hello, "hi!\n", &call_info);
+ * // Do some work. In the meantime, hello() is executed on cpu2.
+ * smp_call_wait(&call_info);
+ * @endcode
+ * 
+ * @param call_info Initialized by smp_call_async().
+ */
+void smp_call_wait(smp_call_t *call_info)
+{
+	call_wait(call_info);
+}
+
+#ifdef CONFIG_SMP
+
+/** Architecture independent smp call IPI handler.
+ * 
+ * Interrupts must be disabled. Tolerates spurious calls.
+ */
+void smp_call_ipi_recv(void)
+{
+	ASSERT(interrupts_disabled());
+	ASSERT(CPU);
+	
+	list_t calls_list;
+	list_initialize(&calls_list);
+	
+	/* 
+	 * Acts as a load memory barrier. Any changes made by the cpu that
+	 * added the smp_call to calls_list will be made visible to this cpu.
+	 */
+	spinlock_lock(&CPU->smp_calls_lock);
+	list_concat(&calls_list, &CPU->smp_pending_calls);
+	spinlock_unlock(&CPU->smp_calls_lock);
+
+	/* Walk the list manually, so that we can safely remove list items. */
+	for (link_t *cur = calls_list.head.next, *next = cur->next; 
+		!list_empty(&calls_list); cur = next, next = cur->next) {
+		
+		smp_call_t *call_info = list_get_instance(cur, smp_call_t, calls_link);
+		list_remove(cur);
+		
+		call_info->func(call_info->arg);
+		call_done(call_info);
+	}
+}
+
+#endif /* CONFIG_SMP */
+
+static void call_start(smp_call_t *call_info, smp_call_func_t func, void *arg)
+{
+	link_initialize(&call_info->calls_link);
+	call_info->func = func;
+	call_info->arg = arg;
+	
+	/*
+	 * We can't use standard spinlocks here because we want to lock
+	 * the structure on one cpu and unlock it on another (without
+	 * messing up the preemption count).
+	 */
+	atomic_set(&call_info->pending, 1);
+	
+	/* Let initialization complete before continuing. */
+	memory_barrier();
+}
+
+static void call_done(smp_call_t *call_info)
+{
+	/* 
+	 * Separate memory accesses of the called function from the 
+	 * announcement of its completion.
+	 */
+	memory_barrier();
+	atomic_set(&call_info->pending, 0);
+}
+
+static void call_wait(smp_call_t *call_info)
+{
+	do {
+		/* 
+		 * Ensure memory accesses following call_wait() are ordered
+		 * after completion of the called function on another cpu. 
+		 * Also, speed up loading of call_info->pending.
+		 */
+		memory_barrier();
+	} while (atomic_get(&call_info->pending));
+}
+
+
+/** @}
+ */
Index: kernel/generic/src/synch/condvar.c
===================================================================
--- kernel/generic/src/synch/condvar.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/condvar.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -38,4 +38,5 @@
 #include <synch/condvar.h>
 #include <synch/mutex.h>
+#include <synch/spinlock.h>
 #include <synch/waitq.h>
 #include <arch.h>
@@ -90,4 +91,5 @@
 
 	ipl = waitq_sleep_prepare(&cv->wq);
+	/* Unlock only after the waitq is locked so we don't miss a wakeup. */
 	mutex_unlock(mtx);
 
@@ -95,10 +97,93 @@
 	rc = waitq_sleep_timeout_unsafe(&cv->wq, usec, flags);
 
+	waitq_sleep_finish(&cv->wq, rc, ipl);
+	/* Lock only after releasing the waitq to avoid a possible deadlock. */
 	mutex_lock(mtx);
-	waitq_sleep_finish(&cv->wq, rc, ipl);
 
 	return rc;
 }
 
+/** Wait for the condition to become true with a locked spinlock.
+ * 
+ * The function is not aware of irq_spinlock. Therefore do not even
+ * try passing irq_spinlock_t to it. Use _condvar_wait_timeout_irq_spinlock()
+ * instead.
+ *
+ * @param cv		Condition variable.
+ * @param lock		Locked spinlock.
+ * @param usec		Timeout value in microseconds.
+ * @param flags		Select mode of operation.
+ *
+ * For exact description of meaning of possible combinations of usec and flags,
+ * see comment for waitq_sleep_timeout().  Note that when
+ * SYNCH_FLAGS_NON_BLOCKING is specified here, ESYNCH_WOULD_BLOCK is always
+ * returned.
+ *
+ * @return See comment for waitq_sleep_timeout().
+ */
+int _condvar_wait_timeout_spinlock_impl(condvar_t *cv, spinlock_t *lock, 
+	uint32_t usec, int flags)
+{
+	int rc;
+	ipl_t ipl;
+	
+	ipl = waitq_sleep_prepare(&cv->wq);
+
+	/* Unlock only after the waitq is locked so we don't miss a wakeup. */
+	spinlock_unlock(lock);
+
+	cv->wq.missed_wakeups = 0;	/* Enforce blocking. */
+	rc = waitq_sleep_timeout_unsafe(&cv->wq, usec, flags);
+
+	waitq_sleep_finish(&cv->wq, rc, ipl);
+	/* Lock only after releasing the waitq to avoid a possible deadlock. */
+	spinlock_lock(lock);
+	
+	return rc;
+}
+
+/** Wait for the condition to become true with a locked irq spinlock.
+ * 
+ * @param cv		Condition variable.
+ * @param lock		Locked irq spinlock.
+ * @param usec		Timeout value in microseconds.
+ * @param flags		Select mode of operation.
+ *
+ * For exact description of meaning of possible combinations of usec and flags,
+ * see comment for waitq_sleep_timeout().  Note that when
+ * SYNCH_FLAGS_NON_BLOCKING is specified here, ESYNCH_WOULD_BLOCK is always
+ * returned.
+ *
+ * @return See comment for waitq_sleep_timeout().
+ */
+int _condvar_wait_timeout_irq_spinlock(condvar_t *cv, irq_spinlock_t *irq_lock, 
+	uint32_t usec, int flags)
+{
+	int rc;
+	/* Save spinlock's state so we can restore it correctly later on. */
+	ipl_t ipl = irq_lock->ipl;
+	bool guard = irq_lock->guard;
+	
+	irq_lock->guard = false;
+	
+	/* 
+	 * waitq_prepare() restores interrupts to the current state, 
+	 * ie disabled. Therefore, interrupts will remain disabled while 
+	 * it spins waiting for a pending timeout handler to complete. 
+	 * Although it spins with interrupts disabled there can only
+	 * be a pending timeout if we failed to cancel an imminent
+	 * timeout (on another cpu) during a wakeup. As a result the 
+	 * timeout handler is guaranteed to run (it is most likely already 
+	 * running) and there is no danger of a deadlock.
+	 */
+	rc = _condvar_wait_timeout_spinlock(cv, &irq_lock->lock, usec, flags);
+	
+	irq_lock->guard = guard;
+	irq_lock->ipl = ipl;
+	
+	return rc;
+}
+
+
 /** @}
  */
Index: kernel/generic/src/synch/futex.c
===================================================================
--- kernel/generic/src/synch/futex.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/futex.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -1,4 +1,5 @@
 /*
  * Copyright (c) 2006 Jakub Jermar
+ * Copyright (c) 2012 Adam Hraska
  * All rights reserved.
  *
@@ -34,4 +35,28 @@
  * @file
  * @brief	Kernel backend for futexes.
+ * 
+ * Kernel futex objects are stored in a global hash table futex_ht 
+ * where the physical address of the futex variable (futex_t.paddr)
+ * is used as the lookup key. As a result multiple address spaces 
+ * may share the same futex variable. 
+ * 
+ * A kernel futex object is created the first time a task accesses
+ * the futex (having a futex variable at a physical address not 
+ * encountered before). Futex object's lifetime is governed by
+ * a reference count that represents the number of all the different
+ * user space virtual addresses from all tasks that map to the
+ * physical address of the futex variable. A futex object is freed
+ * when the last task having accessed the futex exits.
+ * 
+ * Each task keeps track of the futex objects it accessed in a list
+ * of pointers (futex_ptr_t, task->futex_list) to the different futex 
+ * objects.
+ * 
+ * To speed up translation of futex variables' virtual addresses
+ * to their physical addresses, futex pointers accessed by the
+ * task are furthermore stored in a concurrent hash table (CHT,
+ * task->futexes->ht). A single lookup without locks or accesses
+ * to the page table translates a futex variable's virtual address 
+ * into its futex kernel object. 
  */
 
@@ -39,4 +64,5 @@
 #include <synch/mutex.h>
 #include <synch/spinlock.h>
+#include <synch/rcu.h>
 #include <mm/frame.h>
 #include <mm/page.h>
@@ -46,4 +72,5 @@
 #include <genarch/mm/page_pt.h>
 #include <genarch/mm/page_ht.h>
+#include <adt/cht.h>
 #include <adt/hash_table.h>
 #include <adt/list.h>
@@ -52,26 +79,55 @@
 #include <panic.h>
 #include <errno.h>
-#include <print.h>
 
 #define FUTEX_HT_SIZE	1024	/* keep it a power of 2 */
 
-static void futex_initialize(futex_t *futex);
-
-static futex_t *futex_find(uintptr_t paddr);
+/** Task specific pointer to a global kernel futex object. */
+typedef struct futex_ptr {
+	/** CHT link. */
+	cht_link_t cht_link;
+	/** List of all futex pointers used by the task. */
+	link_t all_link;
+	/** Kernel futex object. */
+	futex_t *futex;
+	/** User space virtual address of the futex variable in the task. */
+	uintptr_t uaddr;
+} futex_ptr_t;
+
+
+static void destroy_task_cache(work_t *work);
+
+static void futex_initialize(futex_t *futex, uintptr_t paddr);
+static void futex_add_ref(futex_t *futex);
+static void futex_release_ref(futex_t *futex);
+static void futex_release_ref_locked(futex_t *futex);
+
+static futex_t *get_futex(uintptr_t uaddr);
+static futex_t *find_cached_futex(uintptr_t uaddr);
+static futex_t *get_and_cache_futex(uintptr_t phys_addr, uintptr_t uaddr);
+static bool find_futex_paddr(uintptr_t uaddr, uintptr_t *phys_addr);
+
 static size_t futex_ht_hash(sysarg_t *key);
 static bool futex_ht_compare(sysarg_t *key, size_t keys, link_t *item);
 static void futex_ht_remove_callback(link_t *item);
 
-/**
- * Mutex protecting global futex hash table.
- * It is also used to serialize access to all futex_t structures.
- * Must be acquired before the task futex B+tree lock.
- */
-static mutex_t futex_ht_lock;
-
-/** Futex hash table. */
+static size_t task_fut_ht_hash(const cht_link_t *link);
+static size_t task_fut_ht_key_hash(void *key);
+static bool task_fut_ht_equal(const cht_link_t *item1, const cht_link_t *item2);
+static bool task_fut_ht_key_equal(void *key, const cht_link_t *item);
+
+
+/** Mutex protecting the global futex hash table.
+ * 
+ * Acquire task specific TASK->futex_list_lock before this mutex.
+ */
+SPINLOCK_STATIC_INITIALIZE_NAME(futex_ht_lock, "futex-ht-lock");
+
+/** Global kernel futex hash table. Lock futex_ht_lock before accessing.
+ * 
+ * Physical address of the futex variable is the lookup key.
+ */
 static hash_table_t futex_ht;
 
-/** Futex hash table operations. */
+/** Global kernel futex hash table operations. */
 static hash_table_operations_t futex_ht_ops = {
 	.hash = futex_ht_hash,
@@ -80,21 +136,251 @@
 };
 
+/** Task futex cache CHT operations. */
+static cht_ops_t task_futex_ht_ops = {
+	.hash = task_fut_ht_hash,
+	.key_hash = task_fut_ht_key_hash,
+	.equal = task_fut_ht_equal,
+	.key_equal = task_fut_ht_key_equal,
+	.remove_callback = NULL
+};
+
 /** Initialize futex subsystem. */
 void futex_init(void)
 {
-	mutex_initialize(&futex_ht_lock, MUTEX_PASSIVE);
 	hash_table_create(&futex_ht, FUTEX_HT_SIZE, 1, &futex_ht_ops);
 }
 
-/** Initialize kernel futex structure.
- *
- * @param futex		Kernel futex structure.
- */
-void futex_initialize(futex_t *futex)
+/** Initializes the futex structures for the new task. */
+void futex_task_init(struct task *task)
+{
+	task->futexes = malloc(sizeof(struct futex_cache), 0);
+	
+	cht_create(&task->futexes->ht, 0, 0, 0, true, &task_futex_ht_ops);
+	
+	list_initialize(&task->futexes->list);
+	spinlock_initialize(&task->futexes->list_lock, "futex-list-lock");
+}
+
+/** Destroys the futex structures for the dying task. */
+void futex_task_deinit(task_t *task)
+{
+	/* Interrupts are disabled so we must not block (cannot run cht_destroy). */
+	if (interrupts_disabled()) {
+		/* Invoke the blocking cht_destroy in the background. */
+		workq_global_enqueue_noblock(&task->futexes->destroy_work, 
+			destroy_task_cache);
+	} else {
+		/* We can block. Invoke cht_destroy in this thread. */
+		destroy_task_cache(&task->futexes->destroy_work);
+	}
+}
+
+/** Deallocates a task's CHT futex cache (must already be empty). */
+static void destroy_task_cache(work_t *work)
+{
+	struct futex_cache *cache = 
+		member_to_inst(work, struct futex_cache, destroy_work);
+	
+	/* 
+	 * Destroy the cache before manually freeing items of the cache in case
+	 * table resize is in progress.
+	 */
+	cht_destroy_unsafe(&cache->ht);
+	
+	/* Manually free futex_ptr cache items. */
+	list_foreach_safe(cache->list, cur_link, next_link) {
+		futex_ptr_t *fut_ptr = member_to_inst(cur_link, futex_ptr_t, all_link);
+
+		list_remove(cur_link);
+		free(fut_ptr);
+	}
+	
+	free(cache);
+}
+
+/** Remove references from futexes known to the current task. */
+void futex_task_cleanup(void)
+{
+	struct futex_cache *futexes = TASK->futexes;
+	
+	/* All threads of this task have terminated. This is the last thread. */
+	spinlock_lock(&futexes->list_lock);
+	
+	list_foreach_safe(futexes->list, cur_link, next_link) {
+		futex_ptr_t *fut_ptr = member_to_inst(cur_link, futex_ptr_t, all_link);
+
+		/*
+		 * The function is free to free the futex. All other threads of this
+		 * task have already terminated, so they have also definitely
+		 * exited their CHT futex cache protecting rcu reader sections.
+		 * Moreover release_ref() only frees the futex if this is the 
+		 * last task referencing the futex. Therefore, only threads
+		 * of this task may have referenced the futex if it is to be freed.
+		 */
+		futex_release_ref_locked(fut_ptr->futex);
+	}
+	
+	spinlock_unlock(&futexes->list_lock);
+}
+
+
+/** Initialize the kernel futex structure.
+ *
+ * @param futex	Kernel futex structure.
+ * @param paddr Physical address of the futex variable.
+ */
+static void futex_initialize(futex_t *futex, uintptr_t paddr)
 {
 	waitq_initialize(&futex->wq);
 	link_initialize(&futex->ht_link);
-	futex->paddr = 0;
+	futex->paddr = paddr;
 	futex->refcount = 1;
+}
+
+/** Increments the counter of tasks referencing the futex. */
+static void futex_add_ref(futex_t *futex)
+{
+	ASSERT(spinlock_locked(&futex_ht_lock));
+	ASSERT(0 < futex->refcount);
+	++futex->refcount;
+}
+
+/** Decrements the counter of tasks referencing the futex. May free the futex.*/
+static void futex_release_ref(futex_t *futex)
+{
+	ASSERT(spinlock_locked(&futex_ht_lock));
+	ASSERT(0 < futex->refcount);
+	
+	--futex->refcount;
+	
+	if (0 == futex->refcount) {
+		hash_table_remove(&futex_ht, &futex->paddr, 1);
+	}
+}
+
+/** Decrements the counter of tasks referencing the futex. May free the futex.*/
+static void futex_release_ref_locked(futex_t *futex)
+{
+	spinlock_lock(&futex_ht_lock);
+	futex_release_ref(futex);
+	spinlock_unlock(&futex_ht_lock);
+}
+
+/** Returns a futex for the virtual address @a uaddr (or creates one). */
+static futex_t *get_futex(uintptr_t uaddr)
+{
+	futex_t *futex = find_cached_futex(uaddr);
+	
+	if (futex) 
+		return futex;
+
+	uintptr_t paddr;
+
+	if (!find_futex_paddr(uaddr, &paddr)) {
+		return 0;
+	}
+
+	return get_and_cache_futex(paddr, uaddr);
+}
+
+
+/** Finds the physical address of the futex variable. */
+static bool find_futex_paddr(uintptr_t uaddr, uintptr_t *paddr)
+{
+	page_table_lock(AS, false);
+	spinlock_lock(&futex_ht_lock);
+
+	bool found = false;
+	pte_t *t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), true);
+	
+	if (t && PTE_VALID(t) && PTE_PRESENT(t)) {
+		found = true;
+		*paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
+	}
+	
+	spinlock_unlock(&futex_ht_lock);
+	page_table_unlock(AS, false);
+	
+	return found;
+}
+
+/** Returns the futex cached in this task with the virtual address uaddr. */
+static futex_t *find_cached_futex(uintptr_t uaddr)
+{
+	cht_read_lock();
+	
+	futex_t *futex;
+	cht_link_t *futex_ptr_link = cht_find_lazy(&TASK->futexes->ht, &uaddr);
+
+	if (futex_ptr_link) {
+		futex_ptr_t *futex_ptr 
+			= member_to_inst(futex_ptr_link, futex_ptr_t, cht_link);
+		
+		futex = futex_ptr->futex;
+	} else {
+		futex = NULL;
+	}
+	
+	cht_read_unlock();
+	
+	return futex;
+}
+
+
+/** 
+ * Returns a kernel futex for the physical address @a phys_addr and caches 
+ * it in this task under the virtual address @a uaddr (if not already cached).
+ */
+static futex_t *get_and_cache_futex(uintptr_t phys_addr, uintptr_t uaddr)
+{
+	futex_t *futex = malloc(sizeof(futex_t), 0);
+	
+	/* 
+	 * Find the futex object in the global futex table (or insert it 
+	 * if it is not present).
+	 */
+	spinlock_lock(&futex_ht_lock);
+	
+	link_t *fut_link = hash_table_find(&futex_ht, &phys_addr);
+	
+	if (fut_link) {
+		free(futex);
+		futex = member_to_inst(fut_link, futex_t, ht_link);
+		futex_add_ref(futex);
+	} else {
+		futex_initialize(futex, phys_addr);
+		hash_table_insert(&futex_ht, &phys_addr, &futex->ht_link);
+	}
+	
+	spinlock_unlock(&futex_ht_lock);
+	
+	/* 
+	 * Cache the link to the futex object for this task. 
+	 */
+	futex_ptr_t *fut_ptr = malloc(sizeof(futex_ptr_t), 0);
+	cht_link_t *dup_link;
+	
+	fut_ptr->futex = futex;
+	fut_ptr->uaddr = uaddr;
+	
+	cht_read_lock();
+	
+	/* Cache the mapping from the virtual address to the futex for this task. */
+	if (cht_insert_unique(&TASK->futexes->ht, &fut_ptr->cht_link, &dup_link)) {
+		spinlock_lock(&TASK->futexes->list_lock);
+		list_append(&fut_ptr->all_link, &TASK->futexes->list);
+		spinlock_unlock(&TASK->futexes->list_lock);
+	} else {
+		/* Another thread of this task beat us to it. Use that mapping instead.*/
+		free(fut_ptr);
+		futex_release_ref_locked(futex);
+		
+		futex_ptr_t *dup = member_to_inst(dup_link, futex_ptr_t, cht_link);
+		futex = dup->futex;		
+	}
+
+	cht_read_unlock();
+	
+	return futex;
 }
 
@@ -109,30 +395,11 @@
 sysarg_t sys_futex_sleep(uintptr_t uaddr)
 {
-	futex_t *futex;
-	uintptr_t paddr;
-	pte_t *t;
-	int rc;
-	
-	/*
-	 * Find physical address of futex counter.
-	 */
-	page_table_lock(AS, true);
-	t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), false);
-	if (!t || !PTE_VALID(t) || !PTE_PRESENT(t)) {
-		page_table_unlock(AS, true);
+	futex_t *futex = get_futex(uaddr);
+	
+	if (!futex) 
 		return (sysarg_t) ENOENT;
-	}
-	paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
-	page_table_unlock(AS, true);
-	
-	futex = futex_find(paddr);
-
-#ifdef CONFIG_UDEBUG
-	udebug_stoppable_begin();
-#endif
-	rc = waitq_sleep_timeout(&futex->wq, 0, SYNCH_FLAGS_INTERRUPTIBLE); 
-#ifdef CONFIG_UDEBUG
-	udebug_stoppable_end();
-#endif
+
+	int rc = waitq_sleep_timeout(&futex->wq, 0, SYNCH_FLAGS_INTERRUPTIBLE); 
+
 	return (sysarg_t) rc;
 }
@@ -146,84 +413,14 @@
 sysarg_t sys_futex_wakeup(uintptr_t uaddr)
 {
-	futex_t *futex;
-	uintptr_t paddr;
-	pte_t *t;
-	
-	/*
-	 * Find physical address of futex counter.
-	 */
-	page_table_lock(AS, true);
-	t = page_mapping_find(AS, ALIGN_DOWN(uaddr, PAGE_SIZE), false);
-	if (!t || !PTE_VALID(t) || !PTE_PRESENT(t)) {
-		page_table_unlock(AS, true);
+	futex_t *futex = get_futex(uaddr);
+	
+	if (futex) {
+		waitq_wakeup(&futex->wq, WAKEUP_FIRST);
+		return 0;
+	} else {
 		return (sysarg_t) ENOENT;
 	}
-	paddr = PTE_GET_FRAME(t) + (uaddr - ALIGN_DOWN(uaddr, PAGE_SIZE));
-	page_table_unlock(AS, true);
-	
-	futex = futex_find(paddr);
-		
-	waitq_wakeup(&futex->wq, WAKEUP_FIRST);
-	
-	return 0;
-}
-
-/** Find kernel address of the futex structure corresponding to paddr.
- *
- * If the structure does not exist already, a new one is created.
- *
- * @param paddr		Physical address of the userspace futex counter.
- *
- * @return		Address of the kernel futex structure.
- */
-futex_t *futex_find(uintptr_t paddr)
-{
-	link_t *item;
-	futex_t *futex;
-	btree_node_t *leaf;
-	
-	/*
-	 * Find the respective futex structure
-	 * or allocate new one if it does not exist already.
-	 */
-	mutex_lock(&futex_ht_lock);
-	item = hash_table_find(&futex_ht, &paddr);
-	if (item) {
-		futex = hash_table_get_instance(item, futex_t, ht_link);
-
-		/*
-		 * See if the current task knows this futex.
-		 */
-		mutex_lock(&TASK->futexes_lock);
-		if (!btree_search(&TASK->futexes, paddr, &leaf)) {
-			/*
-			 * The futex is new to the current task.
-			 * Upgrade its reference count and put it to the
-			 * current task's B+tree of known futexes.
-			 */
-			futex->refcount++;
-			btree_insert(&TASK->futexes, paddr, futex, leaf);
-		}
-		mutex_unlock(&TASK->futexes_lock);
-	} else {
-		futex = (futex_t *) malloc(sizeof(futex_t), 0);
-		futex_initialize(futex);
-		futex->paddr = paddr;
-		hash_table_insert(&futex_ht, &paddr, &futex->ht_link);
-			
-		/*
-		 * This is the first task referencing the futex.
-		 * It can be directly inserted into its
-		 * B+tree of known futexes.
-		 */
-		mutex_lock(&TASK->futexes_lock);
-		btree_insert(&TASK->futexes, paddr, futex, NULL);
-		mutex_unlock(&TASK->futexes_lock);
-		
-	}
-	mutex_unlock(&futex_ht_lock);
-	
-	return futex;
-}
+}
+
 
 /** Compute hash index into futex hash table.
@@ -268,25 +465,34 @@
 }
 
-/** Remove references from futexes known to the current task. */
-void futex_cleanup(void)
-{
-	mutex_lock(&futex_ht_lock);
-	mutex_lock(&TASK->futexes_lock);
-
-	list_foreach(TASK->futexes.leaf_list, leaf_link, btree_node_t, node) {
-		unsigned int i;
-		
-		for (i = 0; i < node->keys; i++) {
-			futex_t *ftx;
-			uintptr_t paddr = node->key[i];
-			
-			ftx = (futex_t *) node->value[i];
-			if (--ftx->refcount == 0)
-				hash_table_remove(&futex_ht, &paddr, 1);
-		}
-	}
-	
-	mutex_unlock(&TASK->futexes_lock);
-	mutex_unlock(&futex_ht_lock);
+/*
+ * Operations of a task's CHT that caches mappings of futex user space 
+ * virtual addresses to kernel futex objects.
+ */
+
+static size_t task_fut_ht_hash(const cht_link_t *link)
+{
+	const futex_ptr_t *fut_ptr = member_to_inst(link, futex_ptr_t, cht_link);
+	return fut_ptr->uaddr;
+}
+
+static size_t task_fut_ht_key_hash(void *key)
+{
+	return *(uintptr_t*)key;
+}
+
+static bool task_fut_ht_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	const futex_ptr_t *fut_ptr1 = member_to_inst(item1, futex_ptr_t, cht_link);
+	const futex_ptr_t *fut_ptr2 = member_to_inst(item2, futex_ptr_t, cht_link);
+	
+	return fut_ptr1->uaddr == fut_ptr2->uaddr;
+}
+
+static bool task_fut_ht_key_equal(void *key, const cht_link_t *item)
+{
+	const futex_ptr_t *fut_ptr = member_to_inst(item, futex_ptr_t, cht_link);
+	uintptr_t uaddr = *(uintptr_t*)key;
+	
+	return fut_ptr->uaddr == uaddr;
 }
 
Index: kernel/generic/src/synch/mutex.c
===================================================================
--- kernel/generic/src/synch/mutex.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/mutex.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,6 @@
 #include <arch.h>
 #include <stacktrace.h>
+#include <cpu.h>
+#include <proc/thread.h>
 
 /** Initialize mutex.
Index: kernel/generic/src/synch/rcu.c
===================================================================
--- kernel/generic/src/synch/rcu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/synch/rcu.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,1873 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+ 
+/** @addtogroup sync
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Preemptible read-copy update. Usable from interrupt handlers.
+ * 
+ * @par Podzimek-preempt-RCU (RCU_PREEMPT_PODZIMEK)
+ * 
+ * Podzimek-preempt-RCU is a preemptible variant of Podzimek's non-preemptible
+ * RCU algorithm [1, 2]. Grace period (GP) detection is centralized into a
+ * single detector thread. The detector requests that each cpu announces
+ * that it passed a quiescent state (QS), ie a state when the cpu is
+ * outside of an rcu reader section (CS). Cpus check for QSs during context
+ * switches and when entering and exiting rcu reader sections. Once all 
+ * cpus announce a QS and if there were no threads preempted in a CS, the 
+ * GP ends.
+ * 
+ * The detector increments the global GP counter, _rcu_cur_gp, in order 
+ * to start a new GP. Readers notice the new GP by comparing the changed 
+ * _rcu_cur_gp to a locally stored value last_seen_gp which denotes the
+ * the last GP number for which the cpu noted an explicit QS (and issued
+ * a memory barrier). Readers check for the change in the outer-most
+ * (ie not nested) rcu_read_lock()/unlock() as these functions represent 
+ * a QS. The reader first executes a memory barrier (MB) in order to contain 
+ * memory references within a CS (and to make changes made by writers 
+ * visible in the CS following rcu_read_lock()). Next, the reader notes 
+ * that it reached a QS by updating the cpu local last_seen_gp to the
+ * global GP counter, _rcu_cur_gp. Cache coherency eventually makes
+ * the updated last_seen_gp visible to the detector cpu, much like it
+ * delivered the changed _rcu_cur_gp to all cpus.
+ * 
+ * The detector waits a while after starting a GP and then reads each 
+ * cpu's last_seen_gp to see if it reached a QS. If a cpu did not record 
+ * a QS (might be a long running thread without an RCU reader CS; or cache
+ * coherency has yet to make the most current last_seen_gp visible to
+ * the detector; or the cpu is still in a CS) the cpu is interrupted
+ * via an IPI. If the IPI handler finds the cpu still in a CS, it instructs
+ * the cpu to notify the detector that it had exited the CS via a semaphore
+ * (CPU->rcu.is_delaying_gp). 
+ * The detector then waits on the semaphore for any cpus to exit their
+ * CSs. Lastly, it waits for the last reader preempted in a CS to 
+ * exit its CS if there were any and signals the end of the GP to
+ * separate reclaimer threads wired to each cpu. Reclaimers then
+ * execute the callbacks queued on each of the cpus.
+ * 
+ * 
+ * @par A-RCU algorithm (RCU_PREEMPT_A)
+ * 
+ * A-RCU is based on the user space rcu algorithm in [3] utilizing signals
+ * (urcu) and Podzimek's rcu [1]. Like in Podzimek's rcu, callbacks are 
+ * executed by cpu-bound reclaimer threads. There is however no dedicated 
+ * detector thread and the reclaimers take on the responsibilities of the 
+ * detector when they need to start a new GP. A new GP is again announced 
+ * and acknowledged with _rcu_cur_gp and the cpu local last_seen_gp. Unlike
+ * Podzimek's rcu, cpus check explicitly for QS only during context switches. 
+ * Like in urcu, rcu_read_lock()/unlock() only maintain the nesting count
+ * and never issue any memory barriers. This makes rcu_read_lock()/unlock()
+ * simple and fast.
+ * 
+ * If a new callback is queued for a reclaimer and no GP is in progress,
+ * the reclaimer takes on the role of a detector. The detector increments 
+ * _rcu_cur_gp in order to start a new GP. It waits a while to give cpus 
+ * a chance to switch a context (a natural QS). Then, it examines each
+ * non-idle cpu that has yet to pass a QS via an IPI. The IPI handler
+ * sees the most current _rcu_cur_gp and last_seen_gp and notes a QS
+ * with a memory barrier and an update to last_seen_gp. If the handler
+ * finds the cpu in a CS it does nothing and let the detector poll/interrupt
+ * the cpu again after a short sleep.
+ * 
+ * @par Caveats
+ * 
+ * last_seen_gp and _rcu_cur_gp are always 64bit variables and they
+ * are read non-atomically on 32bit machines. Reading a clobbered
+ * value of last_seen_gp or _rcu_cur_gp or writing a clobbered value
+ * of _rcu_cur_gp to last_seen_gp will at worst force the detector
+ * to unnecessarily interrupt a cpu. Interrupting a cpu makes the 
+ * correct value of _rcu_cur_gp visible to the cpu and correctly
+ * resets last_seen_gp in both algorithms.
+ * 
+ * 
+ * 
+ * [1] Read-copy-update for opensolaris,
+ *     2010, Podzimek
+ *     https://andrej.podzimek.org/thesis.pdf
+ * 
+ * [2] (podzimek-rcu) implementation file "rcu.patch"
+ *     http://d3s.mff.cuni.cz/projects/operating_systems/rcu/rcu.patch
+ * 
+ * [3] User-level implementations of read-copy update,
+ *     2012, appendix
+ *     http://www.rdrop.com/users/paulmck/RCU/urcu-supp-accepted.2011.08.30a.pdf
+ * 
+ */
+ 
+#include <synch/rcu.h>
+#include <synch/condvar.h>
+#include <synch/semaphore.h>
+#include <synch/spinlock.h>
+#include <synch/mutex.h>
+#include <proc/thread.h>
+#include <cpu/cpu_mask.h>
+#include <cpu.h>
+#include <smp/smp_call.h>
+#include <compiler/barrier.h>
+#include <atomic.h>
+#include <arch.h>
+#include <macros.h>
+
+/* 
+ * Number of milliseconds to give to preexisting readers to finish 
+ * when non-expedited grace period detection is in progress.
+ */
+#define DETECT_SLEEP_MS    10
+/* 
+ * Max number of pending callbacks in the local cpu's queue before 
+ * aggressively expediting the current grace period
+ */
+#define EXPEDITE_THRESHOLD 2000
+/*
+ * Max number of callbacks to execute in one go with preemption
+ * enabled. If there are more callbacks to be executed they will
+ * be run with preemption disabled in order to prolong reclaimer's
+ * time slice and give it a chance to catch up with callback producers.
+ */
+#define CRITICAL_THRESHOLD 30000
+/* Half the number of values a uint32 can hold. */
+#define UINT32_MAX_HALF    2147483648U
+
+/** 
+ * The current grace period number. Increases monotonically. 
+ * Lock rcu.gp_lock or rcu.preempt_lock to get a current value.
+ */
+rcu_gp_t _rcu_cur_gp;
+
+/** Global RCU data. */
+typedef struct rcu_data {
+	/** Detector uses so signal reclaimers that a grace period ended. */
+	condvar_t gp_ended;
+	/** Reclaimers use to notify the detector to accelerate GP detection. */
+	condvar_t expedite_now;
+	/** 
+	 * Protects: req_gp_end_cnt, req_expedited_cnt, completed_gp, _rcu_cur_gp;
+	 * or: completed_gp, _rcu_cur_gp
+	 */
+	SPINLOCK_DECLARE(gp_lock);
+	/**
+	 * The number of the most recently completed grace period. At most 
+	 * one behind _rcu_cur_gp. If equal to _rcu_cur_gp, a grace period 
+	 * detection is not in progress and the detector is idle.
+	 */
+	rcu_gp_t completed_gp;
+	
+	/** Protects the following 3 fields. */
+	IRQ_SPINLOCK_DECLARE(preempt_lock);
+	/** Preexisting readers that have been preempted. */
+	list_t cur_preempted;
+	/** Reader that have been preempted and might delay the next grace period.*/
+	list_t next_preempted;
+	/** 
+	 * The detector is waiting for the last preempted reader 
+	 * in cur_preempted to announce that it exited its reader 
+	 * section by up()ing remaining_readers.
+	 */
+	bool preempt_blocking_det;
+	
+#ifdef RCU_PREEMPT_A
+	
+	/** 
+	 * The detector waits on this semaphore for any preempted readers 
+	 * delaying the grace period once all cpus pass a quiescent state.
+	 */
+	semaphore_t remaining_readers;
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+	
+	/** Reclaimers notify the detector when they request more grace periods.*/
+	condvar_t req_gp_changed;
+	/** Number of grace period ends the detector was requested to announce. */
+	size_t req_gp_end_cnt;
+	/** Number of consecutive grace periods to detect quickly and aggressively.*/
+	size_t req_expedited_cnt;
+	/** 
+	 * Number of cpus with readers that are delaying the current GP.
+	 * They will up() remaining_readers.
+	 */
+	atomic_t delaying_cpu_cnt;
+	/** 
+	 * The detector waits on this semaphore for any readers delaying the GP.
+	 * 
+	 * Each of the cpus with readers that are delaying the current GP 
+	 * must up() this sema once they reach a quiescent state. If there 
+	 * are any readers in cur_preempted (ie preempted preexisting) and 
+	 * they are already delaying GP detection, the last to unlock its
+	 * reader section must up() this sema once.
+	 */
+	semaphore_t remaining_readers;
+#endif
+	
+	/** Excludes simultaneous rcu_barrier() calls. */
+	mutex_t barrier_mtx;
+	/** Number of cpus that we are waiting for to complete rcu_barrier(). */
+	atomic_t barrier_wait_cnt;
+	/** rcu_barrier() waits for the completion of barrier callbacks on this wq.*/
+	waitq_t barrier_wq;
+	
+	/** Interruptible attached detector thread pointer. */
+	thread_t *detector_thr;
+	
+	/* Some statistics. */
+	size_t stat_expedited_cnt;
+	size_t stat_delayed_cnt;
+	size_t stat_preempt_blocking_cnt;
+	/* Does not contain self/local calls. */
+	size_t stat_smp_call_cnt;
+} rcu_data_t;
+
+
+static rcu_data_t rcu;
+
+static void start_reclaimers(void);
+static void synch_complete(rcu_item_t *rcu_item);
+static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item, 
+	rcu_func_t func);
+static void add_barrier_cb(void *arg);
+static void barrier_complete(rcu_item_t *barrier_item);
+static bool arriving_cbs_empty(void);
+static bool next_cbs_empty(void);
+static bool cur_cbs_empty(void);
+static bool all_cbs_empty(void);
+static void reclaimer(void *arg);
+static bool wait_for_pending_cbs(void);
+static bool advance_cbs(void);
+static void exec_completed_cbs(rcu_gp_t last_completed_gp);
+static void exec_cbs(rcu_item_t **phead);
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *last_completed_gp);
+static void upd_missed_gp_in_wait(rcu_gp_t completed_gp);
+
+#ifdef RCU_PREEMPT_PODZIMEK
+static void start_detector(void);
+static void read_unlock_impl(size_t *pnesting_cnt);
+static void req_detection(size_t req_cnt);
+static bool cv_wait_for_gp(rcu_gp_t wait_on_gp);
+static void detector(void *);
+static bool wait_for_detect_req(void);
+static void end_cur_gp(void);
+static bool wait_for_readers(void);
+static bool gp_sleep(void);
+static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask);
+static bool wait_for_delaying_cpus(void);
+#elif defined(RCU_PREEMPT_A)
+static bool wait_for_readers(bool expedite);
+static bool gp_sleep(bool *expedite);
+#endif
+
+static void start_new_gp(void);
+static void rm_quiescent_cpus(cpu_mask_t *cpu_mask);
+static void sample_cpus(cpu_mask_t *reader_cpus, void *arg);
+static void sample_local_cpu(void *);
+static bool wait_for_preempt_reader(void);
+static void note_preempted_reader(void);
+static void rm_preempted_reader(void);
+static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt);
+
+
+
+/** Initializes global RCU structures. */
+void rcu_init(void)
+{
+	condvar_initialize(&rcu.gp_ended);
+	condvar_initialize(&rcu.expedite_now);
+
+	spinlock_initialize(&rcu.gp_lock, "rcu.gp_lock");
+	_rcu_cur_gp = 0;
+	rcu.completed_gp = 0;
+	
+	irq_spinlock_initialize(&rcu.preempt_lock, "rcu.preempt_lock");
+	list_initialize(&rcu.cur_preempted);
+	list_initialize(&rcu.next_preempted);
+	rcu.preempt_blocking_det = false;
+	
+	mutex_initialize(&rcu.barrier_mtx, MUTEX_PASSIVE);
+	atomic_set(&rcu.barrier_wait_cnt, 0);
+	waitq_initialize(&rcu.barrier_wq);
+
+	semaphore_initialize(&rcu.remaining_readers, 0);
+	
+#ifdef RCU_PREEMPT_PODZIMEK
+	condvar_initialize(&rcu.req_gp_changed);
+	
+	rcu.req_gp_end_cnt = 0;
+	rcu.req_expedited_cnt = 0;
+	atomic_set(&rcu.delaying_cpu_cnt, 0);
+#endif
+	
+	rcu.detector_thr = NULL;
+	
+	rcu.stat_expedited_cnt = 0;
+	rcu.stat_delayed_cnt = 0;
+	rcu.stat_preempt_blocking_cnt = 0;
+	rcu.stat_smp_call_cnt = 0;
+}
+
+/** Initializes per-CPU RCU data. If on the boot cpu inits global data too.*/
+void rcu_cpu_init(void)
+{
+	if (config.cpu_active == 1) {
+		rcu_init();
+	}
+
+	CPU->rcu.last_seen_gp = 0;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	CPU->rcu.nesting_cnt = 0;
+	CPU->rcu.is_delaying_gp = false;
+	CPU->rcu.signal_unlock = false;
+#endif
+	
+	CPU->rcu.cur_cbs = NULL;
+	CPU->rcu.cur_cbs_cnt = 0;
+	CPU->rcu.next_cbs = NULL;
+	CPU->rcu.next_cbs_cnt = 0;
+	CPU->rcu.arriving_cbs = NULL;
+	CPU->rcu.parriving_cbs_tail = &CPU->rcu.arriving_cbs;
+	CPU->rcu.arriving_cbs_cnt = 0;
+
+	CPU->rcu.cur_cbs_gp = 0;
+	CPU->rcu.next_cbs_gp = 0;
+	
+	semaphore_initialize(&CPU->rcu.arrived_flag, 0);
+
+	/* BSP creates reclaimer threads before AP's rcu_cpu_init() runs. */
+	if (config.cpu_active == 1)
+		CPU->rcu.reclaimer_thr = NULL;
+	
+	CPU->rcu.stat_max_cbs = 0;
+	CPU->rcu.stat_avg_cbs = 0;
+	CPU->rcu.stat_missed_gps = 0;
+	CPU->rcu.stat_missed_gp_in_wait = 0;
+	CPU->rcu.stat_max_slice_cbs = 0;
+	CPU->rcu.last_arriving_cnt = 0;
+}
+
+/** Completes RCU init. Creates and runs the detector and reclaimer threads.*/
+void rcu_kinit_init(void)
+{
+#ifdef RCU_PREEMPT_PODZIMEK
+	start_detector();
+#endif
+	
+	start_reclaimers();
+}
+
+/** Initializes any per-thread RCU structures. */
+void rcu_thread_init(thread_t *thread)
+{
+	thread->rcu.nesting_cnt = 0;
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	thread->rcu.was_preempted = false;
+#endif
+	
+	link_initialize(&thread->rcu.preempt_link);
+}
+
+
+/** Cleans up global RCU resources and stops dispatching callbacks. 
+ * 
+ * Call when shutting down the kernel. Outstanding callbacks will
+ * not be processed. Instead they will linger forever.
+ */
+void rcu_stop(void)
+{
+	/* Stop and wait for reclaimers. */
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
+		ASSERT(cpus[cpu_id].rcu.reclaimer_thr != NULL);
+	
+		if (cpus[cpu_id].rcu.reclaimer_thr) {
+			thread_interrupt(cpus[cpu_id].rcu.reclaimer_thr);
+			thread_join(cpus[cpu_id].rcu.reclaimer_thr);
+			thread_detach(cpus[cpu_id].rcu.reclaimer_thr);
+			cpus[cpu_id].rcu.reclaimer_thr = NULL;
+		}
+	}
+
+#ifdef RCU_PREEMPT_PODZIMEK
+	/* Stop the detector and wait. */
+	if (rcu.detector_thr) {
+		thread_interrupt(rcu.detector_thr);
+		thread_join(rcu.detector_thr);
+		thread_detach(rcu.detector_thr);
+		rcu.detector_thr = NULL;
+	}
+#endif
+}
+
+/** Returns the number of elapsed grace periods since boot. */
+uint64_t rcu_completed_gps(void)
+{
+	spinlock_lock(&rcu.gp_lock);
+	uint64_t completed = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return completed;
+}
+
+/** Creates and runs cpu-bound reclaimer threads. */
+static void start_reclaimers(void)
+{
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_count; ++cpu_id) {
+		char name[THREAD_NAME_BUFLEN] = {0};
+		
+		snprintf(name, THREAD_NAME_BUFLEN - 1, "rcu-rec/%u", cpu_id);
+		
+		cpus[cpu_id].rcu.reclaimer_thr = 
+			thread_create(reclaimer, NULL, TASK, THREAD_FLAG_NONE, name);
+
+		if (!cpus[cpu_id].rcu.reclaimer_thr) 
+			panic("Failed to create RCU reclaimer thread on cpu%u.", cpu_id);
+
+		thread_wire(cpus[cpu_id].rcu.reclaimer_thr, &cpus[cpu_id]);
+		thread_ready(cpus[cpu_id].rcu.reclaimer_thr);
+	}
+}
+
+#ifdef RCU_PREEMPT_PODZIMEK
+
+/** Starts the detector thread. */
+static void start_detector(void)
+{
+	rcu.detector_thr = 
+		thread_create(detector, NULL, TASK, THREAD_FLAG_NONE, "rcu-det");
+	
+	if (!rcu.detector_thr) 
+		panic("Failed to create RCU detector thread.");
+	
+	thread_ready(rcu.detector_thr);
+}
+
+/** Returns true if in an rcu reader section. */
+bool rcu_read_locked(void)
+{
+	preemption_disable();
+	bool locked = 0 < CPU->rcu.nesting_cnt;
+	preemption_enable();
+	
+	return locked;
+}
+
+/** Unlocks the local reader section using the given nesting count. 
+ * 
+ * Preemption or interrupts must be disabled. 
+ * 
+ * @param pnesting_cnt Either &CPU->rcu.tmp_nesting_cnt or 
+ *           THREAD->rcu.nesting_cnt.
+ */
+static void read_unlock_impl(size_t *pnesting_cnt)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	if (0 == --(*pnesting_cnt)) {
+		_rcu_record_qs();
+		
+		/* 
+		 * The thread was preempted while in a critical section or 
+		 * the detector is eagerly waiting for this cpu's reader 
+		 * to finish. 
+		 * 
+		 * Note that THREAD may be NULL in scheduler() and not just during boot.
+		 */
+		if ((THREAD && THREAD->rcu.was_preempted) || CPU->rcu.is_delaying_gp) {
+			/* Rechecks with disabled interrupts. */
+			_rcu_signal_read_unlock();
+		}
+	}
+}
+
+/** If necessary, signals the detector that we exited a reader section. */
+void _rcu_signal_read_unlock(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/*
+	 * If an interrupt occurs here (even a NMI) it may beat us to
+	 * resetting .is_delaying_gp or .was_preempted and up the semaphore
+	 * for us.
+	 */
+	
+	/* 
+	 * If the detector is eagerly waiting for this cpu's reader to unlock,
+	 * notify it that the reader did so.
+	 */
+	if (local_atomic_exchange(&CPU->rcu.is_delaying_gp, false)) {
+		semaphore_up(&rcu.remaining_readers);
+	}
+	
+	/*
+	 * This reader was preempted while in a reader section.
+	 * We might be holding up the current GP. Notify the
+	 * detector if so.
+	 */
+	if (THREAD && local_atomic_exchange(&THREAD->rcu.was_preempted, false)) {
+		ASSERT(link_used(&THREAD->rcu.preempt_link));
+
+		rm_preempted_reader();
+	}
+	
+	/* If there was something to signal to the detector we have done so. */
+	CPU->rcu.signal_unlock = false;
+}
+
+#endif /* RCU_PREEMPT_PODZIMEK */
+
+typedef struct synch_item {
+	waitq_t wq;
+	rcu_item_t rcu_item;
+} synch_item_t;
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void rcu_synchronize(void)
+{
+	_rcu_synchronize(false);
+}
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void rcu_synchronize_expedite(void)
+{
+	_rcu_synchronize(true);
+}
+
+/** Blocks until all preexisting readers exit their critical sections. */
+void _rcu_synchronize(bool expedite)
+{
+	/* Calling from a reader section will deadlock. */
+	ASSERT(!rcu_read_locked());
+	
+	synch_item_t completion; 
+
+	waitq_initialize(&completion.wq);
+	_rcu_call(expedite, &completion.rcu_item, synch_complete);
+	waitq_sleep(&completion.wq);
+}
+
+/** rcu_synchronize's callback. */
+static void synch_complete(rcu_item_t *rcu_item)
+{
+	synch_item_t *completion = member_to_inst(rcu_item, synch_item_t, rcu_item);
+	ASSERT(completion);
+	waitq_wakeup(&completion->wq, WAKEUP_FIRST);
+}
+
+/** Waits for all outstanding rcu calls to complete. */
+void rcu_barrier(void)
+{
+	/* 
+	 * Serialize rcu_barrier() calls so we don't overwrite cpu.barrier_item
+	 * currently in use by rcu_barrier().
+	 */
+	mutex_lock(&rcu.barrier_mtx);
+	
+	/* 
+	 * Ensure we queue a barrier callback on all cpus before the already
+	 * enqueued barrier callbacks start signaling completion.
+	 */
+	atomic_set(&rcu.barrier_wait_cnt, 1);
+
+	DEFINE_CPU_MASK(cpu_mask);
+	cpu_mask_active(cpu_mask);
+	
+	cpu_mask_for_each(*cpu_mask, cpu_id) {
+		smp_call(cpu_id, add_barrier_cb, NULL);
+	}
+	
+	if (0 < atomic_predec(&rcu.barrier_wait_cnt)) {
+		waitq_sleep(&rcu.barrier_wq);
+	}
+	
+	mutex_unlock(&rcu.barrier_mtx);
+}
+
+/** Issues a rcu_barrier() callback on the local cpu. 
+ * 
+ * Executed with interrupts disabled.  
+ */
+static void add_barrier_cb(void *arg)
+{
+	ASSERT(interrupts_disabled() || PREEMPTION_DISABLED);
+	atomic_inc(&rcu.barrier_wait_cnt);
+	rcu_call(&CPU->rcu.barrier_item, barrier_complete);
+}
+
+/** Local cpu's rcu_barrier() completion callback. */
+static void barrier_complete(rcu_item_t *barrier_item)
+{
+	/* Is this the last barrier callback completed? */
+	if (0 == atomic_predec(&rcu.barrier_wait_cnt)) {
+		/* Notify rcu_barrier() that we're done. */
+		waitq_wakeup(&rcu.barrier_wq, WAKEUP_FIRST);
+	}
+}
+
+/** Adds a callback to invoke after all preexisting readers finish. 
+ * 
+ * May be called from within interrupt handlers or RCU reader sections.
+ * 
+ * @param rcu_item Used by RCU to track the call. Must remain
+ *         until the user callback function is entered.
+ * @param func User callback function that will be invoked once a full
+ *         grace period elapsed, ie at a time when all preexisting
+ *         readers have finished. The callback should be short and must
+ *         not block. If you must sleep, enqueue your work in the system
+ *         work queue from the callback (ie workq_global_enqueue()).
+ */
+void rcu_call(rcu_item_t *rcu_item, rcu_func_t func)
+{
+	rcu_call_impl(false, rcu_item, func);
+}
+
+/** rcu_call() implementation. See rcu_call() for comments. */
+void _rcu_call(bool expedite, rcu_item_t *rcu_item, rcu_func_t func)
+{
+	rcu_call_impl(expedite, rcu_item, func);
+}
+
+/** rcu_call() inline-able implementation. See rcu_call() for comments. */
+static inline void rcu_call_impl(bool expedite, rcu_item_t *rcu_item, 
+	rcu_func_t func)
+{
+	ASSERT(rcu_item);
+	
+	rcu_item->func = func;
+	rcu_item->next = NULL;
+	
+	preemption_disable();
+
+	rcu_cpu_data_t *r = &CPU->rcu;
+
+	rcu_item_t **prev_tail 
+		= local_atomic_exchange(&r->parriving_cbs_tail, &rcu_item->next);
+	*prev_tail = rcu_item;
+	
+	/* Approximate the number of callbacks present. */
+	++r->arriving_cbs_cnt;
+	
+	if (expedite) {
+		r->expedite_arriving = true;
+	}
+	
+	bool first_cb = (prev_tail == &CPU->rcu.arriving_cbs);
+	
+	/* Added first callback - notify the reclaimer. */
+	if (first_cb && !semaphore_count_get(&r->arrived_flag)) {
+		semaphore_up(&r->arrived_flag);
+	}
+	
+	preemption_enable();
+}
+
+static bool cur_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	return NULL == CPU->rcu.cur_cbs;
+}
+
+static bool next_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	return NULL == CPU->rcu.next_cbs;
+}
+
+/** Disable interrupts to get an up-to-date result. */
+static bool arriving_cbs_empty(void)
+{
+	ASSERT(THREAD && THREAD->wired);
+	/* 
+	 * Accessing with interrupts enabled may at worst lead to 
+	 * a false negative if we race with a local interrupt handler.
+	 */
+	return NULL == CPU->rcu.arriving_cbs;
+}
+
+static bool all_cbs_empty(void)
+{
+	return cur_cbs_empty() && next_cbs_empty() && arriving_cbs_empty();
+}
+
+
+/** Reclaimer thread dispatches locally queued callbacks once a GP ends. */
+static void reclaimer(void *arg)
+{
+	ASSERT(THREAD && THREAD->wired);
+	ASSERT(THREAD == CPU->rcu.reclaimer_thr);
+
+	rcu_gp_t last_compl_gp = 0;
+	bool ok = true;
+	
+	while (ok && wait_for_pending_cbs()) {
+		ASSERT(CPU->rcu.reclaimer_thr == THREAD);
+		
+		exec_completed_cbs(last_compl_gp);
+
+		bool expedite = advance_cbs();
+		
+		ok = wait_for_cur_cbs_gp_end(expedite, &last_compl_gp);
+	}
+}
+
+/** Waits until there are callbacks waiting to be dispatched. */
+static bool wait_for_pending_cbs(void)
+{
+	if (!all_cbs_empty()) 
+		return true;
+
+	bool ok = true;
+	
+	while (arriving_cbs_empty() && ok) {
+		ok = semaphore_down_interruptable(&CPU->rcu.arrived_flag);
+	}
+	
+	return ok;
+}
+
+static void upd_stat_missed_gp(rcu_gp_t compl)
+{
+	if (CPU->rcu.cur_cbs_gp < compl) {
+		CPU->rcu.stat_missed_gps += (size_t)(compl - CPU->rcu.cur_cbs_gp);
+	}
+}
+
+/** Executes all callbacks for the given completed grace period. */
+static void exec_completed_cbs(rcu_gp_t last_completed_gp)
+{
+	upd_stat_missed_gp(last_completed_gp);
+	
+	/* Both next_cbs and cur_cbs GP elapsed. */
+	if (CPU->rcu.next_cbs_gp <= last_completed_gp) {
+		ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+		
+		size_t exec_cnt = CPU->rcu.cur_cbs_cnt + CPU->rcu.next_cbs_cnt;
+		
+		if (exec_cnt < CRITICAL_THRESHOLD) {
+			exec_cbs(&CPU->rcu.cur_cbs);
+			exec_cbs(&CPU->rcu.next_cbs);	
+		} else {
+			/* 
+			 * Getting overwhelmed with too many callbacks to run. 
+			 * Disable preemption in order to prolong our time slice 
+			 * and catch up with updaters posting new callbacks.
+			 */
+			preemption_disable();
+			exec_cbs(&CPU->rcu.cur_cbs);
+			exec_cbs(&CPU->rcu.next_cbs);	
+			preemption_enable();
+		}
+		
+		CPU->rcu.cur_cbs_cnt = 0;
+		CPU->rcu.next_cbs_cnt = 0;
+	} else if (CPU->rcu.cur_cbs_gp <= last_completed_gp) {
+
+		if (CPU->rcu.cur_cbs_cnt < CRITICAL_THRESHOLD) {
+			exec_cbs(&CPU->rcu.cur_cbs);
+		} else {
+			/* 
+			 * Getting overwhelmed with too many callbacks to run. 
+			 * Disable preemption in order to prolong our time slice 
+			 * and catch up with updaters posting new callbacks.
+			 */
+			preemption_disable();
+			exec_cbs(&CPU->rcu.cur_cbs);
+			preemption_enable();
+		}
+
+		CPU->rcu.cur_cbs_cnt = 0;
+	}
+}
+
+/** Executes callbacks in the single-linked list. The list is left empty. */
+static void exec_cbs(rcu_item_t **phead)
+{
+	rcu_item_t *rcu_item = *phead;
+
+	while (rcu_item) {
+		/* func() may free rcu_item. Get a local copy. */
+		rcu_item_t *next = rcu_item->next;
+		rcu_func_t func = rcu_item->func;
+		
+		func(rcu_item);
+		
+		rcu_item = next;
+	}
+	
+	*phead = NULL;
+}
+
+static void upd_stat_cb_cnts(size_t arriving_cnt)
+{
+	CPU->rcu.stat_max_cbs = max(arriving_cnt, CPU->rcu.stat_max_cbs);
+	if (0 < arriving_cnt) {
+		CPU->rcu.stat_avg_cbs = 
+			(99 * CPU->rcu.stat_avg_cbs + 1 * arriving_cnt) / 100;
+	}
+}
+
+/** Prepares another batch of callbacks to dispatch at the nest grace period.
+ * 
+ * @return True if the next batch of callbacks must be expedited quickly.
+ */
+static bool advance_cbs(void)
+{
+	/* Move next_cbs to cur_cbs. */
+	CPU->rcu.cur_cbs = CPU->rcu.next_cbs;
+	CPU->rcu.cur_cbs_cnt = CPU->rcu.next_cbs_cnt;
+	CPU->rcu.cur_cbs_gp = CPU->rcu.next_cbs_gp;
+	
+	/* Move arriving_cbs to next_cbs. */
+	
+	CPU->rcu.next_cbs_cnt = CPU->rcu.arriving_cbs_cnt;
+	CPU->rcu.arriving_cbs_cnt = 0;
+	
+	/* 
+	 * Too many callbacks queued. Better speed up the detection
+	 * or risk exhausting all system memory.
+	 */
+	bool expedite = (EXPEDITE_THRESHOLD < CPU->rcu.next_cbs_cnt)
+		|| CPU->rcu.expedite_arriving;	
+	CPU->rcu.expedite_arriving = false;
+
+	/* Start moving the arriving_cbs list to next_cbs. */
+	CPU->rcu.next_cbs = CPU->rcu.arriving_cbs;
+	
+	/* 
+	 * At least one callback arrived. The tail therefore does not point
+	 * to the head of arriving_cbs and we can safely reset it to NULL.
+	 */
+	if (CPU->rcu.next_cbs) {
+		ASSERT(CPU->rcu.parriving_cbs_tail != &CPU->rcu.arriving_cbs);
+		
+		CPU->rcu.arriving_cbs = NULL;
+		/* Reset arriving_cbs before updating the tail pointer. */
+		compiler_barrier();
+		/* Updating the tail pointer completes the move of arriving_cbs. */
+		ACCESS_ONCE(CPU->rcu.parriving_cbs_tail) = &CPU->rcu.arriving_cbs;
+	} else {
+		/* 
+		 * arriving_cbs was null and parriving_cbs_tail pointed to it 
+		 * so leave it that way. Note that interrupt handlers may have
+		 * added a callback in the meantime so it is not safe to reset
+		 * arriving_cbs or parriving_cbs.
+		 */
+	}
+
+	/* Update statistics of arrived callbacks. */
+	upd_stat_cb_cnts(CPU->rcu.next_cbs_cnt);
+	
+	/* 
+	 * Make changes prior to queuing next_cbs visible to readers. 
+	 * See comment in wait_for_readers().
+	 */
+	memory_barrier(); /* MB A, B */
+
+	/* At the end of next_cbs_gp, exec next_cbs. Determine what GP that is. */
+	
+	if (!next_cbs_empty()) {
+		spinlock_lock(&rcu.gp_lock);
+	
+		/* Exec next_cbs at the end of the next GP. */
+		CPU->rcu.next_cbs_gp = _rcu_cur_gp + 1;
+		
+		/* 
+		 * There are no callbacks to invoke before next_cbs. Instruct
+		 * wait_for_cur_cbs_gp() to notify us of the nearest GP end.
+		 * That could be sooner than next_cbs_gp (if the current GP 
+		 * had not yet completed), so we'll create a shorter batch
+		 * of callbacks next time around.
+		 */
+		if (cur_cbs_empty()) {
+			CPU->rcu.cur_cbs_gp = rcu.completed_gp + 1;
+		} 
+		
+		spinlock_unlock(&rcu.gp_lock);
+	} else {
+		CPU->rcu.next_cbs_gp = CPU->rcu.cur_cbs_gp;
+	}
+	
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	
+	return expedite;	
+}
+
+
+#ifdef RCU_PREEMPT_A
+
+/** Waits for the grace period associated with callbacks cub_cbs to elapse. 
+ * 
+ * @param expedite Instructs the detector to aggressively speed up grace 
+ *            period detection without any delay.
+ * @param completed_gp Returns the most recent completed grace period 
+ *            number.
+ * @return false if the thread was interrupted and should stop.
+ */
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
+{
+	spinlock_lock(&rcu.gp_lock);
+
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	ASSERT(CPU->rcu.cur_cbs_gp <= _rcu_cur_gp + 1);
+	
+	while (rcu.completed_gp < CPU->rcu.cur_cbs_gp) {
+		/* GP has not yet started - start a new one. */
+		if (rcu.completed_gp == _rcu_cur_gp) {
+			start_new_gp();
+			spinlock_unlock(&rcu.gp_lock);
+
+			if (!wait_for_readers(expedite))
+				return false;
+
+			spinlock_lock(&rcu.gp_lock);
+			/* Notify any reclaimers this GP had ended. */
+			rcu.completed_gp = _rcu_cur_gp;
+			condvar_broadcast(&rcu.gp_ended);
+		} else {
+			/* GP detection is in progress.*/ 
+			
+			if (expedite) 
+				condvar_signal(&rcu.expedite_now);
+			
+			/* Wait for the GP to complete. */
+			int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock, 
+				SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+			
+			if (ret == ESYNCH_INTERRUPTED) {
+				spinlock_unlock(&rcu.gp_lock);
+				return false;			
+			}
+		}
+	}
+	
+	upd_missed_gp_in_wait(rcu.completed_gp);
+	
+	*completed_gp = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return true;
+}
+
+static bool wait_for_readers(bool expedite)
+{
+	DEFINE_CPU_MASK(reader_cpus);
+	
+	cpu_mask_active(reader_cpus);
+	rm_quiescent_cpus(reader_cpus);
+	
+	while (!cpu_mask_is_none(reader_cpus)) {
+		/* Give cpus a chance to context switch (a QS) and batch callbacks. */
+		if(!gp_sleep(&expedite)) 
+			return false;
+		
+		rm_quiescent_cpus(reader_cpus);
+		sample_cpus(reader_cpus, reader_cpus);
+	}
+	
+	/* Update statistic. */
+	if (expedite) {
+		++rcu.stat_expedited_cnt;
+	}
+	
+	/* 
+	 * All cpus have passed through a QS and see the most recent _rcu_cur_gp.
+	 * As a result newly preempted readers will associate with next_preempted
+	 * and the number of old readers in cur_preempted will monotonically
+	 * decrease. Wait for those old/preexisting readers.
+	 */
+	return wait_for_preempt_reader();
+}
+
+static bool gp_sleep(bool *expedite)
+{
+	if (*expedite) {
+		scheduler();
+		return true;
+	} else {
+		spinlock_lock(&rcu.gp_lock);
+
+		int ret = 0;
+		ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
+			DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
+
+		/* rcu.expedite_now was signaled. */
+		if (ret == ESYNCH_OK_BLOCKED) {
+			*expedite = true;
+		}
+
+		spinlock_unlock(&rcu.gp_lock);
+
+		return (ret != ESYNCH_INTERRUPTED);
+	}
+}
+
+static void sample_local_cpu(void *arg)
+{
+	ASSERT(interrupts_disabled());
+	cpu_mask_t *reader_cpus = (cpu_mask_t *)arg;
+	
+	bool locked = RCU_CNT_INC <= THE->rcu_nesting;
+	/* smp_call machinery makes the most current _rcu_cur_gp visible. */
+	bool passed_qs = (CPU->rcu.last_seen_gp == _rcu_cur_gp);
+		
+	if (locked && !passed_qs) {
+		/* 
+		 * This cpu has not yet passed a quiescent state during this grace
+		 * period and it is currently in a reader section. We'll have to
+		 * try to sample this cpu again later.
+		 */
+	} else {
+		/* Either not in a reader section or already passed a QS. */
+		cpu_mask_reset(reader_cpus, CPU->id);
+		/* Contain new reader sections and make prior changes visible to them.*/
+		memory_barrier();
+		CPU->rcu.last_seen_gp = _rcu_cur_gp;
+	}
+}
+
+/** Called by the scheduler() when switching away from the current thread. */
+void rcu_after_thread_ran(void)
+{
+	ASSERT(interrupts_disabled());
+
+	/* 
+	 * In order not to worry about NMI seeing rcu_nesting change work 
+	 * with a local copy.
+	 */
+	size_t nesting_cnt = local_atomic_exchange(&THE->rcu_nesting, 0);
+	
+	/* 
+	 * Ensures NMIs see .rcu_nesting without the WAS_PREEMPTED mark and
+	 * do not accidentally call rm_preempted_reader() from unlock().
+	 */
+	compiler_barrier();
+	
+	/* Preempted a reader critical section for the first time. */
+	if (RCU_CNT_INC <= nesting_cnt && !(nesting_cnt & RCU_WAS_PREEMPTED)) {
+		nesting_cnt |= RCU_WAS_PREEMPTED;
+		note_preempted_reader();
+	}
+	
+	/* Save the thread's nesting count when it is not running. */
+	THREAD->rcu.nesting_cnt = nesting_cnt;
+
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* 
+		 * Contain any memory accesses of old readers before announcing a QS. 
+		 * Also make changes from the previous GP visible to this cpu.
+		 * Moreover it separates writing to last_seen_gp from 
+		 * note_preempted_reader().
+		 */
+		memory_barrier();
+		/* 
+		 * The preempted reader has been noted globally. There are therefore
+		 * no readers running on this cpu so this is a quiescent state.
+		 * 
+		 * Reading the multiword _rcu_cur_gp non-atomically is benign. 
+		 * At worst, the read value will be different from the actual value.
+		 * As a result, both the detector and this cpu will believe
+		 * this cpu has not yet passed a QS although it really did.
+		 * 
+		 * Reloading _rcu_cur_gp is benign, because it cannot change
+		 * until this cpu acknowledges it passed a QS by writing to
+		 * last_seen_gp. Since interrupts are disabled, only this
+		 * code may to so (IPIs won't get through).
+		 */
+		CPU->rcu.last_seen_gp = _rcu_cur_gp;
+	}
+
+	/* 
+	 * Forcefully associate the reclaimer with the highest priority
+	 * even if preempted due to its time slice running out.
+	 */
+	if (THREAD == CPU->rcu.reclaimer_thr) {
+		THREAD->priority = -1;
+	} 
+	
+	upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
+}
+
+/** Called by the scheduler() when switching to a newly scheduled thread. */
+void rcu_before_thread_runs(void)
+{
+	ASSERT(!rcu_read_locked());
+	
+	/* Load the thread's saved nesting count from before it was preempted. */
+	THE->rcu_nesting = THREAD->rcu.nesting_cnt;
+}
+
+/** Called from scheduler() when exiting the current thread. 
+ * 
+ * Preemption or interrupts are disabled and the scheduler() already
+ * switched away from the current thread, calling rcu_after_thread_ran().
+ */
+void rcu_thread_exiting(void)
+{
+	ASSERT(THE->rcu_nesting == 0);
+	
+	/* 
+	 * The thread forgot to exit its reader critical section. 
+	 * It is a bug, but rather than letting the entire system lock up
+	 * forcefully leave the reader section. The thread is not holding 
+	 * any references anyway since it is exiting so it is safe.
+	 */
+	if (RCU_CNT_INC <= THREAD->rcu.nesting_cnt) {
+		/* Emulate _rcu_preempted_unlock() with the proper nesting count. */
+		if (THREAD->rcu.nesting_cnt & RCU_WAS_PREEMPTED) {
+			rm_preempted_reader();
+		}
+
+		printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
+			" section.\n", THREAD->tid, THREAD->name);
+	}
+}
+
+/** Returns true if in an rcu reader section. */
+bool rcu_read_locked(void)
+{
+	return RCU_CNT_INC <= THE->rcu_nesting;
+}
+
+/** Invoked when a preempted reader finally exits its reader section. */
+void _rcu_preempted_unlock(void)
+{
+	ASSERT(0 == THE->rcu_nesting || RCU_WAS_PREEMPTED == THE->rcu_nesting);
+	
+	size_t prev = local_atomic_exchange(&THE->rcu_nesting, 0);
+	if (prev == RCU_WAS_PREEMPTED) {
+		/* 
+		 * NMI handlers are never preempted but may call rm_preempted_reader()
+		 * if a NMI occurred in _rcu_preempted_unlock() of a preempted thread.
+		 * The only other rcu code that may have been interrupted by the NMI
+		 * in _rcu_preempted_unlock() is: an IPI/sample_local_cpu() and
+		 * the initial part of rcu_after_thread_ran().
+		 * 
+		 * rm_preempted_reader() will not deadlock because none of the locks
+		 * it uses are locked in this case. Neither _rcu_preempted_unlock()
+		 * nor sample_local_cpu() nor the initial part of rcu_after_thread_ran()
+		 * acquire any locks.
+		 */
+		rm_preempted_reader();
+	}
+}
+
+#elif defined(RCU_PREEMPT_PODZIMEK)
+
+/** Waits for the grace period associated with callbacks cub_cbs to elapse. 
+ * 
+ * @param expedite Instructs the detector to aggressively speed up grace 
+ *            period detection without any delay.
+ * @param completed_gp Returns the most recent completed grace period 
+ *            number.
+ * @return false if the thread was interrupted and should stop.
+ */
+static bool wait_for_cur_cbs_gp_end(bool expedite, rcu_gp_t *completed_gp)
+{
+	/* 
+	 * Use a possibly outdated version of completed_gp to bypass checking
+	 * with the lock.
+	 * 
+	 * Note that loading and storing rcu.completed_gp is not atomic 
+	 * (it is 64bit wide). Reading a clobbered value that is less than 
+	 * rcu.completed_gp is harmless - we'll recheck with a lock. The 
+	 * only way to read a clobbered value that is greater than the actual 
+	 * value is if the detector increases the higher-order word first and 
+	 * then decreases the lower-order word (or we see stores in that order), 
+	 * eg when incrementing from 2^32 - 1 to 2^32. The loaded value 
+	 * suddenly jumps by 2^32. It would take hours for such an increase 
+	 * to occur so it is safe to discard the value. We allow increases 
+	 * of up to half the maximum to generously accommodate for loading an
+	 * outdated lower word.
+	 */
+	rcu_gp_t compl_gp = ACCESS_ONCE(rcu.completed_gp);
+	if (CPU->rcu.cur_cbs_gp <= compl_gp 
+		&& compl_gp <= CPU->rcu.cur_cbs_gp + UINT32_MAX_HALF) {
+		*completed_gp = compl_gp;
+		return true;
+	}
+	
+	spinlock_lock(&rcu.gp_lock);
+	
+	if (CPU->rcu.cur_cbs_gp <= rcu.completed_gp) {
+		*completed_gp = rcu.completed_gp;
+		spinlock_unlock(&rcu.gp_lock);
+		return true;
+	}
+	
+	ASSERT(CPU->rcu.cur_cbs_gp <= CPU->rcu.next_cbs_gp);
+	ASSERT(_rcu_cur_gp <= CPU->rcu.cur_cbs_gp);
+	
+	/* 
+	 * Notify the detector of how many GP ends we intend to wait for, so 
+	 * it can avoid going to sleep unnecessarily. Optimistically assume
+	 * new callbacks will arrive while we're waiting; hence +1.
+	 */
+	size_t remaining_gp_ends = (size_t) (CPU->rcu.next_cbs_gp - _rcu_cur_gp);
+	req_detection(remaining_gp_ends + (arriving_cbs_empty() ? 0 : 1));
+	
+	/* 
+	 * Ask the detector to speed up GP detection if there are too many 
+	 * pending callbacks and other reclaimers have not already done so.
+	 */
+	if (expedite) {
+		if(0 == rcu.req_expedited_cnt) 
+			condvar_signal(&rcu.expedite_now);
+		
+		/* 
+		 * Expedite only cub_cbs. If there really is a surge of callbacks 
+		 * the arriving batch will expedite the GP for the huge number
+		 * of callbacks currently in next_cbs
+		 */
+		rcu.req_expedited_cnt = 1;
+	}
+
+	/* Wait for cur_cbs_gp to end. */
+	bool interrupted = cv_wait_for_gp(CPU->rcu.cur_cbs_gp);
+	
+	*completed_gp = rcu.completed_gp;
+	spinlock_unlock(&rcu.gp_lock);	
+	
+	if (!interrupted)
+		upd_missed_gp_in_wait(*completed_gp);
+	
+	return !interrupted;
+}
+
+/** Waits for an announcement of the end of the grace period wait_on_gp. */
+static bool cv_wait_for_gp(rcu_gp_t wait_on_gp)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	bool interrupted = false;
+	
+	/* Wait until wait_on_gp ends. */
+	while (rcu.completed_gp < wait_on_gp && !interrupted) {
+		int ret = _condvar_wait_timeout_spinlock(&rcu.gp_ended, &rcu.gp_lock, 
+			SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		interrupted = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	return interrupted;
+}
+
+/** Requests the detector to detect at least req_cnt consecutive grace periods.*/
+static void req_detection(size_t req_cnt)
+{
+	if (rcu.req_gp_end_cnt < req_cnt) {
+		bool detector_idle = (0 == rcu.req_gp_end_cnt);
+		rcu.req_gp_end_cnt = req_cnt;
+
+		if (detector_idle) {
+			ASSERT(_rcu_cur_gp == rcu.completed_gp);
+			condvar_signal(&rcu.req_gp_changed);
+		}
+	}
+}
+
+
+/** The detector thread detects and notifies reclaimers of grace period ends. */
+static void detector(void *arg)
+{
+	spinlock_lock(&rcu.gp_lock);
+	
+	while (wait_for_detect_req()) {
+		/* 
+		 * Announce new GP started. Readers start lazily acknowledging that
+		 * they passed a QS.
+		 */
+		start_new_gp();
+		
+		spinlock_unlock(&rcu.gp_lock);
+		
+		if (!wait_for_readers()) 
+			goto unlocked_out;
+		
+		spinlock_lock(&rcu.gp_lock);
+
+		/* Notify reclaimers that they may now invoke queued callbacks. */
+		end_cur_gp();
+	}
+	
+	spinlock_unlock(&rcu.gp_lock);
+	
+unlocked_out:
+	return;
+}
+
+/** Waits for a request from a reclaimer thread to detect a grace period. */
+static bool wait_for_detect_req(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	bool interrupted = false;
+	
+	while (0 == rcu.req_gp_end_cnt && !interrupted) {
+		int ret = _condvar_wait_timeout_spinlock(&rcu.req_gp_changed, 
+			&rcu.gp_lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		
+		interrupted = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	return !interrupted;
+}
+
+
+static void end_cur_gp(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	rcu.completed_gp = _rcu_cur_gp;
+	--rcu.req_gp_end_cnt;
+	
+	condvar_broadcast(&rcu.gp_ended);
+}
+
+/** Waits for readers that started before the current GP started to finish. */
+static bool wait_for_readers(void)
+{
+	DEFINE_CPU_MASK(reading_cpus);
+	
+	/* All running cpus have potential readers. */
+	cpu_mask_active(reading_cpus);
+
+	/* 
+	 * Give readers time to pass through a QS. Also, batch arriving 
+	 * callbacks in order to amortize detection overhead.
+	 */
+	if (!gp_sleep())
+		return false;
+	
+	/* Non-intrusively determine which cpus have yet to pass a QS. */
+	rm_quiescent_cpus(reading_cpus);
+	
+	/* Actively interrupt cpus delaying the current GP and demand a QS. */
+	interrupt_delaying_cpus(reading_cpus);
+	
+	/* Wait for the interrupted cpus to notify us that they reached a QS. */
+	if (!wait_for_delaying_cpus())
+		return false;
+	/*
+	 * All cpus recorded a QS or are still idle. Any new readers will be added
+	 * to next_preempt if preempted, ie the number of readers in cur_preempted
+	 * monotonically descreases.
+	 */
+	
+	/* Wait for the last reader in cur_preempted to notify us it is done. */
+	if (!wait_for_preempt_reader())
+		return false;
+	
+	return true;
+}
+
+/** Sleeps a while if the current grace period is not to be expedited. */
+static bool gp_sleep(void)
+{
+	spinlock_lock(&rcu.gp_lock);
+
+	int ret = 0;
+	while (0 == rcu.req_expedited_cnt && 0 == ret) {
+		/* minor bug: sleeps for the same duration if woken up spuriously. */
+		ret = _condvar_wait_timeout_spinlock(&rcu.expedite_now, &rcu.gp_lock,
+			DETECT_SLEEP_MS * 1000, SYNCH_FLAGS_INTERRUPTIBLE);
+	}
+	
+	if (0 < rcu.req_expedited_cnt) {
+		--rcu.req_expedited_cnt;
+		/* Update statistic. */
+		++rcu.stat_expedited_cnt;
+	}
+	
+	spinlock_unlock(&rcu.gp_lock);
+	
+	return (ret != ESYNCH_INTERRUPTED);
+}
+
+/** Actively interrupts and checks the offending cpus for quiescent states. */
+static void interrupt_delaying_cpus(cpu_mask_t *cpu_mask)
+{
+	atomic_set(&rcu.delaying_cpu_cnt, 0);
+	
+	sample_cpus(cpu_mask, NULL);
+}
+
+/** Invoked on a cpu delaying grace period detection. 
+ * 
+ * Induces a quiescent state for the cpu or it instructs remaining 
+ * readers to notify the detector once they finish.
+ */
+static void sample_local_cpu(void *arg)
+{
+	ASSERT(interrupts_disabled());
+	ASSERT(!CPU->rcu.is_delaying_gp);
+	
+	/* Cpu did not pass a quiescent state yet. */
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* Interrupted a reader in a reader critical section. */
+		if (0 < CPU->rcu.nesting_cnt) {
+			ASSERT(!CPU->idle);
+			/* 
+			 * Note to notify the detector from rcu_read_unlock(). 
+			 * 
+			 * ACCESS_ONCE ensures the compiler writes to is_delaying_gp
+			 * only after it determines that we are in a reader CS.
+			 */
+			ACCESS_ONCE(CPU->rcu.is_delaying_gp) = true;
+			CPU->rcu.signal_unlock = true;
+			
+			atomic_inc(&rcu.delaying_cpu_cnt);
+		} else {
+			/* 
+			 * The cpu did not enter any rcu reader sections since 
+			 * the start of the current GP. Record a quiescent state.
+			 * 
+			 * Or, we interrupted rcu_read_unlock_impl() right before
+			 * it recorded a QS. Record a QS for it. The memory barrier 
+			 * contains the reader section's mem accesses before 
+			 * updating last_seen_gp.
+			 * 
+			 * Or, we interrupted rcu_read_lock() right after it recorded
+			 * a QS for the previous GP but before it got a chance to
+			 * increment its nesting count. The memory barrier again
+			 * stops the CS code from spilling out of the CS.
+			 */
+			memory_barrier();
+			CPU->rcu.last_seen_gp = _rcu_cur_gp;
+		}
+	} else {
+		/* 
+		 * This cpu already acknowledged that it had passed through 
+		 * a quiescent state since the start of cur_gp. 
+		 */
+	}
+	
+	/* 
+	 * smp_call() makes sure any changes propagate back to the caller.
+	 * In particular, it makes the most current last_seen_gp visible
+	 * to the detector.
+	 */
+}
+
+/** Waits for cpus delaying the current grace period if there are any. */
+static bool wait_for_delaying_cpus(void)
+{
+	int delaying_cpu_cnt = atomic_get(&rcu.delaying_cpu_cnt);
+
+	for (int i = 0; i < delaying_cpu_cnt; ++i){
+		if (!semaphore_down_interruptable(&rcu.remaining_readers))
+			return false;
+	}
+	
+	/* Update statistic. */
+	rcu.stat_delayed_cnt += delaying_cpu_cnt;
+	
+	return true;
+}
+
+/** Called by the scheduler() when switching away from the current thread. */
+void rcu_after_thread_ran(void)
+{
+	ASSERT(interrupts_disabled());
+
+	/* 
+	 * Prevent NMI handlers from interfering. The detector will be notified
+	 * in this function if CPU->rcu.is_delaying_gp. The current thread is 
+	 * no longer running so there is nothing else to signal to the detector.
+	 */
+	CPU->rcu.signal_unlock = false;
+	/* 
+	 * Separates clearing of .signal_unlock from accesses to 
+	 * THREAD->rcu.was_preempted and CPU->rcu.nesting_cnt.
+	 */
+	compiler_barrier();
+	
+	/* Save the thread's nesting count when it is not running. */
+	THREAD->rcu.nesting_cnt = CPU->rcu.nesting_cnt;
+	
+	/* Preempted a reader critical section for the first time. */
+	if (0 < THREAD->rcu.nesting_cnt && !THREAD->rcu.was_preempted) {
+		THREAD->rcu.was_preempted = true;
+		note_preempted_reader();
+	}
+	
+	/* 
+	 * The preempted reader has been noted globally. There are therefore
+	 * no readers running on this cpu so this is a quiescent state.
+	 */
+	_rcu_record_qs();
+
+	/* 
+	 * Interrupt handlers might use RCU while idle in scheduler(). 
+	 * The preempted reader has been noted globally, so the handlers 
+	 * may now start announcing quiescent states.
+	 */
+	CPU->rcu.nesting_cnt = 0;
+	
+	/* 
+	 * This cpu is holding up the current GP. Let the detector know 
+	 * it has just passed a quiescent state. 
+	 * 
+	 * The detector waits separately for preempted readers, so we have 
+	 * to notify the detector even if we have just preempted a reader.
+	 */
+	if (CPU->rcu.is_delaying_gp) {
+		CPU->rcu.is_delaying_gp = false;
+		semaphore_up(&rcu.remaining_readers);
+	}
+
+	/* 
+	 * Forcefully associate the detector with the highest priority
+	 * even if preempted due to its time slice running out.
+	 * 
+	 * todo: Replace with strict scheduler priority classes.
+	 */
+	if (THREAD == rcu.detector_thr) {
+		THREAD->priority = -1;
+	} 
+	else if (THREAD == CPU->rcu.reclaimer_thr) {
+		THREAD->priority = -1;
+	} 
+	
+	upd_max_cbs_in_slice(CPU->rcu.arriving_cbs_cnt);
+}
+
+/** Called by the scheduler() when switching to a newly scheduled thread. */
+void rcu_before_thread_runs(void)
+{
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	ASSERT(0 == CPU->rcu.nesting_cnt);
+	
+	/* Load the thread's saved nesting count from before it was preempted. */
+	CPU->rcu.nesting_cnt = THREAD->rcu.nesting_cnt;
+	
+	/* 
+	 * Ensures NMI see the proper nesting count before .signal_unlock.
+	 * Otherwise the NMI may incorrectly signal that a preempted reader
+	 * exited its reader section.
+	 */
+	compiler_barrier();
+	
+	/* 
+	 * In the unlikely event that a NMI occurs between the loading of the 
+	 * variables and setting signal_unlock, the NMI handler may invoke 
+	 * rcu_read_unlock() and clear signal_unlock. In that case we will
+	 * incorrectly overwrite signal_unlock from false to true. This event
+	 * is benign and the next rcu_read_unlock() will at worst 
+	 * needlessly invoke _rcu_signal_unlock().
+	 */
+	CPU->rcu.signal_unlock = THREAD->rcu.was_preempted || CPU->rcu.is_delaying_gp;
+}
+
+/** Called from scheduler() when exiting the current thread. 
+ * 
+ * Preemption or interrupts are disabled and the scheduler() already
+ * switched away from the current thread, calling rcu_after_thread_ran().
+ */
+void rcu_thread_exiting(void)
+{
+	ASSERT(THREAD != NULL);
+	ASSERT(THREAD->state == Exiting);
+	ASSERT(PREEMPTION_DISABLED || interrupts_disabled());
+	
+	/* 
+	 * The thread forgot to exit its reader critical section. 
+	 * It is a bug, but rather than letting the entire system lock up
+	 * forcefully leave the reader section. The thread is not holding 
+	 * any references anyway since it is exiting so it is safe.
+	 */
+	if (0 < THREAD->rcu.nesting_cnt) {
+		THREAD->rcu.nesting_cnt = 1;
+		read_unlock_impl(&THREAD->rcu.nesting_cnt);
+
+		printf("Bug: thread (id %" PRIu64 " \"%s\") exited while in RCU read"
+			" section.\n", THREAD->tid, THREAD->name);
+	}
+}
+
+
+#endif /* RCU_PREEMPT_PODZIMEK */
+
+/** Announces the start of a new grace period for preexisting readers to ack. */
+static void start_new_gp(void)
+{
+	ASSERT(spinlock_locked(&rcu.gp_lock));
+	
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+	
+	/* Start a new GP. Announce to readers that a quiescent state is needed. */
+	++_rcu_cur_gp;
+	
+	/* 
+	 * Readers preempted before the start of this GP (next_preempted)
+	 * are preexisting readers now that a GP started and will hold up 
+	 * the current GP until they exit their reader sections.
+	 * 
+	 * Preempted readers from the previous GP have finished so 
+	 * cur_preempted is empty, but see comment in _rcu_record_qs(). 
+	 */
+	list_concat(&rcu.cur_preempted, &rcu.next_preempted);
+	
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+}
+
+/** Remove those cpus from the mask that have already passed a quiescent
+ * state since the start of the current grace period.
+ */
+static void rm_quiescent_cpus(cpu_mask_t *cpu_mask)
+{
+	/*
+	 * Ensure the announcement of the start of a new GP (ie up-to-date 
+	 * cur_gp) propagates to cpus that are just coming out of idle 
+	 * mode before we sample their idle state flag.
+	 * 
+	 * Cpus guarantee that after they set CPU->idle = true they will not
+	 * execute any RCU reader sections without first setting idle to
+	 * false and issuing a memory barrier. Therefore, if rm_quiescent_cpus()
+	 * later on sees an idle cpu, but the cpu is just exiting its idle mode,
+	 * the cpu must not have yet executed its memory barrier (otherwise
+	 * it would pair up with this mem barrier and we would see idle == false).
+	 * That memory barrier will pair up with the one below and ensure
+	 * that a reader on the now-non-idle cpu will see the most current
+	 * cur_gp. As a result, such a reader will never attempt to semaphore_up(
+	 * pending_readers) during this GP, which allows the detector to
+	 * ignore that cpu (the detector thinks it is idle). Moreover, any
+	 * changes made by RCU updaters will have propagated to readers
+	 * on the previously idle cpu -- again thanks to issuing a memory
+	 * barrier after returning from idle mode.
+	 * 
+	 * idle -> non-idle cpu      | detector      | reclaimer
+	 * ------------------------------------------------------
+	 * rcu reader 1              |               | rcu_call()
+	 * MB X                      |               |
+	 * idle = true               |               | rcu_call() 
+	 * (no rcu readers allowed ) |               | MB A in advance_cbs() 
+	 * MB Y                      | (...)         | (...)
+	 * (no rcu readers allowed)  |               | MB B in advance_cbs() 
+	 * idle = false              | ++cur_gp      |
+	 * (no rcu readers allowed)  | MB C          |
+	 * MB Z                      | signal gp_end |
+	 * rcu reader 2              |               | exec_cur_cbs()
+	 * 
+	 * 
+	 * MB Y orders visibility of changes to idle for detector's sake.
+	 * 
+	 * MB Z pairs up with MB C. The cpu making a transition from idle 
+	 * will see the most current value of cur_gp and will not attempt
+	 * to notify the detector even if preempted during this GP.
+	 * 
+	 * MB Z pairs up with MB A from the previous batch. Updaters' changes
+	 * are visible to reader 2 even when the detector thinks the cpu is idle 
+	 * but it is not anymore.
+	 * 
+	 * MB X pairs up with MB B. Late mem accesses of reader 1 are contained
+	 * and visible before idling and before any callbacks are executed 
+	 * by reclaimers.
+	 * 
+	 * In summary, the detector does not know of or wait for reader 2, but
+	 * it does not have to since it is a new reader that will not access
+	 * data from previous GPs and will see any changes.
+	 */
+	memory_barrier(); /* MB C */
+	
+	cpu_mask_for_each(*cpu_mask, cpu_id) {
+		/* 
+		 * The cpu already checked for and passed through a quiescent 
+		 * state since the beginning of this GP.
+		 * 
+		 * _rcu_cur_gp is modified by local detector thread only. 
+		 * Therefore, it is up-to-date even without a lock. 
+		 * 
+		 * cpu.last_seen_gp may not be up-to-date. At worst, we will
+		 * unnecessarily sample its last_seen_gp with a smp_call. 
+		 */
+		bool cpu_acked_gp = (cpus[cpu_id].rcu.last_seen_gp == _rcu_cur_gp);
+		
+		/*
+		 * Either the cpu is idle or it is exiting away from idle mode
+		 * and already sees the most current _rcu_cur_gp. See comment
+		 * in wait_for_readers().
+		 */
+		bool cpu_idle = cpus[cpu_id].idle;
+		
+		if (cpu_acked_gp || cpu_idle) {
+			cpu_mask_reset(cpu_mask, cpu_id);
+		}
+	}
+}
+
+/** Serially invokes sample_local_cpu(arg) on each cpu of reader_cpus. */
+static void sample_cpus(cpu_mask_t *reader_cpus, void *arg)
+{
+	cpu_mask_for_each(*reader_cpus, cpu_id) {
+		smp_call(cpu_id, sample_local_cpu, arg);
+
+		/* Update statistic. */
+		if (CPU->id != cpu_id)
+			++rcu.stat_smp_call_cnt;
+	}
+}
+
+static void upd_missed_gp_in_wait(rcu_gp_t completed_gp)
+{
+	ASSERT(CPU->rcu.cur_cbs_gp <= completed_gp);
+	
+	size_t delta = (size_t)(completed_gp - CPU->rcu.cur_cbs_gp);
+	CPU->rcu.stat_missed_gp_in_wait += delta;
+}
+
+/** Globally note that the current thread was preempted in a reader section. */
+static void note_preempted_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, false);
+
+	if (CPU->rcu.last_seen_gp != _rcu_cur_gp) {
+		/* The reader started before the GP started - we must wait for it.*/
+		list_append(&THREAD->rcu.preempt_link, &rcu.cur_preempted);
+	} else {
+		/* 
+		 * The reader started after the GP started and this cpu
+		 * already noted a quiescent state. We might block the next GP.
+		 */
+		list_append(&THREAD->rcu.preempt_link, &rcu.next_preempted);
+	}
+
+	irq_spinlock_unlock(&rcu.preempt_lock, false);
+}
+
+/** Remove the current thread from the global list of preempted readers. */
+static void rm_preempted_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+	
+	ASSERT(link_used(&THREAD->rcu.preempt_link));
+
+	bool prev_empty = list_empty(&rcu.cur_preempted);
+	list_remove(&THREAD->rcu.preempt_link);
+	bool now_empty = list_empty(&rcu.cur_preempted);
+
+	/* This was the last reader in cur_preempted. */
+	bool last_removed = now_empty && !prev_empty;
+
+	/* 
+	 * Preempted readers are blocking the detector and 
+	 * this was the last reader blocking the current GP. 
+	 */
+	if (last_removed && rcu.preempt_blocking_det) {
+		rcu.preempt_blocking_det = false;
+		semaphore_up(&rcu.remaining_readers);
+	}
+
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+}
+
+/** Waits for any preempted readers blocking this grace period to finish.*/
+static bool wait_for_preempt_reader(void)
+{
+	irq_spinlock_lock(&rcu.preempt_lock, true);
+
+	bool reader_exists = !list_empty(&rcu.cur_preempted);
+	rcu.preempt_blocking_det = reader_exists;
+	
+	irq_spinlock_unlock(&rcu.preempt_lock, true);
+	
+	if (reader_exists) {
+		/* Update statistic. */
+		++rcu.stat_preempt_blocking_cnt;
+		
+		return semaphore_down_interruptable(&rcu.remaining_readers);
+	} 	
+	
+	return true;
+}
+
+static void upd_max_cbs_in_slice(size_t arriving_cbs_cnt)
+{
+	rcu_cpu_data_t *cr = &CPU->rcu;
+	
+	if (arriving_cbs_cnt > cr->last_arriving_cnt) {
+		size_t arrived_cnt = arriving_cbs_cnt - cr->last_arriving_cnt;
+		cr->stat_max_slice_cbs = max(arrived_cnt, cr->stat_max_slice_cbs);
+	}
+	
+	cr->last_arriving_cnt = arriving_cbs_cnt;
+}
+
+/** Prints RCU run-time statistics. */
+void rcu_print_stat(void)
+{
+	/* 
+	 * Don't take locks. Worst case is we get out-dated values. 
+	 * CPU local values are updated without any locks, so there 
+	 * are no locks to lock in order to get up-to-date values.
+	 */
+	
+#ifdef RCU_PREEMPT_PODZIMEK
+	const char *algo = "podzimek-preempt-rcu";
+#elif defined(RCU_PREEMPT_A)
+	const char *algo = "a-preempt-rcu";
+#endif
+	
+	printf("Config: expedite_threshold=%d, critical_threshold=%d,"
+		" detect_sleep=%dms, %s\n",	
+		EXPEDITE_THRESHOLD, CRITICAL_THRESHOLD, DETECT_SLEEP_MS, algo);
+	printf("Completed GPs: %" PRIu64 "\n", rcu.completed_gp);
+	printf("Expedited GPs: %zu\n", rcu.stat_expedited_cnt);
+	printf("Delayed GPs:   %zu (cpus w/ still running readers after gp sleep)\n", 
+		rcu.stat_delayed_cnt);
+	printf("Preempt blocked GPs: %zu (waited for preempted readers; "
+		"running or not)\n", rcu.stat_preempt_blocking_cnt);
+	printf("Smp calls:     %zu\n", rcu.stat_smp_call_cnt);
+	
+	printf("Max arrived callbacks per GP and CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_max_cbs);
+	}
+
+	printf("\nAvg arrived callbacks per GP and CPU (nonempty batches only):\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_avg_cbs);
+	}
+	
+	printf("\nMax arrived callbacks per time slice and CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_max_slice_cbs);
+	}
+
+	printf("\nMissed GP notifications per CPU:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_missed_gps);
+	}
+
+	printf("\nMissed GP notifications per CPU while waking up:\n");
+	for (unsigned int i = 0; i < config.cpu_count; ++i) {
+		printf(" %zu", cpus[i].rcu.stat_missed_gp_in_wait);
+	}
+	printf("\n");
+}
+
+/** @}
+ */
Index: kernel/generic/src/synch/smc.c
===================================================================
--- kernel/generic/src/synch/smc.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/smc.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,5 @@
 #include <arch/barrier.h>
 #include <synch/smc.h>
+#include <mm/as.h>
 
 sysarg_t sys_smc_coherence(uintptr_t va, size_t size)
Index: kernel/generic/src/synch/smp_memory_barrier.c
===================================================================
--- kernel/generic/src/synch/smp_memory_barrier.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/synch/smp_memory_barrier.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Syscall implementation that issues a memory barrier on all cpus.
+ */
+
+#include <synch/smp_memory_barrier.h>
+#include <smp/smp_call.h>
+#include <config.h>
+
+
+static void issue_mem_bar(void *arg)
+{
+	/* smp_call already issues memory barriers on return from this function */
+}
+
+/** Issues a memory barrier on each cpu that is running a thread of the current
+ * task.
+ * 
+ * @return Irrelevant.
+ */
+sysarg_t sys_smp_memory_barrier(void)
+{
+	for (unsigned int cpu_id = 0; cpu_id < config.cpu_active; ++cpu_id) {
+		smp_call(cpu_id, issue_mem_bar, NULL);
+	}
+	
+	return 0;
+}
+
+/** @}
+ */
Index: kernel/generic/src/synch/spinlock.c
===================================================================
--- kernel/generic/src/synch/spinlock.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/spinlock.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -45,4 +45,5 @@
 #include <symtab.h>
 #include <stacktrace.h>
+#include <cpu.h>
 
 #ifdef CONFIG_SMP
@@ -198,7 +199,6 @@
  *
  * @param lock    IRQ spinlock to be locked.
- * @param irq_dis If true, interrupts are actually disabled
- *                prior locking the spinlock. If false, interrupts
- *                are expected to be already disabled.
+ * @param irq_dis If true, disables interrupts before locking the spinlock.
+ *                If false, interrupts are expected to be already disabled.
  *
  */
Index: kernel/generic/src/synch/waitq.c
===================================================================
--- kernel/generic/src/synch/waitq.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/synch/waitq.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -57,4 +57,6 @@
 
 static void waitq_sleep_timed_out(void *);
+static void waitq_complete_wakeup(waitq_t *);
+
 
 /** Initialize wait queue
@@ -330,4 +332,18 @@
 		break;
 	default:
+		/* 
+		 * Wait for a waitq_wakeup() or waitq_unsleep() to complete
+		 * before returning from waitq_sleep() to the caller. Otherwise
+		 * the caller might expect that the wait queue is no longer used 
+		 * and deallocate it (although the wakeup on a another cpu has 
+		 * not yet completed and is using the wait queue). 
+		 * 
+		 * Note that we have to do this for ESYNCH_OK_BLOCKED and
+		 * ESYNCH_INTERRUPTED, but not necessarily for ESYNCH_TIMEOUT
+		 * where the timeout handler stops using the waitq before waking 
+		 * us up. To be on the safe side, ensure the waitq is not in use 
+		 * anymore in this case as well.
+		 */
+		waitq_complete_wakeup(wq);
 		break;
 	}
@@ -357,5 +373,5 @@
 	} else {
 		if (PARAM_NON_BLOCKING(flags, usec)) {
-			/* Return immediatelly instead of going to sleep */
+			/* Return immediately instead of going to sleep */
 			return ESYNCH_WOULD_BLOCK;
 		}
@@ -442,4 +458,48 @@
 	irq_spinlock_unlock(&wq->lock, true);
 }
+
+/** If there is a wakeup in progress actively waits for it to complete.
+ * 
+ * The function returns once the concurrently running waitq_wakeup()
+ * exits. It returns immediately if there are no concurrent wakeups 
+ * at the time.
+ * 
+ * Interrupts must be disabled.
+ * 
+ * Example usage:
+ * @code
+ * void callback(waitq *wq)
+ * {
+ *     // Do something and notify wait_for_completion() that we're done.
+ *     waitq_wakeup(wq);
+ * }
+ * void wait_for_completion(void) 
+ * {
+ *     waitq wg;
+ *     waitq_initialize(&wq);
+ *     // Run callback() in the background, pass it wq.
+ *     do_asynchronously(callback, &wq);
+ *     // Wait for callback() to complete its work.
+ *     waitq_sleep(&wq);
+ *     // callback() completed its work, but it may still be accessing 
+ *     // wq in waitq_wakeup(). Therefore it is not yet safe to return 
+ *     // from waitq_sleep() or it would clobber up our stack (where wq 
+ *     // is stored). waitq_sleep() ensures the wait queue is no longer
+ *     // in use by invoking waitq_complete_wakeup() internally.
+ *     
+ *     // waitq_sleep() returned, it is safe to free wq.
+ * }
+ * @endcode
+ * 
+ * @param wq  Pointer to a wait queue.
+ */
+static void waitq_complete_wakeup(waitq_t *wq)
+{
+	ASSERT(interrupts_disabled());
+	
+	irq_spinlock_lock(&wq->lock, false);
+	irq_spinlock_unlock(&wq->lock, false);
+}
+
 
 /** Internal SMP- and IRQ-unsafe version of waitq_wakeup()
Index: kernel/generic/src/synch/workqueue.c
===================================================================
--- kernel/generic/src/synch/workqueue.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/generic/src/synch/workqueue.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** @addtogroup generic
+ * @{
+ */
+
+/**
+ * @file
+ * @brief Work queue/thread pool that automatically adjusts its size
+ *        depending on the current load. Queued work functions may sleep..
+ */
+
+#include <synch/workqueue.h>
+#include <synch/spinlock.h>
+#include <synch/condvar.h>
+#include <synch/mutex.h>
+#include <proc/thread.h>
+#include <config.h>
+#include <arch.h>
+#include <cpu.h>
+#include <macros.h>
+
+#define WORKQ_MAGIC      0xf00c1333U
+#define WORK_ITEM_MAGIC  0xfeec1777U
+
+
+struct work_queue {
+	/* 
+	 * Protects everything except activate_worker. 
+	 * Must be acquired after any thread->locks.
+	 */
+	IRQ_SPINLOCK_DECLARE(lock);
+	
+	/* Activates a worker if new work arrives or if shutting down the queue. */
+	condvar_t activate_worker;
+	
+	/* Queue of work_items ready to be dispatched. */
+	list_t queue;
+	
+	/* List of worker threads. */
+	list_t workers;
+	
+	/* Number of work items queued. */
+	size_t item_cnt;
+	
+	/* Indicates the work queue is shutting down. */
+	bool stopping;
+	const char *name;
+
+	/* Total number of created worker threads. */
+	size_t cur_worker_cnt;
+	/* Number of workers waiting for work to arrive. */
+	size_t idle_worker_cnt;
+	/* Number of idle workers signaled that have not yet been woken up. */
+	size_t activate_pending;
+	/* Number of blocked workers sleeping in work func() (ie not idle). */
+	size_t blocked_worker_cnt;
+	
+	/* Number of pending signal_worker_op() operations. */
+	size_t pending_op_cnt;
+	
+	link_t nb_link;
+	
+#ifdef CONFIG_DEBUG
+	/* Magic cookie for integrity checks. Immutable. Accessed without lock. */
+	uint32_t cookie;
+#endif 
+};
+
+
+/** Min number of idle workers to keep. */
+static size_t min_worker_cnt;
+/** Max total number of workers - be it blocked, idle, or active. */
+static size_t max_worker_cnt;
+/** Max number of concurrently running active workers, ie not blocked nor idle. */
+static size_t max_concurrent_workers;
+/** Max number of work items per active worker before a new worker is activated.*/
+static const size_t max_items_per_worker = 8;
+	
+/** System wide work queue. */
+static struct work_queue g_work_queue;
+
+static int booting = true;
+
+
+typedef struct {
+	IRQ_SPINLOCK_DECLARE(lock);
+	condvar_t req_cv;
+	thread_t *thread;
+	list_t work_queues;
+} nonblock_adder_t;
+
+static nonblock_adder_t nonblock_adder;
+
+
+
+/** Typedef a worker thread signaling operation prototype. */
+typedef void (*signal_op_t)(struct work_queue *workq);
+
+
+/* Fwd decl. */
+static void workq_preinit(struct work_queue *workq, const char *name);
+static bool add_worker(struct work_queue *workq);
+static void interrupt_workers(struct work_queue *workq);
+static void wait_for_workers(struct work_queue *workq);
+static int _workq_enqueue(struct work_queue *workq, work_t *work_item, 
+	work_func_t func, bool can_block);
+static void init_work_item(work_t *work_item, work_func_t func);
+static signal_op_t signal_worker_logic(struct work_queue *workq, bool can_block);
+static void worker_thread(void *arg);
+static bool dequeue_work(struct work_queue *workq, work_t **pwork_item);
+static bool worker_unnecessary(struct work_queue *workq);
+static void cv_wait(struct work_queue *workq);
+static void nonblock_init(void);
+
+#ifdef CONFIG_DEBUG
+static bool workq_corrupted(struct work_queue *workq);
+static bool work_item_corrupted(work_t *work_item);
+#endif
+
+/** Creates worker thread for the system-wide worker queue. */
+void workq_global_worker_init(void)
+{
+	/* 
+	 * No need for additional synchronization. Stores to word-sized 
+	 * variables are atomic and the change will eventually propagate.
+	 * Moreover add_worker() includes the necessary memory barriers
+	 * in spinlock lock/unlock().
+	 */
+	booting = false;
+	
+	nonblock_init();
+	
+	if (!add_worker(&g_work_queue))
+		panic("Could not create a single global work queue worker!\n");
+	
+}
+
+/** Initializes the system wide work queue and support for other work queues. */
+void workq_global_init(void)
+{
+	/* Keep idle workers on 1/4-th of cpus, but at least 2 threads. */
+	min_worker_cnt = max(2, config.cpu_count / 4);
+	/* Allow max 8 sleeping work items per cpu. */
+	max_worker_cnt = max(32, 8 * config.cpu_count);
+	/* Maximum concurrency without slowing down the system. */
+	max_concurrent_workers = max(2, config.cpu_count);
+	
+	workq_preinit(&g_work_queue, "kworkq");
+}
+
+/** Stops the system global work queue and waits for all work items to complete.*/
+void workq_global_stop(void)
+{
+	workq_stop(&g_work_queue);
+}
+
+/** Creates and initializes a work queue. Returns NULL upon failure. */
+struct work_queue * workq_create(const char *name)
+{
+	struct work_queue *workq = malloc(sizeof(struct work_queue), 0);
+	
+	if (workq) {
+		if (workq_init(workq, name)) {
+			ASSERT(!workq_corrupted(workq));
+			return workq;
+		}
+		
+		free(workq);
+	}
+	
+	return NULL;
+}
+
+/** Frees work queue resources and stops it if it had not been done so already.*/
+void workq_destroy(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	irq_spinlock_lock(&workq->lock, true);
+	bool stopped = workq->stopping;
+#ifdef CONFIG_DEBUG
+	size_t running_workers = workq->cur_worker_cnt;
+#endif
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	if (!stopped) {
+		workq_stop(workq);
+	} else {
+		ASSERT(0 == running_workers);
+	}
+	
+#ifdef CONFIG_DEBUG
+	workq->cookie = 0;
+#endif 
+	
+	free(workq);
+}
+
+/** Initializes workq structure without creating any workers. */
+static void workq_preinit(struct work_queue *workq, const char *name)
+{
+#ifdef CONFIG_DEBUG
+	workq->cookie = WORKQ_MAGIC;
+#endif 
+	
+	irq_spinlock_initialize(&workq->lock, name);
+	condvar_initialize(&workq->activate_worker);
+	
+	list_initialize(&workq->queue);
+	list_initialize(&workq->workers);
+	
+	workq->item_cnt = 0;
+	workq->stopping = false;
+	workq->name = name;
+	
+	workq->cur_worker_cnt = 1;
+	workq->idle_worker_cnt = 0;
+	workq->activate_pending = 0;
+	workq->blocked_worker_cnt = 0;
+	
+	workq->pending_op_cnt = 0;
+	link_initialize(&workq->nb_link);
+}
+
+/** Initializes a work queue. Returns true if successful.  
+ * 
+ * Before destroying a work queue it must be stopped via
+ * workq_stop().
+ */
+int workq_init(struct work_queue *workq, const char *name)
+{
+	workq_preinit(workq, name);
+	return add_worker(workq);
+}
+
+/** Add a new worker thread. Returns false if the thread could not be created. */
+static bool add_worker(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+
+	thread_t *thread = thread_create(worker_thread, workq, TASK, 
+		THREAD_FLAG_NONE, workq->name);
+	
+	if (!thread) {
+		irq_spinlock_lock(&workq->lock, true);
+		
+		/* cur_worker_cnt proactively increased in signal_worker_logic() .*/
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+		
+		irq_spinlock_unlock(&workq->lock, true);
+		return false;
+	}
+	
+	/* Respect lock ordering. */
+	irq_spinlock_lock(&thread->lock, true);
+	irq_spinlock_lock(&workq->lock, false);
+
+	bool success;
+
+	if (!workq->stopping) {
+		success = true;
+		
+		/* Try to distribute workers among cpus right away. */
+		unsigned int cpu_id = (workq->cur_worker_cnt) % config.cpu_active;
+		
+		if (!cpus[cpu_id].active)
+			cpu_id = CPU->id;
+
+		thread->workq = workq;	
+		thread->cpu = &cpus[cpu_id];
+		thread->workq_blocked = false;
+		thread->workq_idling = false;
+		link_initialize(&thread->workq_link);
+
+		list_append(&thread->workq_link, &workq->workers);
+	} else {
+		/* 
+		 * Work queue is shutting down - we must not add the worker
+		 * and we cannot destroy it without ready-ing it. Mark it
+		 * interrupted so the worker exits right away without even
+		 * touching workq.
+		 */
+		success = false;
+		
+		/* cur_worker_cnt proactively increased in signal_worker() .*/
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+	}
+	
+	irq_spinlock_unlock(&workq->lock, false);
+	irq_spinlock_unlock(&thread->lock, true);
+
+	if (!success) {
+		thread_interrupt(thread);
+	}
+		
+	thread_ready(thread);
+	
+	return success;
+}
+
+/** Shuts down the work queue. Waits for all pending work items to complete.  
+ *
+ * workq_stop() may only be run once. 
+ */
+void workq_stop(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	interrupt_workers(workq);
+	wait_for_workers(workq);
+}
+
+/** Notifies worker threads the work queue is shutting down. */
+static void interrupt_workers(struct work_queue *workq)
+{
+	irq_spinlock_lock(&workq->lock, true);
+
+	/* workq_stop() may only be called once. */
+	ASSERT(!workq->stopping);
+	workq->stopping = true;
+	
+	/* Respect lock ordering - do not hold workq->lock during broadcast. */
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	condvar_broadcast(&workq->activate_worker);
+}
+
+/** Waits for all worker threads to exit. */
+static void wait_for_workers(struct work_queue *workq)
+{
+	ASSERT(!PREEMPTION_DISABLED);
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	list_foreach_safe(workq->workers, cur_worker, next_worker) {
+		thread_t *worker = list_get_instance(cur_worker, thread_t, workq_link);
+		list_remove(cur_worker);
+
+		/* Wait without the lock. */
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		thread_join(worker);
+		thread_detach(worker);
+		
+		irq_spinlock_lock(&workq->lock, true);
+	}
+	
+	ASSERT(list_empty(&workq->workers));
+	
+	/* Wait for deferred add_worker_op(), signal_worker_op() to finish. */
+	while (0 < workq->cur_worker_cnt || 0 < workq->pending_op_cnt) {
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		scheduler();
+		
+		irq_spinlock_lock(&workq->lock, true);
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+}
+
+/** Queues a function into the global wait queue without blocking. 
+ * 
+ * See workq_enqueue_noblock() for more details.
+ */
+int workq_global_enqueue_noblock(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue_noblock(&g_work_queue, work_item, func);
+}
+
+/** Queues a function into the global wait queue; may block. 
+ * 
+ * See workq_enqueue() for more details.
+ */
+int workq_global_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue(&g_work_queue, work_item, func);
+}
+
+/** Adds a function to be invoked in a separate thread without blocking. 
+ * 
+ * workq_enqueue_noblock() is guaranteed not to block. It is safe 
+ * to invoke from interrupt handlers.
+ * 
+ * Consider using workq_enqueue() instead if at all possible. Otherwise,
+ * your work item may have to wait for previously enqueued sleeping 
+ * work items to complete if you are unlucky.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise. func() will be invoked in a separate thread.
+ */
+int workq_enqueue_noblock(struct work_queue *workq, work_t *work_item, 
+	work_func_t func)
+{
+	return _workq_enqueue(workq, work_item, func, false);
+}
+
+/** Adds a function to be invoked in a separate thread; may block. 
+ * 
+ * While the workq_enqueue() is unlikely to block, it may do so if too 
+ * many previous work items blocked sleeping.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise. func() will be invoked in a separate thread.
+ */
+int workq_enqueue(struct work_queue *workq, work_t *work_item, work_func_t func)
+{
+	return _workq_enqueue(workq, work_item, func, true);
+}
+
+/** Adds a work item that will be processed by a separate worker thread.
+ * 
+ * func() will be invoked in another kernel thread and may block. 
+ * 
+ * Prefer to call _workq_enqueue() with can_block set. Otherwise
+ * your work item may have to wait for sleeping work items to complete.
+ * If all worker threads are blocked/sleeping a new worker thread cannot
+ * be create without can_block set because creating a thread might
+ * block due to low memory conditions.
+ * 
+ * @param workq     Work queue where to queue the work item.
+ * @param work_item Work item bookkeeping structure. Must be valid
+ *                  until func() is entered.
+ * @param func      User supplied function to invoke in a worker thread.
+ * @param can_block May adding this work item block?
+ 
+ * @return false if work queue is shutting down; function is not 
+ *               queued for further processing. 
+ * @return true  Otherwise.
+ */
+static int _workq_enqueue(struct work_queue *workq, work_t *work_item, 
+	work_func_t func, bool can_block)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	bool success = true;
+	signal_op_t signal_op = NULL;
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	if (workq->stopping) {
+		success = false;
+	} else {
+		init_work_item(work_item, func);
+		list_append(&work_item->queue_link, &workq->queue);
+		++workq->item_cnt;
+		success = true;
+		
+		if (!booting) {
+			signal_op = signal_worker_logic(workq, can_block);
+		} else {
+			/* 
+			 * During boot there are no workers to signal. Just queue 
+			 * the work and let future workers take care of it.
+			 */
+		}
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+
+	if (signal_op) {
+		signal_op(workq);
+	}
+	
+	return success;
+}
+
+/** Prepare an item to be added to the work item queue. */
+static void init_work_item(work_t *work_item, work_func_t func)
+{
+#ifdef CONFIG_DEBUG
+	work_item->cookie = WORK_ITEM_MAGIC;
+#endif 
+	
+	link_initialize(&work_item->queue_link);
+	work_item->func = func;
+}
+
+/** Returns the number of workers running work func() that are not blocked. */
+static size_t active_workers_now(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* Workers blocked are sleeping in the work function (ie not idle). */
+	ASSERT(workq->blocked_worker_cnt <= workq->cur_worker_cnt);
+	/* Idle workers are waiting for more work to arrive in condvar_wait. */
+	ASSERT(workq->idle_worker_cnt <= workq->cur_worker_cnt);
+	
+	/* Idle + blocked workers == sleeping worker threads. */
+	size_t sleeping_workers = workq->blocked_worker_cnt + workq->idle_worker_cnt;
+	
+	ASSERT(sleeping_workers	<= workq->cur_worker_cnt);
+	/* Workers pending activation are idle workers not yet given a time slice. */
+	ASSERT(workq->activate_pending <= workq->idle_worker_cnt);
+	
+	/* 
+	 * Workers actively running the work func() this very moment and 
+	 * are neither blocked nor idle. Exclude ->activate_pending workers 
+	 * since they will run their work func() once they get a time slice 
+	 * and are not running it right now.
+	 */
+	return workq->cur_worker_cnt - sleeping_workers;
+}
+
+/** 
+ * Returns the number of workers that are running or are about to run work 
+ * func() and that are not blocked. 
+ */
+static size_t active_workers(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* 
+	 * Workers actively running the work func() and are neither blocked nor 
+	 * idle. ->activate_pending workers will run their work func() once they
+	 * get a time slice after waking from a condvar wait, so count them
+	 * as well.
+	 */
+	return active_workers_now(workq) + workq->activate_pending;
+}
+
+static void add_worker_noblock_op(struct work_queue *workq)
+{
+	condvar_signal(&nonblock_adder.req_cv);
+}
+
+static void add_worker_op(struct work_queue *workq)
+{
+	add_worker(workq);
+}
+
+static void signal_worker_op(struct work_queue *workq)
+{
+	ASSERT(!workq_corrupted(workq));
+
+	condvar_signal(&workq->activate_worker);
+	
+	irq_spinlock_lock(&workq->lock, true);
+	ASSERT(0 < workq->pending_op_cnt);
+	--workq->pending_op_cnt;
+	irq_spinlock_unlock(&workq->lock, true);
+}
+
+/** Determines how to signal workers if at all.
+ * 
+ * @param workq     Work queue where a new work item was queued.
+ * @param can_block True if we may block while signaling a worker or creating 
+ *                  a new worker.
+ * 
+ * @return Function that will notify workers or NULL if no action is needed.
+ */
+static signal_op_t signal_worker_logic(struct work_queue *workq, bool can_block)
+{
+	ASSERT(!workq_corrupted(workq));
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* Only signal workers if really necessary. */
+	signal_op_t signal_op = NULL;
+
+	/* 
+	 * Workers actively running the work func() and neither blocked nor idle. 
+	 * Including ->activate_pending workers that will run their work func() 
+	 * once they get a time slice.
+	 */
+	size_t active = active_workers(workq);
+	/* Max total allowed number of work items queued for active workers. */
+	size_t max_load = active * max_items_per_worker;
+
+	/* Active workers are getting overwhelmed - activate another. */
+	if (max_load < workq->item_cnt) {
+
+		size_t remaining_idle = 
+			workq->idle_worker_cnt - workq->activate_pending;
+
+		/* Idle workers still exist - activate one. */
+		if (remaining_idle > 0) {
+			/* 
+			 * Directly changing idle_worker_cnt here would not allow
+			 * workers to recognize spurious wake-ups. Change 
+			 * activate_pending instead.
+			 */
+			++workq->activate_pending;
+			++workq->pending_op_cnt;
+			signal_op = signal_worker_op;
+		} else {
+			/* No idle workers remain. Request that a new one be created. */
+			bool need_worker = (active < max_concurrent_workers)
+				&& (workq->cur_worker_cnt < max_worker_cnt);
+			
+			if (need_worker && can_block) {
+				signal_op = add_worker_op;
+				/* 
+				 * It may take some time to actually create the worker.
+				 * We don't want to swamp the thread pool with superfluous
+				 * worker creation requests so pretend it was already
+				 * created and proactively increase the worker count.
+				 */
+				++workq->cur_worker_cnt;
+			}
+			
+			/* 
+			 * We cannot create a new worker but we need one desperately
+			 * because all workers are blocked in their work functions.
+			 */
+			if (need_worker && !can_block && 0 == active) {
+				ASSERT(0 == workq->idle_worker_cnt);
+				
+				irq_spinlock_lock(&nonblock_adder.lock, true);
+
+				if (nonblock_adder.thread && !link_used(&workq->nb_link)) {
+					signal_op = add_worker_noblock_op;
+					++workq->cur_worker_cnt;
+					list_append(&workq->nb_link, &nonblock_adder.work_queues);
+				}
+
+				irq_spinlock_unlock(&nonblock_adder.lock, true);
+			}
+		}
+	} else {
+		/* 
+		 * There are enough active/running workers to process the queue. 
+		 * No need to signal/activate any new workers.
+		 */
+		signal_op = NULL;
+	}
+	
+	return signal_op;
+}
+
+/** Executes queued work items. */
+static void worker_thread(void *arg)
+{
+	/* 
+	 * The thread has been created after the work queue was ordered to stop. 
+	 * Do not access the work queue and return immediately. 
+	 */
+	if (thread_interrupted(THREAD)) {
+		thread_detach(THREAD);
+		return;
+	}
+	
+	ASSERT(arg != NULL);
+	
+	struct work_queue *workq = arg;
+	work_t *work_item;
+	
+	while (dequeue_work(workq, &work_item)) {
+		/* Copy the func field so func() can safely free work_item. */
+		work_func_t func = work_item->func;
+
+		func(work_item);
+	}
+}
+
+/** Waits and retrieves a work item. Returns false if the worker should exit. */
+static bool dequeue_work(struct work_queue *workq, work_t **pwork_item)
+{
+	ASSERT(!workq_corrupted(workq));
+	
+	irq_spinlock_lock(&workq->lock, true);
+	
+	/* Check if we should exit if load is low. */
+	if (!workq->stopping && worker_unnecessary(workq)) {
+		/* There are too many workers for this load. Exit. */
+		ASSERT(0 < workq->cur_worker_cnt);
+		--workq->cur_worker_cnt;
+		list_remove(&THREAD->workq_link);
+		irq_spinlock_unlock(&workq->lock, true);
+		
+		thread_detach(THREAD);
+		return false;
+	}
+	
+	bool stop = false;
+	
+	/* Wait for work to arrive. */
+	while (list_empty(&workq->queue) && !workq->stopping) {
+		cv_wait(workq);
+		
+		if (0 < workq->activate_pending)
+			--workq->activate_pending;
+	}
+
+	/* Process remaining work even if requested to stop. */
+	if (!list_empty(&workq->queue)) {
+		link_t *work_link = list_first(&workq->queue);
+		*pwork_item = list_get_instance(work_link, work_t, queue_link);
+		
+#ifdef CONFIG_DEBUG
+		ASSERT(!work_item_corrupted(*pwork_item));
+		(*pwork_item)->cookie = 0;
+#endif
+		list_remove(work_link);
+		--workq->item_cnt;
+		
+		stop = false;
+	} else {
+		/* Requested to stop and no more work queued. */
+		ASSERT(workq->stopping);
+		--workq->cur_worker_cnt;
+		stop = true;
+	}
+	
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	return !stop;
+}
+
+/** Returns true if for the given load there are too many workers. */
+static bool worker_unnecessary(struct work_queue *workq)
+{
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	/* No work is pending. We don't need too many idle threads. */
+	if (list_empty(&workq->queue)) {
+		/* There are too many idle workers. Exit. */
+		return (min_worker_cnt <= workq->idle_worker_cnt);
+	} else {
+		/* 
+		 * There is work but we are swamped with too many active workers
+		 * that were woken up from sleep at around the same time. We
+		 * don't need another worker fighting for cpu time.
+		 */
+		size_t active = active_workers_now(workq);
+		return (max_concurrent_workers < active);
+	}
+}
+
+/** Waits for a signal to activate_worker. Thread marked idle while waiting. */
+static void cv_wait(struct work_queue *workq)
+{
+	++workq->idle_worker_cnt;
+	THREAD->workq_idling = true;
+	
+	/* Ignore lock ordering just here. */
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	_condvar_wait_timeout_irq_spinlock(&workq->activate_worker,
+		&workq->lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_NONE);
+
+	ASSERT(!workq_corrupted(workq));
+	ASSERT(irq_spinlock_locked(&workq->lock));
+	
+	THREAD->workq_idling = false;
+	--workq->idle_worker_cnt;
+}
+
+
+/** Invoked from thread_ready() right before the thread is woken up. */
+void workq_before_thread_is_ready(thread_t *thread)
+{
+	ASSERT(thread);
+	ASSERT(irq_spinlock_locked(&thread->lock));
+
+	/* Worker's work func() is about to wake up from sleeping. */
+	if (thread->workq && thread->workq_blocked) {
+		/* Must be blocked in user work func() and not be waiting for work. */
+		ASSERT(!thread->workq_idling);
+		ASSERT(thread->state == Sleeping);
+		ASSERT(THREAD != thread);
+		ASSERT(!workq_corrupted(thread->workq));
+		
+		/* Protected by thread->lock */
+		thread->workq_blocked = false;
+		
+		irq_spinlock_lock(&thread->workq->lock, true);
+		--thread->workq->blocked_worker_cnt;
+		irq_spinlock_unlock(&thread->workq->lock, true);
+	}
+}
+
+/** Invoked from scheduler() before switching away from a thread. */
+void workq_after_thread_ran(void)
+{
+	ASSERT(THREAD);
+	ASSERT(irq_spinlock_locked(&THREAD->lock));
+
+	/* Worker's work func() is about to sleep/block. */
+	if (THREAD->workq && THREAD->state == Sleeping && !THREAD->workq_idling) {
+		ASSERT(!THREAD->workq_blocked);
+		ASSERT(!workq_corrupted(THREAD->workq));
+		
+		THREAD->workq_blocked = true;
+		
+		irq_spinlock_lock(&THREAD->workq->lock, false);
+
+		++THREAD->workq->blocked_worker_cnt;
+		
+		bool can_block = false;
+		signal_op_t op = signal_worker_logic(THREAD->workq, can_block);
+		
+		irq_spinlock_unlock(&THREAD->workq->lock, false);
+		
+		if (op) {
+			ASSERT(add_worker_noblock_op == op || signal_worker_op == op);
+			op(THREAD->workq);
+		}
+	}
+}
+
+/** Prints stats of the work queue to the kernel console. */
+void workq_print_info(struct work_queue *workq)
+{
+	irq_spinlock_lock(&workq->lock, true);
+
+	size_t total = workq->cur_worker_cnt;
+	size_t blocked = workq->blocked_worker_cnt;
+	size_t idle = workq->idle_worker_cnt;
+	size_t active = active_workers(workq);
+	size_t items = workq->item_cnt;
+	bool stopping = workq->stopping;
+	bool worker_surplus = worker_unnecessary(workq);
+	const char *load_str = worker_surplus ? "decreasing" : 
+		(0 < workq->activate_pending) ? "increasing" : "stable";
+	
+	irq_spinlock_unlock(&workq->lock, true);
+	
+	printf(
+		"Configuration: max_worker_cnt=%zu, min_worker_cnt=%zu,\n"
+		" max_concurrent_workers=%zu, max_items_per_worker=%zu\n"
+		"Workers: %zu\n"
+		"Active:  %zu (workers currently processing work)\n"
+		"Blocked: %zu (work functions sleeping/blocked)\n"
+		"Idle:    %zu (idle workers waiting for more work)\n"
+		"Items:   %zu (queued not yet dispatched work)\n"
+		"Stopping: %d\n"
+		"Load: %s\n",
+		max_worker_cnt, min_worker_cnt, 
+		max_concurrent_workers, max_items_per_worker,
+		total,
+		active,
+		blocked,
+		idle,
+		items,
+		stopping,
+		load_str
+	);
+}
+
+/** Prints stats of the global work queue. */
+void workq_global_print_info(void)
+{
+	workq_print_info(&g_work_queue);
+}
+
+
+static bool dequeue_add_req(nonblock_adder_t *info, struct work_queue **pworkq)
+{
+	bool stop = false;
+
+	irq_spinlock_lock(&info->lock, true);
+	
+	while (list_empty(&info->work_queues) && !stop) {
+		int ret = _condvar_wait_timeout_irq_spinlock(&info->req_cv, 
+			&info->lock, SYNCH_NO_TIMEOUT, SYNCH_FLAGS_INTERRUPTIBLE);
+		
+		stop = (ret == ESYNCH_INTERRUPTED);
+	}
+	
+	if (!stop) {
+		*pworkq = list_get_instance(list_first(&info->work_queues), 
+			struct work_queue, nb_link);
+
+		ASSERT(!workq_corrupted(*pworkq));
+		
+		list_remove(&(*pworkq)->nb_link);
+	}
+	
+	irq_spinlock_unlock(&info->lock, true);
+	
+	return !stop;
+}
+
+static void thr_nonblock_add_worker(void *arg)
+{
+	nonblock_adder_t *info = arg;
+	struct work_queue *workq;
+	
+	while (dequeue_add_req(info, &workq)) {
+		add_worker(workq);
+	}
+}
+
+
+static void nonblock_init(void)
+{
+	irq_spinlock_initialize(&nonblock_adder.lock, "kworkq-nb.lock");
+	condvar_initialize(&nonblock_adder.req_cv);
+	list_initialize(&nonblock_adder.work_queues);
+	
+	nonblock_adder.thread = thread_create(thr_nonblock_add_worker, 
+		&nonblock_adder, TASK, THREAD_FLAG_NONE, "kworkq-nb");
+	
+	if (nonblock_adder.thread) {
+		thread_ready(nonblock_adder.thread);
+	} else {
+		/* 
+		 * We won't be able to add workers without blocking if all workers
+		 * sleep, but at least boot the system.
+		 */
+		printf("Failed to create kworkq-nb. Sleeping work may stall the workq.\n");
+	}
+}
+
+#ifdef CONFIG_DEBUG
+/** Returns true if the workq is definitely corrupted; false if not sure. 
+ * 
+ * Can be used outside of any locks.
+ */
+static bool workq_corrupted(struct work_queue *workq)
+{
+	/* 
+	 * Needed to make the most current cookie value set by workq_preinit()
+	 * visible even if we access the workq right after it is created but
+	 * on a different cpu. Otherwise, workq_corrupted() would not work
+	 * outside a lock.
+	 */
+	memory_barrier();
+	return NULL == workq || workq->cookie != WORKQ_MAGIC;
+}
+
+/** Returns true if the work_item is definitely corrupted; false if not sure. 
+ * 
+ * Must be used with the work queue protecting spinlock locked.
+ */
+static bool work_item_corrupted(work_t *work_item)
+{
+	return NULL == work_item || work_item->cookie != WORK_ITEM_MAGIC;
+}
+#endif
+
+/** @}
+ */
Index: kernel/generic/src/syscall/syscall.c
===================================================================
--- kernel/generic/src/syscall/syscall.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/syscall/syscall.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -50,4 +50,5 @@
 #include <synch/futex.h>
 #include <synch/smc.h>
+#include <synch/smp_memory_barrier.h>
 #include <ddi/ddi.h>
 #include <ipc/event.h>
@@ -142,4 +143,6 @@
 	(syshandler_t) sys_futex_wakeup,
 	(syshandler_t) sys_smc_coherence,
+	(syshandler_t) sys_smp_memory_barrier,
+	
 	
 	/* Address space related syscalls. */
Index: kernel/generic/src/time/clock.c
===================================================================
--- kernel/generic/src/time/clock.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/time/clock.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -212,5 +212,5 @@
 		irq_spinlock_unlock(&THREAD->lock, false);
 		
-		if ((!ticks) && (!PREEMPTION_DISABLED)) {
+		if (ticks == 0 && PREEMPTION_ENABLED) {
 			scheduler();
 #ifdef CONFIG_UDEBUG
Index: kernel/generic/src/udebug/udebug.c
===================================================================
--- kernel/generic/src/udebug/udebug.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/generic/src/udebug/udebug.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -44,4 +44,6 @@
 #include <print.h>
 #include <arch.h>
+#include <proc/task.h>
+#include <proc/thread.h>
 
 /** Initialize udebug part of task structure.
Index: kernel/test/cht/cht1.c
===================================================================
--- kernel/test/cht/cht1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/cht/cht1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <print.h>
+#include <debug.h>
+#include <adt/cht.h>
+#include <synch/rcu.h>
+
+typedef struct val {
+	/* Place at the top to simplify re-casting. */
+	cht_link_t link;
+	size_t hash;
+	size_t unique_id;
+	bool deleted;
+	bool mark;
+} val_t;
+
+static size_t val_hash(const cht_link_t *item)
+{
+	val_t *v = member_to_inst(item, val_t, link);
+	ASSERT(v->hash == (v->unique_id % 10));
+	return v->hash;
+}
+
+static size_t val_key_hash(void *key)
+{
+	return (uintptr_t)key % 10;
+}
+
+static bool val_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	val_t *v1 = member_to_inst(item1, val_t, link);
+	val_t *v2 = member_to_inst(item2, val_t, link);
+	return v1->unique_id == v2->unique_id;
+}
+
+static bool val_key_equal(void *key, const cht_link_t *item2)
+{
+	val_t *v2 = member_to_inst(item2, val_t, link);
+	return (uintptr_t)key == v2->unique_id;
+}
+
+static void val_rm_callback(cht_link_t *item)
+{
+	val_t *v = member_to_inst(item, val_t, link);
+	ASSERT(!v->deleted);
+	v->deleted = true;
+	free(v);
+}
+
+
+static cht_ops_t val_ops = {
+	.hash = val_hash,
+	.key_hash = val_key_hash,
+	.equal = val_equal,
+	.key_equal = val_key_equal,
+	.remove_callback = val_rm_callback,
+};
+
+static void set_val(val_t *v, size_t h, size_t uid)
+{
+	v->hash = h;
+	v->unique_id = uid;
+	v->deleted = false;
+	v->mark = false;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static const char * do_sanity_test(cht_t *h)
+{
+	if (cht_find_lazy(h, (void*)0))
+		return "Found lazy in empty table.";
+	
+	if (cht_find(h, (void*)0))
+		return "Found in empty table.";
+	
+	if (cht_remove_key(h, (void*)0))
+		return "Removed from empty table.";
+	
+	const int val_cnt = 6;
+	val_t *v[6] = { NULL };
+	
+	for (int i = 0; i < val_cnt; ++i)
+		v[i] = malloc(sizeof(val_t), 0);
+	
+	size_t key[] = { 1, 1, 1, 11, 12, 13 };
+	
+	/* First three are identical */
+	for (int i = 0; i < 3; ++i)
+		set_val(v[i], 1, key[i]);
+	
+	/* Same hash, different key.*/
+	set_val(v[3], 1, key[3]);
+	
+	/* Different hashes and keys. */
+	set_val(v[4], 2, key[4]);
+	set_val(v[5], 3, key[5]);
+	
+	cht_link_t *dup;
+			
+	if (!cht_insert_unique(h, &v[0]->link, &dup))
+		return "Duplicates in empty";
+
+	if (cht_insert_unique(h, &v[1]->link, &dup))
+		return "Inserted a duplicate";
+	
+	if (dup != &v[0]->link)
+		return "Returned wrong duplicate";
+
+	if (!cht_insert_unique(h, &v[3]->link, &dup))
+		return "Refused non-equal item but with a hash in table.";
+	
+	cht_insert(h, &v[1]->link);
+	cht_insert(h, &v[2]->link);
+	
+	bool ok = true;
+	ok = ok && cht_insert_unique(h, &v[4]->link, &dup);
+	ok = ok && cht_insert_unique(h, &v[5]->link, &dup);
+	
+	if (!ok)
+		return "Refused unique ins 4, 5.";
+	
+	if (cht_find(h, (void*)0))
+		return "Phantom find.";
+	
+	cht_link_t *item = cht_find(h, (void*)v[5]->unique_id);
+	if (!item || item != &v[5]->link)
+		return "Missing 5.";
+
+	item = cht_find_next(h, &v[5]->link);
+	if (item)
+		return "Found nonexisting duplicate 5";
+	
+	item = cht_find(h, (void*)v[3]->unique_id);
+	if (!item || item != &v[3]->link)
+		return "Missing 3.";
+
+	item = cht_find_next(h, &v[3]->link);
+	if (item)
+		return "Found nonexisting duplicate 3, same hash as others.";
+	
+	item = cht_find(h, (void*)v[0]->unique_id);
+	((val_t*)item)->mark = true;
+	
+	for (int k = 1; k < 3; ++k) {
+		item = cht_find_next(h, item);
+		if (!item)
+			return "Did not find an inserted duplicate";
+		
+		val_t *val = ((val_t*)item);
+		
+		if (val->unique_id != v[0]->unique_id)
+			return "Found item with a different key.";
+		if (val->mark) 
+			return "Found twice the same node.";
+		val->mark = true;
+	}
+	
+	for (int i = 0; i < 3; ++i) {
+		if (!v[i]->mark) 
+			return "Did not find all duplicates";
+		
+		v[i]->mark = false;
+	}
+
+	if (cht_find_next(h, item))
+		return "Found non-existing duplicate.";
+
+	item = cht_find_next(h, cht_find(h, (void*)key[0]));
+	
+	((val_t*)item)->mark = true;
+	if (!cht_remove_item(h, item))
+		return "Failed to remove inserted item";
+	
+	item = cht_find(h, (void*)key[0]);
+	if (!item || ((val_t*)item)->mark)
+		return "Did not find proper item.";
+	
+	item = cht_find_next(h, item);
+	if (!item || ((val_t*)item)->mark)
+		return "Did not find proper duplicate.";
+
+	item = cht_find_next(h, item);
+	if (item)
+		return "Found removed duplicate";
+	
+	if (2 != cht_remove_key(h, (void*)key[0]))
+		return "Failed to remove all duplicates";
+	
+	if (cht_find(h, (void*)key[0]))
+		return "Found removed key";
+	
+	if (!cht_find(h, (void*)key[3]))
+		return "Removed incorrect key";
+	
+	for (size_t k = 0; k < sizeof(v) / sizeof(v[0]); ++k) {
+		cht_remove_key(h, (void*)key[k]);
+	}
+	
+	for (size_t k = 0; k < sizeof(v) / sizeof(v[0]); ++k) {
+		if (cht_find(h, (void*)key[k]))
+			return "Found a key in a cleared table";
+	}
+
+	return NULL;
+}
+
+static const char * sanity_test(void)
+{
+	cht_t h;
+	if (!cht_create_simple(&h, &val_ops))
+		return "Could not create the table.";
+	
+	rcu_read_lock();
+	const char *err = do_sanity_test(&h);
+	rcu_read_unlock();
+	
+	cht_destroy(&h);
+
+	return err;
+}
+
+/*-------------------------------------------------------------------*/
+
+static size_t next_rand(size_t seed)
+{
+	return (seed * 1103515245 + 12345) & ((1U << 31) - 1);
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	cht_link_t link;
+	size_t key;
+	bool free;
+	bool inserted;
+	bool deleted;
+} stress_t;
+
+typedef struct {
+	cht_t *h;
+	int *stop;
+	stress_t *elem;
+	size_t elem_cnt;
+	size_t upd_prob;
+	size_t wave_cnt;
+	size_t wave_elems;
+	size_t id;
+	bool failed;
+} stress_work_t;
+
+static size_t stress_hash(const cht_link_t *item)
+{
+	return ((stress_t*)item)->key >> 8;
+}
+static size_t stress_key_hash(void *key)
+{
+	return ((size_t)key) >> 8;
+}
+static bool stress_equal(const cht_link_t *item1, const cht_link_t *item2)
+{
+	return ((stress_t*)item1)->key == ((stress_t*)item2)->key;
+}
+static bool stress_key_equal(void *key, const cht_link_t *item)
+{
+	return ((size_t)key) == ((stress_t*)item)->key;
+}
+static void stress_rm_callback(cht_link_t *item)
+{
+	if (((stress_t*)item)->free)
+		free(item);
+	else
+		((stress_t*)item)->deleted = true;
+}
+
+cht_ops_t stress_ops = {
+	.hash = stress_hash,
+	.key_hash = stress_key_hash,
+	.equal = stress_equal,
+	.key_equal = stress_key_equal,
+	.remove_callback = stress_rm_callback	
+};
+
+static void resize_stresser(void *arg)
+{
+	stress_work_t *work = (stress_work_t *)arg;
+
+	for (size_t k = 0; k < work->wave_cnt; ++k) {
+		TPRINTF("I{");
+		for (size_t i = 0; i < work->wave_elems; ++i) {
+			stress_t *s = malloc(sizeof(stress_t), FRAME_ATOMIC);
+			if (!s) {
+				TPRINTF("[out-of-mem]\n");
+				goto out_of_mem;				
+			}
+			
+			s->free = true;
+			s->key = (i << 8) + work->id;
+			
+			cht_insert(work->h, &s->link);
+		}
+		TPRINTF("}");
+		
+		thread_sleep(2);
+
+		TPRINTF("R<");
+		for (size_t i = 0; i < work->wave_elems; ++i) {
+			size_t key = (i << 8) + work->id;
+			
+			if (1 != cht_remove_key(work->h, (void*)key)) {
+				TPRINTF("Err: Failed to remove inserted item\n");
+				goto failed;
+			}
+		}
+		TPRINTF(">");
+	}
+	
+	/* Request that others stop. */
+	*work->stop = 1;
+	return;
+
+failed:
+	work->failed = true;
+
+out_of_mem:
+	/* Request that others stop. */
+	*work->stop = 1;
+
+	/* Remove anything we may have inserted. */
+	for (size_t i = 0; i < work->wave_elems; ++i) {
+		size_t key = (i << 8) + work->id;
+		cht_remove_key(work->h, (void*)key);
+	}
+}
+
+static void op_stresser(void *arg)
+{
+	stress_work_t *work = (stress_work_t *)arg;
+	ASSERT(0 == *work->stop);
+	
+	size_t loops = 0;
+	size_t seed = work->id;
+		
+	while (0 == *work->stop && !work->failed) {
+		seed = next_rand(seed);
+		bool upd = ((seed % 100) <= work->upd_prob);
+		seed = next_rand(seed);
+		size_t elem_idx = seed % work->elem_cnt;
+		
+		++loops;
+		if (0 == loops % (1024 * 1024)) {
+			/* Make the most current work->stop visible. */
+			read_barrier();
+			TPRINTF("*");
+		}
+			
+		if (upd) {
+			seed = next_rand(seed);
+			bool item_op = seed & 1;
+			
+			if (work->elem[elem_idx].inserted) {
+				if (item_op) {
+					rcu_read_lock();
+					cht_remove_item(work->h, &work->elem[elem_idx].link);
+					rcu_read_unlock();
+				} else {
+					void *key = (void*)work->elem[elem_idx].key;
+					if (1 != cht_remove_key(work->h, key)) {
+						TPRINTF("Err: did not rm the key\n");
+						work->failed = true;
+					}
+				}
+				work->elem[elem_idx].inserted = false;
+			} else if (work->elem[elem_idx].deleted) {
+				work->elem[elem_idx].deleted = false;
+				
+				if (item_op) {
+					rcu_read_lock();
+					cht_link_t *dup;
+					if (!cht_insert_unique(work->h, &work->elem[elem_idx].link, 
+						&dup)) {
+						TPRINTF("Err: already inserted\n");
+						work->failed = true;
+					}
+					rcu_read_unlock();
+				} else {
+					cht_insert(work->h, &work->elem[elem_idx].link);
+				}
+				
+				work->elem[elem_idx].inserted = true;
+			}
+		} else {
+			rcu_read_lock();
+			cht_link_t *item = 
+				cht_find(work->h, (void*)work->elem[elem_idx].key);
+			rcu_read_unlock();
+
+			if (item) {
+				if (!work->elem[elem_idx].inserted) {
+					TPRINTF("Err: found but not inserted!");
+					work->failed = true;
+				}
+				if (item != &work->elem[elem_idx].link) {
+					TPRINTF("Err: found but incorrect item\n");
+					work->failed = true;
+				}
+			} else {
+				if (work->elem[elem_idx].inserted) {
+					TPRINTF("Err: inserted but not found!");
+					work->failed = true;
+				}
+			}
+		}
+	}
+
+
+	/* Remove anything we may have inserted. */
+	for (size_t i = 0; i < work->elem_cnt; ++i) {
+		void *key = (void*) work->elem[i].key;
+		cht_remove_key(work->h, key);
+	}
+}
+
+static bool do_stress(void)
+{
+	cht_t h;
+	
+	if (!cht_create_simple(&h, &stress_ops)) {
+		TPRINTF("Failed to create the table\n");
+		return false;
+	}
+
+	const size_t wave_cnt = 10;
+	const size_t max_thread_cnt = 8;
+	const size_t resize_thread_cnt = 2;
+	size_t op_thread_cnt = min(max_thread_cnt, 2 * config.cpu_active);
+	size_t total_thr_cnt = op_thread_cnt + resize_thread_cnt;
+	size_t items_per_thread = 1024;
+	
+	size_t work_cnt = op_thread_cnt + resize_thread_cnt;
+	size_t item_cnt = op_thread_cnt * items_per_thread;
+	
+	/* Alloc hash table items. */
+	size_t size = item_cnt * sizeof(stress_t) + work_cnt * sizeof(stress_work_t)
+		+ sizeof(int);
+		
+	TPRINTF("Alloc and init table items. \n");
+	void *p = malloc(size, FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("Failed to alloc items\n");
+		cht_destroy(&h);
+		return false;
+	}
+	
+	stress_t *pitem = p + work_cnt * sizeof(stress_work_t);
+	stress_work_t *pwork = p;
+	int *pstop = (int*)(pitem + item_cnt);
+	
+	*pstop = 0;
+	
+	/* Init work items. */
+	for (size_t i = 0; i < op_thread_cnt; ++i) {
+		pwork[i].h = &h;
+		pwork[i].stop = pstop;
+		pwork[i].elem = &pitem[i * items_per_thread];
+		pwork[i].upd_prob = (i + 1) * 100 / op_thread_cnt;
+		pwork[i].id = i;
+		pwork[i].elem_cnt = items_per_thread;
+		pwork[i].failed = false;
+	}
+	
+	for (size_t i = op_thread_cnt; i < op_thread_cnt + resize_thread_cnt; ++i) {
+		pwork[i].h = &h;
+		pwork[i].stop = pstop;
+		pwork[i].wave_cnt = wave_cnt;
+		pwork[i].wave_elems = item_cnt * 4;
+		pwork[i].id = i;
+		pwork[i].failed = false;
+	}
+	
+	/* Init table elements. */
+	for (size_t k = 0; k < op_thread_cnt; ++k) {
+		for (size_t i = 0; i < items_per_thread; ++i) {
+			pwork[k].elem[i].key = (i << 8) + k;
+			pwork[k].elem[i].free = false;
+			pwork[k].elem[i].inserted = false;
+			pwork[k].elem[i].deleted = true;
+		}
+	}
+	
+	TPRINTF("Running %zu ins/del/find stress threads + %zu resizers.\n",
+		op_thread_cnt, resize_thread_cnt);
+	
+	/* Create and run threads. */
+	thread_t *thr[max_thread_cnt + resize_thread_cnt];
+	
+	for (size_t i = 0; i < total_thr_cnt; ++i) {
+		if (i < op_thread_cnt)
+			thr[i] = thread_create(op_stresser, &pwork[i], TASK, 0, "cht-op-stress");
+		else 
+			thr[i] = thread_create(resize_stresser, &pwork[i], TASK, 0, "cht-resize");
+		
+		ASSERT(thr[i]);
+		thread_wire(thr[i], &cpus[i % config.cpu_active]);
+		thread_ready(thr[i]);
+	}
+	
+	bool failed = false;
+	
+	/* Wait for all threads to return. */
+	TPRINTF("Joining resize stressers.\n");
+	for (size_t i = op_thread_cnt; i < total_thr_cnt; ++i) {
+		thread_join(thr[i]);
+		thread_detach(thr[i]);
+		failed = pwork[i].failed || failed;
+	}
+	
+	TPRINTF("Joining op stressers.\n");
+	for (int i = (int)op_thread_cnt - 1; i >= 0; --i) {
+		TPRINTF("%d threads remain\n", i);
+		thread_join(thr[i]);
+		thread_detach(thr[i]);
+		failed = pwork[i].failed || failed;
+	}
+	
+	cht_destroy(&h);
+	free(p);
+
+	return !failed;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+const char *test_cht1(void)
+{
+	const char *err = sanity_test();
+	if (err)
+		return err;
+	printf("Basic sanity test: ok.\n");
+	
+	if (!do_stress()) 
+		return "CHT stress test failed.";
+	else
+		return NULL;
+}
Index: kernel/test/cht/cht1.def
===================================================================
--- kernel/test/cht/cht1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/cht/cht1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,6 @@
+{
+	"cht",
+	"Concurrent hash table test",
+	&test_cht1,
+	true
+},
Index: kernel/test/smpcall/smpcall1.c
===================================================================
--- kernel/test/smpcall/smpcall1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/smpcall/smpcall1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2012 Adam Hraska 
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <print.h>
+#include <debug.h>
+
+#include <test.h>
+#include <smp/smp_call.h>
+#include <cpu.h>
+#include <macros.h>
+#include <config.h>
+#include <arch.h>
+#include <proc/thread.h>
+
+/* 
+ * Maximum total number of smp_calls in the system is: 
+ *  162000 == 9^2 * 1000 * 2 
+ *  == MAX_CPUS^2 * ITERATIONS * EACH_CPU_INC_PER_ITER
+ */
+#define MAX_CPUS   9
+#define ITERATIONS 1000
+#define EACH_CPU_INC_PER_ITER 2
+
+
+static void inc(void *p)
+{
+	ASSERT(interrupts_disabled());
+
+	size_t *pcall_cnt = (size_t*)p;
+	/* 
+	 * No synchronization. Tests if smp_calls makes changes 
+	 * visible to the caller. 
+	 */
+	++*pcall_cnt;
+}
+
+
+static void test_thread(void *p)
+{
+	size_t *pcall_cnt = (size_t*)p;
+	smp_call_t call_info[MAX_CPUS];
+	
+	unsigned int cpu_count = min(config.cpu_active, MAX_CPUS);
+	
+	for (int iter = 0; iter < ITERATIONS; ++iter) {
+		/* Synchronous version. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			/* 
+			 * smp_call should make changes by inc() visible on this cpu. 
+			 * As a result we can pass it our pcall_cnt and not worry 
+			 * about other synchronization.
+			 */
+			smp_call(cpu_id, inc, pcall_cnt);
+		}
+		
+		/* 
+		 * Async calls run in parallel on different cpus, so passing the 
+		 * same counter would clobber it without additional synchronization.
+		 */
+		size_t local_cnt[MAX_CPUS] = {0};
+		
+		/* Now start asynchronous calls. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			smp_call_async(cpu_id, inc, &local_cnt[cpu_id], &call_info[cpu_id]);
+		}
+		
+		/* And wait for all async calls to complete. */
+		for (unsigned cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+			smp_call_wait(&call_info[cpu_id]);
+			*pcall_cnt += local_cnt[cpu_id];
+		}
+
+		/* Give other threads a chance to run. */
+		thread_usleep(10000);
+	}
+}
+
+static size_t calc_exp_calls(size_t thread_cnt)
+{
+	return thread_cnt * ITERATIONS * EACH_CPU_INC_PER_ITER;
+}
+
+const char *test_smpcall1(void)
+{
+	/* Number of received calls that were sent by cpu[i]. */
+	size_t call_cnt[MAX_CPUS] = {0};
+	thread_t *thread[MAX_CPUS] = { NULL };
+	
+	unsigned int cpu_count = min(config.cpu_active, MAX_CPUS);
+	size_t running_thread_cnt = 0;
+
+	TPRINTF("Spawning threads on %u cpus.\n", cpu_count);
+	
+	/* Create a wired thread on each cpu. */
+	for (unsigned int id = 0; id < cpu_count; ++id) {
+		thread[id] = thread_create(test_thread, &call_cnt[id], TASK, 
+			THREAD_FLAG_NONE, "smp-call-test");
+		
+		if (thread[id]) {
+			thread_wire(thread[id], &cpus[id]);
+			++running_thread_cnt;
+		} else {
+			TPRINTF("Failed to create thread on cpu%u.\n", id);
+		}
+	}
+
+	size_t exp_calls = calc_exp_calls(running_thread_cnt);
+	size_t exp_calls_sum = exp_calls * cpu_count;
+	
+	TPRINTF("Running %zu wired threads. Expecting %zu calls. Be patient.\n", 
+		running_thread_cnt, exp_calls_sum);
+
+	for (unsigned int i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			thread_ready(thread[i]);
+		}
+	}
+	
+	/* Wait for threads to complete. */
+	for (unsigned int i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			thread_join(thread[i]);
+			thread_detach(thread[i]);
+		}
+	}
+
+	TPRINTF("Threads finished. Checking number of smp_call()s.\n");
+	
+	bool ok = true;
+	size_t calls_sum = 0;
+	
+	for (size_t i = 0; i < cpu_count; ++i) {
+		if (thread[i] != NULL) {
+			if (call_cnt[i] != exp_calls) {
+				ok = false;
+				TPRINTF("Error: %zu instead of %zu cpu%zu's calls were"
+					" acknowledged.\n", call_cnt[i], exp_calls, i);
+			} 
+		}
+		
+		calls_sum += call_cnt[i];
+	}
+	
+	if (calls_sum != exp_calls_sum) {
+		TPRINTF("Error: total acknowledged sum: %zu instead of %zu.\n",
+			calls_sum, exp_calls_sum);
+		
+		ok = false;
+	}
+	
+	if (ok) {
+		TPRINTF("Success: number of received smp_calls is as expected (%zu).\n",
+			exp_calls_sum);
+		return NULL;
+	} else
+		return "Failed: incorrect acknowledged smp_calls.\n";
+	
+}
Index: kernel/test/smpcall/smpcall1.def
===================================================================
--- kernel/test/smpcall/smpcall1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/smpcall/smpcall1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,6 @@
+{
+	"smpcall1",
+	"smp_call() test",
+	&test_smpcall1,
+	true
+},
Index: kernel/test/synch/rcu1.c
===================================================================
--- kernel/test/synch/rcu1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/rcu1.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <atomic.h>
+#include <print.h>
+#include <proc/thread.h>
+#include <macros.h>
+#include <str.h>
+#include <errno.h>
+#include <time/delay.h>
+
+#include <synch/rcu.h>
+
+
+#define MAX_THREADS 32
+
+static int one_idx = 0;
+static thread_t *thread[MAX_THREADS] = { NULL };
+
+typedef struct {
+	rcu_item_t rcu;
+	bool exited;
+} exited_t;
+
+/* Callback raced with preexisting readers. */
+#define ERACE   123
+/* Waited for too long for the callback to exit; consider it lost. */
+#define ECBLOST 432
+
+/*-------------------------------------------------------------------*/
+static void wait_for_cb_exit(size_t secs, exited_t *p, int *presult)
+{
+	size_t loops = 0;
+	/* 4 secs max */
+	size_t loop_ms_sec = 500;
+	size_t max_loops = ((secs * 1000 + loop_ms_sec - 1) / loop_ms_sec);
+
+	while (loops < max_loops && !p->exited) {
+		++loops;
+		thread_usleep(loop_ms_sec * 1000);
+		TPRINTF(".");
+	}
+	
+	if (!p->exited) {
+		*presult = ECBLOST;
+	}
+}
+
+static size_t get_thread_cnt(void)
+{
+	return min(MAX_THREADS, config.cpu_active * 4);
+}
+
+static void run_thread(size_t k, void (*func)(void*), void *arg)
+{
+	ASSERT(thread[k] == NULL);
+	
+	thread[k] = thread_create(func, arg, TASK, THREAD_FLAG_NONE, 
+		"test-rcu-thread");
+		
+	if(thread[k]) {
+		/* Distribute evenly. */
+		thread_wire(thread[k], &cpus[k % config.cpu_active]);
+		thread_ready(thread[k]);
+	}
+}
+
+static void run_all(void (*func)(void*))
+{
+	size_t thread_cnt = get_thread_cnt();
+	
+	one_idx = 0;
+	
+	for (size_t i = 0; i < thread_cnt; ++i) {
+		run_thread(i, func, NULL);
+	}
+}
+
+static void join_all(void)
+{
+	size_t thread_cnt = get_thread_cnt();
+	
+	one_idx = 0;
+	
+	for (size_t i = 0; i < thread_cnt; ++i) {
+		if (thread[i]) {
+			bool joined = false;
+			do {
+				int ret = thread_join_timeout(thread[i], 5 * 1000 * 1000, 0);
+				joined = (ret != ESYNCH_TIMEOUT);
+				
+				if (ret == ESYNCH_OK_BLOCKED) {
+					TPRINTF("%zu threads remain\n", thread_cnt - i - 1);
+				}
+			} while (!joined);
+			
+			thread_detach(thread[i]);
+			thread[i] = NULL;
+		}
+	}
+}
+
+static void run_one(void (*func)(void*), void *arg)
+{
+	ASSERT(one_idx < MAX_THREADS);
+	run_thread(one_idx, func, arg);
+	++one_idx;
+}
+
+
+static void join_one(void)
+{
+	ASSERT(0 < one_idx && one_idx <= MAX_THREADS);
+
+	--one_idx;
+	
+	if (thread[one_idx]) {
+		thread_join(thread[one_idx]);
+		thread_detach(thread[one_idx]);
+		thread[one_idx] = NULL;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static void nop_reader(void *arg)
+{
+	size_t nop_iters = (size_t)arg;
+	
+	TPRINTF("Enter nop-reader\n");
+	
+	for (size_t i = 0; i < nop_iters; ++i) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
+	
+	TPRINTF("Exit nop-reader\n");
+}
+
+static void get_seq(size_t from, size_t to, size_t steps, size_t *seq)
+{
+	ASSERT(0 < steps && from <= to && 0 < to);
+	size_t inc = (to - from) / (steps - 1);
+	
+	for (size_t i = 0; i < steps - 1; ++i) {
+		seq[i] = i * inc + from;
+	}
+	
+	seq[steps - 1] = to;
+}
+
+static bool do_nop_readers(void)
+{
+	size_t seq[MAX_THREADS] = {0};
+	get_seq(100, 100000, get_thread_cnt(), seq);
+	
+	TPRINTF("\nRun %zu thr: repeat empty no-op reader sections\n", get_thread_cnt());
+	
+	for (size_t k = 0; k < get_thread_cnt(); ++k)
+		run_one(nop_reader, (void*)seq[k]);
+	
+	TPRINTF("\nJoining %zu no-op readers\n", get_thread_cnt());
+	join_all();
+	
+	return true;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+
+static void long_reader(void *arg)
+{
+	const size_t iter_cnt = 100 * 1000 * 1000;
+	size_t nop_iters = (size_t)arg;
+	size_t outer_iters = iter_cnt / nop_iters;
+	
+	TPRINTF("Enter long-reader\n");
+	
+	for (size_t i = 0; i < outer_iters; ++i) {
+		rcu_read_lock();
+		
+		for (volatile size_t k = 0; k < nop_iters; ++k) {
+			/* nop, but increment volatile k */
+		}
+		
+		rcu_read_unlock();
+	}
+	
+	TPRINTF("Exit long-reader\n");
+}
+
+static bool do_long_readers(void)
+{
+	size_t seq[MAX_THREADS] = {0};
+	get_seq(10, 1000 * 1000, get_thread_cnt(), seq);
+	
+	TPRINTF("\nRun %zu thr: repeat long reader sections, will preempt, no cbs.\n", 
+		get_thread_cnt());
+	
+	for (size_t k = 0; k < get_thread_cnt(); ++k)
+		run_one(long_reader, (void*)seq[k]);
+	
+	TPRINTF("\nJoining %zu readers with long reader sections.\n", get_thread_cnt());
+	join_all();
+	
+	return true;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static atomic_t nop_callbacks_cnt = {0};
+/* Must be even. */
+static const int nop_updater_iters = 10000;
+
+static void count_cb(rcu_item_t *item)
+{
+	atomic_inc(&nop_callbacks_cnt);
+	free(item);
+}
+
+static void nop_updater(void *arg)
+{
+	for (int i = 0; i < nop_updater_iters; i += 2){
+		rcu_item_t *a = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		rcu_item_t *b = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		
+		if (a && b) {
+			rcu_call(a, count_cb);
+			rcu_call(b, count_cb);
+		} else {
+			TPRINTF("[out-of-mem]\n");
+			free(a);
+			free(b);
+			return;
+		}
+	}
+}
+
+static bool do_nop_callbacks(void)
+{
+	atomic_set(&nop_callbacks_cnt, 0);
+
+	size_t exp_cnt = nop_updater_iters * get_thread_cnt();
+	size_t max_used_mem = sizeof(rcu_item_t) * exp_cnt;
+	
+	TPRINTF("\nRun %zu thr: post %zu no-op callbacks (%zu B used), no readers.\n", 
+		get_thread_cnt(), exp_cnt, max_used_mem);
+	
+	run_all(nop_updater);
+	TPRINTF("\nJoining %zu no-op callback threads\n", get_thread_cnt());
+	join_all();
+	
+	size_t loop_cnt = 0, max_loops = 15;
+
+	while (exp_cnt != atomic_get(&nop_callbacks_cnt) && loop_cnt < max_loops) {
+		++loop_cnt;
+		TPRINTF(".");
+		thread_sleep(1);
+	}
+	
+	return loop_cnt < max_loops;
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	rcu_item_t rcu_item;
+	int cookie;
+} item_w_cookie_t;
+
+const int magic_cookie = 0x01234567;
+static int one_cb_is_done = 0;
+
+static void one_cb_done(rcu_item_t *item)
+{
+	ASSERT( ((item_w_cookie_t *)item)->cookie == magic_cookie);
+	one_cb_is_done = 1;
+	TPRINTF("Callback()\n");
+	free(item);
+}
+
+static void one_cb_reader(void *arg)
+{
+	TPRINTF("Enter one-cb-reader\n");
+	
+	rcu_read_lock();
+	
+	item_w_cookie_t *item = malloc(sizeof(item_w_cookie_t), FRAME_ATOMIC);
+	
+	if (item) {
+		item->cookie = magic_cookie;
+		rcu_call(&item->rcu_item, one_cb_done);
+	} else {
+		TPRINTF("\n[out-of-mem]\n");
+	}
+	
+	thread_sleep(1);
+	
+	rcu_read_unlock();
+	
+	TPRINTF("Exit one-cb-reader\n");
+}
+
+static bool do_one_cb(void)
+{
+	one_cb_is_done = 0;
+	
+	TPRINTF("\nRun a single reader that posts one callback.\n");
+	run_one(one_cb_reader, NULL);
+	join_one();
+	
+	TPRINTF("\nJoined one-cb reader, wait for callback.\n");
+	size_t loop_cnt = 0;
+	size_t max_loops = 4; /* 200 ms total */
+	
+	while (!one_cb_is_done && loop_cnt < max_loops) {
+		thread_usleep(50 * 1000);
+		++loop_cnt;
+	}
+	
+	return one_cb_is_done;
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	size_t update_cnt;
+	size_t read_cnt;
+	size_t iters;
+} seq_work_t;
+
+typedef struct {
+	rcu_item_t rcu;
+	atomic_count_t start_time;
+} seq_item_t;
+
+
+static int seq_test_result = EOK;
+
+static atomic_t cur_time = {1};
+static atomic_count_t max_upd_done_time = {0};
+
+static void seq_cb(rcu_item_t *rcu_item)
+{
+	seq_item_t *item = member_to_inst(rcu_item, seq_item_t, rcu);
+	
+	/* Racy but errs to the conservative side, so it is ok. */
+	if (max_upd_done_time < item->start_time) {
+		max_upd_done_time = item->start_time;
+		
+		/* Make updated time visible */
+		memory_barrier();
+	}
+
+	free(item);
+}
+
+static void seq_func(void *arg)
+{
+	seq_work_t *work = (seq_work_t*)arg;
+	
+	/* Alternate between reader and updater roles. */
+	for (size_t k = 0; k < work->iters; ++k) {
+		/* Reader */
+		for (size_t i = 0; i < work->read_cnt; ++i) {
+			rcu_read_lock();
+			atomic_count_t start_time = atomic_postinc(&cur_time);
+			
+			for (volatile size_t d = 0; d < 10 * i; ++d ){
+				/* no-op */
+			}
+			
+			/* Get most recent max_upd_done_time. */
+			memory_barrier();
+			
+			if (start_time < max_upd_done_time) {
+				seq_test_result = ERACE;
+			}
+			
+			rcu_read_unlock();
+			
+			if (seq_test_result != EOK) 
+				return;
+		}
+		
+		/* Updater */
+		for (size_t i = 0; i < work->update_cnt; ++i) {
+			seq_item_t *a = malloc(sizeof(seq_item_t), FRAME_ATOMIC);
+			seq_item_t *b = malloc(sizeof(seq_item_t), FRAME_ATOMIC);
+			
+			if (a && b) {
+				a->start_time = atomic_postinc(&cur_time);
+				rcu_call(&a->rcu, seq_cb);
+				
+				b->start_time = atomic_postinc(&cur_time);
+				rcu_call(&b->rcu, seq_cb);
+			} else {
+				TPRINTF("\n[out-of-mem]\n");
+				seq_test_result = ENOMEM;
+				free(a);
+				free(b);
+				return;
+			}
+		}
+		
+	} 
+}
+
+static bool do_seq_check(void)
+{
+	seq_test_result = EOK;
+	max_upd_done_time = 0;
+	atomic_set(&cur_time, 1);
+
+	const size_t iters = 100;
+	const size_t total_cnt = 1000;
+	size_t read_cnt[MAX_THREADS] = {0};
+	seq_work_t item[MAX_THREADS];
+	
+	size_t total_cbs = 0;
+	size_t max_used_mem = 0;
+	
+	get_seq(0, total_cnt, get_thread_cnt(), read_cnt);
+	
+
+	for (size_t i = 0; i < get_thread_cnt(); ++i) {
+		item[i].update_cnt = total_cnt - read_cnt[i];
+		item[i].read_cnt = read_cnt[i];
+		item[i].iters = iters;
+		
+		total_cbs += 2 * iters * item[i].update_cnt;
+	}
+	
+	max_used_mem = total_cbs * sizeof(seq_item_t);
+
+	const char *mem_suffix;
+	uint64_t mem_units;
+	bin_order_suffix(max_used_mem, &mem_units, &mem_suffix, false);
+	
+	TPRINTF("\nRun %zu th: check callback completion time in readers. "
+		"%zu callbacks total (max %" PRIu64 " %s used). Be patient.\n", 
+		get_thread_cnt(), total_cbs, mem_units, mem_suffix);
+	
+	for (size_t i = 0; i < get_thread_cnt(); ++i) {
+		run_one(seq_func, &item[i]);
+	}
+	
+	TPRINTF("\nJoining %zu seq-threads\n", get_thread_cnt());
+	join_all();
+	
+	if (seq_test_result == ENOMEM) {
+		TPRINTF("\nErr: out-of mem\n");
+	} else if (seq_test_result == ERACE) {
+		TPRINTF("\nERROR: race detected!!\n");
+	} 
+	
+	return seq_test_result == EOK;
+}
+
+/*-------------------------------------------------------------------*/
+
+
+static void reader_unlocked(rcu_item_t *item)
+{
+	exited_t *p = (exited_t*)item;
+	p->exited = true;
+}
+
+static void reader_exit(void *arg)
+{
+	rcu_read_lock();
+	rcu_read_lock();
+	rcu_read_lock();
+	rcu_read_unlock();
+	
+	rcu_call((rcu_item_t*)arg, reader_unlocked);
+	
+	rcu_read_lock();
+	rcu_read_lock();
+	
+	/* Exit without unlocking the rcu reader section. */
+}
+
+static bool do_reader_exit(void)
+{
+	TPRINTF("\nReader exits thread with rcu_lock\n");
+	
+	exited_t *p = malloc(sizeof(exited_t), FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+		
+	p->exited = false;
+	
+	run_one(reader_exit, p);	
+	join_one();
+	
+	int result = EOK;
+	wait_for_cb_exit(2 /* secs */, p, &result);
+	
+	if (result != EOK) {
+		TPRINTF("Err: RCU locked up after exiting from within a reader\n");
+		/* Leak the mem. */
+	} else {
+		free(p);
+	}
+	
+	return result == EOK;
+}
+
+/*-------------------------------------------------------------------*/
+
+/*-------------------------------------------------------------------*/
+
+typedef struct preempt_struct {
+	exited_t e;
+	int result;
+} preempt_t;
+
+
+static void preempted_unlocked(rcu_item_t *item)
+{
+	preempt_t *p = member_to_inst(item, preempt_t, e.rcu);
+	p->e.exited = true;
+	TPRINTF("Callback().\n");
+}
+
+static void preempted_reader_prev(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+
+	TPRINTF("reader_prev{ ");
+	
+	rcu_read_lock();
+	scheduler();
+	rcu_read_unlock();
+
+	/* 
+	 * Start GP after exiting reader section w/ preemption. 
+	 * Just check that the callback does not lock up and is not lost.
+	 */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	TPRINTF("}reader_prev\n");
+}
+
+static void preempted_reader_inside_cur(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_inside_cur{ ");
+	/* 
+	 * Start a GP and try to finish the reader before 
+	 * the GP ends (including preemption). 
+	 */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Give RCU threads a chance to start up. */
+	scheduler();
+	scheduler();
+
+	rcu_read_lock();
+	/* Come back as soon as possible to complete before GP ends. */
+	thread_usleep(2);
+	rcu_read_unlock();
+
+	TPRINTF("}reader_inside_cur\n");
+}
+
+
+static void preempted_reader_cur(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_cur{ ");
+	rcu_read_lock();
+
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Preempt while cur GP detection is running */
+	thread_sleep(1);
+	
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_cur\n");
+}
+
+static void preempted_reader_next1(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_next1{ ");
+	rcu_read_lock();
+
+	/* Preempt before cur GP detection starts. */
+	scheduler();
+	
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_next1\n");
+}
+
+static void preempted_reader_next2(void *arg)
+{
+	preempt_t *p = (preempt_t*)arg;
+	ASSERT(!p->e.exited);
+	
+	TPRINTF("reader_next2{ ");
+	rcu_read_lock();
+
+	/* Preempt before cur GP detection starts. */
+	scheduler();
+	
+	/* Start GP. */
+	rcu_call(&p->e.rcu, preempted_unlocked);
+
+	/* 
+	 * Preempt twice while GP is running after we've been known 
+	 * to hold up the GP just to make sure multiple preemptions
+	 * are properly tracked if a reader is delaying the cur GP.
+	 */
+	thread_sleep(1);
+	thread_sleep(1);
+
+	/* Err: exited before this reader completed. */
+	if (p->e.exited)
+		p->result = ERACE;
+
+	rcu_read_unlock();
+	TPRINTF("}reader_next2\n");
+}
+
+
+static bool do_one_reader_preempt(void (*f)(void*), const char *err)
+{
+	preempt_t *p = malloc(sizeof(preempt_t), FRAME_ATOMIC);
+	if (!p) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	p->e.exited = false;
+	p->result = EOK;
+	
+	run_one(f, p);	
+	join_one();
+	
+	/* Wait at most 4 secs. */
+	wait_for_cb_exit(4, &p->e, &p->result);
+	
+	if (p->result == EOK) {
+		free(p);
+		return true;
+	} else {
+		TPRINTF(err);
+		/* Leak a bit of mem. */
+		return false;
+	}
+}
+
+static bool do_reader_preempt(void)
+{
+	TPRINTF("\nReaders will be preempted.\n");
+	
+	bool success = true;
+	bool ok = true;
+	
+	ok = do_one_reader_preempt(preempted_reader_prev, 
+		"Err: preempted_reader_prev()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_inside_cur, 
+		"Err: preempted_reader_inside_cur()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_cur, 
+		"Err: preempted_reader_cur()\n");
+	success = success && ok;
+	
+	ok = do_one_reader_preempt(preempted_reader_next1, 
+		"Err: preempted_reader_next1()\n");
+	success = success && ok;
+
+	ok = do_one_reader_preempt(preempted_reader_next2, 
+		"Err: preempted_reader_next2()\n");
+	success = success && ok;
+	
+	return success;
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	bool reader_done;
+	bool reader_running;
+	bool synch_running;
+} synch_t;
+
+static void synch_reader(void *arg)
+{
+	synch_t *synch = (synch_t *) arg;
+	
+	rcu_read_lock();
+
+	/* Order accesses of synch after the reader section begins. */
+	memory_barrier();
+	
+	synch->reader_running = true;
+	
+	while (!synch->synch_running) {
+		/* 0.5 sec */
+		delay(500 * 1000);
+	}
+	
+	/* Run for 1 sec */
+	delay(1000 * 1000);
+	/* thread_join() propagates done to do_synch() */
+	synch->reader_done = true;
+	
+	rcu_read_unlock();
+}
+
+
+static bool do_synch(void)
+{
+	TPRINTF("\nSynchronize with long reader\n");
+	
+	synch_t *synch = malloc(sizeof(synch_t), FRAME_ATOMIC);
+	
+	if (!synch) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	synch->reader_done = false;
+	synch->reader_running = false;
+	synch->synch_running = false;
+	
+	run_one(synch_reader, synch);	
+	
+	/* Wait for the reader to enter its critical section. */
+	scheduler();
+	while (!synch->reader_running) {
+		thread_usleep(500 * 1000);
+	}
+	
+	synch->synch_running = true;
+	
+	rcu_synchronize();
+	join_one();
+	
+	
+	if (synch->reader_done) {
+		free(synch);
+		return true;
+	} else {
+		TPRINTF("Err: synchronize() exited prematurely \n");
+		/* Leak some mem. */
+		return false;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+typedef struct {
+	rcu_item_t rcu_item;
+	atomic_t done;
+} barrier_t;
+
+static void barrier_callback(rcu_item_t *item)
+{
+	barrier_t *b = member_to_inst(item, barrier_t, rcu_item);
+	atomic_set(&b->done, 1);
+}
+
+static bool do_barrier(void)
+{
+	TPRINTF("\nrcu_barrier: Wait for outstanding rcu callbacks to complete\n");
+	
+	barrier_t *barrier = malloc(sizeof(barrier_t), FRAME_ATOMIC);
+	
+	if (!barrier) {
+		TPRINTF("[out-of-mem]\n");
+		return false;
+	}
+	
+	atomic_set(&barrier->done, 0);
+	
+	rcu_call(&barrier->rcu_item, barrier_callback);
+	rcu_barrier();
+	
+	if (1 == atomic_get(&barrier->done)) {
+		free(barrier);
+		return true;
+	} else {
+		TPRINTF("rcu_barrier() exited prematurely.\n");
+		/* Leak some mem. */
+		return false;
+	}
+}
+
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	size_t iters;
+	bool master;
+} stress_t;
+
+
+static void stress_reader(void *arg)
+{
+	bool *done = (bool*) arg;
+	
+	while (!*done) {
+		rcu_read_lock();
+		rcu_read_unlock();
+		
+		/* 
+		 * Do some work outside of the reader section so we are not always
+		 * preempted in the reader section.
+		 */
+		delay(5);
+	}
+}
+
+static void stress_cb(rcu_item_t *item)
+{
+	/* 5 us * 1000 * 1000 iters == 5 sec per updater thread */
+	delay(5);
+	free(item);
+}
+
+static void stress_updater(void *arg)
+{
+	stress_t *s = (stress_t *)arg;
+	
+	for (size_t i = 0; i < s->iters; ++i) {
+		rcu_item_t *item = malloc(sizeof(rcu_item_t), FRAME_ATOMIC);
+		
+		if (item) {
+			rcu_call(item, stress_cb);
+		} else {
+			TPRINTF("[out-of-mem]\n");
+			return;
+		}
+		
+		/* Print a dot if we make a progress of 1% */
+		if (s->master && 0 == (i % (s->iters/100)))
+			TPRINTF(".");
+	}
+}
+
+static bool do_stress(void)
+{
+	size_t cb_per_thread = 1000 * 1000;
+	bool done = false;
+	stress_t master = { .iters = cb_per_thread, .master = true }; 
+	stress_t worker = { .iters = cb_per_thread, .master = false }; 
+	
+	size_t thread_cnt = min(MAX_THREADS / 2, config.cpu_active);
+	/* Each cpu has one reader and one updater. */
+	size_t reader_cnt = thread_cnt;
+	size_t updater_cnt = thread_cnt;
+	
+	size_t exp_upd_calls = updater_cnt * cb_per_thread;
+	size_t max_used_mem = exp_upd_calls * sizeof(rcu_item_t);
+	
+	const char *mem_suffix;
+	uint64_t mem_units;
+	bin_order_suffix(max_used_mem, &mem_units, &mem_suffix, false);
+
+	TPRINTF("\nStress: Run %zu nop-readers and %zu updaters. %zu callbacks"
+		" total (max %" PRIu64 " %s used). Be very patient.\n", 
+		reader_cnt, updater_cnt, exp_upd_calls, mem_units, mem_suffix);
+	
+	for (size_t k = 0; k < reader_cnt; ++k) {
+		run_one(stress_reader, &done);
+	}
+
+	for (size_t k = 0; k < updater_cnt; ++k) {
+		run_one(stress_updater, k > 0 ? &worker : &master);
+	}
+	
+	TPRINTF("\nJoining %zu stress updaters.\n", updater_cnt);
+	
+	for (size_t k = 0; k < updater_cnt; ++k) {
+		join_one();
+	}
+	
+	done = true;
+
+	TPRINTF("\nJoining %zu stress nop-readers.\n", reader_cnt);
+	
+	join_all();
+	return true;
+}
+/*-------------------------------------------------------------------*/
+
+typedef struct {
+	rcu_item_t r;
+	size_t total_cnt;
+	size_t count_down;
+	bool expedite;
+} expedite_t;
+
+static void expedite_cb(rcu_item_t *arg)
+{
+	expedite_t *e = (expedite_t *)arg;
+	
+	if (1 < e->count_down) {
+		--e->count_down;
+		
+		if (0 == (e->count_down % (e->total_cnt/100))) {
+			TPRINTF("*");
+		}
+		
+		_rcu_call(e->expedite, &e->r, expedite_cb);
+	} else {
+		/* Do not touch any of e's mem after we declare we're done with it. */
+		memory_barrier();
+		e->count_down = 0;
+	}
+}
+
+static void run_expedite(bool exp, size_t cnt)
+{
+	expedite_t e;
+	e.total_cnt = cnt;
+	e.count_down = cnt;
+	e.expedite = exp;
+	
+	_rcu_call(e.expedite, &e.r, expedite_cb);
+	
+	while (0 < e.count_down) {
+		thread_sleep(1);
+		TPRINTF(".");
+	}
+}
+
+static bool do_expedite(void)
+{
+	size_t exp_cnt = 1000 * 1000;
+	size_t normal_cnt = 1 * 1000;
+	
+	TPRINTF("Expedited: sequence of %zu rcu_calls\n", exp_cnt);
+	run_expedite(true, exp_cnt);
+	TPRINTF("Normal/non-expedited: sequence of %zu rcu_calls\n", normal_cnt);
+	run_expedite(false, normal_cnt);
+	return true;
+}
+/*-------------------------------------------------------------------*/
+
+struct test_func {
+	bool include;
+	bool (*func)(void);
+	const char *desc;
+};
+
+
+const char *test_rcu1(void)
+{
+	struct test_func test_func[] = {
+		{ 1, do_one_cb, "do_one_cb" },
+		{ 1, do_reader_preempt, "do_reader_preempt" },
+		{ 1, do_synch, "do_synch" },
+		{ 1, do_barrier, "do_barrier" },
+		{ 1, do_reader_exit, "do_reader_exit" },
+		{ 1, do_nop_readers, "do_nop_readers" },
+		{ 1, do_seq_check, "do_seq_check" },
+		{ 0, do_long_readers, "do_long_readers" },
+		{ 1, do_nop_callbacks, "do_nop_callbacks" },
+		{ 0, do_expedite, "do_expedite" },
+		{ 1, do_stress, "do_stress" },
+		{ 0, NULL, NULL }
+	};
+	
+	bool success = true;
+	bool ok = true;
+	uint64_t completed_gps = rcu_completed_gps();
+	uint64_t delta_gps = 0;
+	
+	for (int i = 0; test_func[i].func; ++i) {
+		if (!test_func[i].include) {
+			TPRINTF("\nSubtest %s() skipped.\n", test_func[i].desc);
+			continue;
+		} else {
+			TPRINTF("\nRunning subtest %s.\n", test_func[i].desc);
+		}
+		
+		ok = test_func[i].func();
+		success = success && ok;
+		
+		delta_gps = rcu_completed_gps() - completed_gps;
+		completed_gps += delta_gps;
+
+		if (ok) {  
+			TPRINTF("\nSubtest %s() ok (GPs: %" PRIu64 ").\n", 
+				test_func[i].desc, delta_gps);
+		} else {
+			TPRINTF("\nFailed: %s(). Pausing for 5 secs.\n", test_func[i].desc);
+			thread_sleep(5);
+		} 
+	}
+
+	if (success)
+		return NULL;
+	else
+		return "One of the tests failed.";
+}
Index: kernel/test/synch/rcu1.def
===================================================================
--- kernel/test/synch/rcu1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/rcu1.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,6 @@
+{
+	"rcu1",
+	"Basic RCU test",
+	&test_rcu1,
+	true
+},
Index: kernel/test/synch/workq-test-core.h
===================================================================
--- kernel/test/synch/workq-test-core.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/workq-test-core.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <atomic.h>
+#include <print.h>
+#include <proc/thread.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+typedef struct test_work {
+	work_t work_item;
+	int master;
+	int wave;
+	int count_down;
+} test_work_t;
+
+static atomic_t call_cnt[WAVES];
+
+
+/* Fwd decl - implement in your actual test file.. */
+static int core_workq_enqueue(work_t *work_item, work_func_t func);
+
+
+static bool new_wave(test_work_t *work)
+{
+	++work->wave;
+	
+	if (work->wave < WAVES) {
+		work->count_down = COUNT;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+
+static int is_pow2(int num)
+{
+	unsigned n = (unsigned)num;
+	return (n != 0) && 0 == (n & (n-1));
+}
+
+static test_work_t * create_child(test_work_t *work)
+{
+	test_work_t *child = malloc(sizeof(test_work_t), 0);
+	ASSERT(child);
+	if (child) {
+		child->master = false;
+		child->wave = work->wave;
+		child->count_down = work->count_down;
+	}
+	
+	return child;
+}
+
+static void free_work(test_work_t *work)
+{
+	memsetb(work, sizeof(test_work_t), 0xfa);
+	free(work);
+}
+
+static void reproduce(work_t *work_item)
+{
+	/* Ensure work_item is ours for the taking. */
+	memsetb(work_item, sizeof(work_t), 0xec);
+	
+	test_work_t *work = (test_work_t *)work_item;
+	
+	atomic_inc(&call_cnt[work->wave]);
+	
+	if (0 < work->count_down) {
+		/* Sleep right before creating the last generation. */
+		if (1 == work->count_down) {
+			bool sleeping_wave = ((work->wave % 2) == 1);
+
+			/* Master never sleeps. */
+			if (sleeping_wave && !work->master) {
+				thread_usleep(WAVE_SLEEP_MS * 1000);
+			}
+		}
+		
+		--work->count_down;
+
+		/* 
+		 * Enqueue a child if count_down is power-of-2. 
+		 * Leads to exponential growth. 
+		 */
+		if (is_pow2(work->count_down + 1)) {
+			test_work_t *child = create_child(work);
+			if (child) {
+				if (!core_workq_enqueue(&child->work_item, reproduce))
+					free_work(child);
+			}
+		}
+		
+		if (!core_workq_enqueue(work_item, reproduce)) {
+			if (work->master) 
+				TPRINTF("\nErr: Master work item exiting prematurely!\n");
+
+			free_work(work);
+		}
+	} else {
+		/* We're done with this wave - only the master survives. */
+		
+		if (work->master && new_wave(work)) {
+			if (!core_workq_enqueue(work_item, reproduce)) {
+				TPRINTF("\nErr: Master work could not start a new wave!\n");
+				free_work(work);
+			}
+		} else {
+			if (work->master)
+				TPRINTF("\nMaster work item done.\n");
+				
+			free_work(work);
+		}
+	}
+}
+
+static const char *run_workq_core(bool end_prematurely)
+{
+	for (int i = 0; i < WAVES; ++i) {
+		atomic_set(&call_cnt[i], 0);
+	}
+
+	test_work_t *work = malloc(sizeof(test_work_t), 0);
+
+	work->master = true;
+	work->wave = 0;
+	work->count_down = COUNT;
+	
+	/*
+	 * k == COUNT_POW
+	 * 2^k == COUNT + 1
+	 * 
+	 * We have "k" branching points. Therefore:
+	 * exp_call_cnt == k*2^(k-1) + 2^k == (k + 2) * 2^(k-1)
+	 */
+	size_t exp_call_cnt = (COUNT_POW + 2) * (1 << (COUNT_POW - 1));
+	
+	TPRINTF("waves: %d, count_down: %d, total expected calls: %zu\n", 
+		WAVES, COUNT, exp_call_cnt * WAVES);
+	
+
+	core_workq_enqueue(&work->work_item, reproduce);
+	
+	size_t sleep_cnt = 0;
+	/* At least 40 seconds total (or 2 sec to end while there's work). */
+	size_t max_sleep_secs = end_prematurely ? 2 : MAIN_MAX_SLEEP_SEC;
+	size_t max_sleep_cnt = (max_sleep_secs * 1000) / MAIN_POLL_SLEEP_MS;
+	
+	for (int i = 0; i < WAVES; ++i) {
+		while (atomic_get(&call_cnt[i]) < exp_call_cnt 
+			&& sleep_cnt < max_sleep_cnt) {
+			TPRINTF(".");
+			thread_usleep(MAIN_POLL_SLEEP_MS * 1000);
+			++sleep_cnt;
+		}
+	}
+	
+	bool success = true;
+	
+	for (int i = 0; i < WAVES; ++i) {
+		if (atomic_get(&call_cnt[i]) == exp_call_cnt) {
+			TPRINTF("Ok: %" PRIua " calls in wave %d, as expected.\n",
+				atomic_get(&call_cnt[i]), i);
+		} else {
+			success = false;
+			TPRINTF("Error: %" PRIua " calls in wave %d, but %zu expected.\n",
+				atomic_get(&call_cnt[i]), i, exp_call_cnt);
+		} 
+	}
+	
+	
+	if (success)
+		return NULL;
+	else {
+		return "Failed to invoke the expected number of calls.\n";
+	}
+}
Index: kernel/test/synch/workqueue2.c
===================================================================
--- kernel/test/synch/workqueue2.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/workqueue2.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <print.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+#define WAVES 10
+#define COUNT_POW 12
+#define COUNT ((1 << COUNT_POW) - 1)
+#define WAVE_SLEEP_MS 100
+#define MAIN_POLL_SLEEP_MS 100
+#define MAIN_MAX_SLEEP_SEC 40
+
+/*
+ * Include the test implementation.
+ */
+#include "workq-test-core.h"
+
+
+/*-------------------------------------------------------------------*/
+
+static work_t basic_work;
+static int basic_done = 0;
+
+static void basic_test_work(work_t *work_item)
+{
+	basic_done = 1;
+	TPRINTF("basic_test_work()");
+}
+
+
+static void basic_test(void)
+{
+	TPRINTF("Issue a single work item.\n");
+	basic_done = 0;
+	workq_global_enqueue(&basic_work, basic_test_work);
+	
+	while (!basic_done) {
+		TPRINTF(".");
+		thread_sleep(1);
+	}
+
+	TPRINTF("\nBasic test done\n");
+}
+
+/*-------------------------------------------------------------------*/
+
+
+struct work_queue *workq = NULL;
+
+static int core_workq_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_enqueue(workq, work_item, func);
+}
+/*-------------------------------------------------------------------*/
+
+
+static const char *test_custom_workq_impl(bool stop, const char *qname)
+{
+	workq = workq_create(qname);
+	
+	if (!workq) {
+		return "Failed to create a work queue.\n";
+	}
+	
+	const char *ret = run_workq_core(stop);
+	
+	TPRINTF("Stopping work queue...\n");
+	workq_stop(workq);
+	
+	TPRINTF("Destroying work queue...\n");
+	workq_destroy(workq);
+	return ret;
+}
+
+static const char *test_custom_workq(void)
+{
+	TPRINTF("Stress testing a custom queue.\n");
+	return test_custom_workq_impl(false, "test-workq");
+}
+
+
+static const char *test_custom_workq_stop(void)
+{
+	TPRINTF("Stress testing a custom queue. Stops prematurely. "
+		"Errors are expected.\n");
+	test_custom_workq_impl(true, "test-workq-stop");
+	/* Errors are expected. */
+	return NULL;
+}
+
+
+const char *test_workqueue_all(void)
+{
+	const char *err = NULL;
+	const char *res;
+	
+	basic_test();
+	
+	res = test_custom_workq();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+	
+	res = test_custom_workq_stop();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+	
+	res = test_workqueue3();
+	if (res) {
+		TPRINTF(res);
+		err = res;
+	}
+
+	return err;
+}
Index: kernel/test/synch/workqueue2.def
===================================================================
--- kernel/test/synch/workqueue2.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/workqueue2.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,6 @@
+{
+	"workqueue",
+	"Separate and system work queue stress test",
+	&test_workqueue_all,
+	true
+},
Index: kernel/test/synch/workqueue3.c
===================================================================
--- kernel/test/synch/workqueue3.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/workqueue3.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2012 Adam Hraska
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * - The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <test.h>
+#include <arch.h>
+#include <print.h>
+#include <memstr.h>
+#include <synch/workqueue.h>
+
+
+#define WAVES 10
+#define COUNT_POW 12
+#define COUNT ((1 << COUNT_POW) - 1)
+#define WAVE_SLEEP_MS 100
+#define MAIN_POLL_SLEEP_MS 100
+#define MAIN_MAX_SLEEP_SEC 40
+
+/*
+ * Include the test implementation.
+ */
+#include "workq-test-core.h"
+
+
+static int core_workq_enqueue(work_t *work_item, work_func_t func)
+{
+	return workq_global_enqueue(work_item, func);
+}
+
+
+
+static const char *do_test(bool exit_early)
+{
+	const char *err = NULL;
+	TPRINTF("Stress testing system queue.\n");
+	TPRINTF("First run:\n");
+	err = run_workq_core(exit_early);
+
+	if (!err) {
+		TPRINTF("\nSecond run:\n");
+		err = run_workq_core(exit_early);
+	} 
+
+	TPRINTF("Done.\n");
+	
+	return err;
+}
+
+const char *test_workqueue3(void)
+{
+	return do_test(false);
+}
+
+const char *test_workqueue3quit(void)
+{
+	return do_test(true);
+}
Index: kernel/test/synch/workqueue3.def
===================================================================
--- kernel/test/synch/workqueue3.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
+++ kernel/test/synch/workqueue3.def	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -0,0 +1,6 @@
+{
+	"workqueue3quit",
+	"Global work queue test, exits early",
+	&test_workqueue3quit,
+	true
+},
Index: kernel/test/test.c
===================================================================
--- kernel/test/test.c	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/test/test.c	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -41,4 +41,5 @@
 #include <avltree/avltree1.def>
 #include <btree/btree1.def>
+#include <cht/cht1.def>
 #include <debug/mips1.def>
 #include <fault/fault1.def>
@@ -50,4 +51,7 @@
 #include <synch/semaphore1.def>
 #include <synch/semaphore2.def>
+#include <synch/rcu1.def>
+#include <synch/workqueue2.def>
+#include <synch/workqueue3.def>
 #include <print/print1.def>
 #include <print/print2.def>
@@ -56,4 +60,5 @@
 #include <print/print5.def>
 #include <thread/thread1.def>
+#include <smpcall/smpcall1.def>
 	{
 		.name = NULL,
Index: kernel/test/test.h
===================================================================
--- kernel/test/test.h	(revision f892ed3b1ead6db400671bf1e99fdb0b5af3c724)
+++ kernel/test/test.h	(revision 235d31de52ec23ea2a4cb3b7837814333820b9cd)
@@ -59,4 +59,5 @@
 extern const char *test_avltree1(void);
 extern const char *test_btree1(void);
+extern const char *test_cht1(void);
 extern const char *test_mips1(void);
 extern const char *test_fault1(void);
@@ -75,4 +76,10 @@
 extern const char *test_print5(void);
 extern const char *test_thread1(void);
+extern const char *test_smpcall1(void);
+extern const char *test_workqueue_all(void);
+extern const char *test_workqueue3(void);
+extern const char *test_workqueue3quit(void);
+extern const char *test_rcu1(void);
+
 
 extern test_t tests[];
