Index: kernel/arch/sparc64/include/asm.h
===================================================================
--- kernel/arch/sparc64/include/asm.h	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/arch/sparc64/include/asm.h	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -316,4 +316,10 @@
 }
 
+/** Flush all valid register windows to memory. */
+static inline void flushw(void)
+{
+	__asm__ volatile ("flushw\n");
+}
+
 void cpu_halt(void);
 void cpu_sleep(void);
Index: kernel/arch/sparc64/include/trap/trap_table.h
===================================================================
--- kernel/arch/sparc64/include/trap/trap_table.h	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/arch/sparc64/include/trap/trap_table.h	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -82,8 +82,17 @@
  * definition of the istate structure.
  */
-#define PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE	(STACK_WINDOW_SAVE_AREA_SIZE+(4*8))
+#define PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE	(STACK_WINDOW_SAVE_AREA_SIZE+(12*8))
 #define SAVED_TSTATE	-(1*8)
 #define SAVED_TPC	-(2*8)
-#define SAVED_TNPC	-(3*8)
+#define SAVED_TNPC	-(3*8)		/* <-- istate_t begins here */
+/* alignment gap */
+#define SAVED_I0	-(5*8)
+#define SAVED_I1	-(6*8)
+#define SAVED_I2	-(7*8)
+#define SAVED_I3	-(8*8)
+#define SAVED_I4	-(9*8)
+#define SAVED_I5	-(10*8)
+#define SAVED_I6	-(11*8)
+#define SAVED_I7	-(12*8)
 
 .macro PREEMPTIBLE_HANDLER f
Index: kernel/arch/sparc64/src/proc/scheduler.c
===================================================================
--- kernel/arch/sparc64/src/proc/scheduler.c	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/arch/sparc64/src/proc/scheduler.c	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -36,8 +36,10 @@
 #include <proc/thread.h>
 #include <arch.h>
+#include <arch/asm.h>
 #include <arch/mm/tlb.h>
 #include <arch/mm/page.h>
 #include <config.h>
 #include <align.h>
+#include <macros.h>
 
 /** Perform sparc64 specific tasks needed before the new task is run. */
@@ -46,5 +48,9 @@
 }
 
-/** Ensure that thread's kernel stack is locked in TLB. */
+/** Perform sparc64 specific steps before scheduling a thread.
+ *
+ * Ensure that thread's kernel stack, as well as userspace window
+ * buffer for userspace threads, are locked in DTLB.
+ */
 void before_thread_runs_arch(void)
 {
@@ -53,5 +59,5 @@
 	base = ALIGN_DOWN(config.base, 1<<KERNEL_PAGE_WIDTH);
 
-	if ((uintptr_t) THREAD->kstack < base || (uintptr_t) THREAD->kstack > base + (1<<KERNEL_PAGE_WIDTH)) {
+	if (!overlaps((uintptr_t) THREAD->kstack, PAGE_SIZE, base, (1<<KERNEL_PAGE_WIDTH))) {
 		/*
 		 * Kernel stack of this thread is not locked in DTLB.
@@ -59,10 +65,30 @@
 		 * If not, create a locked mapping for it.
 		 */
-		 dtlb_demap(TLB_DEMAP_PAGE, TLB_DEMAP_NUCLEUS, (uintptr_t) THREAD->kstack);
-		 dtlb_insert_mapping((uintptr_t) THREAD->kstack, KA2PA(THREAD->kstack), PAGESIZE_8K, true, true);
-	}	
+		dtlb_demap(TLB_DEMAP_PAGE, TLB_DEMAP_NUCLEUS, (uintptr_t) THREAD->kstack);
+		dtlb_insert_mapping((uintptr_t) THREAD->kstack, KA2PA(THREAD->kstack), PAGESIZE_8K, true, true);
+	}
+	
+	if ((THREAD->flags & THREAD_FLAG_USPACE)) {
+		/*
+		 * If this thread executes also in userspace, we have to lock
+		 * its userspace window buffer into DTLB.
+		 */
+		ASSERT(THREAD->arch.uspace_window_buffer);
+		uintptr_t uw_buf = (uintptr_t) THREAD->arch.uspace_window_buffer;
+		if (!overlaps(uw_buf, PAGE_SIZE, base, 1<<KERNEL_PAGE_WIDTH)) {
+			/*
+			 * The buffer is not covered by the 4M locked kernel DTLB entry.
+			 */
+			dtlb_demap(TLB_DEMAP_PAGE, TLB_DEMAP_NUCLEUS, (uintptr_t) uw_buf);
+			dtlb_insert_mapping(uw_buf, KA2PA(uw_buf), PAGESIZE_8K, true, true);
+		}
+	}
 }
 
-/** Unlock thread's stack from TLB, if necessary. */
+/** Perform sparc64 specific steps before a thread stops running.
+ *
+ * Demap any locked DTLB entries isntalled by the thread (i.e. kernel stack
+ * and userspace window buffer).
+ */
 void after_thread_ran_arch(void)
 {
@@ -71,5 +97,5 @@
 	base = ALIGN_DOWN(config.base, 1<<KERNEL_PAGE_WIDTH);
 
-	if ((uintptr_t) THREAD->kstack < base || (uintptr_t) THREAD->kstack > base + (1<<KERNEL_PAGE_WIDTH)) {
+	if (!overlaps((uintptr_t) THREAD->kstack, PAGE_SIZE, base, (1<<KERNEL_PAGE_WIDTH))) {
 		/*
 		 * Kernel stack of this thread is locked in DTLB.
@@ -78,4 +104,25 @@
 		dtlb_demap(TLB_DEMAP_PAGE, TLB_DEMAP_NUCLEUS, (uintptr_t) THREAD->kstack);
 	}
+	
+	if ((THREAD->flags & THREAD_FLAG_USPACE)) {
+		/*
+		 * If this thread executes also in userspace, we have to force all
+		 * its still-active userspace windows into the userspace window buffer
+		 * and demap the buffer from DTLB.
+		 */
+		ASSERT(THREAD->arch.uspace_window_buffer);
+		
+		flushw();	/* force all userspace windows into memory */
+		
+		uintptr_t uw_buf = (uintptr_t) THREAD->arch.uspace_window_buffer;
+		if (!overlaps(uw_buf, PAGE_SIZE, base, 1<<KERNEL_PAGE_WIDTH)) {
+			/*
+			 * The buffer is not covered by the 4M locked kernel DTLB entry
+			 * and therefore it was given a dedicated locked DTLB entry.
+			 * Demap it.
+			 */
+			dtlb_demap(TLB_DEMAP_PAGE, TLB_DEMAP_NUCLEUS, (uintptr_t) uw_buf);
+		}
+	}
 }
 
Index: kernel/arch/sparc64/src/trap/trap_table.S
===================================================================
--- kernel/arch/sparc64/src/trap/trap_table.S	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/arch/sparc64/src/trap/trap_table.S	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -413,10 +413,31 @@
 	/*
 	 * Fix CWP.
-	 */
-	mov %fp, %g1
+	 * Just for reminder, the input registers in the current window
+	 * are the output registers of the window to which we want to
+	 * restore. Because the fill trap fills only input and local
+	 * registers of a window, we need to preserve those output
+	 * registers manually.
+	 */
 	flushw
+	mov %sp, %g1
+	stx %i0, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0]
+	stx %i1, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1]
+	stx %i2, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2]
+	stx %i3, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3]
+	stx %i4, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4]
+	stx %i5, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5]
+	stx %i6, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6]
+	stx %i7, [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7]
 	wrpr %l0, 0, %cwp
-	mov %g1, %fp
-	
+	mov %g1, %sp
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I0], %i0
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I1], %i1
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I2], %i2
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I3], %i3
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I4], %i4
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I5], %i5
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I6], %i6
+	ldx [%sp + PREEMPTIBLE_HANDLER_STACK_FRAME_SIZE + STACK_BIAS + SAVED_I7], %i7
+
 	/*
 	 * OTHERWIN != 0 or fall-through from the OTHERWIN == 0 case.
Index: kernel/generic/include/macros.h
===================================================================
--- kernel/generic/include/macros.h	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/generic/include/macros.h	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -33,6 +33,6 @@
  */
 
-#ifndef __MACROS_H__
-#define __MACROS_H__
+#ifndef KERN_MACROS_H_
+#define KERN_MACROS_H_
 
 #include <arch/types.h>
@@ -49,5 +49,11 @@
 #define max(a,b)	((a) > (b) ? (a) : (b))
 
-/** Return true if the interlvals overlap. */
+/** Return true if the interlvals overlap.
+ *
+ * @param s1 Start address of the first interval.
+ * @param sz1 Size of the first interval.
+ * @param s2 Start address of the second interval.
+ * @param sz2 Size of the second interval.
+ */
 static inline int overlaps(uintptr_t s1, size_t sz1, uintptr_t s2, size_t sz2)
 {
Index: kernel/generic/src/main/kinit.c
===================================================================
--- kernel/generic/src/main/kinit.c	(revision 32fffef00aa36a7913892cc0d7ff6eff828b2e7e)
+++ kernel/generic/src/main/kinit.c	(revision 0fa6044030c5c9175f76bf5a500b45f6f66598e6)
@@ -155,5 +155,5 @@
 #ifdef CONFIG_TEST
 	test();
-	printf("\nTest finished, please reboot\n");
+	printf("\nTest finished, please reboot.\n");
 #else  /* CONFIG_TEST */
 
