source: mainline/kernel/generic/src/proc/scheduler.c@ 5663872

Last change on this file since 5663872 was 5663872, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 18 months ago

Move stuff around for thread sleep

Only mark the thread as ready for wakeup after we switch to
another context. This way, soundness of the sychronization
does not depend on thread lock being held across the context
switch, which gives us more freedom.

  • Property mode set to 100644
File size: 16.8 KB
Line 
1/*
2 * Copyright (c) 2010 Jakub Jermar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup kernel_generic_proc
30 * @{
31 */
32
33/**
34 * @file
35 * @brief Scheduler and load balancing.
36 *
37 * This file contains the scheduler and kcpulb kernel thread which
38 * performs load-balancing of per-CPU run queues.
39 */
40
41#include <assert.h>
42#include <atomic.h>
43#include <proc/scheduler.h>
44#include <proc/thread.h>
45#include <proc/task.h>
46#include <mm/frame.h>
47#include <mm/page.h>
48#include <mm/as.h>
49#include <time/timeout.h>
50#include <time/delay.h>
51#include <arch/asm.h>
52#include <arch/faddr.h>
53#include <arch/cycle.h>
54#include <atomic.h>
55#include <synch/spinlock.h>
56#include <config.h>
57#include <context.h>
58#include <fpu_context.h>
59#include <halt.h>
60#include <arch.h>
61#include <adt/list.h>
62#include <panic.h>
63#include <cpu.h>
64#include <stdio.h>
65#include <log.h>
66#include <stacktrace.h>
67
68static void scheduler_separated_stack(void);
69
70atomic_size_t nrdy; /**< Number of ready threads in the system. */
71
72/** Take actions before new thread runs.
73 *
74 * Perform actions that need to be
75 * taken before the newly selected
76 * thread is passed control.
77 *
78 * THREAD->lock is locked on entry
79 *
80 */
81static void before_thread_runs(void)
82{
83 before_thread_runs_arch();
84
85#ifdef CONFIG_FPU_LAZY
86 /*
87 * The only concurrent modification possible for fpu_owner here is
88 * another thread changing it from itself to NULL in its destructor.
89 */
90 thread_t *owner = atomic_load_explicit(&CPU->fpu_owner,
91 memory_order_relaxed);
92
93 if (THREAD == owner)
94 fpu_enable();
95 else
96 fpu_disable();
97#elif defined CONFIG_FPU
98 fpu_enable();
99 if (THREAD->fpu_context_exists)
100 fpu_context_restore(&THREAD->fpu_context);
101 else {
102 fpu_init();
103 THREAD->fpu_context_exists = true;
104 }
105#endif
106
107#ifdef CONFIG_UDEBUG
108 if (THREAD->btrace) {
109 istate_t *istate = THREAD->udebug.uspace_state;
110 if (istate != NULL) {
111 printf("Thread %" PRIu64 " stack trace:\n", THREAD->tid);
112 stack_trace_istate(istate);
113 }
114
115 THREAD->btrace = false;
116 }
117#endif
118}
119
120/** Take actions after THREAD had run.
121 *
122 * Perform actions that need to be
123 * taken after the running thread
124 * had been preempted by the scheduler.
125 *
126 * THREAD->lock is locked on entry
127 *
128 */
129static void after_thread_ran(void)
130{
131 after_thread_ran_arch();
132}
133
134#ifdef CONFIG_FPU_LAZY
135void scheduler_fpu_lazy_request(void)
136{
137 fpu_enable();
138
139 /* We need this lock to ensure synchronization with thread destructor. */
140 irq_spinlock_lock(&CPU->fpu_lock, false);
141
142 /* Save old context */
143 thread_t *owner = atomic_load_explicit(&CPU->fpu_owner, memory_order_relaxed);
144 if (owner != NULL) {
145 fpu_context_save(&owner->fpu_context);
146 atomic_store_explicit(&CPU->fpu_owner, NULL, memory_order_relaxed);
147 }
148
149 irq_spinlock_unlock(&CPU->fpu_lock, false);
150
151 if (THREAD->fpu_context_exists) {
152 fpu_context_restore(&THREAD->fpu_context);
153 } else {
154 fpu_init();
155 THREAD->fpu_context_exists = true;
156 }
157
158 atomic_store_explicit(&CPU->fpu_owner, THREAD, memory_order_relaxed);
159}
160#endif /* CONFIG_FPU_LAZY */
161
162/** Initialize scheduler
163 *
164 * Initialize kernel scheduler.
165 *
166 */
167void scheduler_init(void)
168{
169}
170
171/** Get thread to be scheduled
172 *
173 * Get the optimal thread to be scheduled
174 * according to thread accounting and scheduler
175 * policy.
176 *
177 * @return Thread to be scheduled.
178 *
179 */
180static thread_t *try_find_thread(int *rq_index)
181{
182 assert(interrupts_disabled());
183 assert(CPU != NULL);
184
185 if (atomic_load(&CPU->nrdy) == 0)
186 return NULL;
187
188 for (int i = 0; i < RQ_COUNT; i++) {
189 irq_spinlock_lock(&(CPU->rq[i].lock), false);
190 if (CPU->rq[i].n == 0) {
191 /*
192 * If this queue is empty, try a lower-priority queue.
193 */
194 irq_spinlock_unlock(&(CPU->rq[i].lock), false);
195 continue;
196 }
197
198 atomic_dec(&CPU->nrdy);
199 atomic_dec(&nrdy);
200 CPU->rq[i].n--;
201
202 /*
203 * Take the first thread from the queue.
204 */
205 thread_t *thread = list_get_instance(
206 list_first(&CPU->rq[i].rq), thread_t, rq_link);
207 list_remove(&thread->rq_link);
208
209 irq_spinlock_pass(&(CPU->rq[i].lock), &thread->lock);
210
211 thread->cpu = CPU;
212 thread->priority = i; /* Correct rq index */
213
214 /* Time allocation in microseconds. */
215 uint64_t time_to_run = (i + 1) * 10000;
216
217 /* This is safe because interrupts are disabled. */
218 CPU_LOCAL->preempt_deadline =
219 CPU_LOCAL->current_clock_tick + us2ticks(time_to_run);
220
221 /*
222 * Clear the stolen flag so that it can be migrated
223 * when load balancing needs emerge.
224 */
225 thread->stolen = false;
226 irq_spinlock_unlock(&thread->lock, false);
227
228 *rq_index = i;
229 return thread;
230 }
231
232 return NULL;
233}
234
235/** Get thread to be scheduled
236 *
237 * Get the optimal thread to be scheduled
238 * according to thread accounting and scheduler
239 * policy.
240 *
241 * @return Thread to be scheduled.
242 *
243 */
244static thread_t *find_best_thread(int *rq_index)
245{
246 assert(interrupts_disabled());
247 assert(CPU != NULL);
248
249 while (true) {
250 thread_t *thread = try_find_thread(rq_index);
251
252 if (thread != NULL)
253 return thread;
254
255 /*
256 * For there was nothing to run, the CPU goes to sleep
257 * until a hardware interrupt or an IPI comes.
258 * This improves energy saving and hyperthreading.
259 */
260 CPU_LOCAL->idle = true;
261
262 /*
263 * Go to sleep with interrupts enabled.
264 * Ideally, this should be atomic, but this is not guaranteed on
265 * all platforms yet, so it is possible we will go sleep when
266 * a thread has just become available.
267 */
268 cpu_interruptible_sleep();
269 }
270}
271
272static void switch_task(task_t *task)
273{
274 /* If the task stays the same, a lot of work is avoided. */
275 if (TASK == task)
276 return;
277
278 as_t *old_as = AS;
279 as_t *new_as = task->as;
280
281 /* It is possible for two tasks to share one address space. */
282 if (old_as != new_as)
283 as_switch(old_as, new_as);
284
285 if (TASK)
286 task_release(TASK);
287
288 TASK = task;
289
290 task_hold(TASK);
291
292 before_task_runs_arch();
293}
294
295/** Prevent rq starvation
296 *
297 * Prevent low priority threads from starving in rq's.
298 *
299 * When the function decides to relink rq's, it reconnects
300 * respective pointers so that in result threads with 'pri'
301 * greater or equal start are moved to a higher-priority queue.
302 *
303 * @param start Threshold priority.
304 *
305 */
306static void relink_rq(int start)
307{
308 if (CPU_LOCAL->current_clock_tick < CPU_LOCAL->relink_deadline)
309 return;
310
311 CPU_LOCAL->relink_deadline = CPU_LOCAL->current_clock_tick + NEEDS_RELINK_MAX;
312
313 /* Temporary cache for lists we are moving. */
314 list_t list;
315 list_initialize(&list);
316
317 size_t n = 0;
318
319 /* Move every list (except the one with highest priority) one level up. */
320 for (int i = RQ_COUNT - 1; i > start; i--) {
321 irq_spinlock_lock(&CPU->rq[i].lock, false);
322
323 /* Swap lists. */
324 list_swap(&CPU->rq[i].rq, &list);
325
326 /* Swap number of items. */
327 size_t tmpn = CPU->rq[i].n;
328 CPU->rq[i].n = n;
329 n = tmpn;
330
331 irq_spinlock_unlock(&CPU->rq[i].lock, false);
332 }
333
334 /* Append the contents of rq[start + 1] to rq[start]. */
335 if (n != 0) {
336 irq_spinlock_lock(&CPU->rq[start].lock, false);
337 list_concat(&CPU->rq[start].rq, &list);
338 CPU->rq[start].n += n;
339 irq_spinlock_unlock(&CPU->rq[start].lock, false);
340 }
341}
342
343void scheduler(void)
344{
345 ipl_t ipl = interrupts_disable();
346
347 if (atomic_load(&haltstate))
348 halt();
349
350 if (THREAD) {
351 irq_spinlock_lock(&THREAD->lock, false);
352 }
353
354 scheduler_locked(ipl);
355}
356
357/** The scheduler
358 *
359 * The thread scheduling procedure.
360 * Passes control directly to
361 * scheduler_separated_stack().
362 *
363 */
364void scheduler_locked(ipl_t ipl)
365{
366 assert(CPU != NULL);
367
368 if (THREAD) {
369 /* Update thread kernel accounting */
370 THREAD->kcycles += get_cycle() - THREAD->last_cycle;
371
372#if (defined CONFIG_FPU) && (!defined CONFIG_FPU_LAZY)
373 fpu_context_save(&THREAD->fpu_context);
374#endif
375 if (!context_save(&THREAD->saved_context)) {
376 /*
377 * This is the place where threads leave scheduler();
378 */
379
380 /* Save current CPU cycle */
381 THREAD->last_cycle = get_cycle();
382
383 irq_spinlock_unlock(&THREAD->lock, false);
384 interrupts_restore(THREAD->saved_ipl);
385
386 return;
387 }
388
389 /*
390 * Interrupt priority level of preempted thread is recorded
391 * here to facilitate scheduler() invocations from
392 * interrupts_disable()'d code (e.g. waitq_sleep_timeout()).
393 *
394 */
395 THREAD->saved_ipl = ipl;
396 }
397
398 /*
399 * Through the 'CURRENT' structure, we keep track of THREAD, TASK, CPU, AS
400 * and preemption counter. At this point CURRENT could be coming either
401 * from THREAD's or CPU's stack.
402 *
403 */
404 current_copy(CURRENT, (current_t *) CPU_LOCAL->stack);
405
406 /*
407 * We may not keep the old stack.
408 * Reason: If we kept the old stack and got blocked, for instance, in
409 * find_best_thread(), the old thread could get rescheduled by another
410 * CPU and overwrite the part of its own stack that was also used by
411 * the scheduler on this CPU.
412 *
413 * Moreover, we have to bypass the compiler-generated POP sequence
414 * which is fooled by SP being set to the very top of the stack.
415 * Therefore the scheduler() function continues in
416 * scheduler_separated_stack().
417 *
418 */
419 context_t ctx;
420 context_save(&ctx);
421 context_set(&ctx, FADDR(scheduler_separated_stack),
422 (uintptr_t) CPU_LOCAL->stack, STACK_SIZE);
423 context_restore(&ctx);
424
425 /* Not reached */
426}
427
428/** Scheduler stack switch wrapper
429 *
430 * Second part of the scheduler() function
431 * using new stack. Handling the actual context
432 * switch to a new thread.
433 *
434 */
435void scheduler_separated_stack(void)
436{
437 assert((!THREAD) || (irq_spinlock_locked(&THREAD->lock)));
438 assert(CPU != NULL);
439 assert(interrupts_disabled());
440
441 if (THREAD) {
442 /* Must be run after the switch to scheduler stack */
443 after_thread_ran();
444
445 int expected;
446
447 switch (THREAD->state) {
448 case Running:
449 irq_spinlock_unlock(&THREAD->lock, false);
450 thread_ready(THREAD);
451 break;
452
453 case Exiting:
454 irq_spinlock_unlock(&THREAD->lock, false);
455 waitq_close(&THREAD->join_wq);
456
457 /*
458 * Release the reference CPU has for the thread.
459 * If there are no other references (e.g. threads calling join),
460 * the thread structure is deallocated.
461 */
462 thread_put(THREAD);
463 break;
464
465 case Sleeping:
466 expected = SLEEP_INITIAL;
467
468 /* Only set SLEEP_ASLEEP in sleep pad if it's still in initial state */
469 if (atomic_compare_exchange_strong_explicit(&THREAD->sleep_state,
470 &expected, SLEEP_ASLEEP,
471 memory_order_acq_rel, memory_order_acquire)) {
472
473 /* Prefer the thread after it's woken up. */
474 THREAD->priority = -1;
475 irq_spinlock_unlock(&THREAD->lock, false);
476 } else {
477 assert(expected == SLEEP_WOKE);
478 /* The thread has already been woken up, requeue immediately. */
479 irq_spinlock_unlock(&THREAD->lock, false);
480 thread_ready(THREAD);
481 }
482
483 break;
484
485 default:
486 /*
487 * Entering state is unexpected.
488 */
489 panic("tid%" PRIu64 ": unexpected state %s.",
490 THREAD->tid, thread_states[THREAD->state]);
491 break;
492 }
493
494 THREAD = NULL;
495 }
496
497 int rq_index;
498 THREAD = find_best_thread(&rq_index);
499
500 relink_rq(rq_index);
501
502 switch_task(THREAD->task);
503
504 irq_spinlock_lock(&THREAD->lock, false);
505 THREAD->state = Running;
506
507#ifdef SCHEDULER_VERBOSE
508 log(LF_OTHER, LVL_DEBUG,
509 "cpu%u: tid %" PRIu64 " (priority=%d, ticks=%" PRIu64
510 ", nrdy=%zu)", CPU->id, THREAD->tid, THREAD->priority,
511 THREAD->ticks, atomic_load(&CPU->nrdy));
512#endif
513
514 /*
515 * Some architectures provide late kernel PA2KA(identity)
516 * mapping in a page fault handler. However, the page fault
517 * handler uses the kernel stack of the running thread and
518 * therefore cannot be used to map it. The kernel stack, if
519 * necessary, is to be mapped in before_thread_runs(). This
520 * function must be executed before the switch to the new stack.
521 */
522 before_thread_runs();
523
524 /*
525 * Copy the knowledge of CPU, TASK, THREAD and preemption counter to
526 * thread's stack.
527 */
528 current_copy(CURRENT, (current_t *) THREAD->kstack);
529
530 context_restore(&THREAD->saved_context);
531
532 /* Not reached */
533}
534
535#ifdef CONFIG_SMP
536
537static thread_t *steal_thread_from(cpu_t *old_cpu, int i)
538{
539 runq_t *old_rq = &old_cpu->rq[i];
540 runq_t *new_rq = &CPU->rq[i];
541
542 ipl_t ipl = interrupts_disable();
543
544 irq_spinlock_lock(&old_rq->lock, false);
545
546 /*
547 * If fpu_owner is any thread in the list, its store is seen here thanks to
548 * the runqueue lock.
549 */
550 thread_t *fpu_owner = atomic_load_explicit(&old_cpu->fpu_owner,
551 memory_order_relaxed);
552
553 /* Search rq from the back */
554 list_foreach_rev(old_rq->rq, rq_link, thread_t, thread) {
555
556 irq_spinlock_lock(&thread->lock, false);
557
558 /*
559 * Do not steal CPU-wired threads, threads
560 * already stolen, threads for which migration
561 * was temporarily disabled or threads whose
562 * FPU context is still in the CPU.
563 */
564 if (thread->stolen || thread->nomigrate ||
565 thread == fpu_owner) {
566 irq_spinlock_unlock(&thread->lock, false);
567 continue;
568 }
569
570 thread->stolen = true;
571 thread->cpu = CPU;
572
573 irq_spinlock_unlock(&thread->lock, false);
574
575 /*
576 * Ready thread on local CPU
577 */
578
579#ifdef KCPULB_VERBOSE
580 log(LF_OTHER, LVL_DEBUG,
581 "kcpulb%u: TID %" PRIu64 " -> cpu%u, "
582 "nrdy=%ld, avg=%ld", CPU->id, thread->tid,
583 CPU->id, atomic_load(&CPU->nrdy),
584 atomic_load(&nrdy) / config.cpu_active);
585#endif
586
587 /* Remove thread from ready queue. */
588 old_rq->n--;
589 list_remove(&thread->rq_link);
590 irq_spinlock_unlock(&old_rq->lock, false);
591
592 /* Append thread to local queue. */
593 irq_spinlock_lock(&new_rq->lock, false);
594 list_append(&thread->rq_link, &new_rq->rq);
595 new_rq->n++;
596 irq_spinlock_unlock(&new_rq->lock, false);
597
598 atomic_dec(&old_cpu->nrdy);
599 atomic_inc(&CPU->nrdy);
600 interrupts_restore(ipl);
601 return thread;
602 }
603
604 irq_spinlock_unlock(&old_rq->lock, false);
605 interrupts_restore(ipl);
606 return NULL;
607}
608
609/** Load balancing thread
610 *
611 * SMP load balancing thread, supervising thread supplies
612 * for the CPU it's wired to.
613 *
614 * @param arg Generic thread argument (unused).
615 *
616 */
617void kcpulb(void *arg)
618{
619 size_t average;
620 size_t rdy;
621
622loop:
623 /*
624 * Work in 1s intervals.
625 */
626 thread_sleep(1);
627
628not_satisfied:
629 /*
630 * Calculate the number of threads that will be migrated/stolen from
631 * other CPU's. Note that situation can have changed between two
632 * passes. Each time get the most up to date counts.
633 *
634 */
635 average = atomic_load(&nrdy) / config.cpu_active + 1;
636 rdy = atomic_load(&CPU->nrdy);
637
638 if (average <= rdy)
639 goto satisfied;
640
641 size_t count = average - rdy;
642
643 /*
644 * Searching least priority queues on all CPU's first and most priority
645 * queues on all CPU's last.
646 */
647 size_t acpu;
648 int rq;
649
650 for (rq = RQ_COUNT - 1; rq >= 0; rq--) {
651 for (acpu = 0; acpu < config.cpu_active; acpu++) {
652 cpu_t *cpu = &cpus[acpu];
653
654 /*
655 * Not interested in ourselves.
656 * Doesn't require interrupt disabling for kcpulb has
657 * THREAD_FLAG_WIRED.
658 *
659 */
660 if (CPU == cpu)
661 continue;
662
663 if (atomic_load(&cpu->nrdy) <= average)
664 continue;
665
666 if (steal_thread_from(cpu, rq) && --count == 0)
667 goto satisfied;
668 }
669 }
670
671 if (atomic_load(&CPU->nrdy)) {
672 /*
673 * Be a little bit light-weight and let migrated threads run.
674 *
675 */
676 scheduler();
677 } else {
678 /*
679 * We failed to migrate a single thread.
680 * Give up this turn.
681 *
682 */
683 goto loop;
684 }
685
686 goto not_satisfied;
687
688satisfied:
689 goto loop;
690}
691#endif /* CONFIG_SMP */
692
693/** Print information about threads & scheduler queues
694 *
695 */
696void sched_print_list(void)
697{
698 size_t cpu;
699 for (cpu = 0; cpu < config.cpu_count; cpu++) {
700 if (!cpus[cpu].active)
701 continue;
702
703 printf("cpu%u: address=%p, nrdy=%zu\n",
704 cpus[cpu].id, &cpus[cpu], atomic_load(&cpus[cpu].nrdy));
705
706 unsigned int i;
707 for (i = 0; i < RQ_COUNT; i++) {
708 irq_spinlock_lock(&(cpus[cpu].rq[i].lock), false);
709 if (cpus[cpu].rq[i].n == 0) {
710 irq_spinlock_unlock(&(cpus[cpu].rq[i].lock), false);
711 continue;
712 }
713
714 printf("\trq[%u]: ", i);
715 list_foreach(cpus[cpu].rq[i].rq, rq_link, thread_t,
716 thread) {
717 printf("%" PRIu64 "(%s) ", thread->tid,
718 thread_states[thread->state]);
719 }
720 printf("\n");
721
722 irq_spinlock_unlock(&(cpus[cpu].rq[i].lock), false);
723 }
724 }
725}
726
727/** @}
728 */
Note: See TracBrowser for help on using the repository browser.