source: mainline/kernel/generic/src/proc/scheduler.c@ 169815e

ticket/834-toolchain-update topic/msim-upgrade topic/simplify-dev-export
Last change on this file since 169815e was 169815e, checked in by Jiří Zárevúcky <zarevucky.jiri@…>, 2 years ago

Split cpu_t::lock into fpu_lock and tlb_lock

For all other purposes, locking is unnecessary, since the fields
in question are only accessed locally from the CPU they belong to.

  • Property mode set to 100644
File size: 16.2 KB
Line 
1/*
2 * Copyright (c) 2010 Jakub Jermar
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** @addtogroup kernel_generic_proc
30 * @{
31 */
32
33/**
34 * @file
35 * @brief Scheduler and load balancing.
36 *
37 * This file contains the scheduler and kcpulb kernel thread which
38 * performs load-balancing of per-CPU run queues.
39 */
40
41#include <assert.h>
42#include <atomic.h>
43#include <proc/scheduler.h>
44#include <proc/thread.h>
45#include <proc/task.h>
46#include <mm/frame.h>
47#include <mm/page.h>
48#include <mm/as.h>
49#include <time/timeout.h>
50#include <time/delay.h>
51#include <arch/asm.h>
52#include <arch/faddr.h>
53#include <arch/cycle.h>
54#include <atomic.h>
55#include <synch/spinlock.h>
56#include <config.h>
57#include <context.h>
58#include <fpu_context.h>
59#include <halt.h>
60#include <arch.h>
61#include <adt/list.h>
62#include <panic.h>
63#include <cpu.h>
64#include <stdio.h>
65#include <log.h>
66#include <stacktrace.h>
67
68static void scheduler_separated_stack(void);
69
70atomic_size_t nrdy; /**< Number of ready threads in the system. */
71
72/** Take actions before new thread runs.
73 *
74 * Perform actions that need to be
75 * taken before the newly selected
76 * thread is passed control.
77 *
78 * THREAD->lock is locked on entry
79 *
80 */
81static void before_thread_runs(void)
82{
83 before_thread_runs_arch();
84
85#ifdef CONFIG_FPU_LAZY
86 if (THREAD == CPU->fpu_owner)
87 fpu_enable();
88 else
89 fpu_disable();
90#elif defined CONFIG_FPU
91 fpu_enable();
92 if (THREAD->fpu_context_exists)
93 fpu_context_restore(&THREAD->fpu_context);
94 else {
95 fpu_init();
96 THREAD->fpu_context_exists = true;
97 }
98#endif
99
100#ifdef CONFIG_UDEBUG
101 if (THREAD->btrace) {
102 istate_t *istate = THREAD->udebug.uspace_state;
103 if (istate != NULL) {
104 printf("Thread %" PRIu64 " stack trace:\n", THREAD->tid);
105 stack_trace_istate(istate);
106 }
107
108 THREAD->btrace = false;
109 }
110#endif
111}
112
113/** Take actions after THREAD had run.
114 *
115 * Perform actions that need to be
116 * taken after the running thread
117 * had been preempted by the scheduler.
118 *
119 * THREAD->lock is locked on entry
120 *
121 */
122static void after_thread_ran(void)
123{
124 after_thread_ran_arch();
125}
126
127#ifdef CONFIG_FPU_LAZY
128void scheduler_fpu_lazy_request(void)
129{
130 fpu_enable();
131 irq_spinlock_lock(&CPU->fpu_lock, false);
132
133 /* Save old context */
134 if (CPU->fpu_owner != NULL) {
135 irq_spinlock_lock(&CPU->fpu_owner->lock, false);
136 fpu_context_save(&CPU->fpu_owner->fpu_context);
137
138 /* Don't prevent migration */
139 CPU->fpu_owner->fpu_context_engaged = false;
140 irq_spinlock_unlock(&CPU->fpu_owner->lock, false);
141 CPU->fpu_owner = NULL;
142 }
143
144 irq_spinlock_lock(&THREAD->lock, false);
145 if (THREAD->fpu_context_exists) {
146 fpu_context_restore(&THREAD->fpu_context);
147 } else {
148 fpu_init();
149 THREAD->fpu_context_exists = true;
150 }
151
152 CPU->fpu_owner = THREAD;
153 THREAD->fpu_context_engaged = true;
154 irq_spinlock_unlock(&THREAD->lock, false);
155
156 irq_spinlock_unlock(&CPU->fpu_lock, false);
157}
158#endif /* CONFIG_FPU_LAZY */
159
160/** Initialize scheduler
161 *
162 * Initialize kernel scheduler.
163 *
164 */
165void scheduler_init(void)
166{
167}
168
169/** Get thread to be scheduled
170 *
171 * Get the optimal thread to be scheduled
172 * according to thread accounting and scheduler
173 * policy.
174 *
175 * @return Thread to be scheduled.
176 *
177 */
178static thread_t *find_best_thread(void)
179{
180 assert(CPU != NULL);
181
182loop:
183 if (atomic_load(&CPU->nrdy) == 0) {
184 /*
185 * For there was nothing to run, the CPU goes to sleep
186 * until a hardware interrupt or an IPI comes.
187 * This improves energy saving and hyperthreading.
188 */
189 CPU->idle = true;
190
191 /*
192 * Go to sleep with interrupts enabled.
193 * Ideally, this should be atomic, but this is not guaranteed on
194 * all platforms yet, so it is possible we will go sleep when
195 * a thread has just become available.
196 */
197 cpu_interruptible_sleep();
198
199 /* Interrupts are disabled again. */
200 goto loop;
201 }
202
203 assert(!CPU->idle);
204
205 unsigned int i;
206 for (i = 0; i < RQ_COUNT; i++) {
207 irq_spinlock_lock(&(CPU->rq[i].lock), false);
208 if (CPU->rq[i].n == 0) {
209 /*
210 * If this queue is empty, try a lower-priority queue.
211 */
212 irq_spinlock_unlock(&(CPU->rq[i].lock), false);
213 continue;
214 }
215
216 atomic_dec(&CPU->nrdy);
217 atomic_dec(&nrdy);
218 CPU->rq[i].n--;
219
220 /*
221 * Take the first thread from the queue.
222 */
223 thread_t *thread = list_get_instance(
224 list_first(&CPU->rq[i].rq), thread_t, rq_link);
225 list_remove(&thread->rq_link);
226
227 irq_spinlock_pass(&(CPU->rq[i].lock), &thread->lock);
228
229 thread->cpu = CPU;
230 thread->priority = i; /* Correct rq index */
231
232 /* Time allocation in microseconds. */
233 uint64_t time_to_run = (i + 1) * 10000;
234
235 /* This is safe because interrupts are disabled. */
236 CPU->preempt_deadline = CPU->current_clock_tick + us2ticks(time_to_run);
237
238 /*
239 * Clear the stolen flag so that it can be migrated
240 * when load balancing needs emerge.
241 */
242 thread->stolen = false;
243 irq_spinlock_unlock(&thread->lock, false);
244
245 return thread;
246 }
247
248 goto loop;
249}
250
251static void switch_task(task_t *task)
252{
253 /* If the task stays the same, a lot of work is avoided. */
254 if (TASK == task)
255 return;
256
257 as_t *old_as = AS;
258 as_t *new_as = task->as;
259
260 /* It is possible for two tasks to share one address space. */
261 if (old_as != new_as)
262 as_switch(old_as, new_as);
263
264 if (TASK)
265 task_release(TASK);
266
267 TASK = task;
268
269 task_hold(TASK);
270
271 before_task_runs_arch();
272}
273
274/** Prevent rq starvation
275 *
276 * Prevent low priority threads from starving in rq's.
277 *
278 * When the function decides to relink rq's, it reconnects
279 * respective pointers so that in result threads with 'pri'
280 * greater or equal start are moved to a higher-priority queue.
281 *
282 * @param start Threshold priority.
283 *
284 */
285static void relink_rq(int start)
286{
287 if (CPU->current_clock_tick < CPU->relink_deadline)
288 return;
289
290 CPU->relink_deadline = CPU->current_clock_tick + NEEDS_RELINK_MAX;
291
292 /* Temporary cache for lists we are moving. */
293 list_t list;
294 list_initialize(&list);
295
296 size_t n = 0;
297
298 /* Move every list (except the one with highest priority) one level up. */
299 for (int i = RQ_COUNT - 1; i > start; i--) {
300 irq_spinlock_lock(&CPU->rq[i].lock, false);
301
302 /* Swap lists. */
303 list_swap(&CPU->rq[i].rq, &list);
304
305 /* Swap number of items. */
306 size_t tmpn = CPU->rq[i].n;
307 CPU->rq[i].n = n;
308 n = tmpn;
309
310 irq_spinlock_unlock(&CPU->rq[i].lock, false);
311 }
312
313 /* Append the contents of rq[start + 1] to rq[start]. */
314 if (n != 0) {
315 irq_spinlock_lock(&CPU->rq[start].lock, false);
316 list_concat(&CPU->rq[start].rq, &list);
317 CPU->rq[start].n += n;
318 irq_spinlock_unlock(&CPU->rq[start].lock, false);
319 }
320}
321
322void scheduler(void)
323{
324 ipl_t ipl = interrupts_disable();
325
326 if (atomic_load(&haltstate))
327 halt();
328
329 if (THREAD) {
330 irq_spinlock_lock(&THREAD->lock, false);
331 }
332
333 scheduler_locked(ipl);
334}
335
336/** The scheduler
337 *
338 * The thread scheduling procedure.
339 * Passes control directly to
340 * scheduler_separated_stack().
341 *
342 */
343void scheduler_locked(ipl_t ipl)
344{
345 assert(CPU != NULL);
346
347 if (THREAD) {
348 /* Update thread kernel accounting */
349 THREAD->kcycles += get_cycle() - THREAD->last_cycle;
350
351#if (defined CONFIG_FPU) && (!defined CONFIG_FPU_LAZY)
352 fpu_context_save(&THREAD->fpu_context);
353#endif
354 if (!context_save(&THREAD->saved_context)) {
355 /*
356 * This is the place where threads leave scheduler();
357 */
358
359 /* Save current CPU cycle */
360 THREAD->last_cycle = get_cycle();
361
362 irq_spinlock_unlock(&THREAD->lock, false);
363 interrupts_restore(THREAD->saved_ipl);
364
365 return;
366 }
367
368 /*
369 * Interrupt priority level of preempted thread is recorded
370 * here to facilitate scheduler() invocations from
371 * interrupts_disable()'d code (e.g. waitq_sleep_timeout()).
372 *
373 */
374 THREAD->saved_ipl = ipl;
375 }
376
377 /*
378 * Through the 'CURRENT' structure, we keep track of THREAD, TASK, CPU, AS
379 * and preemption counter. At this point CURRENT could be coming either
380 * from THREAD's or CPU's stack.
381 *
382 */
383 current_copy(CURRENT, (current_t *) CPU->stack);
384
385 /*
386 * We may not keep the old stack.
387 * Reason: If we kept the old stack and got blocked, for instance, in
388 * find_best_thread(), the old thread could get rescheduled by another
389 * CPU and overwrite the part of its own stack that was also used by
390 * the scheduler on this CPU.
391 *
392 * Moreover, we have to bypass the compiler-generated POP sequence
393 * which is fooled by SP being set to the very top of the stack.
394 * Therefore the scheduler() function continues in
395 * scheduler_separated_stack().
396 *
397 */
398 context_t ctx;
399 context_save(&ctx);
400 context_set(&ctx, FADDR(scheduler_separated_stack),
401 (uintptr_t) CPU->stack, STACK_SIZE);
402 context_restore(&ctx);
403
404 /* Not reached */
405}
406
407/** Scheduler stack switch wrapper
408 *
409 * Second part of the scheduler() function
410 * using new stack. Handling the actual context
411 * switch to a new thread.
412 *
413 */
414void scheduler_separated_stack(void)
415{
416 assert((!THREAD) || (irq_spinlock_locked(&THREAD->lock)));
417 assert(CPU != NULL);
418 assert(interrupts_disabled());
419
420 if (THREAD) {
421 /* Must be run after the switch to scheduler stack */
422 after_thread_ran();
423
424 switch (THREAD->state) {
425 case Running:
426 irq_spinlock_unlock(&THREAD->lock, false);
427 thread_ready(THREAD);
428 break;
429
430 case Exiting:
431 irq_spinlock_unlock(&THREAD->lock, false);
432 waitq_close(&THREAD->join_wq);
433
434 /*
435 * Release the reference CPU has for the thread.
436 * If there are no other references (e.g. threads calling join),
437 * the thread structure is deallocated.
438 */
439 thread_put(THREAD);
440 break;
441
442 case Sleeping:
443 /*
444 * Prefer the thread after it's woken up.
445 */
446 THREAD->priority = -1;
447 irq_spinlock_unlock(&THREAD->lock, false);
448 break;
449
450 default:
451 /*
452 * Entering state is unexpected.
453 */
454 panic("tid%" PRIu64 ": unexpected state %s.",
455 THREAD->tid, thread_states[THREAD->state]);
456 break;
457 }
458
459 THREAD = NULL;
460 }
461
462 THREAD = find_best_thread();
463
464 irq_spinlock_lock(&THREAD->lock, false);
465 int priority = THREAD->priority;
466 irq_spinlock_unlock(&THREAD->lock, false);
467
468 relink_rq(priority);
469
470 switch_task(THREAD->task);
471
472 irq_spinlock_lock(&THREAD->lock, false);
473 THREAD->state = Running;
474
475#ifdef SCHEDULER_VERBOSE
476 log(LF_OTHER, LVL_DEBUG,
477 "cpu%u: tid %" PRIu64 " (priority=%d, ticks=%" PRIu64
478 ", nrdy=%zu)", CPU->id, THREAD->tid, THREAD->priority,
479 THREAD->ticks, atomic_load(&CPU->nrdy));
480#endif
481
482 /*
483 * Some architectures provide late kernel PA2KA(identity)
484 * mapping in a page fault handler. However, the page fault
485 * handler uses the kernel stack of the running thread and
486 * therefore cannot be used to map it. The kernel stack, if
487 * necessary, is to be mapped in before_thread_runs(). This
488 * function must be executed before the switch to the new stack.
489 */
490 before_thread_runs();
491
492 /*
493 * Copy the knowledge of CPU, TASK, THREAD and preemption counter to
494 * thread's stack.
495 */
496 current_copy(CURRENT, (current_t *) THREAD->kstack);
497
498 context_restore(&THREAD->saved_context);
499
500 /* Not reached */
501}
502
503#ifdef CONFIG_SMP
504/** Load balancing thread
505 *
506 * SMP load balancing thread, supervising thread supplies
507 * for the CPU it's wired to.
508 *
509 * @param arg Generic thread argument (unused).
510 *
511 */
512void kcpulb(void *arg)
513{
514 size_t average;
515 size_t rdy;
516
517loop:
518 /*
519 * Work in 1s intervals.
520 */
521 thread_sleep(1);
522
523not_satisfied:
524 /*
525 * Calculate the number of threads that will be migrated/stolen from
526 * other CPU's. Note that situation can have changed between two
527 * passes. Each time get the most up to date counts.
528 *
529 */
530 average = atomic_load(&nrdy) / config.cpu_active + 1;
531 rdy = atomic_load(&CPU->nrdy);
532
533 if (average <= rdy)
534 goto satisfied;
535
536 size_t count = average - rdy;
537
538 /*
539 * Searching least priority queues on all CPU's first and most priority
540 * queues on all CPU's last.
541 */
542 size_t acpu;
543 size_t acpu_bias = 0;
544 int rq;
545
546 for (rq = RQ_COUNT - 1; rq >= 0; rq--) {
547 for (acpu = 0; acpu < config.cpu_active; acpu++) {
548 cpu_t *cpu = &cpus[(acpu + acpu_bias) % config.cpu_active];
549
550 /*
551 * Not interested in ourselves.
552 * Doesn't require interrupt disabling for kcpulb has
553 * THREAD_FLAG_WIRED.
554 *
555 */
556 if (CPU == cpu)
557 continue;
558
559 if (atomic_load(&cpu->nrdy) <= average)
560 continue;
561
562 irq_spinlock_lock(&(cpu->rq[rq].lock), true);
563 if (cpu->rq[rq].n == 0) {
564 irq_spinlock_unlock(&(cpu->rq[rq].lock), true);
565 continue;
566 }
567
568 thread_t *thread = NULL;
569
570 /* Search rq from the back */
571 link_t *link = list_last(&cpu->rq[rq].rq);
572
573 while (link != NULL) {
574 thread = (thread_t *) list_get_instance(link,
575 thread_t, rq_link);
576
577 /*
578 * Do not steal CPU-wired threads, threads
579 * already stolen, threads for which migration
580 * was temporarily disabled or threads whose
581 * FPU context is still in the CPU.
582 */
583 irq_spinlock_lock(&thread->lock, false);
584
585 if ((!thread->stolen) &&
586 (!thread->nomigrate) &&
587 (!thread->fpu_context_engaged)) {
588 /*
589 * Remove thread from ready queue.
590 */
591 irq_spinlock_unlock(&thread->lock,
592 false);
593
594 atomic_dec(&cpu->nrdy);
595 atomic_dec(&nrdy);
596
597 cpu->rq[rq].n--;
598 list_remove(&thread->rq_link);
599
600 break;
601 }
602
603 irq_spinlock_unlock(&thread->lock, false);
604
605 link = list_prev(link, &cpu->rq[rq].rq);
606 thread = NULL;
607 }
608
609 if (thread) {
610 /*
611 * Ready thread on local CPU
612 */
613
614 irq_spinlock_pass(&(cpu->rq[rq].lock),
615 &thread->lock);
616
617#ifdef KCPULB_VERBOSE
618 log(LF_OTHER, LVL_DEBUG,
619 "kcpulb%u: TID %" PRIu64 " -> cpu%u, "
620 "nrdy=%ld, avg=%ld", CPU->id, thread->tid,
621 CPU->id, atomic_load(&CPU->nrdy),
622 atomic_load(&nrdy) / config.cpu_active);
623#endif
624
625 thread->stolen = true;
626 thread->state = Entering;
627
628 irq_spinlock_unlock(&thread->lock, true);
629 thread_ready(thread);
630
631 if (--count == 0)
632 goto satisfied;
633
634 /*
635 * We are not satisfied yet, focus on another
636 * CPU next time.
637 *
638 */
639 acpu_bias++;
640
641 continue;
642 } else
643 irq_spinlock_unlock(&(cpu->rq[rq].lock), true);
644
645 }
646 }
647
648 if (atomic_load(&CPU->nrdy)) {
649 /*
650 * Be a little bit light-weight and let migrated threads run.
651 *
652 */
653 scheduler();
654 } else {
655 /*
656 * We failed to migrate a single thread.
657 * Give up this turn.
658 *
659 */
660 goto loop;
661 }
662
663 goto not_satisfied;
664
665satisfied:
666 goto loop;
667}
668#endif /* CONFIG_SMP */
669
670/** Print information about threads & scheduler queues
671 *
672 */
673void sched_print_list(void)
674{
675 size_t cpu;
676 for (cpu = 0; cpu < config.cpu_count; cpu++) {
677 if (!cpus[cpu].active)
678 continue;
679
680 /* Technically a data race, but we don't really care in this case. */
681 int needs_relink = cpus[cpu].relink_deadline - cpus[cpu].current_clock_tick;
682
683 printf("cpu%u: address=%p, nrdy=%zu, needs_relink=%d\n",
684 cpus[cpu].id, &cpus[cpu], atomic_load(&cpus[cpu].nrdy),
685 needs_relink);
686
687 unsigned int i;
688 for (i = 0; i < RQ_COUNT; i++) {
689 irq_spinlock_lock(&(cpus[cpu].rq[i].lock), false);
690 if (cpus[cpu].rq[i].n == 0) {
691 irq_spinlock_unlock(&(cpus[cpu].rq[i].lock), false);
692 continue;
693 }
694
695 printf("\trq[%u]: ", i);
696 list_foreach(cpus[cpu].rq[i].rq, rq_link, thread_t,
697 thread) {
698 printf("%" PRIu64 "(%s) ", thread->tid,
699 thread_states[thread->state]);
700 }
701 printf("\n");
702
703 irq_spinlock_unlock(&(cpus[cpu].rq[i].lock), false);
704 }
705 }
706}
707
708/** @}
709 */
Note: See TracBrowser for help on using the repository browser.