Linux内核设计与实现：进程调度

编程入门行业动态更新时间:2024-10-11 21:30:07

Linux<a href=https://www.elefans.com/category/jswz/34/1769575.html style= 内核设计与实现：进程调度"/>

Linux内核设计与实现：进程调度

进程调度是确保进程能够有效运行的内核子系统

多任务

多任务操作系统是并发地交互执行多个进程的操作系统；可分为抢占式任务和非抢占式任务。

抢占式：

每个进程的可运行的时间是预设好的，叫作时间片；每当进程的时间片走完，进程会被强制挂起，这个动作叫作抢占，这个是由操作系统的调度程序来决定的，这样可以避免其他进程独占系统资源；

非抢占式：

除非进程主动停止，否则会一直运行下去，而进程主动挂起的动作叫让步，优点是给进程足够的时间处理任务，缺点是时间无法预计，并且有可能因为长时间占用系统资源造成系统崩溃；

策略：（进程分为 I/O消耗型和处理器消耗型）

I/O消耗型：此类进程大部分时间用来提交 I/O请求或者是等待 I/O请求，经常处于可运行状态，但是由于经常处理大量的 I/O请求，会导致程序容易阻塞；（不需要太长的时间片）
处理器消耗型： 除非被抢占，否则大部分时间都被用在执行代码上，没有太多的I/O请求；（时间片越长越好）

也有既是I/O消耗型，也是处理器消耗型的进程。

进程优先级：

一般是优先级高的进程比低的先运行，若是相同的优先级，则按照轮转的方式进行调度。
若是优先级高且时间片长，则先运行；

Linux有两种优先级：

nice值：值越大，优先级越低；
实时优先级：值越大，优先级越高；

时间片：

时间片表明程序在被抢占前持续运行的时间；

一个进程是否要立刻运行，取决于进程优先级和时间片；

Linux调度算法

调度器类：以模块方式提供，允许不同类型可以有针对性的选择调度算法，每个调度器都有优先级。

struct sched_class {const struct sched_class *next;#ifdef CONFIG_UCLAMP_TASKint uclamp_enabled;
#endifvoid (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);void (*yield_task)   (struct rq *rq);bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);/** Both @prev and @rf are optional and may be NULL, in which case the* caller must already have invoked put_prev_task(rq, prev, rf).** Otherwise it is the responsibility of the pick_next_task() to call* put_prev_task() on the @prev task or something equivalent, IFF it* returns a next task.** In that case (@rf != NULL) it may return RETRY_TASK when it finds a* higher prio class has runnable tasks.*/struct task_struct * (*pick_next_task)(struct rq *rq,struct task_struct *prev,struct rq_flags *rf);void (*put_prev_task)(struct rq *rq, struct task_struct *p);void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);#ifdef CONFIG_SMPint (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);void (*migrate_task_rq)(struct task_struct *p, int new_cpu);void (*task_woken)(struct rq *this_rq, struct task_struct *task);void (*set_cpus_allowed)(struct task_struct *p,const struct cpumask *newmask);void (*rq_online)(struct rq *rq);void (*rq_offline)(struct rq *rq);
#endifvoid (*task_tick)(struct rq *rq, struct task_struct *p, int queued);void (*task_fork)(struct task_struct *p);void (*task_dead)(struct task_struct *p);/** The switched_from() call is allowed to drop rq->lock, therefore we* cannot assume the switched_from/switched_to pair is serliazed by* rq->lock. They are however serialized by p->pi_lock.*/void (*switched_from)(struct rq *this_rq, struct task_struct *task);void (*switched_to)  (struct rq *this_rq, struct task_struct *task);void (*prio_changed) (struct rq *this_rq, struct task_struct *task,int oldprio);unsigned int (*get_rr_interval)(struct rq *rq,struct task_struct *task);void (*update_curr)(struct rq *rq);#define TASK_SET_GROUP		0
#define TASK_MOVE_GROUP		1#ifdef CONFIG_FAIR_GROUP_SCHEDvoid (*task_change_group)(struct task_struct *p, int type);
#endif
};

CFS：公平调度算法

在多任务情况下趋近于完美的多任务算法，但是并不是完美的公平，降低了调度带来的不公平性。

nice值和时间片的分配通过CFS算法来平衡，例如nice值分别为10和15的两个进程，时间片的
分配为15ms 和 5ms。而觉得这个分配的不是nice的绝对值，而是其相对值和比例。

在进程优先级相同的情况下，如果进程过多甚至趋近于无限，则每个进程获得的处理器使用比和时间比都将趋近于0，这样就毫无意义（考虑到还有切换进程带来的开销代价），所以CFS算法设置了时间片最小粒度，每个进程最少也会分配到1ms的时间片。

调度实现：

调度实现由四个部分组成：

时间记账
进程选择
调度器入口
睡眠和唤醒

其相关代码在<kernel/fair.c>中

1.时间记账

所有调度器都会对进程的时间片进行记账，每当系统时钟的节拍发生时，时间片都会减少一个周期，当时间片耗尽时，就会被其他进程抢占。

CFS使用调度器的实体结构

struct sched_entity {/* For load-balancing: */struct load_weight		load;unsigned long			runnable_weight;struct rb_node			run_node;struct list_head		group_node;unsigned int			on_rq;u64				exec_start;u64				sum_exec_runtime;u64				vruntime;		//存放进程的虚拟允许时间：经过了所有可运行进程总数的标准化，和节拍不再相关，因为是ns为单位u64				prev_sum_exec_runtime;u64				nr_migrations;struct sched_statistics		statistics;#ifdef CONFIG_FAIR_GROUP_SCHEDint				depth;struct sched_entity		*parent;/* rq on which this entity is (to be) queued: */struct cfs_rq			*cfs_rq;/* rq "owned" by this entity/group: */struct cfs_rq			*my_q;
#endif#ifdef CONFIG_SMP/** Per entity load average tracking.** Put into separate cache line so it does not* collide with read-mostly values above.*/struct sched_avg		avg;
#endif
};

该结构体嵌在task_struct 内；
记账功能：

static void update_curr(struct cfs_rq *cfs_rq)
{struct sched_entity *curr = cfs_rq->curr;u64 now = rq_clock_task(rq_of(cfs_rq));u64 delta_exec;		//当前进程允许的时间，然后再传给__update_curr()，然后再根据当前进程运行总数对时间进行加权if (unlikely(!curr))return;/*获得从最后一次修改负载后当前任务所占用的运行时间*/delta_exec = now - curr->exec_start;if (unlikely((s64)delta_exec <= 0))return;curr->exec_start = now;schedstat_set(curr->statistics.exec_max,max(delta_exec, curr->statistics.exec_max));curr->sum_exec_runtime += delta_exec;schedstat_add(cfs_rq->exec_clock, delta_exec);curr->vruntime += calc_delta_fair(delta_exec, curr);update_min_vruntime(cfs_rq);if (entity_is_task(curr)) {struct task_struct *curtask = task_of(curr);trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);cgroup_account_cputime(curtask, delta_exec);account_group_exec_runtime(curtask, delta_exec);}account_cfs_rq_runtime(cfs_rq, delta_exec);
}

2.进程选择（太长，暂时不写）

当CFS选择下一个运行进程时，他会挑选一个具有最小vruntime的进程。（使用红黑树来组织进程队列，沿着根节点往左子树找，可以找到最小vruntime的进程）

static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{struct rb_node *next = rb_next(&se->run_node);if (!next)return NULL;return rb_entry(next, struct sched_entity, run_node);
}

添加进程至CFS:

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);bool curr = cfs_rq->curr == se;/** If we're the current task, we must renormalise before calling* update_curr().*/if (renorm && curr)se->vruntime += cfs_rq->min_vruntime;// 更新当前任务的运行时统计数据update_curr(cfs_rq);/** Otherwise, renormalise after, such that we're placed at the current* moment in time, instead of some random moment in the past. Being* placed in the past could significantly boost this task to the* fairness detriment of existing tasks.*/if (renorm && !curr)se->vruntime += cfs_rq->min_vruntime;/** When enqueuing a sched_entity, we must:*   - Update loads to have both entity and cfs_rq synced with now.*   - Add its load to cfs_rq->runnable_avg*   - For group_entity, update its weight to reflect the new share of*     its group cfs_rq*   - Add its new weight to cfs_rq->load.weight*/update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);update_cfs_group(se);enqueue_runnable_load_avg(cfs_rq, se);account_entity_enqueue(cfs_rq, se);if (flags & ENQUEUE_WAKEUP)place_entity(cfs_rq, se, 0);check_schedstat_required();update_stats_enqueue(cfs_rq, se, flags);check_spread(cfs_rq, se);if (!curr)__enqueue_entity(cfs_rq, se);se->on_rq = 1;/** When bandwidth control is enabled, cfs might have been removed* because of a parent been throttled but cfs->nr_running > 1. Try to* add it unconditionnally.*/if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())list_add_leaf_cfs_rq(cfs_rq);if (cfs_rq->nr_running == 1)check_enqueue_throttle(cfs_rq);
}/** Enqueue an entity into the rb-tree:*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;struct rb_node *parent = NULL;struct sched_entity *entry;bool leftmost = true;/** Find the right place in the rbtree:*/while (*link) {parent = *link;entry = rb_entry(parent, struct sched_entity, run_node);/** We dont care about collisions. Nodes with* the same key stay together.*/if (entity_before(se, entry)) {link = &parent->rb_left;} else {link = &parent->rb_right;leftmost = false;}}rb_link_node(&se->run_node, parent, link);rb_insert_color_cached(&se->run_node,&cfs_rq->tasks_timeline, leftmost);
}

删除进程：

static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{/** Update run-time statistics of the 'current'.*/update_curr(cfs_rq);/** When dequeuing a sched_entity, we must:*   - Update loads to have both entity and cfs_rq synced with now.*   - Subtract its load from the cfs_rq->runnable_avg.*   - Subtract its previous weight from cfs_rq->load.weight.*   - For group entity, update its weight to reflect the new share*     of its group cfs_rq.*/update_load_avg(cfs_rq, se, UPDATE_TG);dequeue_runnable_load_avg(cfs_rq, se);update_stats_dequeue(cfs_rq, se, flags);clear_buddies(cfs_rq, se);if (se != cfs_rq->curr)__dequeue_entity(cfs_rq, se);se->on_rq = 0;account_entity_dequeue(cfs_rq, se);/** Normalize after update_curr(); which will also have moved* min_vruntime if @se is the one holding it back. But before doing* update_min_vruntime() again, which will discount @se's position and* can move min_vruntime forward still more.*/if (!(flags & DEQUEUE_SLEEP))se->vruntime -= cfs_rq->min_vruntime;/* return excess runtime on last dequeue */return_cfs_rq_runtime(cfs_rq);update_cfs_group(se);/** Now advance min_vruntime if @se was the entity holding it back,* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be* put back on, and if we advance min_vruntime, we'll be placed back* further than we started -- ie. we'll be penalized.*/if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)update_min_vruntime(cfs_rq);
}static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

3.调度器入口

进程调度入口函数是schedule()，（找到最高优先级的调度列）

/** Pick up the highest-prio task:*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{const struct sched_class *class;struct task_struct *p;/** Optimization: we know that if all tasks are in the fair class we can* call that function directly, but only if the @prev task wasn't of a* higher scheduling class, because otherwise those loose the* opportunity to pull in more work from other CPUs.*/if (likely((prev->sched_class == &idle_sched_class ||prev->sched_class == &fair_sched_class) &&rq->nr_running == rq->cfs.h_nr_running)) {p = fair_sched_class.pick_next_task(rq, prev, rf);if (unlikely(p == RETRY_TASK))goto restart;/* Assumes fair_sched_class->next == idle_sched_class */if (unlikely(!p))p = idle_sched_class.pick_next_task(rq, prev, rf);return p;}

4.睡眠和唤醒

休眠的进程处于一个不可执行的状态
进程把自己标记成休眠状态，从可执行红黑树中移除，放入等待队列，然后调用schedule()选择和执行其他进程，唤醒的过程则相反。
休眠有两种状态：TASK_INTERRUPTABLE 和 TASK_UNINTERRUPTABLE,前者会忽略信号量，后者遇到信号量会被提前唤醒。

抢占和上下文切换

上下文切换：从一个可执行进程切换到另外一个可执行进程。函数为context_switch()

static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,struct task_struct *next, struct rq_flags *rf)
{prepare_task_switch(rq, prev, next);/** For paravirt, this is coupled with an exit in switch_to to* combine the page table reload and the switch backend into* one hypercall.*/arch_start_context_switch(prev);/** kernel -> kernel   lazy + transfer active*   user -> kernel   lazy + mmgrab() active** kernel ->   user   switch + mmdrop() active*   user ->   user   switch*/if (!next->mm) {                                // to kernelenter_lazy_tlb(prev->active_mm, next);next->active_mm = prev->active_mm;if (prev->mm)                           // from usermmgrab(prev->active_mm);elseprev->active_mm = NULL;} else {                                        // to usermembarrier_switch_mm(rq, prev->active_mm, next->mm);/** sys_membarrier() requires an smp_mb() between setting* rq->curr / membarrier_switch_mm() and returning to userspace.** The below provides this either through switch_mm(), or in* case 'prev->active_mm == next->mm' through* finish_task_switch()'s mmdrop().*/switch_mm_irqs_off(prev->active_mm, next->mm, next);	//负责将虚拟内存从上一个进程映射到新的进程中if (!prev->mm) {                        // from kernel/* will mmdrop() in finish_task_switch(). */rq->prev_mm = prev->active_mm;prev->active_mm = NULL;}}rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);prepare_lock_switch(rq, next, rf);/* Here we just switch the register state and the stack. */switch_to(prev, next, prev);	//将上一个进程的处理器状态切换到新进程的处理器状态barrier();return finish_task_switch(prev);
}