admin管理员组

文章数量:1611902

MC层级的group_capacity的更新

这里需要区分是MC层级还是DIE。他们的区别是sd->child是否为NULL。MC层级为NULL

具体可以看update_cpu_capacity这个函数

void update_group_capacity(struct sched_domain *sd, int cpu)
{
	struct sched_domain *child = sd->child;
	struct sched_group *group, *sdg = sd->groups;
	unsigned long capacity, min_capacity, max_capacity;
	unsigned long interval;

	interval = msecs_to_jiffies(sd->balance_interval);
	//负载均衡的最大值不能超过1UL和max_load_balance_interval 之间,
	//也就是负载均衡的最大周期不能超过max_load_balance_interval
	interval = clamp(interval, 1UL, max_load_balance_interval);
	sdg->sgc->next_update = jiffies + interval;

	if (!child) {
		update_cpu_capacity(sd, cpu);
		return;
	}
......
}

rq->cpu_capacity

本cpu的cfs的计算能力,rq->cpu_capacity =

(rq->cpu_capacity_orig  -  rq->rt.avg.util_avg)*

 

(rq->cpu_capacity_orig  -  rq->avg_irq.util_avg)/

 

rq->cpu_capacity_orig

 

sd->group->sgc->capacity

sdg->sgc->capacity = rq->cpu_capacity

 

Sd->group->sgc->max_capacity

sdg->sgc->max_capacity =  rq->cpu_capacity

 

Sd->group->sgc->min_capacity

sdg->sgc->min_capacity =  rq->cpu_capacity

 

Sd->group->sgc->next_update

jiffies + clamp(msecs_to_jiffies(sd->balance_interval), 1UL, max_load_balance_interval)

 

update_cpu_capacity

static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);//这个是dts中配置的值
	struct sched_group *sdg = sd->groups;
	struct max_cpu_capacity *mcc;
	unsigned long max_capacity;
	int max_cap_cpu;
	unsigned long flags;
 
	capacity *= arch_scale_max_freq_capacity(sd, cpu);//最大为1024
	capacity >>= SCHED_CAPACITY_SHIFT;//SCHED_CAPACITY_SHIFT为10,右移10位,等价于除以1024
 
	cpu_rq(cpu)->cpu_capacity_orig = capacity;//dts中配置的值
 
	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;//后续值实际上对他进行了更改
 
	raw_spin_lock_irqsave(&mcc->lock, flags);
	max_capacity = mcc->val;
	max_cap_cpu = mcc->cpu;
 
	if ((max_capacity > capacity && max_cap_cpu == cpu) ||
	    max_capacity < capacity) {
		/*update max_cpu_capacity结构体成员,获取整个topology cpu的max capacity
        存储在 rd结构体变量max_cpu_capacity中*/
		mcc->val = capacity;
		mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
		raw_spin_unlock_irqrestore(&mcc->lock, flags);
		printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
				cpu, capacity);
		goto skip_unlock;
#endif
	}
	raw_spin_unlock_irqrestore(&mcc->lock, flags);
 
skip_unlock: __attribute__ ((unused));
	capacity = scale_rt_capacity(capacity, cpu);
	
 
	if (!capacity)
		capacity = 1;
	
	cpu_rq(cpu)->cpu_capacity = capacity;
	sdg->sgc->capacity = capacity;
	sdg->sgc->min_capacity = capacity;
	sdg->sgc->max_capacity = capacity;
}

scale_rt_capacity

static unsigned long scale_rt_capacity(unsigned long max_cap, int cpu)
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long used, free;
	unsigned long irq;
 
	irq = cpu_util_irq(rq);//rq->avg_irq.util_avg
 
	if (unlikely(irq >= max_cap))
		return 1;
 
	used = READ_ONCE(rq->rt.avg.util_avg);
 
	if (unlikely(used >= max_cap))
		return 1;
 
	free = max_cap - used;
 
	//util = (max_cap - rq->rt.avg.util_avg)*(max_cap - rq->avg_irq.util_avg)/max_cap
	
	return scale_irq_capacity(free, irq, max_cap);
}

以sharkl5为例子,配置了宏:

defined(CONFIG_IRQ_TIME_ACCOUNTING) || \
defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

 
static inline unsigned long cpu_util_irq(struct rq *rq)
{
	return rq->avg_irq.util_avg;
}
 
static inline
unsigned long scale_irq_capacity(unsigned long util, unsigned long irq,
				 unsigned long max)
{
	util *= (max - irq);
	util /= max;
 
	
	return util;
}

总结一下:update_cpu_capacity如下:

rq->cpu_capacity_orig

cpu_rq(cpu)->cpu_capacity_orig=arch_scale_cpu_capacity(sd, cpu)*arch_scale_max_freq_capacity(sd, cpu)>>=SCHED_CAPACITY_SHIFT

可以粗略为arch_scale_cpu_capacity(sd, cpu),即dts中配置的值

rq->rd->max_cpu_capacity->val

cpu的最大计算能力   满足条件更新

 

rq->rd->max_cpu_capacity->cpu

 cpu

 

rq->cpu_capacity

本cpu的cfs的计算能力,rq->cpu_capacity =

(rq->cpu_capacity_orig  -  rq->rt.avg.util_avg)*

(rq->cpu_capacity_orig  -  rq->avg_irq.util_avg)/

rq->cpu_capacity_orig

 

sd->group->sgc->capacity

sdg->sgc->capacity = rq->cpu_capacity

 

Sd->group->sgc->max_capacity

sdg->sgc->max_capacity =  rq->cpu_capacity

 

Sd->group->sgc->min_capacity

sdg->sgc->min_capacity =  rq->cpu_capacity

 

DIE层级的group_capacity的更新

void update_group_capacity(struct sched_domain *sd, int cpu)
{
	struct sched_domain *child = sd->child;
	struct sched_group *group, *sdg = sd->groups;
	unsigned long capacity, min_capacity, max_capacity;
	unsigned long interval;

	interval = msecs_to_jiffies(sd->balance_interval);
	//负载均衡的最大值不能超过1UL和max_load_balance_interval 之间,
	//也就是负载均衡的最大周期不能超过max_load_balance_interval
	interval = clamp(interval, 1UL, max_load_balance_interval);//在1UL和max_load_balance_interval之间取值
	sdg->sgc->next_update = jiffies + interval;

	if (!child) {
		update_cpu_capacity(sd, cpu);
		return;
	}

	capacity = 0;
	min_capacity = ULONG_MAX;
	max_capacity = 0;

	if (child->flags & SD_OVERLAP) {
		/*
		 * SD_OVERLAP domains cannot assume(假定) that child groups
		 * span(跨越) the current group.
		 */

		for_each_cpu(cpu, sched_group_span(sdg)) {
			struct sched_group_capacity *sgc;
			struct rq *rq = cpu_rq(cpu);

			/*
			 * build_sched_domains() -> init_sched_groups_capacity()
			 * gets here before we've attached the domains to the
			 * runqueues.
			 *
			 * Use capacity_of(), which is set irrespective of domains
			 * in update_cpu_capacity().
			 *
			 * This avoids capacity from being 0 and
			 * causing divide-by-zero issues on boot.
			 */
			if (unlikely(!rq->sd)) {
				capacity += capacity_of(cpu);
			} else {
				sgc = rq->sd->groups->sgc;
				capacity += sgc->capacity;
			}

			min_capacity = min(capacity, min_capacity);
			max_capacity = max(capacity, max_capacity);
		}
	} else  {//之前跟过code,好像是直接到了这里
		/*
		 * !SD_OVERLAP domains can assume that child groups
		 * span the current group.
		 */

		group = child->groups;
		do {
			struct sched_group_capacity *sgc = group->sgc;

			capacity += sgc->capacity;
			min_capacity = min(sgc->min_capacity, min_capacity);
			max_capacity = max(sgc->max_capacity, max_capacity);
			group = group->next;
		} while (group != child->groups);//可以看作是计算一个cluster的group
	}
	// 实际上这里赋值最初传入的值的指针,所以这个值改变,sd也会改变
	sdg->sgc->capacity = capacity;
	sdg->sgc->min_capacity = min_capacity;
	sdg->sgc->max_capacity = max_capacity;
}

sd->group->sgc->capacity

+=sgc->capacity

 

Sd->group->sgc->max_capacity

+=min(sgc->min_capacity, min_capacity)

 

Sd->group->sgc->min_capacity

+=max(sgc->max_capacity, max_capacity)

 

Sd->group->sgc->next_update

jiffies + clamp(msecs_to_jiffies(sd->balance_interval), 1UL, max_load_balance_interval)

 

 

那最初的那个sgc->capacity、sgc->min_capacity、sgc->max_capacity的值又是多少呢?

实际上是在topolocy.c文件中进行初始化

        sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));

        sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;

        sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;

本文标签: 内核负载均衡updategroupcapacity