调度子系统7_负载均衡（四）-阿里云开发者社区

//	寻找sched domain中最忙的group
//	函数参数：
//		sd：待查找的sched domain
//		this_cpu：当前正在对其执行负载均衡的cpu
//		imbalance：为达到平衡需要移动的权重
//		idle：this_cpu当前的状态
//		sd_idle: sd空闲状态
//		cpus：可作为源cpu的集合
//		balance：指示this_cpu是否适合负载均衡
//	返回值：
//		如果存在不均衡，返回最忙的group
//		否则，如果用户建议power-savings balance，返回最不忙的group，
//			通过将其中cpus的进程移动到本group，使其idle
//	函数任务：
//		1.计算sd的负载信息
//		2.根据统计信息，决定是否进行负载均衡
//			2.1 this_cpu不适合在sd中进行均衡，则返回
//			2.2 没有最忙的group，或者最忙group可运行进程数为0，则返回
//			2.3 this_cpu所在group的负载大于最忙group的负载，则返回
//			2.4 计算sched domain的平均负载
//				公式：(SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr
//				2.4.1 如果this_group的负载大于等于平均负载，则返回
//			2.5 this_cpu所在group的负载阈值超过了最忙group的负载阈值，则返回
//				this_cpu所在group的负载阈值计算公式：sd->imbalance_pct * sds.this_load
//			2.6 运行到此处，说明存在失衡，计算失衡的负载量（即需要移动的负载数）
//			2.7 返回最忙的group
//		3.如果sd负载没有失衡，计算是否可以通过负载均衡来省电
//			3.1 返回最不忙的group
1.1 static struct sched_group *find_busiest_group(struct sched_domain *sd, int this_cpu,
		   unsigned long *imbalance, enum cpu_idle_type idle,
		   int *sd_idle, const struct cpumask *cpus, int *balance)
{

	struct sd_lb_stats sds;
	memset(&sds, 0, sizeof(sds));
	//计算sd的负载
	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
					balance, &sds);
	//this_cpu不适合在sd中进行均衡，则返回
	if (!(*balance))
		goto ret;
	//没有最忙的group，或者最忙group可运行进程数为0，则返回
	if (!sds.busiest || sds.busiest_nr_running == 0)
		goto out_balanced;

	//this_cpu所在group的负载大于最忙group的负载，则返回
	if (sds.this_load >= sds.max_load)
		goto out_balanced;

	//sd的平均权重
	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
	//this_cpu所在group负载大于sd的平均负载，则返回
	if (sds.this_load >= sds.avg_load)
		goto out_balanced;
	//imbalance_pct，进行负载均衡的阈值
	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
		goto out_balanced;
	//存在失衡，计算需要均衡的负载量
	calculate_imbalance(&sds, this_cpu, imbalance);
	//返回最忙的group
	return sds.busiest;

out_balanced:
	//没有明显的失衡，检查是否可以进行通过负载均衡省电
	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
		return sds.busiest;
ret:
	*imbalance = 0;
	return NULL;
}

//	计算sched domain负载均衡统计信息
//	函数参数：
//		sd：待计算负载统计信息的sd
//		this_cpu：当前正在对其执行负载均衡的cpu
//		idle：this_cpu的idle状态
//		sd_idle：sd的idle状态
//		cpu：可作为源cpu的掩码
//		balance：指示是否应该进行负载均衡
//		sds：保存统计信息的变量
//	函数任务：
//		1.遍历sched domain中所有的group
//			1.1 计算当前group的负载信息
//			1.2 如果this_cpu在当前group，并且当前group已经均衡，则退出
//			1.3 更新sched domain的负载统计
//				1.3.1 sds->total_load统计所有group的负载
//				1.3.2 sds->total_pwr统计所有group的cpu power
//			1.4 如果sched domain的子domain设置了SD_PREFER_SIBLING标志
//				1.4.1 说明sched domain的sibling之间移动进程
//				1.4.2 降低本group的group_capacity，之后将所有多余进程移动到其他sibling
//			1.5 如果this_cpu属于当前group，更新sched domain中关于this_cpu所在group的记录信息
//			1.6 如果当前group是sched domain中负载最重的group，记录group的负载信息
//				1.6.1 sds->max_load，记录sched domain内负载最重group的负载量
//				1.6.2 sds->busiest，记录sched domain内负载最重group的编号
//			1.7 更新sched domain power saving的信息
//	注：
//		sched domain下所有sched group组织成环形链表的形式。
1.2 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
			enum cpu_idle_type idle, int *sd_idle,
			const struct cpumask *cpus, int *balance,
			struct sd_lb_stats *sds)
{
	struct sched_domain *child = sd->child;
	struct sched_group *group = sd->groups;
	struct sg_lb_stats sgs;
	int load_idx, prefer_sibling = 0;

	if (child && child->flags & SD_PREFER_SIBLING)
		prefer_sibling = 1;
	//初始化power saving的信息
	init_sd_power_savings_stats(sd, sds, idle);
	load_idx = get_sd_load_idx(sd, idle);
	//遍历sched domain中所有的group
	do {
		int local_group;
		//判断this_cpu是否当前group中
		local_group = cpumask_test_cpu(this_cpu,
					       sched_group_cpus(group));
		memset(&sgs, 0, sizeof(sgs));
		//计算当前group的负载信息
		update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
				local_group, cpus, balance, &sgs);
		//this_cpu在本group内，并且group已经均衡，则返回
		if (local_group && !(*balance))
			return;
		//更新sched domain的负载统计信息
		sds->total_load += sgs.group_load;
		sds->total_pwr += group->cpu_power;

		//sched domain中的子sched domain设置SD_PREFER_SIBLING，标识sched domain的sibling之间移动进程
		//降低本group的group_capacity，将所有多余进程移动到其他sibling
		if (prefer_sibling)
			sgs.group_capacity = min(sgs.group_capacity, 1UL);
		//this_cpu属于group，更新sched domain中关于this_cpu所在group的记录信息
		if (local_group) {
			//sds->this_load，this_cpu所在group的负载
			sds->this_load = sgs.avg_load;
			//sds->this，this_cpu所在的group
			sds->this = group;
			sds->this_nr_running = sgs.sum_nr_running;
			sds->this_load_per_task = sgs.sum_weighted_load;
		//当前group的平均负载大于sched domain中已遍历group的最大的负载
		//当前group就绪进程的个数大于group的容量，或者group设置了imb标识
		} else if (sgs.avg_load > sds->max_load &&
			   (sgs.sum_nr_running > sgs.group_capacity ||
				sgs.group_imb)) {
			//更新sched domain中用于记录具有最大负载group的信息
			sds->max_load = sgs.avg_load;
			sds->busiest = group;
			sds->busiest_nr_running = sgs.sum_nr_running;
			sds->busiest_group_capacity = sgs.group_capacity;
			sds->busiest_load_per_task = sgs.sum_weighted_load;
			sds->group_imb = sgs.group_imb;
		}
		//更新sched domain power saving的信息
		update_sd_power_savings_stats(group, sds, local_group, &sgs);
		//继续遍历下一个group
		group = group->next;
	} while (group != sd->groups);
}

//	计算sched group的负载信息
//	函数参数：
//		sd，group所在的sched domain
//		group，当前要计算的group
//		this_cpu，当前对其进行负载均衡的cpu
//		idle，this_cpu的idle状态
//		load_idx，Load index of sched_domain of this_cpu for load calc
//		sd_idle，group所在sched domain的idle状态
//		local_group，指示当前group是否包含this_cpu
//		cpus，可选为源cpu的掩码集合
//		balance：指示是否应该进行负载均衡
//		sgs，收集统计信息的变量
//	函数任务：
//		1.遍历group中的候选cpu
//			1.1 如果cpu有进程运行，更新sched domain为非idle状态
//			1.2 获取cpu的历史负载load
//				1.2.1 如果this_cpu在group内，返回max(cpu->cpu_load[load_idx], rq->load.weight)
//				1.2.2 如果this_cpu不在group内，返回min(cpu->cpu_load[load_idx], rq->load.weight)
//			1.3 更新group的统计信息
//				1.3.1 group->group_load, group的历史负载
//				1.3.2 group->sum_nr_running, group中进程总数
//				1.3.3 group->sum_weighted_load, group的当前负载
//		2.更新group的cpu power
//			2.1 sched domain的cpu power保存在其sd->groups->cpu_power中(即domain包含的第一个group的cpu_power字段)
//			2.2 sd->groups->cpu_power等于子domain的cpu power总和
//		3.计算group的平均负载
//			3.1 公式 avg_load = group_load/group->cpu_power
//		4.计算group每进程负载
//			4.1 公式 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running
//		5.如果group内cpu最大、最小负载悬殊，设置标识标识group内不均衡
//			5.1 公式 (max_cpu_load - min_cpu_load) > 2*avg_load_per_task
//		6.更新sched group的group_capacity，即能接纳的进程容量
//	注：
//		cpu power用于表示cpu group的能力，不同层次的cpu group具有不同的计算公式:
//			cpu domain: cpu_power = SCHED_LOAD_SCALE
//			physical domain：SCHED_LOAD_SCALE+SCHED_LOAD_SCALE*(cpus_weight(cpumask)-1)/10
//				其中cpus_weight计算物理cpu里的逻辑核个数（超线程）
//			node domain:在相同node domain下所有physical domain的cpu power的总和
1.3 static inline void update_sg_lb_stats(struct sched_domain *sd,
			struct sched_group *group, int this_cpu,
			enum cpu_idle_type idle, int load_idx, int *sd_idle,
			int local_group, const struct cpumask *cpus,
			int *balance, struct sg_lb_stats *sgs)
{
	unsigned long load, max_cpu_load, min_cpu_load;
	int i;
	unsigned int balance_cpu = -1, first_idle_cpu = 0;
	unsigned long avg_load_per_task = 0;
	//this_cpu在当前group
	if (local_group)
		balance_cpu = group_first_cpu(group);
	//最大、最小负载
	max_cpu_load = 0;
	min_cpu_load = ~0UL;
	//遍历group中的候选cpu
	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
		struct rq *rq = cpu_rq(i);
		//非idle状态，并且有进程
		if (*sd_idle && rq->nr_running)
			*sd_idle = 0;
		//this_cpu在group内
		if (local_group) {
			//当前cpu为idle，并且为发现的第一个idle cpu
			if (idle_cpu(i) && !first_idle_cpu) {
				first_idle_cpu = 1;
				//balance_cpu记录this_cpu所在group内的第一个idle cpu
				balance_cpu = i;
			}
			//返回cpu i当前负载和load_idx历史负载记录两者最大的
			load = target_load(i, load_idx);
		} 
		else //this_cpu不在group内
		{
			//返回cpu i当前负载和load_idx历史负载记录两者最大的
			load = source_load(i, load_idx);
			//max_cpu_load，min_cpu_load记录最大、最小负载
			if (load > max_cpu_load)
				max_cpu_load = load;
			if (min_cpu_load > load)
				min_cpu_load = load;
		}
		//更新group的统计量
		//load_idx，历史负载统计
		sgs->group_load += load;
		//sched group中进程总数
		sgs->sum_nr_running += rq->nr_running;
		//cpu_rq(cpu)->load.weight，当前tick的负载统计
		sgs->sum_weighted_load += weighted_cpuload(i);

	}

	//只有当前domain的first idle cpu和first cpu(busiest)合适做load balance
	//CPU_NEWLY_IDLE类型的load balance将总是被允许
	if (idle != CPU_NEWLY_IDLE && local_group &&
	    balance_cpu != this_cpu) {
		//不需要做负载均衡
		*balance = 0;
		return;
	}
	//更新group的cpu power
	update_group_power(sd, this_cpu);

	//计算group的平均负载
	sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

	//更新group内进程平均负载
	//	sgs->sum_nr_running记录group内进程总数
	if (sgs->sum_nr_running)
		avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
	//如果group内最大负载、最小负载悬殊，表示组内不均衡
	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
		sgs->group_imb = 1;
	//更新sched group的group_capacity，即能接纳的进程容量
	sgs->group_capacity =
		DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
}

//参考 负载均衡（二）中的论文

调度子系统7_负载均衡（四）

热门文章

最新文章

相关课程

相关电子书

相关实验场景