ltp（六） IRQ之irqbalance01.c源码分析-摩杜云开发者社区

前言

本篇文章主要是为了对ltp内irq模块的测试用例之一的irqbalance进行源码分析，作为对内核中断子系统测试项之一，其蕴含的技术知识，还是很值得学习一下的。

irqbalance是什么？项目主页上有以下描述：

Irqbalance is a daemon to help balance the cpu load generated by interrupts across all of a systems cpus. Irqbalance identifies the highest volume interrupt sources, and isolates each of them to a single unique cpu, so that load is spread as much as possible over an entire processor set, while minimizing cache miss rates for irq handlers.

简单来说，irqbalance这一服务可以帮助平衡所有系统cpu上的中断所产生的cpu负载。Irqbalance可以筛选出最高容量的中断源，并将它们中的每一个隔离到一个单独的cpu，以便负载尽可能多地分布在整个处理器集上，同时最大限度地减少irq 处理程序的缓存未命中率，这对对多核心系统的性能有很大的提升。

1.源码分析

1.1函数调用关系图

ltp（六） IRQ之irqbalance01.c源码分析_irqbalance

1.2源码分析

1.2.1setup()

根据函数调用关系图，我们可以了解到，作为bin入口的，setup()，调用了如下的两个主要函数:

static void setup(void)
{
  // 收集系统中断信息
	collect_irq_info();
  // 打印收集到的中断信息
	print_irq_info();
	
  // 根据测试结果，确认该系统是否满足测试门槛，即CPU个数与中断源个数必须要>=1
	if (nr_cpus < 1)
		tst_brk(TBROK, "No CPUs found in /proc/interrupts?");

	if (nr_irqs < 1)
		tst_brk(TBROK, "No IRQs found in /proc/interrupts?");
}

1.2.2run()

接下来，我们对irqbalance01.c 的源码主要的测试函数run()进行分析:

// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright (c) 2021 SUSE LLC <rpalethorpe@suse.com> */
/*\
 * [Description]
 *
 * Check that something (e.g. irqbalance daemon) is performing IRQ
 * load balancing.
 *
 * On many systems userland needs to set /proc/irq/$IRQ/smp_affinity
 * to prevent many IRQs being delivered to the same CPU.
 *
 * Note some drivers and IRQ controllers will distribute IRQs
 * evenly. Some systems will have housekeeping CPUs configured. Some·
 * IRQs can not be masked etc. So this test is not appropriate for all
 * scenarios.
 *
 * Furthermore, exactly how IRQs should be distributed is a
 * performance and/or security issue. This is only a generic smoke
 * test. It will hopefully detect misconfigured systems and total
 * balancing failures which are often silent errors.
 *
 * Heuristic: Evidence of Change
 *
 * 1. Find IRQs with a non-zero count
 * 2. Check if they are now disallowed
 *
 * There are two sources of information we need to parse:
 *
 * 1. /proc/interrupts
 * 2. /proc/irq/$IRQ/smp_affinity
 *
 * We get the active IRQs and CPUs from /proc/interrupts. It also
 * contains the per-CPU IRQ counts and info we do not care about.
 *
 * We get the IRQ masks from each active IRQ's smp_affinity file. This
 * is a bitmask written out in hexadecimal format. It shows which CPUs
 * an IRQ may be received by.
 */

#include <stdlib.h>

#include "tst_test.h"
#include "tst_safe_stdio.h"
#include "tst_safe_file_at.h"

enum affinity {
	ALLOW = '+',
	DENY = '-',
};

static unsigned int *irq_stats;
static enum affinity *irq_affinity;

static unsigned int nr_cpus;
static unsigned int nr_irqs;
static unsigned int *irq_ids;

static char *read_proc_file(const char *const path, size_t *const len_out)
{
	const size_t pg_len = SAFE_SYSCONF(_SC_PAGESIZE);
	int fd = SAFE_OPEN(path, O_RDONLY);
	size_t ret = 0, used_len = 0;
	static size_t total_len;
	static char *buf;

	do {
		if (used_len + 1 >= total_len) {
			total_len += pg_len;
			buf = SAFE_REALLOC(buf, total_len);
		}

		ret = SAFE_READ(0, fd,
				buf + used_len,
				total_len - used_len - 1);
		used_len += ret;
	} while (ret);

	if (!used_len)
		tst_brk(TBROK, "Empty %s?", path);

	buf[used_len] = '\0';

	SAFE_CLOSE(fd);

	if (len_out)
		*len_out = used_len;
	return buf;
}

static void collect_irq_info(void)
{
	char *buf, *c, *first_row;
	char path[PATH_MAX];
	size_t row, col, len;
	long acc;
	unsigned int cpu_total, bit;

	nr_cpus = 0;
	nr_irqs = 0;

	/*
	* 直接读取/proc/interrupts，里面存放了OS中断的详细信息，eg:
	* root@james-HP-288-Pro-G2-MT:/home/jameschu# cat /proc/interrupts
    * 			CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7
  	* 0:         14          0          0          0          0          0          0          0  IR-IO-APIC    2-edge      timer
   	* 8:          0          0          0          0          0          1          0          0  IR-IO-APIC    8-edge      rtc0
   	* 9:          0        132          0          0          0          0          0          0  IR-IO-APIC    9-fasteoi   acpi
	* [= 1 =]   [=================================== 2 ========================================]  [== 3 ==]    [== 4 ==]   [= 5 =]
	* 读取到的内容从左到右，分别为：1、逻辑中断号，2、中断在各CPU发生的次数，3、中断所属设备类名称，4、硬件中断号，5、中断处理函数。
	*/
	buf = read_proc_file("/proc/interrupts", NULL);

	/* Count CPUs, header columns are like /CPU[0-9]+/ */
	for (c = buf; *c != '\0' && *c != '\n'; c++) {
		if (!strncmp(c, "CPU", 3))
			nr_cpus++;
	}

	c++;
	// 获取有效行首列指针地址
	first_row = c;
	/* Count IRQs, real IRQs start with /[0-9]+:/ */
	while (*c != '\0') {
		switch (*c) {
		case ' ':
		case '\t':
		case '\n':
		case '0' ... '9':
			c++;
			break;
		case ':':
			nr_irqs++; // 统计有几个中断源
			/* fall-through */
		default:
			while (*c != '\n' && *c != '\0')
				c++;
		}
	}

	tst_res(TINFO, "Found %u CPUS, %u IRQs", nr_cpus, nr_irqs);

	irq_ids = SAFE_REALLOC(irq_ids, nr_irqs * sizeof(*irq_ids));
	irq_stats = SAFE_REALLOC(irq_stats,
				 nr_cpus * (nr_irqs + 1) * sizeof(*irq_stats));
	irq_affinity = SAFE_REALLOC(irq_affinity,
				    nr_cpus * nr_irqs * sizeof(*irq_affinity));

	c = first_row; // 转位至有效行首行
	acc = -1;
	row = col = 0; //行、列全部清零
	/* Parse columns containing IRQ counts and IRQ IDs into acc. Ignore
	 * everything else.
	 */
	while (*c != '\0') {
		switch (*c) {
		case ' ':
		case '\t':
			if (acc >= 0) {
				irq_stats[row * nr_cpus + col] = acc;
				acc = -1;
				col++;
			}
			break;
		case '\n':
			if (acc != -1)
				tst_brk(TBROK, "Unexpected EOL");
			col = 0;
			row++;
			break;
		case '0' ... '9':
			if (acc == -1)
				acc = 0;

			acc *= 10;
			acc += *c - '0';
			break;
		case ':':
			if (acc == -1 || col != 0)
				tst_brk(TBROK, "Unexpected ':'");
			irq_ids[row] = acc;
			acc = -1;
			break;
		default:
			acc = -1;
			while (*c != '\n' && *c != '\0')
				c++;
			continue;
		}

		c++;
	}

	for (col = 0; col < nr_cpus; col++) {
		cpu_total = 0;

		for (row = 0; row < nr_irqs; row++)
			cpu_total += irq_stats[row * nr_cpus + col];

		irq_stats[row * nr_cpus + col] = cpu_total;
	}

	/* Read the CPU affinity masks for each IRQ. The first CPU is in the
	 * right most (least significant) bit. See bitmap_string() in the kernel
	 * (%*pb)
	 */
	// 逐个中断源
	for (row = 0; row < nr_irqs; row++) {
		/*
		* "smp_affinity"是Linux系统中的一个参数，它用于指定CPU的亲和性，即将特定的CPU核心分配给特定的进程或线程。
		* 这个参数通常用于优化系统性能，可以确保进程或线程在尽可能少的CPU核心上运行，从而提高系统的响应速度和吞吐量。
		* 在多核系统中，通过设置smp_affinity参数，可以有效地避免CPU核心之间的竞争和冲突，从而提高系统的稳定性和可靠性。
		*/ 
		sprintf(path, "/proc/irq/%u/smp_affinity", irq_ids[row]);
		buf = read_proc_file(path, &len); 
		c = buf + len;
		col = 0;

		/*
		* 开始解析smp_affinity Mask，注意smp_affinity是一个十六进制的bitmask，
		* 它和cpu No.序列的“与”运算结果就是将affinity设置在那个CPU了。
		*（也即smp_affinity中被设置为1的位为CPU No.）如：8个逻辑core，那么CPU#的序列为11111111(从右到左依次为CPU0~CPU7)
		*/ 
		while (--c >= buf) {
			if (col > nr_cpus) {
				tst_res(TINFO, "%u/smp_affnity: %s", irq_ids[row], buf);
				tst_brk(TBROK, "More mask char bits than cpus");
			}

			switch (*c) {
			case '\n':
			case ' ':
			case ',':
				continue;
			case '0' ... '9':
				acc = *c - '0';
				break;
			case 'a' ... 'f':
				acc = 10 + *c - 'a';
				break;
			default:
				tst_res(TINFO, "%u/smp_affnity: %s", irq_ids[row], buf);
				tst_brk(TBROK, "Wasn't expecting 0x%02x", *c);
			}

			for (bit = 0; bit < 4 && col < nr_cpus; bit++) {
				irq_affinity[row * nr_cpus + col++] = (acc & (1 << bit)) ? ALLOW : DENY;
			}
		}

		if (col < nr_cpus) {
			tst_res(TINFO, "%u/smp_affnity: %s", irq_ids[row], buf);
			tst_brk(TBROK, "Only found %zu cpus", col);
		}
	}
}

static void print_irq_info(void)
{
	size_t row, col;
	unsigned int count;
	enum affinity aff;

	// 绘出表头
	tst_printf("  IRQ       ");
	for (col = 0; col < nr_cpus; col++)
		tst_printf("CPU%-8zu", col);

	tst_printf("\n");

	for (row = 0; row < nr_irqs; row++) {
		tst_printf("%5u:", irq_ids[row]); // 打印中断源

		for (col = 0; col < nr_cpus; col++) {
			count = irq_stats[row * nr_cpus + col];
			aff = irq_affinity[row * nr_cpus + col];

			tst_printf("%10u%c", count, aff); // 打印中断次数 && CPU是否可以调度中断
		}

		tst_printf("\n");
	}

	tst_printf("Total:");

	for (col = 0; col < nr_cpus; col++)
		tst_printf("%10u ", irq_stats[row * nr_cpus + col]);

	tst_printf("\n");
}

static void evidence_of_change(void)
{
	size_t row, col, changed = 0;

	// 遍历行列信息，确认是否irq负载均衡
	for (row = 0; row < nr_irqs; row++) {
		for (col = 0; col < nr_cpus; col++) {
			// 中断如果在该CPU上没有发生，则跳过
			if (!irq_stats[row * nr_cpus + col])
				continue;
			// 中断如果原本就在该CPU上可以调度，则跳过
			if (irq_affinity[row * nr_cpus + col] == ALLOW)
				continue;

			changed++;
		}
	}

	tst_res(changed ? TPASS : TFAIL, \
			"Heuristic: Detected %zu irq-cpu pairs have been dissallowed", changed);
}

static void setup(void)
{
	collect_irq_info();
	print_irq_info();

	if (nr_cpus < 1)
		tst_brk(TBROK, "No CPUs found in /proc/interrupts?");

	if (nr_irqs < 1)
		tst_brk(TBROK, "No IRQs found in /proc/interrupts?");
}

static void run(void)
{
	collect_irq_info();

	evidence_of_change();
}

static struct tst_test test = {
	.test_all = run,
	.setup = setup,
	.min_cpus = 2,
};

1.3测试实例

如下是irqbalance01测试的打印之一，可以对照如下的打印，理解上一小节我对程序的一些注释。此外，大家也可以尝试阅读一下irqbalance01.c作者对其的desciription，以便于更好的了解本测试的核心思想。总体来说，本测试还要对照irqbalance service的源码进行解读，irqbalance service通过修改各中断源的smp_affinity，达到了irq在系统上的负载均衡，当然，它的实现肯定会比我在此描述的复杂很多，不过在此就不展开了，大家有兴趣可以去了解一下它的源码！

irqbalance01.c:129: TINFO: Found 2 CPUS, 46 IRQs
  IRQ       CPU0       CPU1       
......
   37:      5970+         0+
   38:         0+         0+
   39:         0+         0+

尾言

温故知新，岁岁常新！:) 因博主水平能力有限，如果有大佬在阅读过程种发现其中的缪误，希望可以不吝赐教，3Q。