Linux Kernel：非早期异常处理程序的初始化在以前的文章里，我们顺着内核启动的流程，梳理了异常处理程序的早期初始

本文采用Linux 内核 v3.10 版本 x86_64架构

在以前的文章里，我们顺着内核启动的流程，梳理了异常处理程序的早期初始化。在 Linux Kernel：中断和异常处理程序的早期初始化（续）里，我们介绍到了 set_arch 函数内部的 early_trap_int 以及 early_trap_pf_init 函数。接下来，让我们回到 start_kernel 函数，继续梳理后续的初始化流程。

回到 start_kernel 函数之后，首个跟异常和中断相关的函数就是 trap_init 。在之前的流程里，我们已经为 3 种异常设置了处理程序，分别是调试异常（Debug Exception，#DB），断点异常（Breakpoint exception，#BP）以及页故障异常（Page-Fault exception，#PF）；在 trap_init 函数里，我们将为其它异常设置处理程序。本文，我们就来介绍 trap_init 函数的实现，该函数定义如下：

// file: arch/x86/kernel/traps.c
void __init trap_init(void)
{
	int i;

#ifdef CONFIG_EISA
	void __iomem *p = early_ioremap(0x0FFFD9, 4);

	if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
		EISA_bus = 1;
	early_iounmap(p, 4);
#endif

	set_intr_gate(X86_TRAP_DE, &divide_error);
	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
	/* int4 can be called from all */
	set_system_intr_gate(X86_TRAP_OF, &overflow);
	set_intr_gate(X86_TRAP_BR, &bounds);
	set_intr_gate(X86_TRAP_UD, &invalid_op);
	set_intr_gate(X86_TRAP_NM, &device_not_available);
#ifdef CONFIG_X86_32
	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
	set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
	set_intr_gate(X86_TRAP_TS, &invalid_TSS);
	set_intr_gate(X86_TRAP_NP, &segment_not_present);
	set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
	set_intr_gate(X86_TRAP_GP, &general_protection);
	set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
	set_intr_gate(X86_TRAP_MF, &coprocessor_error);
	set_intr_gate(X86_TRAP_AC, &alignment_check);
#ifdef CONFIG_X86_MCE
	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
	set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);

	/* Reserve all the builtin and the syscall vector: */
	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
		set_bit(i, used_vectors);

#ifdef CONFIG_IA32_EMULATION
	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif

#ifdef CONFIG_X86_32
	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
	set_bit(SYSCALL_VECTOR, used_vectors);
#endif

	/*
	 * Set the IDT descriptor to a fixed read-only location, so that the
	 * "sidt" instruction will not leak the location of the kernel, and
	 * to defend the IDT against arbitrary memory write vulnerabilities.
	 * It will be reloaded in cpu_init() */
	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
	idt_descr.address = fix_to_virt(FIX_RO_IDT);

	/*
	 * Should be a barrier for any external CPU state:
	 */
	cpu_init();

	x86_init.irqs.trap_init();

#ifdef CONFIG_X86_64
	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
	set_nmi_gate(X86_TRAP_DB, &debug);
	set_nmi_gate(X86_TRAP_BP, &int3);
#endif
}

一、检查 EISA 总线标识

如果设置了内核配置参数 CONFIG_EISA ，说明系统是基于 EISA（Extended Industry Standard Architecture）总线的，会对其进行校验。

#ifdef CONFIG_EISA
	void __iomem *p = early_ioremap(0x0FFFD9, 4);

	if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
		EISA_bus = 1;
	early_iounmap(p, 4);
#endif

调用 early_ioremap 函数，将以物理地址 0x0FFFD9为起始点的 4 个字节映射到虚拟内存，然后调用 readl 函数从映射后的虚拟地址中读取 4 个字节。如果读取到的数据是 ”EISA“，说明是 EISA 总线，将变量 EISA_bus设置为 1。最后，调用 early_iounmap 函数，取消映射。

变量 EISA_bus 的初始化值为 0，其定义如下：

//include/linux/eisa.h
# define EISA_bus 0

关于early_ioremap 和early_iounmap函数的具体实现，请参考 Linux Kernel：内存管理之早期 I/O 内存映射(early ioremap) 。

在 PC 发展的早期，由于地址线只有 20 位，最大可寻址 1MB 的物理内存。其中 0x00000 ~ 0x9FFFF （共640KB）是普通内存，由系统支配；而 0xA0000 ~ 0xFFFFF （共 384 KB）是 BIOS 保留区域，该区域内存布局如下（具体可参考 Memory Map (x86)）：

在 32 位及 64 位模式下，为了向前兼容，该区域被保留下来。内核对该区域地址空间定义如下：

// file: arch/x86/include/uapi/asm/e820.h
#define ISA_START_ADDRESS	0xa0000
#define ISA_END_ADDRESS		0x100000

#define BIOS_BEGIN		0x000a0000
#define BIOS_END		0x00100000

地址 0x0FFFD9是主板 BIOS 的总线标识地址，我们从开源 BIOS 软件 coreboot 的源码中，能够看到该地址是如何赋值的：

// file: src/device/oprom/realmode/x86.c
static void setup_rombios(void)
{
	const char date[] = "06/11/99";
	memcpy((void *)0xffff5, &date, 8);

	const char ident[] = "PCI_ISA";
	memcpy((void *)0xfffd9, &ident, 7);

	/* system model: IBM-AT */
	write8((void *)0xffffe, 0xfc);
}

需要说明的是，EISA 总线已经过时了，被 PCI / PCIe 总线所替代，所以其总线标识已经不是 “EISA” 了。

二、安装中断门

接下来，为各种异常安装中断门：

	set_intr_gate(X86_TRAP_DE, &divide_error);
	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
	/* int4 can be called from all */
	set_system_intr_gate(X86_TRAP_OF, &overflow);
	set_intr_gate(X86_TRAP_BR, &bounds);
	set_intr_gate(X86_TRAP_UD, &invalid_op);
	set_intr_gate(X86_TRAP_NM, &device_not_available);
#ifdef CONFIG_X86_32
	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
	set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
	set_intr_gate(X86_TRAP_TS, &invalid_TSS);
	set_intr_gate(X86_TRAP_NP, &segment_not_present);
	set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
	set_intr_gate(X86_TRAP_GP, &general_protection);
	set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
	set_intr_gate(X86_TRAP_MF, &coprocessor_error);
	set_intr_gate(X86_TRAP_AC, &alignment_check);
#ifdef CONFIG_X86_MCE
	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
	set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);

异常的向量号定义如下：

// file: arch/x86/include/asm/traps.h
/* Interrupts/Exceptions */
enum {
	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
	X86_TRAP_DB,		/*  1, Debug */
	X86_TRAP_NMI,		/*  2, Non-maskable Interrupt */
	X86_TRAP_BP,		/*  3, Breakpoint */
	X86_TRAP_OF,		/*  4, Overflow */
	X86_TRAP_BR,		/*  5, Bound Range Exceeded */
	X86_TRAP_UD,		/*  6, Invalid Opcode */
	X86_TRAP_NM,		/*  7, Device Not Available */
	X86_TRAP_DF,		/*  8, Double Fault */
	X86_TRAP_OLD_MF,	/*  9, Coprocessor Segment Overrun */
	X86_TRAP_TS,		/* 10, Invalid TSS */
	X86_TRAP_NP,		/* 11, Segment Not Present */
	X86_TRAP_SS,		/* 12, Stack Segment Fault */
	X86_TRAP_GP,		/* 13, General Protection Fault */
	X86_TRAP_PF,		/* 14, Page Fault */
	X86_TRAP_SPURIOUS,	/* 15, Spurious Interrupt */
	X86_TRAP_MF,		/* 16, x87 Floating-Point Exception */
	X86_TRAP_AC,		/* 17, Alignment Check */
	X86_TRAP_MC,		/* 18, Machine Check */
	X86_TRAP_XF,		/* 19, SIMD Floating-Point Exception */
	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
};

各异常简要说明如下（详细信息可参考 Intel 64 and IA-32 Architectures Software Developer Manuals Volume 3A, Chapter 6.15 EXCEPTION AND INTERRUPT REFERENCE）：

0 - Divide Error（#DE）。

指示 DIV 或 IDIV 指令的除数为 0；或者目标操作数的位数不足以表示计算结果。

1 - Debug Exception (#DB)。

指示检测到一个或多个调试异常条件。#DB 的触发条件有以下几种：

Instruction fetch breakpoint
Data read or write breakpoint
I/O read or write breakpoint
General detect condition (in conjunction with in-circuit emulation)
Single-step（当设置 RFLAGS 中的 TF 标志时，可实现单步执行）
Task-switch
Execution of INT1（硬件厂商使用 INT1 指令进行硬件调试，软件厂商使用 INT3 指令进行断点调试）

2 - 不可屏蔽中断（ Non-Maskable Interrupt， NMI）。

通过处理器 NMI 针脚触发的外部中断或者由 I/O APIC 设置并传送给 Local APIC 的 NMI 请求。

3 - Breakpoint Exception (#BP)。

由 INT3 指令引起的断点异常。 INT3 指令的操作码只有一个字节，调试器可以把要执行指令的第一个字节替换成 INT3 指令的操作码，以达到设置断点的目的。

4 - Overflow Exception (#OF)。

当执行 INTO 指令时，如果 RFLAGS 寄存器中的 OF 标志被置位，则触发 #OF 异常；否则，什么也不会发生。

5 - BOUND Range Exceeded Exception (#BR)

指示索引值是否超出上、下边界。BOUND 指令会检查给定的索引（有符号整数）是否超出内存中指定的上、下边界范围。如果超出边界，则触发 #BR 异常；否则，什么也不会发生。

6 - Invalid Opcode Exception (Undefined Opcode，#UD)

指示处理器执行了无效的或保留的操作码。

7 - Device Not Available Exception (No Math Coprocessor，#NM)

控制寄存器 CR0 的 EM 标志被置位时，处理器执行了 x87 FPU 浮点指令。 CR0 的 EM 标志被置位，意味着处理器没有 x87 FPU 浮点处理单元。
控制寄存器 CR0 的 MP 和 TS 标志被置位时，处理器执行了 WAIT/FWAIT 指令。
控制寄存器 CR0 的 TS 标志被置位，EM 标志被清除时，处理器执行了 x87 FPU、 MMX 或者 SSE/SSE2/SSE3 指令，。

8 - Double Fault Exception (#DF)

指示处理器在调用异常处理程序处理第一个异常时，又检测到了第二个异常。通常情况下，当处理器在处理一个异常时又检测到了另一个异常，这两个异常会被串行处理。然而，如果处理器不能串行处理它们，就会触发 #DF 异常。

9 - Coprocessor Segment Overrun

因外部的协处理器引发的问题，仅用于 Intel386 处理器。

10 - Invalid TSS Exception (#TS)

当任务切换或执行指令时，使用的 TTS 信息无效。

11 - Segment Not Present (#NP)

引用了一个不存在的段（段或门描述符的存在位被清除）。

12 - Stack Fault Exception (#SS)

操作超出了段界限；或者引用了不存在的栈段；或者在 64 位模式下，栈指针的地址不符合 canonical 类型。

13 - General Protection Exception (#GP)

违反了处理器的任意一条保护规则。

14 - Page-Fault Exception (#PF)

页目录项或页表项的存在位为 0；或者将页结构项的保留位设置为 1；或者违反了任何一种分页保护机制。

15 - Spurious Interrupt

15 号向量是 Intel 系统的保留向量，被 Linux 用作虚假中断。虚假中断本质上一种 bug，出现了中断信号却无法确定中断源。可参考 Spurious interrupts。

16 - x87 FPU Floating-Point Error (#MF)

指示 x87 FPU 检测到浮点错误，如数字溢出、被 0 除等。

17 - Alignment Check Exception (#AC)

操作数的地址没有被正确的对齐（比如，一个长整数的地址不是 4 的倍数）。

18 - Machine-Check Exception (#MC)

指示处理器检测到内部机器错误或者总线错误。

19 - SIMD Floating-Point Exception (#XM)

指示处理器检测到了 SSE/SSE2/SSE3 SIMD 浮点异常。

我们注意到，安装中断门时调用了 3 个不同的函数：

set_intr_gate
set_system_intr_gate
set_intr_gate_ist

这三个函数定义如下：

// file: arch/x86/include/asm/desc.h
static inline void set_intr_gate(unsigned int n, void *addr)
{
	BUG_ON((unsigned)n > 0xFF);
	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
}

/*
 * This routine sets up an interrupt gate at directory privilege level 3.
 */
static inline void set_system_intr_gate(unsigned int n, void *addr)
{
	BUG_ON((unsigned)n > 0xFF);
	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
}

static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
{
	BUG_ON((unsigned)n > 0xFF);
	_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
}

这三个函数内部都调用了 _set_gate 函数来安装中断门，只是传递给 _set_gate 函数的参数不同。

_set_gate 函数接收 6 个参数，分别是：

gate - 中断向量。
type - 门类型。一共有四种，分别是：中断门、陷阱门、调用门和任务门。

// file: arch/x86/include/asm/desc_defs.h
enum {
	GATE_INTERRUPT = 0xE,
	GATE_TRAP = 0xF,
	GATE_CALL = 0xC,
	GATE_TASK = 0x5,
};

针对中断及异常处理程序，x86 模式允许使用其中的三种 -- 中断门、陷阱门和任务门； x86_64 模式下，仅保留了中断门和陷阱门。

addr - 中断处理程序地址。
dpl - 门描述符中的特权等级，用于程序保护。
ist - 中断栈表（Interrupt Stack Table）。指示是否使用了中断栈表，中断栈表最多容纳序号 1 ~7 共 7 种栈，0 表示未使用中断栈表。
seg - 段选择子。

// file: arch/x86/include/asm/desc.h
static inline void _set_gate(int gate, unsigned type, void *addr,
			     unsigned dpl, unsigned ist, unsigned seg)
{
	gate_desc s;

	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
	/*
	 * does not need to be atomic because it is only done once at
	 * setup time
	 */
	write_idt_entry(idt_table, gate, &s);
}

_set_gate 函数中又调用了 pack_gate 函数和 write_idt_entry 函数。其中 pack_gate 函数用于把各参数组装成门描述符；write_idt_entry 函数用于把门描述符写入中断描述符表。Linux 内核中，所有的异常处理程序，使用的都是中断门，陷阱门和调用门没有使用。

64 位中断门描述符格式如下图所示：

使用 set_intr_gate 安装的中断门，其特权级为 0，所以只能在特权级 0 调用；使用 set_system_intr_gate 安装的中断门，其特权级为 3，可以从任何特权级调用；使用 set_intr_gate_ist安装的中断门，只能在特权级 0 调用，且使用的是中断栈表。

关于 set_intr_gate函数的详细解析，请参考 Linux kernel：中断和异常处理程序的早期初始化 2.1.1 set_intr_gate 函数。

关于中断栈表的内容，请参考 Linux Kernel：中断和异常处理程序的早期初始化（续） 4.1.3 中断栈表及 TSS 初始化。

三、中断向量位图 -- used_vectors

在完成各中断门的设置后，系统相关的异常处理程序就都安装好了。从中断向量 32 开始，都是用于用户定义的中断处理程序的。内核使用数组 used_vectors来标识已安装的中断处理程序，used_vectors 本质上是一个位图，每个中断向量对应着 used_vectors中的一个位。

接下来，通过循环，在 used_vectors 中将所有系统相关的中断向量位置位。

	/* Reserve all the builtin and the syscall vector: */
	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
		set_bit(i, used_vectors);

宏FIRST_EXTERNAL_VECTOR扩展为 0x20，即十进制 32，这是用户自定义中断中最小的中断号。

// file: arch/x86/include/asm/irq_vectors.h
/*
 * IDT vectors usable for external interrupt sources start at 0x20.
 * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
 */
#define FIRST_EXTERNAL_VECTOR		0x20

变量 used_vectors 声明如下：

// file: arch/x86/include/asm/desc.h
/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
extern unsigned long used_vectors[];

从注释中也能看到，该变量实际上作为位图使用的，其定义如下：

// file: arch/x86/kernel/traps.c
DECLARE_BITMAP(used_vectors, NR_VECTORS);

宏 NR_VECTORS 表示最大中断数量，Intel 处理器最大支持 256 个中断。

// file: arch/x86/include/asm/irq_vectors.h
#define NR_VECTORS           256

宏 DECLARE_BITMAP 用来声明一个位图，其定义如下：

// file: include/linux/types.h
#define DECLARE_BITMAP(name,bits) \
	unsigned long name[BITS_TO_LONGS(bits)]

由于我们使用 unsigned long 数组来表示位图，所以需要将比特位数量转换成数组的元素数量。宏 BITS_TO_LONGS 用来实现对应的转换，其定义如下：

// file: include/linux/bitops.h
#define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))

BITS_TO_LONGS 中又引用了宏 DIV_ROUND_UP 和 BITS_PER_BYTE。

宏 BITS_PER_BYTE扩展为 8，表示每个字节包含 8 个比特位：

// file: include/linux/bitops.h
#define BITS_PER_BYTE		8

宏 DIV_ROUND_UP 用来把整数 n 向上圆整到 d 的倍数，其实现方式就是把 n 加上 d-1 后再除以 d，该宏定义如下：

// file: include/linux/kernel.h
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))

最终，used_vectors 实现为一个可以容纳 256 个比特位的 unsigned long 数组。

3.1 set_bit 函数

接下来，我们看下 set_bit 函数的实现：

// file: arch/x86/include/asm/bitops.h
/**
 * set_bit - Atomically set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This function is atomic and may not be reordered.  See __set_bit()
 * if you do not require the atomic guarantees.
 *
 * Note: there are no guarantees that this function will not be reordered
 * on non x86 architectures, so if you are writing portable code,
 * make sure not to rely on its reordering guarantees.
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void
set_bit(unsigned int nr, volatile unsigned long *addr)
{
	if (IS_IMMEDIATE(nr)) {
		asm volatile(LOCK_PREFIX "orb %1,%0"
			: CONST_MASK_ADDR(nr, addr)
			: "iq" ((u8)CONST_MASK(nr))
			: "memory");
	} else {
		asm volatile(LOCK_PREFIX "bts %1,%0"
			: BITOP_ADDR(addr) : "Ir" (nr) : "memory");
	}
}

该函接收 2 个参数：

nr：需要设置的比特位位置

addr：位图的起始地址

首先，通过宏 IS_IMMEDIATE检查参数 nr 是否是编译时常量，该宏定义如下：

// file: arch/x86/include/asm/bitops.h
#define IS_IMMEDIATE(nr)		(__builtin_constant_p(nr))

__builtin_constant_p 是 gcc 的内建函数，用来检测一个值是否为编译时常量。如果该值是编译时常量，返回 1；否则，返回 0。__builtin_constant_p 具体用法可参考 gcc 文档。

如果__builtin_constant_p 函数返回 1，会走到第一个分支。该分支使用的是 or 指令，指令后缀 b 指示操作数是字节大小。宏 CONST_MASK_ADDR 和 CONST_MASK 定义如下：

// file: arch/x86/include/asm/bitops.h
#define CONST_MASK_ADDR(nr, addr)	BITOP_ADDR((void *)(addr) + ((nr)>>3))
#define CONST_MASK(nr)			(1 << ((nr) & 7))

addr 是位图的起始地址；(nr)>>3将 nr 右移 3 位，计算出该比特位处于位图的第几个字节；(void *)(addr) + ((nr)>>3) 获取到比特位对应的字节地址。

其中宏 BITOP_ADDR 定义如下：

// file: arch/x86/include/asm/bitops.h
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
/* Technically wrong, but this avoids compilation errors on some gcc
   versions. */
#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
#endif

可以看到，如果 gcc 版本小于 4.1，宏 BITOP_ADDR(x) 扩展后的输出操作数类型修饰符为 "=m"，"=" 表示该操作数为只写的，"m" 表示操作数位于内存中；否则，扩展为 "+m"，"+" 表示该操作数为可读写的。另外，我们从注释也能看到，使用修饰符 "=m" 只是为了避免编译器错误，正常应该使用 "+m" 修饰符。

宏 __GNUC__ 和 __GNUC_MINOR__是 gcc 的预定义宏，用来表示 gcc 版本，详见 Common Predefined Macros。

综上，宏 CONST_MASK_ADDR 的作用是获取比特位所在字节的地址。

再来看下宏 CONST_MASK，该宏的作用就是将字节内对应的比特位置 1。

(nr) & 7 获取比特位在字节内的偏移，1 << ((nr) & 7)将字节内对应的比特位设置为 1。

最后，通过 or 指令，把宏 CONST_MASK_ADDR 与宏 CONST_MASK_ADDR 进行按位与操作，将位图中对应的比特位置位。

如果__builtin_constant_p函数返回 0，会走到 else 分支。该分支实现比较简单，直接使用 bts 指令来设置比特位。bts 指令接收 2 个参数：位偏移以及位串基地址。该指令会将位串中指定偏移的比特位设置为 1，并把原比特位的值保存到EFLAGS 寄存器的 CF 标志位中。具体用法可参考 Intel 64 and IA-32 Architectures Software Developer Manuals Volume 2A， Chapter 3 中的 BTS 指令。

至于为什么当 nr为编译时常量时，要使用 or 指令而不是 bts 指令。那是因为，当 nr 为编译时常量时，只需要一个写内存的操作；而使用 bts 指令，则需要执行 “读-修改-写” 操作，效率上肯定不如单次写操作的 or 指令。

此处，由于我们在循环中传给 set_bit 函数的第一个参数 i 是一个变量，所以会走到 else 分支，最终会使用 bts 指令来设置比特位。

不管是第一分支还是 else 分支，都使用了宏 LOCK_PREFIX，该宏的实现依赖内核配置参数 CONFIG_SMP。当CONFIG_SMP 为真时，宏 LOCK_PREFIX 扩展为 lock 锁前缀；否则，就是一个空定义。从 set_bit 函数的注释中也能看到，该函数保证位设置是一个原子操作，所以在多处理器（MP）系统下，使用了 lock 锁前缀。

// file: arch/x86/include/asm/alternative.h
/*
 * Alternative inline assembly for SMP.
 *
 * The LOCK_PREFIX macro defined here replaces the LOCK and
 * LOCK_PREFIX macros used everywhere in the source tree.
 *
 * SMP alternatives use the same data structures as the other
 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
 * UP system running a SMP kernel.  The existing apply_alternatives()
 * works fine for patching a SMP kernel for UP.
 *
 * The SMP alternative tables can be kept after boot and contain both
 * UP and SMP versions of the instructions to allow switching back to
 * SMP at runtime, when hotplugging in a new CPU, which is especially
 * useful in virtualized environments.
 *
 * The very common lock prefix is handled as special case in a
 * separate table which is a pure address list without replacement ptr
 * and size information.  That keeps the table sizes small.
 */

#ifdef CONFIG_SMP
#define LOCK_PREFIX_HERE \
		".pushsection .smp_locks,\"a\"\n"	\
		".balign 4\n"				\
		".long 671f - .\n" /* offset */		\
		".popsection\n"				\
		"671:"

#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "

#else /* ! CONFIG_SMP */
#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
#endif

Intel 文档对 lock 前缀指令说明如下：

Causes the processor’s LOCK# signal to be asserted during execution of the accompanying instruction (turns the

instruction into an atomic instruction). In a multiprocessor environment, the LOCK# signal ensures that the

processor has exclusive use of any shared memory while the signal is asserted.

The LOCK prefix is typically used with the BTS instruction to perform a read-modify-write operation on a memory

location in shared memory environment.

四、兼容 32 位系统调用 -- 0x80 中断

接下来，为了兼容 32 位系统调用，为中断向量 0x80 设置了中断门，其中断处理程序为 ia32_syscall。然后在位图 used_vectors中将位 0x80 设置为 1。

#ifdef CONFIG_IA32_EMULATION
	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif

宏 IA32_SYSCALL_VECTOR 扩展为 0x80，其定义如下：

// file: arch/x86/include/asm/irq_vectors.h
#define IA32_SYSCALL_VECTOR		0x80

参考上文中的 set_bit 函数，由于 IA32_SYSCALL_VECTOR是编译时常量，所以会走到第一个分支，使用 orb 指令来完成位设置。

五、映射中断描述符表到固定映射区

接下来，将中断描述符表映射到索引 FIX_RO_IDT 所对应的固定映射区虚拟地址。

	/*
	 * Set the IDT descriptor to a fixed read-only location, so that the
	 * "sidt" instruction will not leak the location of the kernel, and
	 * to defend the IDT against arbitrary memory write vulnerabilities.
	 * It will be reloaded in cpu_init() */
	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
	idt_descr.address = fix_to_virt(FIX_RO_IDT);

固定映射相关内容，请参考 Linux Kernel：内存管理之固定映射（Fixmap）。

变量 idt_table是中断描述符表的基地址，该表定义在文件 arch/x86/kernel/head_64.S 中：

// file: arch/x86/kernel/head_64.S
ENTRY(idt_table)
	.skip IDT_ENTRIES * 16

其中，宏IDT_ENTRIES（扩展为 256）表示中断描述符的数量，16 指示每个中断描述符的大小。所以，中断描述符表idt_table 最多可以容纳 256 个元素，支持 256 个中断向量。

// file: arch/x86/include/asm/segment.h
#define IDT_ENTRIES 256

宏 __pa_symbol 用于把虚拟地址转换成物理地址：

// file: arch/x86/include/asm/page.h
/* __pa_symbol should be used for C visible symbols.
   This seems to be the official gcc blessed way to do such arithmetic. */
/*
 * We need __phys_reloc_hide() here because gcc may assume that there is no
 * overflow during __pa() calculation and can optimize it unexpectedly.
 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
 * case properly. Once all supported versions of gcc understand it, we can
 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
 */
#define __pa_symbol(x) \
	__phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))

其内部又调用了宏 __phys_addr_symbol 和宏 __phys_reloc_hide，这两个宏位于同一文件中：

// file: arch/x86/include/asm/page_64.h
#define __phys_addr_symbol(x) \
	((unsigned long)(x) - __START_KERNEL_map + phys_base)

#define __phys_reloc_hide(x)	(x)

由于 __phys_reloc_hide 内部对变量未做任何处理，所以真正起作用的是宏 __phys_addr_symbol内的代码。宏 __START_KERNEL_map表示内核代码映射区的起始地址，phys_base是内核代码加载的物理基地址，x - __START_KERNEL_map 得到虚拟地址 x 相对于内核代码映射区起始地址的偏移量，再加上 phys_base就得到该虚拟地址对应的物理地址。

关于物理地址的详细计算过程，请参考 Linux Kernel：内存管理之分页（Paging）第 3.1 节及第八节。

六、 cpu_init 函数

接下来是 cpu_init 函数：

	/*
	 * Should be a barrier for any external CPU state:
	 */
	cpu_init();

我们只关注该函数内与中断和异常相关的代码：

// file: arch/x86/kernel/cpu/common.c
void __cpuinit cpu_init(void)
{
	struct orig_ist *oist;
	struct task_struct *me;
	struct tss_struct *t;
	unsigned long v;
	int cpu;
	int i;
    
    ...
        
    cpu = stack_smp_processor_id();
	t = &per_cpu(init_tss, cpu);
	oist = &per_cpu(orig_ist, cpu);
    
    ...
        
	/*
	 * set up and load the per-CPU TSS
	 */
	if (!oist->ist[0]) {
		char *estacks = per_cpu(exception_stacks, cpu);

		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
			estacks += exception_stack_sizes[v];
			oist->ist[v] = t->x86_tss.ist[v] =
					(unsigned long)estacks;
			if (v == DEBUG_STACK-1)
				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
		}
	}
    
    ...
}

这段代码主要是对中断栈表及任务状态段 TSS 进行初始化。其详细实现过程，请参考 Linux Kernel：中断和异常处理程序的早期初始化（续） 4.1.3 中断栈表及 TSS 初始化。

七、x86 架构特定的 trap_init 函数

完成 TSS 和中断栈表的初始化后，执行 x86 架构特定的初始化代码：

x86_init.irqs.trap_init();

x86_init 是一个 x86_init_ops 类型结构体，表示系统初始化时的各种操作。其内部成员也是结构体，代表各子系统的相关操作。比如，结构体 x86_init_resources 表示资源相关的操作；x86_init_paging 表示分页相关的操作。

// file: arch/x86/include/asm/x86_init.h
/**
 * struct x86_init_ops - functions for platform specific setup
 *
 */
struct x86_init_ops {
	struct x86_init_resources	resources;
	struct x86_init_mpparse		mpparse;
	struct x86_init_irqs		irqs;
	struct x86_init_oem		oem;
	struct x86_init_paging		paging;
	struct x86_init_timers		timers;
	struct x86_init_iommu		iommu;
	struct x86_init_pci		pci;
};

我们用到的中断相关的成员变量 irqs，为 x86_init_irqs 结构体类型，该结构体的成员均为函数指针，其定义如下：

// file: arch/x86/include/asm/x86_init.h
/**
 * struct x86_init_irqs - platform specific interrupt setup
 * @pre_vector_init:		init code to run before interrupt vectors
 *				are set up.
 * @intr_init:			interrupt init code
 * @trap_init:			platform specific trap setup
 */
struct x86_init_irqs {
	void (*pre_vector_init)(void);
	void (*intr_init)(void);
	void (*trap_init)(void);
};

x86_init 是结构体 x86_init_ops 类型变量，其成员 irqs.trap_init 被定义为 x86_init_noop：

// file: arch/x86/kernel/x86_init.c
/*
 * The platform setup functions are preset with the default functions
 * for standard PC hardware.
 */
struct x86_init_ops x86_init __initdata = {
    ...
        
    .irqs = {
		.pre_vector_init	= init_ISA_irqs,
		.intr_init		= native_init_IRQ,
		.trap_init		= x86_init_noop,
	},
    
    ...
}

x86_init_noop 是一个空函数：

// file: arch/x86/kernel/x86_init.c
void __cpuinit x86_init_noop(void) { }

所以，x86_init.irqs.trap_init(); 这行代码实际是个空操作，没有任何影响。

八、两套中断描述符表 -- idt_table 和 nmi_idt_table

接下来我们看下最后几行代码：

#ifdef CONFIG_X86_64
	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
	set_nmi_gate(X86_TRAP_DB, &debug);
	set_nmi_gate(X86_TRAP_BP, &int3);
#endif

memcpy 指令把中断描述符表 idt_table 中的数据拷贝到 nmi_idt_table 中。nmi_idt_table 与idt_table 定义在同一文件中，且大小一致；它们都位于 .bss 节，且对齐到 L1 缓存行大小。

// file: arch/x86/kernel/head_64.S
	.section .bss, "aw", @nobits
	.align L1_CACHE_BYTES
ENTRY(idt_table)
	.skip IDT_ENTRIES * 16

	.align L1_CACHE_BYTES
ENTRY(nmi_idt_table)
	.skip IDT_ENTRIES * 16

宏 L1_CACHE_BYTES定义如下，最终扩展为 64 字节：

// file: arch/x86/include/asm/cache.h
/* L1 cache line size */
#define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
#define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)

#define CONFIG_X86_L1_CACHE_SHIFT 6

接下来，调用 set_nmi_gate 函数，把调试异常（Debug exception， #DB ）和断点异常（Breakpoint exception， #BP）的中断门描述符安装到表 nmi_idt_table 中。

set_nmi_gate 函数的实现可参考 Linux kernel：中断和异常处理程序的早期初始化中的 set_intr_gate 函数。两者的差异仅仅是门描述符安装的位置不同，set_intr_gate将中断门安装到 idt_table 中，而 set_nmi_gate 将中断门安装到 nmi_idt_table 中。

// file: arch/x86/include/asm/desc.h
static inline void set_nmi_gate(int gate, void *addr)
{
	gate_desc s;

	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
	write_idt_entry(nmi_idt_table, gate, &s);

在这里，我们看到有两套基本相同的中断描述符表 idt_table 和 nmi_idt_table，它们的区别仅仅是在处理 #DB 和 #BP 时所使用的栈不同。在idt_table 中，这两种异常都使用了中断栈表中的调试栈（DEBUG_STACK）；而在 nmi_idt_table 中未使用调试栈。对比 early_trap_init 函数看一下：

/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
	/* int3 can be called from all */
	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
#ifdef CONFIG_X86_32
	set_intr_gate(X86_TRAP_PF, &page_fault);
#endif
	load_idt(&idt_descr);
}

那么，为什么要有两套中断描述符表呢？在回答这个问题之前，我们先来看下 nmi_idt_table 表是如何使用的。

在 NMI 异常的处理程序 do_nmi 中，调用了 nmi_nesting_preprocess 和 nmi_nesting_postprocess 分别进行预处理以及后处理。

// file: arch/x86/kernel/nmi.c
dotraplinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code)
{
	nmi_nesting_preprocess(regs);

	...

	nmi_nesting_postprocess();
}

在预处理函数 nmi_nesting_preprocess 中，首先会检查被中断程序使用的栈是否为调试栈。如果是，说明被中断的是调试相关的异常（#DB 或 #BG）处理程序，那么就会调用 debug_stack_set_zero 函数，并将 per-cpu 变量 update_debug_stack 的值设置为 1。 update_debug_stack 是一个标志变量，如果为 1，说明切换了#DB 及 #BG 处理程序的栈；如果为 0，说明未切换。

// file: arch/x86/kernel/nmi.c
static inline void nmi_nesting_preprocess(struct pt_regs *regs)
{
	/*
	 * If we interrupted a breakpoint, it is possible that
	 * the nmi handler will have breakpoints too. We need to
	 * change the IDT such that breakpoints that happen here
	 * continue to use the NMI stack.
	 */
	if (unlikely(is_debug_stack(regs->sp))) {
		debug_stack_set_zero();
		this_cpu_write(update_debug_stack, 1);
	}
}

debug_stack_set_zero 函数内部，会将 per-cpu 变量 debug_stack_use_ctr 的值自增 1；并调用 load_idt 函数将 nmi_idt_descr 地址加载到中断描述符表寄存器中，更换中断描述符表。从函数名称中也能看出来，**set_zero是指将中断栈表索引设置为 0，表示不使用中断栈表。加载完成后，#DB 和 #BP 异常处理程序不再使用中断栈表中的调试栈。

// file: arch/x86/kernel/cpu/common.c
void debug_stack_set_zero(void)
{
	this_cpu_inc(debug_stack_use_ctr);
	load_idt((const struct desc_ptr *)&nmi_idt_descr);
}

当 NMI 异常处理程序完成后，调用后处理函数 nmi_nesting_postprocess。该函数首先根据 per-cpu 变量 update_debug_stack 来判断被 NMI 异常中断的是否是调试处理程序；如果是，则会调用 debug_stack_reset函数，并将 update_debug_stack 恢复为 0。

// file: arch/x86/kernel/nmi.c
static inline void nmi_nesting_postprocess(void)
{
	if (unlikely(this_cpu_read(update_debug_stack))) {
		debug_stack_reset();
		this_cpu_write(update_debug_stack, 0);
	}
}

debug_stack_reset函数内部，会调用 load_idt 函数，将中断描述符表寄存器恢复成原始的 idt_descr 地址。恢复以后，#DB 和 #BP 异常处理程序会使用调试栈。



void debug_stack_reset(void)
{
	if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
		return;
	if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
		load_idt((const struct desc_ptr *)&idt_descr);
}

那么，为什么要来回切换中断描述符表呢。

在 x86-64 架构下，有 6 种异常处理程序使用了中断栈表（Interrupt Stack Table，IST），其中就包括调试异常（Debug Exception，#DB）以及断点异常（BreakPoint， #BP ），它们使用了中断栈表中的调试栈。而 NMI 异常和 #DB 异常或 #BP 是可以嵌套的。一个典型的案例，处理器执行普通程序时遇到了断点，转而去执行断点处理程序；在执行过程中，发生了 NMI 异常，处理器又开始执行 NMI 处理程序；在 NMI 处理程序中又遇到了断点，又去执行断点处理程序。在第一个断点处，处理器会进行栈切换，把处理程序使用的栈切换到中断栈表的调试栈，然后把当前上下文保存到调试栈的固定位置；在第二个断点处，如果依然使用中断栈表，处理器又会进行栈切换，并把当前上下文保存到调试栈的同样位置。这样就会导致第一个断点处的上下文被覆盖掉，从而导致程序错误。

为了避免这种情况，在处理 NMI 异常时，就需要先判断出 NMI 是否嵌套在断点处理程序里。如果是，就在实际处理前把 #DB 以及 #BP 的处理程序切换成不使用调试栈的方式，而是使用 NMI 处理程序的栈，这样就能防止当在 NMI 处理程序里再次遇到断点时，第一次断点的上下文被覆盖的情况；然后，当 NMI 异常处理完成时，再把 #DB 以及 #BP 的处理程序恢复成使用中断栈表。

由于无法在内核运行时动态修改异常处理程序代码，所以准备了两套中断描述符表（Interrupt Descriptor Table，IDT），根据需要进行加载。

内核文件中，对此说明如下：

// arch/x86/kernel/nmi.c
/*
 * In x86_64 things are a bit more difficult. This has the same problem
 * where an NMI hitting a breakpoint that calls iret will remove the
 * NMI context, allowing a nested NMI to enter. What makes this more
 * difficult is that both NMIs and breakpoints have their own stack.
 * When a new NMI or breakpoint is executed, the stack is set to a fixed
 * point. If an NMI is nested, it will have its stack set at that same
 * fixed address that the first NMI had, and will start corrupting the
 * stack. This is handled in entry_64.S, but the same problem exists with
 * the breakpoint stack.
 *
 * If a breakpoint is being processed, and the debug stack is being used,
 * if an NMI comes in and also hits a breakpoint, the stack pointer
 * will be set to the same fixed address as the breakpoint that was
 * interrupted, causing that stack to be corrupted. To handle this case,
 * check if the stack that was interrupted is the debug stack, and if
 * so, change the IDT so that new breakpoints will use the current stack
 * and not switch to the fixed address. On return of the NMI, switch back
 * to the original IDT.
 */

九、参考资料

1、Intel 开发者手册：Intel 64 and IA-32 Architectures Software Developer Manuals

2、Extended Industry Standard Architecture

3、 Linux Kernel：内存管理之早期 I/O 内存映射(early ioremap)

4、Memory Map (x86)

5、coreboot

6、Spurious interrupts

7、 Linux kernel：中断和异常处理程序的早期初始化 2.1.1 set_intr_gate 函数。

8、 Linux Kernel：中断和异常处理程序的早期初始化（续） 4.1.3 中断栈表及 TSS 初始化。

9、gcc 文档 -__builtin_constant_p

10、Linux Kernel：内存管理之分页（Paging）第 3.1 节及第八节

11、Linux Kernel：内存管理之固定映射（Fixmap）