从 Java NIO poll 到 Linux 内核 poll:一次系统调用的完整旅程
Java NIO 包中的 Selector 是高性能 I/O 复用的基石,其底层实现依赖于操作系统的多路复用机制,如 Linux 的 poll、epoll 等。本文将跟随一段从 Java 到内核的代码路径,详细剖析一次 poll 系统调用的完整流程。我们将使用 OpenJDK 和 Linux 内核的源码片段作为证据,逐层解析其中的关键逻辑。
1. Java 层:JNI 封装 poll
Java 的 sun.nio.ch.Net.poll 是一个 native 方法,用于在单个文件描述符上执行 poll 操作(通常用于 Selector 的实现中)。对应的 JNI 函数如下:
c
JNIEXPORT jint JNICALL
Java_sun_nio_ch_Net_poll(JNIEnv* env, jclass this, jobject fdo, jint events, jlong timeout)
{
struct pollfd pfd;
int rv;
pfd.fd = fdval(env, fdo); // 获取文件描述符整数值
pfd.events = events; // 关注的事件(POLLIN/POLLOUT等)
if (timeout < -1) {
timeout = -1;
} else if (timeout > INT_MAX) {
timeout = INT_MAX; // 限制超时值范围
}
rv = poll(&pfd, 1, (int)timeout); // 调用系统调用 poll
if (rv >= 0) {
return pfd.revents; // 返回就绪的事件掩码
} else if (errno == EINTR) {
return 0; // 被信号中断,无事件
} else {
handleSocketError(env, errno); // 处理其他错误
return IOS_THROWN;
}
}
关键点:
- 通过
fdval从 Java 的FileDescriptor对象中提取整型文件描述符。 - 构造
struct pollfd,设置fd和events。 - 处理
timeout参数,确保其合法(< -1转为 -1 表示无限等待;大于INT_MAX截断)。 - 直接调用
poll系统调用(注意 glibc 的poll是对内核poll的封装)。 - 返回值处理:成功时返回
revents;若被信号中断则返回 0;否则抛出套接字错误。
这一层充当了 Java 世界与内核世界的桥梁。
2. 用户态到内核态:系统调用入口
在 x86-64 架构上,应用程序通过 syscall 指令陷入内核。Linux 内核在初始化时通过 syscall_init 设置了相关的 MSR 寄存器:
c
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
// ... 其他兼容模式设置
wrmsrl(MSR_SYSCALL_MASK, ...); // 屏蔽的标志位
}
MSR_LSTAR 指定了 syscall 指令进入后的入口地址,即 entry_SYSCALL_64。该汇编入口保存用户态寄存器、切换到内核栈、调用 C 函数 do_syscall_64。
entry_SYSCALL_64 的关键步骤(简化):
- 保存用户栈指针,切换到内核栈。
- 在栈上构造
struct pt_regs,包含所有寄存器值。 - 调用
do_syscall_64,参数为pt_regs和系统调用号。 - 返回后尝试使用
sysret快速返回用户态。
do_syscall_64 根据系统调用号查找对应的内核函数。对于 poll,其系统调用号对应 __NR_poll,在 64 位系统上定义为 7(用户提供的代码中注释 common poll sys_poll)。内核中通过 SYSCALL_DEFINE 宏定义系统调用函数,poll 的定义展开后最终调用 do_sys_poll。
3. 内核 poll 实现:do_sys_poll
do_sys_poll 是 poll 系统调用的核心实现,负责从用户空间复制数据、执行轮询并将结果写回。
c
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
// 栈上分配一小块内存,避免频繁 kmalloc
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
int len, err, fdcount;
if (nfds > rlimit(RLIMIT_NOFILE)) // 检查文件描述符数量限制
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
// 循环将用户空间的 pollfd 数组复制到内核
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds - todo,
sizeof(struct pollfd) * walk->len))
goto out_fds; // 复制失败
todo -= walk->len;
if (!todo)
break;
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
poll_initwait(&table); // 初始化 poll_wqueues
fdcount = do_poll(head, &table, end_time); // 执行轮询
poll_freewait(&table); // 清理等待队列
// 将结果(revents)写回用户空间
if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
goto out_fds;
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = walk->len; j; fds++, ufds++, j--)
unsafe_put_user(fds->revents, &ufds->revents, Efault);
}
user_write_access_end();
err = fdcount;
// ... 释放临时分配的内存
}
关键设计:
- 分层复制:使用
poll_list链表将用户数据分批复制,小量数据使用栈上内存(stack_pps),大量数据使用kmalloc分配页。这既减少了内存分配开销,又避免了栈溢出。 poll_initwait:初始化poll_wqueues结构,其中包含poll_table回调函数。poll_table用于在轮询时将进程添加到文件描述符的等待队列。do_poll:实际轮询逻辑,返回就绪的文件描述符数量。- 写回结果:使用
unsafe_put_user将每个文件描述符的revents写回用户空间。
4. 轮询核心:do_poll
do_poll 函数实现了一个可能超时或可中断的循环,遍历所有 poll_list 中的文件描述符,调用 do_pollfd 检查事件。
c
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
// 无等待情况(超时为0),直接轮询一次
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
// 遍历所有文件描述符
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd *pfd, *pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) {
count++;
pt->_qproc = NULL; // 发现事件后,后续不需要再注册等待
busy_flag = 0; // 停止 busy polling
can_busy_loop = false;
}
}
}
pt->_qproc = NULL; // 下一轮不再注册
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -ERESTARTNOHAND;
}
if (count || timed_out)
break;
// 支持 busy polling(网络设备忙轮询优化)
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
// 设置超时时间
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
// 进入睡眠,直到超时或事件发生
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
核心机制:
- 事件检测:对每个
pollfd调用do_pollfd,若发现事件则计数并禁止后续注册。 - 等待与超时:若没有事件且未超时,调用
poll_schedule_timeout让进程睡眠,等待设备唤醒或超时到达。 - busy polling:针对低延迟场景,可以在一定时间内忙轮询而非睡眠,代码中的
busy_flag和can_busy_loop控制这一行为。
5. 文件描述符轮询:do_pollfd
do_pollfd 负责对单个文件描述符进行事件查询:
c
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll, __poll_t busy_flag)
{
int fd = pollfd->fd;
__poll_t mask = 0, filter;
struct fd f;
if (fd < 0)
goto out;
mask = EPOLLNVAL;
f = fdget(fd);
if (!f.file)
goto out;
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
pwait->_key = filter | busy_flag;
mask = vfs_poll(f.file, pwait); // 调用文件系统的 poll 方法
if (mask & busy_flag)
*can_busy_poll = true;
mask &= filter; // 过滤掉不关心的事件
fdput(f);
out:
pollfd->revents = mangle_poll(mask); // 转换回用户空间事件格式
return mask;
}
关键步骤:
- 通过
fdget获取struct fd(包含struct file *)。 - 将用户态的
events转换为内核事件掩码(demangle_poll)。 - 设置
poll_table的_key,指示文件系统 poll 函数需要关注的事件。 - 调用
vfs_poll,它会回调具体文件系统(如 socket、pipe 等)的poll实现,该实现通常将当前进程加入等待队列,并返回当前就绪的事件。 - 结果经过掩码过滤后,通过
mangle_poll转换回用户态事件(如POLLIN对应0x0001),存入revents。
6. 返回值与错误传递
当 poll 系统调用返回时,do_sys_poll 返回就绪的文件描述符数量(或错误码)。该值通过系统调用路径传递回用户态:
- 在
entry_SYSCALL_64中,do_syscall_64的返回值保存在rax寄存器中。 - 返回用户态后,glibc 的
poll包装函数将其作为返回值。 - JNI 层捕获此值,若
rv >= 0,则直接返回pfd.revents(对于单文件描述符的poll,通常就绪数量为 1 或 0,revents已包含事件)。 - 若为负错误码,则通过
handleSocketError转换为 Java 异常。
7. 总结
通过以上源码分析,我们完整追踪了一次 Java NIO poll 调用的生命周期:
- Java 层调用 native 方法
Net.poll,JNI 函数构造pollfd并执行系统调用。 - 用户态通过
syscall指令进入内核,entry_SYSCALL_64保存上下文并调用do_syscall_64。 - 内核
do_sys_poll复制用户数据,初始化等待队列,进入do_poll循环。 do_poll遍历所有文件描述符,通过do_pollfd调用具体文件系统的poll方法,可能使进程睡眠。- 事件就绪或超时后,结果通过
revents写回用户空间,逐层返回至 Java 层。
这一过程体现了操作系统设计的精妙:从高级语言到底层硬件,每一层都各司其职,通过清晰的接口协作。理解这些底层细节,有助于我们编写更高效的 Java NIO 程序,并在性能调优时做出明智的决策。
#源码
JNIEXPORT jint JNICALL
Java_sun_nio_ch_Net_poll(JNIEnv* env, jclass this, jobject fdo, jint events, jlong timeout)
{
struct pollfd pfd;
int rv;
pfd.fd = fdval(env, fdo);
pfd.events = events;
if (timeout < -1) {
timeout = -1;
} else if (timeout > INT_MAX) {
timeout = INT_MAX;
}
rv = poll(&pfd, 1, (int)timeout);
if (rv >= 0) {
return pfd.revents;
} else if (errno == EINTR) {
// interrupted, no events to return
return 0;
} else {
handleSocketError(env, errno);
return IOS_THROWN;
}
}
7 common poll sys_poll
#define __SYSCALL_DEFINEx(x, name, ...) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
__X64_SYS_STUBx(x, name, __VA_ARGS__) \
__IA32_SYS_STUBx(x, name, __VA_ARGS__) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
} else {
wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
}
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
*/
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
X86_EFLAGS_AC|X86_EFLAGS_ID);
}
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_ENTRY
ENDBR
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
IBRS_EXIT
POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_64)
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
poll_initwait(&table);
fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
goto out_fds;
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = walk->len; j; fds++, ufds++, j--)
unsafe_put_user(fds->revents, &ufds->revents, Efault);
}
user_write_access_end();
err = fdcount;
out_fds:
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
Efault:
user_write_access_end();
err = -EFAULT;
goto out_fds;
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -ERESTARTNOHAND;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
timed_out = 1;
}
return count;
}
/*
* Fish for pollable events on the pollfd->fd file descriptor. We're only
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
__poll_t busy_flag)
{
int fd = pollfd->fd;
__poll_t mask = 0, filter;
struct fd f;
if (fd < 0)
goto out;
mask = EPOLLNVAL;
f = fdget(fd);
if (!f.file)
goto out;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
pwait->_key = filter | busy_flag;
mask = vfs_poll(f.file, pwait);
if (mask & busy_flag)
*can_busy_poll = true;
mask &= filter; /* Mask out unneeded events. */
fdput(f);
out:
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask);
return mask;
}