从 Java NIO poll 到 Linux 内核 poll：一次系统调用的完整旅程从 Java NIO poll 到

从 Java NIO poll 到 Linux 内核 poll：一次系统调用的完整旅程

Java NIO 包中的 Selector 是高性能 I/O 复用的基石，其底层实现依赖于操作系统的多路复用机制，如 Linux 的 poll、epoll 等。本文将跟随一段从 Java 到内核的代码路径，详细剖析一次 poll 系统调用的完整流程。我们将使用 OpenJDK 和 Linux 内核的源码片段作为证据，逐层解析其中的关键逻辑。

1. Java 层：JNI 封装 `poll`

Java 的 sun.nio.ch.Net.poll 是一个 native 方法，用于在单个文件描述符上执行 poll 操作（通常用于 Selector 的实现中）。对应的 JNI 函数如下：

JNIEXPORT jint JNICALL
Java_sun_nio_ch_Net_poll(JNIEnv* env, jclass this, jobject fdo, jint events, jlong timeout)
{
    struct pollfd pfd;
    int rv;
    pfd.fd = fdval(env, fdo);          // 获取文件描述符整数值
    pfd.events = events;                // 关注的事件（POLLIN/POLLOUT等）
    if (timeout < -1) {
        timeout = -1;
    } else if (timeout > INT_MAX) {
        timeout = INT_MAX;               // 限制超时值范围
    }
    rv = poll(&pfd, 1, (int)timeout);    // 调用系统调用 poll

    if (rv >= 0) {
        return pfd.revents;               // 返回就绪的事件掩码
    } else if (errno == EINTR) {
        return 0;                          // 被信号中断，无事件
    } else {
        handleSocketError(env, errno);     // 处理其他错误
        return IOS_THROWN;
    }
}

关键点：

通过 fdval 从 Java 的 FileDescriptor 对象中提取整型文件描述符。
构造 struct pollfd，设置 fd 和 events。
处理 timeout 参数，确保其合法（< -1 转为 -1 表示无限等待；大于 INT_MAX 截断）。
直接调用 poll 系统调用（注意 glibc 的 poll 是对内核 poll 的封装）。
返回值处理：成功时返回 revents；若被信号中断则返回 0；否则抛出套接字错误。

这一层充当了 Java 世界与内核世界的桥梁。

2. 用户态到内核态：系统调用入口

在 x86-64 架构上，应用程序通过 syscall 指令陷入内核。Linux 内核在初始化时通过 syscall_init 设置了相关的 MSR 寄存器：

void syscall_init(void)
{
    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
    // ... 其他兼容模式设置
    wrmsrl(MSR_SYSCALL_MASK, ...);  // 屏蔽的标志位
}

MSR_LSTAR 指定了 syscall 指令进入后的入口地址，即 entry_SYSCALL_64。该汇编入口保存用户态寄存器、切换到内核栈、调用 C 函数 do_syscall_64。

entry_SYSCALL_64 的关键步骤（简化）：

保存用户栈指针，切换到内核栈。
在栈上构造 struct pt_regs，包含所有寄存器值。
调用 do_syscall_64，参数为 pt_regs 和系统调用号。
返回后尝试使用 sysret 快速返回用户态。

do_syscall_64 根据系统调用号查找对应的内核函数。对于 poll，其系统调用号对应 __NR_poll，在 64 位系统上定义为 7（用户提供的代码中注释 common poll sys_poll）。内核中通过 SYSCALL_DEFINE 宏定义系统调用函数，poll 的定义展开后最终调用 do_sys_poll。

3. 内核 poll 实现：`do_sys_poll`

do_sys_poll 是 poll 系统调用的核心实现，负责从用户空间复制数据、执行轮询并将结果写回。

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                       struct timespec64 *end_time)
{
    // 栈上分配一小块内存，避免频繁 kmalloc
    long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
    struct poll_list *const head = (struct poll_list *)stack_pps;
    struct poll_list *walk = head;
    unsigned long todo = nfds;
    int len, err, fdcount;

    if (nfds > rlimit(RLIMIT_NOFILE))      // 检查文件描述符数量限制
        return -EINVAL;

    len = min_t(unsigned int, nfds, N_STACK_PPS);
    // 循环将用户空间的 pollfd 数组复制到内核
    for (;;) {
        walk->next = NULL;
        walk->len = len;
        if (!len)
            break;
        if (copy_from_user(walk->entries, ufds + nfds - todo,
                           sizeof(struct pollfd) * walk->len))
            goto out_fds;                  // 复制失败

        todo -= walk->len;
        if (!todo)
            break;

        len = min(todo, POLLFD_PER_PAGE);
        walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL);
        if (!walk) {
            err = -ENOMEM;
            goto out_fds;
        }
    }

    poll_initwait(&table);                 // 初始化 poll_wqueues
    fdcount = do_poll(head, &table, end_time);  // 执行轮询
    poll_freewait(&table);                  // 清理等待队列

    // 将结果（revents）写回用户空间
    if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
        goto out_fds;
    for (walk = head; walk; walk = walk->next) {
        struct pollfd *fds = walk->entries;
        int j;
        for (j = walk->len; j; fds++, ufds++, j--)
            unsafe_put_user(fds->revents, &ufds->revents, Efault);
    }
    user_write_access_end();
    err = fdcount;
    // ... 释放临时分配的内存
}

关键设计：

分层复制：使用 poll_list 链表将用户数据分批复制，小量数据使用栈上内存（stack_pps），大量数据使用 kmalloc 分配页。这既减少了内存分配开销，又避免了栈溢出。
poll_initwait：初始化 poll_wqueues 结构，其中包含 poll_table 回调函数。poll_table 用于在轮询时将进程添加到文件描述符的等待队列。
do_poll：实际轮询逻辑，返回就绪的文件描述符数量。
写回结果：使用 unsafe_put_user 将每个文件描述符的 revents 写回用户空间。

4. 轮询核心：`do_poll`

do_poll 函数实现了一个可能超时或可中断的循环，遍历所有 poll_list 中的文件描述符，调用 do_pollfd 检查事件。

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
    poll_table* pt = &wait->pt;
    ktime_t expire, *to = NULL;
    int timed_out = 0, count = 0;
    u64 slack = 0;
    __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
    unsigned long busy_start = 0;

    // 无等待情况（超时为0），直接轮询一次
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        pt->_qproc = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    for (;;) {
        struct poll_list *walk;
        bool can_busy_loop = false;

        // 遍历所有文件描述符
        for (walk = list; walk != NULL; walk = walk->next) {
            struct pollfd *pfd, *pfd_end;
            pfd = walk->entries;
            pfd_end = pfd + walk->len;
            for (; pfd != pfd_end; pfd++) {
                if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) {
                    count++;
                    pt->_qproc = NULL;      // 发现事件后，后续不需要再注册等待
                    busy_flag = 0;          // 停止 busy polling
                    can_busy_loop = false;
                }
            }
        }
        pt->_qproc = NULL;                   // 下一轮不再注册

        if (!count) {
            count = wait->error;
            if (signal_pending(current))
                count = -ERESTARTNOHAND;
        }
        if (count || timed_out)
            break;

        // 支持 busy polling（网络设备忙轮询优化）
        if (can_busy_loop && !need_resched()) {
            if (!busy_start) {
                busy_start = busy_loop_current_time();
                continue;
            }
            if (!busy_loop_timeout(busy_start))
                continue;
        }
        busy_flag = 0;

        // 设置超时时间
        if (end_time && !to) {
            expire = timespec64_to_ktime(*end_time);
            to = &expire;
        }

        // 进入睡眠，直到超时或事件发生
        if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
            timed_out = 1;
    }
    return count;
}

核心机制：

事件检测：对每个 pollfd 调用 do_pollfd，若发现事件则计数并禁止后续注册。
等待与超时：若没有事件且未超时，调用 poll_schedule_timeout 让进程睡眠，等待设备唤醒或超时到达。
busy polling：针对低延迟场景，可以在一定时间内忙轮询而非睡眠，代码中的 busy_flag 和 can_busy_loop 控制这一行为。

5. 文件描述符轮询：`do_pollfd`

do_pollfd 负责对单个文件描述符进行事件查询：

static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                 bool *can_busy_poll, __poll_t busy_flag)
{
    int fd = pollfd->fd;
    __poll_t mask = 0, filter;
    struct fd f;

    if (fd < 0)
        goto out;
    mask = EPOLLNVAL;
    f = fdget(fd);
    if (!f.file)
        goto out;

    filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
    pwait->_key = filter | busy_flag;
    mask = vfs_poll(f.file, pwait);          // 调用文件系统的 poll 方法
    if (mask & busy_flag)
        *can_busy_poll = true;
    mask &= filter;                           // 过滤掉不关心的事件
    fdput(f);

out:
    pollfd->revents = mangle_poll(mask);      // 转换回用户空间事件格式
    return mask;
}

关键步骤：

通过 fdget 获取 struct fd（包含 struct file *）。
将用户态的 events 转换为内核事件掩码（demangle_poll）。
设置 poll_table 的 _key，指示文件系统 poll 函数需要关注的事件。
调用 vfs_poll，它会回调具体文件系统（如 socket、pipe 等）的 poll 实现，该实现通常将当前进程加入等待队列，并返回当前就绪的事件。
结果经过掩码过滤后，通过 mangle_poll 转换回用户态事件（如 POLLIN 对应 0x0001），存入 revents。

6. 返回值与错误传递

当 poll 系统调用返回时，do_sys_poll 返回就绪的文件描述符数量（或错误码）。该值通过系统调用路径传递回用户态：

在 entry_SYSCALL_64 中，do_syscall_64 的返回值保存在 rax 寄存器中。
返回用户态后，glibc 的 poll 包装函数将其作为返回值。
JNI 层捕获此值，若 rv >= 0，则直接返回 pfd.revents（对于单文件描述符的 poll，通常就绪数量为 1 或 0，revents 已包含事件）。
若为负错误码，则通过 handleSocketError 转换为 Java 异常。

7. 总结

通过以上源码分析，我们完整追踪了一次 Java NIO poll 调用的生命周期：

Java 层调用 native 方法 Net.poll，JNI 函数构造 pollfd 并执行系统调用。
用户态通过 syscall 指令进入内核，entry_SYSCALL_64 保存上下文并调用 do_syscall_64。
内核 do_sys_poll 复制用户数据，初始化等待队列，进入 do_poll 循环。
do_poll 遍历所有文件描述符，通过 do_pollfd 调用具体文件系统的 poll 方法，可能使进程睡眠。
事件就绪或超时后，结果通过 revents 写回用户空间，逐层返回至 Java 层。

这一过程体现了操作系统设计的精妙：从高级语言到底层硬件，每一层都各司其职，通过清晰的接口协作。理解这些底层细节，有助于我们编写更高效的 Java NIO 程序，并在性能调优时做出明智的决策。

#源码


JNIEXPORT jint JNICALL
Java_sun_nio_ch_Net_poll(JNIEnv* env, jclass this, jobject fdo, jint events, jlong timeout)
{
    struct pollfd pfd;
    int rv;
    pfd.fd = fdval(env, fdo);
    pfd.events = events;
    if (timeout < -1) {
        timeout = -1;
    } else if (timeout > INT_MAX) {
        timeout = INT_MAX;
    }
    rv = poll(&pfd, 1, (int)timeout);

    if (rv >= 0) {
        return pfd.revents;
    } else if (errno == EINTR) {
        // interrupted, no events to return
        return 0;
    } else {
        handleSocketError(env, errno);
        return IOS_THROWN;
    }
}

7	common	poll			sys_poll

#define __SYSCALL_DEFINEx(x, name, ...)					\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
	__X64_SYS_STUBx(x, name, __VA_ARGS__)				\
	__IA32_SYS_STUBx(x, name, __VA_ARGS__)				\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))


/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

	if (ia32_enabled()) {
		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
		/*
		 * This only works on Intel CPUs.
		 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
		 * This does not cause SYSENTER to jump to the wrong location, because
		 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
		 */
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
			    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
	} else {
		wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
	}

	/*
	 * Flags to clear on syscall; clear as much as possible
	 * to minimize user space-kernel interference.
	 */
	wrmsrl(MSR_SYSCALL_MASK,
	       X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
	       X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
	       X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
	       X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
	       X86_EFLAGS_AC|X86_EFLAGS_ID);
}

/*
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
 *
 * This is the only entry point used for 64-bit system calls.  The
 * hardware interface is reasonably well designed and the register to
 * argument mapping Linux uses fits well with the registers that are
 * available when SYSCALL is used.
 *
 * SYSCALL instructions can be found inlined in libc implementations as
 * well as some other programs and libraries.  There are also a handful
 * of SYSCALL instructions in the vDSO used, for example, as a
 * clock_gettimeofday fallback.
 *
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Registers on entry:
 * rax  system call number
 * rcx  return address
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
 * rdi  arg0
 * rsi  arg1
 * rdx  arg2
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
 * r8   arg4
 * r9   arg5
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
 *
 * Only called from user space.
 *
 * When user can change pt_regs->foo always force IRET. That is because
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
 */

SYM_CODE_START(entry_SYSCALL_64)
	UNWIND_HINT_ENTRY
	ENDBR

	swapgs
	/* tss.sp2 is scratch space. */
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp

SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR

	/* Construct struct pt_regs on stack */
	pushq	$__USER_DS				/* pt_regs->ss */
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
	pushq	%r11					/* pt_regs->flags */
	pushq	$__USER_CS				/* pt_regs->cs */
	pushq	%rcx					/* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
	pushq	%rax					/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	/* IRQs are off. */
	movq	%rsp, %rdi
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
	movslq	%eax, %rsi

	/* clobbers %rax, make sure it is after saving the syscall nr */
	IBRS_ENTER
	UNTRAIN_RET
	CLEAR_BRANCH_HISTORY

	call	do_syscall_64		/* returns with IRQs disabled */

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.  If we're not,
	 * go to the slow exit path.
	 * In the Xen PV case we must use iret anyway.
	 */

	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV

	/*
	 * We win! This label is here just for ease of understanding
	 * perf profiles. Nothing jumps here.
	 */
syscall_return_via_sysret:
	IBRS_EXIT
	POP_REGS pop_rdi=0

	/*
	 * Now all regs are restored except RSP and RDI.
	 * Save old stack pointer and switch to trampoline stack.
	 */
	movq	%rsp, %rdi
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
	UNWIND_HINT_END_OF_STACK

	pushq	RSP-RDI(%rdi)	/* RSP */
	pushq	(%rdi)		/* RDI */

	/*
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
	STACKLEAK_ERASE_NOCLOBBER

	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

	popq	%rdi
	popq	%rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR
	swapgs
	CLEAR_CPU_BUFFERS
	sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR
	int3
SYM_CODE_END(entry_SYSCALL_64)


static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
		struct timespec64 *end_time)
{
	struct poll_wqueues table;
	int err = -EFAULT, fdcount, len;
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;

	if (nfds > rlimit(RLIMIT_NOFILE))
		return -EINVAL;

	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;

		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;

		len = min(todo, POLLFD_PER_PAGE);
		walk = walk->next = kmalloc(struct_size(walk, entries, len),
					    GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
			goto out_fds;
		}
	}

	poll_initwait(&table);
	fdcount = do_poll(head, &table, end_time);
	poll_freewait(&table);

	if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
		goto out_fds;

	for (walk = head; walk; walk = walk->next) {
		struct pollfd *fds = walk->entries;
		int j;

		for (j = walk->len; j; fds++, ufds++, j--)
			unsafe_put_user(fds->revents, &ufds->revents, Efault);
  	}
	user_write_access_end();

	err = fdcount;
out_fds:
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
	}

	return err;

Efault:
	user_write_access_end();
	err = -EFAULT;
	goto out_fds;
}


static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
		   struct timespec64 *end_time)
{
	poll_table* pt = &wait->pt;
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
	u64 slack = 0;
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_start = 0;

	/* Optimise the no-wait case */
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		pt->_qproc = NULL;
		timed_out = 1;
	}

	if (end_time && !timed_out)
		slack = select_estimate_accuracy(end_time);

	for (;;) {
		struct poll_list *walk;
		bool can_busy_loop = false;

		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
				 * and kill poll_table->_qproc, so we don't
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
					count++;
					pt->_qproc = NULL;
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
				}
			}
		}
		/*
		 * All waiters have already been registered, so don't provide
		 * a poll_table->_qproc to them on the next loop iteration.
		 */
		pt->_qproc = NULL;
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -ERESTARTNOHAND;
		}
		if (count || timed_out)
			break;

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		if (can_busy_loop && !need_resched()) {
			if (!busy_start) {
				busy_start = busy_loop_current_time();
				continue;
			}
			if (!busy_loop_timeout(busy_start))
				continue;
		}
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
			expire = timespec64_to_ktime(*end_time);
			to = &expire;
		}

		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
			timed_out = 1;
	}
	return count;
}

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
				     bool *can_busy_poll,
				     __poll_t busy_flag)
{
	int fd = pollfd->fd;
	__poll_t mask = 0, filter;
	struct fd f;

	if (fd < 0)
		goto out;
	mask = EPOLLNVAL;
	f = fdget(fd);
	if (!f.file)
		goto out;

	/* userland u16 ->events contains POLL... bitmap */
	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
	pwait->_key = filter | busy_flag;
	mask = vfs_poll(f.file, pwait);
	if (mask & busy_flag)
		*can_busy_poll = true;
	mask &= filter;		/* Mask out unneeded events. */
	fdput(f);

out:
	/* ... and so does ->revents */
	pollfd->revents = mangle_poll(mask);
	return mask;
}

从 Java NIO poll 到 Linux 内核 poll：一次系统调用的完整旅程