背景

在Linux内核中，percpu_rw_semaphore是一种同步机制，用于控制对每个CPU的数据的访问。它是一种读写信号量（read-write semaphore），允许多个读者同时访问数据，但只允许一个写者在没有读者的情况下写入数据。percpu_rw_semaphore特别适用于需要在多个CPU上共享数据，并且需要保证数据一致性的场景。

源码分析

代码路径：linux-5.10/kernel/locking/percpu-rwsem.c

percpu_rw_semaphore结构体定义

struct percpu_rw_semaphore {
       // 用于跟踪当前正在使用共享资源的读者数量，如果 rss 的值为零，这意味着没有读者正在使用资源，写者可以安全地访问并修改资源。
    struct rcu_sync		rss;
       // 用于跟踪当前有多少个读者正在持有读锁,当read_count 为非零值时，表示有读者正在访问共享资源，此时写者必须等待直到read_count 归零，即所有读者都释放了读锁后，写者才能获取写锁。
    unsigned int __percpu	*read_count;
       // rcuwait其实就是封装了一个task_struct结构体，描述当前的写者。
    struct rcuwait		writer;
       // 等待队列，记录
    wait_queue_head_t	waiters;
       // 表征当前是否被锁上了
    atomic_t		block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        // 用于跟踪当前持有读写锁的CPU与等待获取锁的CPU之间的依赖关系。
    struct lockdep_map	dep_map;
#endif
};

流程分析

首先搞一个demo代码，模拟一下锁竞争场景。

#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/percpu-rwsem.h>

// 让两个线程竞争这同一把锁
static struct percpu_rw_semaphore test_pcpu_rwsem_lock;

static struct task_struct *thread1, *thread2;

static int thread_fn1(void *data)
{
    /*
     * 线程1拿锁，并且不释放。
     */
    int id = (int)data;
    while (!kthread_should_stop()) {
        percpu_down_write(&test_pcpu_rwsem_lock);
        pr_err("%s:id=%d\n", __func__, id);
        //msleep(1000);
        //percpu_up_write(&test_pcpu_rwsem_lock);
    }
    return 0;
}

static int thread_fn2(void *data)
{
    int id = (int)data;
    pr_err("%s: enter\n", __func__);
    while (!kthread_should_stop()) {
        percpu_down_write(&test_pcpu_rwsem_lock);
        pr_err("%s:id=%d\n", __func__, id);
        //msleep(1000);
        percpu_up_write(&test_pcpu_rwsem_lock);
    }
    return 0;
}

static void test_pcpu_rwsem_lock_init(void)
{
    percpu_init_rwsem(&test_pcpu_rwsem_lock);
}

static int __init prio_deliver_init(void)
{
    int ret = 0;
    pr_err("%s: enter\n", __func__);
    test_pcpu_rwsem_lock_init();

    thread1 = kthread_run(thread_fn1, (void *)1, "thread1");
    if (IS_ERR(thread1)) {
        ret = PTR_ERR(thread1);
        goto fail_thread1;
    }

    thread2 = kthread_run(thread_fn2, (void *)2, "thread2");
    if (IS_ERR(thread2)) {
        ret = PTR_ERR(thread2);
        goto fail_thread2;
    }

    return 0;

fail_thread2:
    kthread_stop(thread1);
fail_thread1:
    return ret;
}

static void __exit prio_deliver_exit(void)
{
    kthread_stop(thread2);
    kthread_stop(thread1);
    return;
}

module_init(prio_deliver_init);
module_exit(prio_deliver_exit);

MODULE_LICENSE("GPL");

场景分析：当首次调用thread_fn1函数中的percpu_down_write拿锁时，调用路径如下：

rcu_sync_enter(&sem->rss); //此时sem->rss->gp_state为GP_IDLE，做完一系列操作后返回。
if (!__percpu_down_write_trylock(sem)) //这个判断条件不满足。
执行rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);

因为线程1和线程2都竞争同一把锁，再次进入percpu_down_write函数时，会在percpu_rwsem_wait处一直等待，

percpu_rwsem_wait流程分析

void percpu_down_write(struct percpu_rw_semaphore *sem)
{
	might_sleep();
	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);

	/* Notify readers to take the slow path. */
	rcu_sync_enter(&sem->rss);

	/*
	 * Try set sem->block; this provides writer-writer exclusion.
	 * Having sem->block set makes new readers block.
	 */
	if (!__percpu_down_write_trylock(sem))
		percpu_rwsem_wait(sem, /* .reader = */ false);

	/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */

	/*
	 * If they don't see our store of sem->block, then we are guaranteed to
	 * see their sem->read_count increment, and therefore will wait for
	 * them.
	 */

	/* Wait for all active readers to complete. */
        /* 
         * 等待所有读操作者完成对旧数据的访问。
         * `rcuwait_wait_event`函数允许写操作者等待
         * 直到没有读操作者正在访问旧数据。
         */
	rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
}

static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
				      unsigned int mode, int wake_flags,
				      void *key)
{
	bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
	struct percpu_rw_semaphore *sem = key;
	struct task_struct *p;

	/* concurrent against percpu_down_write(), can get stolen */
	if (!__percpu_rwsem_trylock(sem, reader))
		return 1;
        
        /* 此处p获取到的就是当时阻塞的那个任务 */
	p = get_task_struct(wq_entry->private);
	list_del_init(&wq_entry->entry);
	smp_store_release(&wq_entry->private, NULL);

	wake_up_process(p);
	put_task_struct(p);

	return !reader; /* wake (readers until) 1 writer */
}

// 由于是在percpu_down_write函数中调用，所以reader值为false。
static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
{
        /* 绑定了一个唤醒函数 */
	DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
	bool wait;

	spin_lock_irq(&sem->waiters.lock);
	/*
	 * Serialize against the wakeup in percpu_up_write(), if we fail
	 * the trylock, the wakeup must see us on the list.
	 */
	wait = !__percpu_rwsem_trylock(sem, reader);
	if (wait) {
		wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
                /* 把唤醒队列追加到sem锁的waiters中。 */
		__add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
	}
	spin_unlock_irq(&sem->waiters.lock);

	while (wait) {
		set_current_state(TASK_UNINTERRUPTIBLE);
		if (!smp_load_acquire(&wq_entry.private))
			break;
		schedule();
	}
	__set_current_state(TASK_RUNNING);
}

DEFINE_WAIT_FUNC宏定义

 #define DEFINE_WAIT_FUNC(name, function)					\
 	struct wait_queue_entry name = {					\
                // 初始化时将private赋值为current。
 		.private	= current,					\
 		.func		= function,					\
 		.entry		= LIST_HEAD_INIT((name).entry),			\
 	}

优先级传递示例代码(Create by AI)

首先，你需要定义一些数据结构来跟踪任务的优先级和它们持有的资源。

#include <linux/list.h>
#include <linux/sched.h>

struct resource {
    struct list_head list;
    struct task_struct *holder; // 持有资源的任务
    int priority; // 资源的优先级
};

struct task_struct {
    // 省略其他成员...
    struct list_head resources; // 任务持有的资源列表
    int priority; // 任务的优先级
};

// 接下来，定义一个函数来处理资源请求和优先级传递。

void request_resource(struct task_struct *task, struct resource *res) {
    // 检查资源是否被其他任务持有
    if (res->holder != NULL) {
        // 如果资源被持有，检查持有者的优先级
        if (res->holder->priority < task->priority) {
            // 如果请求任务的优先级更高，提升持有者的优先级
            res->holder->priority = task->priority;
        }
    }

    // 将资源分配给请求任务
    res->holder = task;
    // 将资源添加到任务的资源列表中
    list_add(&res->list, &task->resources);
}
当资源被释放时，需要恢复持有者的优先级。

void release_resource(struct resource *res) {
    // 从任务的资源列表中移除资源
    list_del(&res->list);
    // 设置资源的持有者为NULL
    res->holder = NULL;

    // 如果资源列表为空，恢复任务的原始优先级
    if (list_empty(&res->holder->resources)) {
        res->holder->priority = original_priority; // original_priority是任务的原始优先级
    }
}
请注意，这个示例非常简化，并没有包含实际内核代码中的复杂性，例如锁的使用、任务的调度和唤醒机制、以及如何处理多个任务同时请求同一资源的情况等。

linux内核percpu_rw_semaphore结构体分析

背景