write用户态调用流程:
用户空间系统调用write操作:
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
}
return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ret = rw_verify_area(WRITE, file, pos, count);//校验文件有效性:大小、锁、访问权限
file_start_write(file);//检查文件inode的属性,判断是否是一个常规文件;避免文件系统冻结保护
if (file->f_op->write)//如果有write函数,则回调文件的write写操作
ret = file->f_op->write(file, buf, count, pos);
else if (file->f_op->write_iter)//如果有write_iter函数,则回调文件的write_iter写操作
ret = new_sync_write(file, buf, count, pos); //(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
|-struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
|-struct kiocb kiocb;
|-init_sync_kiocb(&kiocb, filp);
|-iov_iter_init(&iter, WRITE, &iov, 1, len);
|-ret = call_write_iter(filp, &kiocb, &iter);
|-return file->f_op->write_iter(kio, iter);
}
内核态文件操作函数ocfs2_fops:
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
.read_iter = ocfs2_file_read_iter,
.write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
.remap_file_range = ocfs2_remap_file_range,
};
OCFS2内核模块文件写操作:
写操作分为直接写入direct_io和写入缓存OCFS2_WRITE_BUFFER两种方式。
写入数据又分为内联inline模式(直接写入inode block块剩余空间)和extent模式(采用B+树组织数据)
.write_iter = ocfs2_file_write_iter,
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|-size_t count = iov_iter_count(from);
|-struct file *file = iocb->ki_filp;
|-struct inode *inode = file_inode(file);
|-struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|-int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); //写入count字节后,是否超过了文件inode数据空间。
|-int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
|-int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
|-ret = ocfs2_rw_lock(inode, rw_level);//对文件锁资源ip_rw_lockres进行加锁
|-if (direct_io && full_coherency) { //直接IO 且 全一致性
|-ret = ocfs2_inode_lock(inode, NULL, 1); //对文件锁资源ip_inode_lockres加EX锁
|-ocfs2_inode_unlock(inode, 1); //对文件锁资源ip_inode_lockres解EX锁
|-}
|-ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);//写之前做检查
|-ssize_t written = __generic_file_write_iter(iocb, from); //写数据到文件
|-if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode)) { //如果需要sync同步数据
|-ret = filemap_fdatawrite_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1);
|-if (!ret) {
|-ret = jbd2_journal_force_commit(osb->journal->j_journal); //强制日志系统jdb2提交commit数据
|-}
|-if (!ret)
|-ret = filemap_fdatawait_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1);
|-}
|-ocfs2_rw_unlock(inode, rw_level);//对文件锁资源ip_rw_lockres进行解锁
|-return written;
真正执行写操作的函数:开始区分直接写入还是写缓存。
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|-struct file *file = iocb->ki_filp;
|-struct address_space * mapping = file->f_mapping;
|-struct inode *inode = mapping->host;
|-err = file_update_time(file); //更新mtime和ctime时间
|-if (iocb->ki_flags & IOCB_DIRECT) { //如果是直接写
|-written = generic_file_direct_write(iocb, from);//开始直接写的流程
|-status = generic_perform_write(file, from, pos = iocb->ki_pos);
|-} else {
|-written = generic_perform_write(file, from, iocb->ki_pos);
|-return written ? written : err;
直接写流程:
1、检查地址空间的状态,如果有回写数据,则返回。
2、使将要写入区域的读缓存失效。
3、文件系统提供的真正的直接写操作direct_IO()。
4、避免预读等操作导致缓存数据与磁盘数据不一致。
ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
|-struct file *file = iocb->ki_filp;
|-struct address_space *mapping = file->f_mapping;
|-struct inode *inode = mapping->host;
|-loff_t pos = iocb->ki_pos;
|-written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); //检查地址空间的状态,如果有回写数据,则返回。
|-written = invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end);//使将要写入区域的读缓存失效
|-written = mapping->a_ops->direct_IO(iocb, from); //call ocfs2_direct_IO()直接写操作。
|-__blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, ocfs2_dio_end_io, NULL, 0);
|-return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, end_io, submit_io, flags);
|-if (written > 0 && mapping->nrpages && invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end))
dio_warn_stale_pagecache(file); //避免预读等操作导致缓存数据与磁盘数据不一致。
|-return written;
缓存写流程:
1、分配磁盘空间和缓存页;
2、将数据从用户态拷贝到内核态内存;
3、收尾和页缓存均衡。
ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos)
|-struct address_space *mapping = file->f_mapping;
|-const struct address_space_operations *a_ops = mapping->a_ops;
|-do {
|-status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); //call ocfs2_write_begin()
|-ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, pagep, fsdata, di_bh, NULL);//开始缓存写
|-ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
|-copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
|-flush_dcache_page(page);
|-status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); //call ocfs2_write_end()
|-ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
|-balance_dirty_pages_ratelimited(mapping);//页缓存均衡,目的是检查目前页缓存的容量,保证页缓存的总容量不超过设置的大小,避免占用太多系统内存。
|-} while (iov_iter_count(i));
|-return written ? written : status;
ocfs2_aops操作函数:
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
.invalidatepage = block_invalidatepage,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
内联inline模式写入:
如下图所示。即:1个block除了inode描述信息外剩下的空间,由struct ocfs2_dinode可知,0xC0之后就是inline data,所以 inline data = 1block(4096 B) - 0xC0(192 B) - 0x08(8 B)= 3896 B。当数据小于3896 B时,数据会写入inode的Inline Data中;当数据大于3896 B时,则会转换为extent的方式写入。
begin:
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
|-ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
|-if (ocfs2_supports_inline_data(osb)) {
ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len, mmap_page, wc);
if (ret == 1) {
ret = 0;
goto success;
真正内联写入:
1、启动事物相关的逻辑,防止数据丢失。
2、分配缓存页
ocfs2_try_to_write_inline_data:
|-if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { //支持内联模式
if (mmap_page == NULL && ocfs2_size_fits_inline_data(wc->w_di_bh, end)) //如果内联空间可以容纳写入数据大小
goto do_inline_write;//直接内联数据写入
|-do_inline_write:
ret = ocfs2_write_begin_inline(mapping, inode, wc);
|-handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);//启动日志事物
|-page = find_or_create_page(mapping, 0, GFP_NOFS); //申请一个缓存页
|-ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); //添加需要事物处理的逻辑块
|-ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);//从磁盘读取缓存页
|-ocfs2_commit_trans(osb, handle);//提交日志事物
end:
拷贝用户态数据到inode内联数据区域
int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied, void *fsdata)
|-if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|-ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
|-kaddr = kmap_atomic(wc->w_target_page);
|-memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); //拷贝用户态数据到inode内联数据区域
|-kunmap_atomic(kaddr);
|-goto out_write_size;
这就是内联模式的数据写入。
extent模式写入:
当inode存储的数据大于内联空间3896 B时,就会使用extent模式来存储数据。很多文件是采用了extent模式存储数据。extent模式采用B+树的方式存储数据,B+树是经典的存储数据的方式。
在struct ocfs2_dinode磁盘数据结构0xC0之后有struct ocfs2_extent_list和ocfs2_extent_rec,这两个磁盘数据结构共同描述了该inode下面数据在extent模式下的分布情况。如下所示的extent磁盘数据结构:
其中,l_tree_depth表示树深,该值为0,表示该树节点是叶子节点,即:e_blkno这个block块是用来存储数据的;如果该值非0,表示该节点是非叶子节点,不是用来存储数据的,而是用来记录下一个存储位置的。
l_count表示extent records的数目,下图所示count: 243 = ( 1block(4096 B) - 0xC0(192 B) - 0x10(16 B) ) / (16 B)= 3888 B / 16 B = 243。
begin:
1、 计算需要的簇的数量;
2、锁住分配器,并预留clusters;
3、获取对应的缓存页,设置页与对应磁盘块之间的关系;
3、将bitmap中的clusters进行置位
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
|-ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
|-ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, &extents_to_split);//计算需要的簇的数量
|-ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc, extents_to_split, &data_ac, &meta_ac);//锁住分配器,并预留clusters
|-handle = ocfs2_start_trans(osb, credits); //开启日志事物
|-ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
|-ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page);//获取对应的缓存页,设置页与对应磁盘块之间的关系
|-ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len);//将bitmap中的clusters进行置位
|-ocfs2_commit_trans(osb, handle);
end:
标记脏页,以便于内核可以将页缓存中的数据写入磁盘。
int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied, void *fsdata)
|-if (wc->w_target_page)
flush_dcache_page(wc->w_target_page);
|-for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
|-if (page_has_buffers(tmppage)) {
if (handle && ocfs2_should_order_data(inode)) {
loff_t start_byte = ((loff_t)tmppage->index << PAGE_SHIFT) + from;
loff_t length = to - from;
ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
}
block_commit_write(tmppage, from, to);
|-}
|-}
out_write_size:
|-pos += copied;
|-i_size_write(inode, pos);
|-inode->i_blocks = ocfs2_inode_sector_count(inode);
|-di->i_size = cpu_to_le64((u64)i_size_read(inode));
|-inode->i_mtime = inode->i_ctime = current_time(inode);
|-di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
|-di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|-mark_inode_dirty(inode);
|-if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);