:Linux文件系统之文件的读写(续二)

来源：百度文库编辑：九乡新闻网时间：2024/05/13 18:05:02

------------------------------------------ 本文系本站原创,欢迎转载!转载请注明出处:http://ericxiao.cublog.cn/------------------------------------------八：VFS层的I/O操作VFS层是与用户界面直接交互的接口，在这一节里，我们将分为读写两部份来介绍VFS层的操作以及跟上层用用的交互.8.1:文件的读操作在用户空间，读文件操作的常用函数为read()。对应在系统空间的调用入口是sys_read().它的代码如下：asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count){ struct file *file; ssize_t ret = -EBADF; int fput_needed; //根据fd从进程中取出相应的file对象 file = fget_light(fd, &fput_needed); if (file) { loff_t pos = file_pos_read(file); //文件的当前位置 ret = vfs_read(file, buf, count, &pos); //更新当前的文件位置 file_pos_write(file, pos); fput_light(file, fput_needed); } return ret;}从进程中取得文件描述符后和文件当前的操作位置后会调用vfs_read()执行具体的操作过程.它的代码如下：ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos){ struct inode *inode = file->f_dentry->d_inode; ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) return -EINVAL;//检查当前区段是否允许读操作 ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count); if (!ret) { //是否有权限 ret = security_file_permission (file, MAY_READ); if (!ret) { //如果有read 操作，调用之 if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else //否则调用aio_read ret = do_sync_read(file, buf, count, pos); //ret: 写入的字节数 if (ret > 0) //产生通告 dnotify_parent(file->f_dentry, DN_ACCESS); } } return ret;}从上面看到，会最终调用file的相关操作完成文件的读操作.曾记得我们在文件的打开一节中分析了文件的打开过程。在打开文件过程中，文件描述符的相关操作会被赋值为inode->f_op.对于ext2文件系统，inode的相关信息如下： inode->i_fop = &ext2_file_operations;struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, .read = generic_file_read, .write = generic_file_write, .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .ioctl = ext2_ioctl, .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, .readv = generic_file_readv, .writev = generic_file_writev, .sendfile = generic_file_sendfile,}相应文件读操作入口为generic_file_read():ssize_tgeneric_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos){ //用户空间的地址和长度 struct iovec local_iov = { .iov_base = buf, .iov_len = count }; //记录完成状态 struct kiocb kiocb; ssize_t ret; //kiocb.ki_key=KIOCB_SYNC_KEY; kiocb.ki_filp=filp;kiocb.ki_obj=current; init_sync_kiocb(&kiocb, filp); //返回读写完成的字节数 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); //异步操作，需用等待 if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); //返回完成的字节数 return ret;}__generic_file_aio_read()是一个很重要的函数，它是读操作的入口。代码如下：ssize_t__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos){ struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; count = 0; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ count += iv->iov_len; if (unlikely((ssize_t)(count|iv->iov_len) < 0)) return -EINVAL; //检查从 iv->iov_base 开始的iov_len区间的合法性 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; //nr_seg: 有效的数据段数目 nr_segs = seg; //上一个数据段无效，将其长度减下来 count -= iv->iov_len; /* This segment is no good */ break; } /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ //如果定义了O_DIRECT:直接传送数据`绕过了页高速缓存 if (filp->f_flags & O_DIRECT) { loff_t pos = *ppos, size; struct address_space *mapping; struct inode *inode; mapping = filp->f_mapping; inode = mapping->host; retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); if (retval >= 0 && !is_sync_kiocb(iocb)) retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } file_accessed(filp); goto out; } //count:读取文件的长度 retval = 0; if (count) { for (seg = 0; seg < nr_segs; seg++) { //read_descriptor_t:读操作描述符`用来记录读的状态 read_descriptor_t desc; desc.written = 0; desc.arg.buf = iov[seg].iov_base; desc.count = iov[seg].iov_len; //如果没有要传输的数据`继续下一个iov if (desc.count == 0) continue; desc.error = 0; //对其中的每一个段调用do_generic_file_read do_generic_file_read(filp,ppos,&desc,file_read_actor,0); //desc.written：写入到用户空间的字节数 //更新retval retval += desc.written; if (!retval) { retval = desc.error; break; } } }out: return retval;}这里有种特殊情况，当文件是用直接I/O模式打开时（文件描述符带有O_DIRECT标志），就会采用直接I/O而跳过了页高速缓区。这样的情况我们在之后再讨论.对于普通模块的情况。将会对每一个段调用do_generic_file_read()来完成I/O操作。这个函数的代码如下：do_generic_file_read（）à do_generic_file_read():/* mapping: 页高速缓存区 _ra: filep对应的file_ra_state filep: 打开的文件描述符 ppos: 当前的操作位置 desc: 读操作描述符 actor: 内核空间到用户空间的拷贝函数 nonblock: 如果此变量为1，则需要预读 */void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor, int nonblock){ struct inode *inode = mapping->host; unsigned long index, end_index, offset; loff_t isize; struct page *cached_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; //找到页面的偏移量。即确定是存储在那个存面中 index = *ppos >> PAGE_CACHE_SHIFT; //第一个请求字节在页面的偏移量 //亦即请求的字节在页面中的偏移 offset = *ppos & ~PAGE_CACHE_MASK; //inode对应的文件大小 isize = i_size_read(inode); if (!isize) goto out; //最后的缓存页序号 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; unsigned long nr, ret; /* nr is the maximum number of bytes to copy from this page */ //nr: 缓存页空间大小 nr = PAGE_CACHE_SIZE; if (index >= end_index) { //index > end_indx: 肯定是非法的页面缓存器大小 if (index > end_index) goto out; //执行到这里，肯定有index == end_index //nr转化成了文件在最后一个缓存page中的位置 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; //offset是当前位置在页中的偏移，nr: 是最后一个块在磁盘中的偏移 //如果nr<=offset说明文件已经操作完了 if (nr <= offset) { goto out; } } //nr-offset: 页面的剩余操作字节数 nr = nr - offset; //检查当前进程是否设置了重新调度标志`如果有`调用schdule()重新调度一次 cond_resched(); //文件预读 if (!nonblock) page_cache_readahead(mapping, &ra, filp, index); find_page: //寻找当前位置对应的缓存页 page = find_get_page(mapping, index); if (unlikely(page == NULL)) { //没有找到对应的缓存页，说明在页缓存区中不存在此页面对应的缓存页 if (nonblock) { desc->error = -EWOULDBLOCKIO; break; } handle_ra_miss(mapping, &ra, index); goto no_cached_page; } //在页缓存区中找到了相关的页面 //检查PG_uptodata标志是否被设置`如果这个标志被设置的话，就不需要从设备 //上去读取了 if (!PageUptodate(page)) { //页面没有设置PG_uptodata`页面中的内容无效，所以要从文件系统中把数据读取出来 if (nonblock) { page_cache_release(page); desc->error = -EWOULDBLOCKIO; break; } goto page_not_up_to_date; }page_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) flush_dcache_page(page); /* * Mark the page accessed if we read the beginning. */ if (!offset) mark_page_accessed(page); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ //页面与用户空间的值拷贝.返回拷贝的数据数 ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); //如果ret == nr：拷贝的长度等于在页面中的剩余长度，说明拷贝没有发生错误 if (ret == nr && desc->count) continue; //否则，可以退出了 goto out; page_not_up_to_date: /* Get exclusive access to the page ... */ //要从文件系统中传数据到此页面上。将此页面锁定 lock_page(page); /* Did it get unhashed before we got the lock? */ //有可能在锁页面的时候`有其它的进程将页面移除了页缓存区 //在这种情况下：将page解锁`并减少它的使用计数，重新循环``` //重新进入循环后，在页缓存区找不到对应的page.就会重新分配一个新的page if (!page->mapping) { unlock_page(page); page_cache_release(page); continue; } /* Did somebody else fill it already? */ //在加锁的时候，有其它的进程完成了从文件系统到具体页面的映射? //在这种情况下，返回到page_ok.直接将页面上的内容copy到用户空间即可 if (PageUptodate(page)) { unlock_page(page); goto page_ok; } //读取页面readpage: /* Start the actual read. The read will unlock the page. */ //到这里的话，实际的读取过程开始了 ^_^ error = mapping->a_ops->readpage(filp, page); //读取错误，退出 if (unlikely(error)) goto readpage_error; //如果PG_uptodata标志仍然末设置.就一直等待，一直到page不处于锁定状态 // TODO: 在将文件系统的内容读入page之前，page一直是处理Lock状态的。一直到 //读取完成后，才会将页面解锁. 然后将进程唤醒 if (!PageUptodate(page)) { wait_on_page_locked(page); //如果页面仍然没有PG_uptodata标志.只可能是发生了错误.出错返回 if (!PageUptodate(page)) { error = -EIO; goto readpage_error; } } /* * i_size must be checked after we have done ->readpage. * * Checking i_size after the readpage allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; //如果文件大小无效或者当前位置超过了文件大小 if (unlikely(!isize || index > end_index)) { page_cache_release(page); goto out; } /* nr is the maximum number of bytes to copy from this page */ //重新计算nr 即在页面中剩余的要copy的字节数 nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { page_cache_release(page); goto out; } } nr = nr - offset; goto page_ok; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; page_cache_release(page); goto out; no_cached_page: /* * Ok, it wasn't cached, so we need to create a new * page.. */ //在页缓区中没有相关的缓存页 //新分匹一个页面 if (!cached_page) { cached_page = page_cache_alloc_cold(mapping); if (!cached_page) { desc->error = -ENOMEM; goto out; } } //将分得的页加到页缓存区和LRU // TODO:在将新页面插入页缓存区域中，会将页面标志设置为PG_locked error = add_to_page_cache_lru(cached_page, mapping, index, GFP_KERNEL); if (error) { if (error == -EEXIST) goto find_page; desc->error = error; goto out; } page = cached_page; cached_page = NULL; goto readpage; } out: *_ra = ra; //ppos: 最后的读取位置 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); if (filp) file_accessed(filp);}如果参数为nonblock为1，则必须预读页面。在这里的调用nonblock为零，不需要考虑预读的情况。关于预读的操作，我们之后再给出分析.在这个操作中，有这样几种可能的情况：1：如果要访问的页面在页高速缓存中，而且已经被更新（含有PG_uptodata标志）.只需要直接将其copy到用户空间即可.2:序号对应的页面不在高速缓存中，那就需要在页高速缓存中增加序号对应的页面。然后从文件系统中读取数据到这个页面上.再拷贝到用户空间。3:序号对应的页面在高速缓存中，但数据不是最新的.这就需要缓存页与文件系统进行同步.再将页面拷贝到用户空间.对于2和3。它们有一部份是相同的，即从文件系统中读数据的过程。我们只需要分种对于第2的情况。对应的代码片段如下：void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor, int nonblock){ …… page = find_get_page(mapping, index); if (unlikely(page == NULL)) { //没有找到对应的缓存页，说明在页缓存区中不存在此页面对应的缓存页 if (nonblock) { desc->error = -EWOULDBLOCKIO; break; } handle_ra_miss(mapping, &ra, index); goto no_cached_page; } …… …… }Handle_ra_miss()主要对文件的预读进行调整，在这里不进行分析，待分析预读机制的时候再来详细分析.如果页面高速缓存中不存在此页面就会跳转到no_cached_page:no_cached_page: /* * Ok, it wasn't cached, so we need to create a new * page.. */ //在页缓区中没有相关的缓存页 //新分匹一个页面 if (!cached_page) { cached_page = page_cache_alloc_cold(mapping); if (!cached_page) { desc->error = -ENOMEM; goto out; } } //将分得的页加到页缓存区和LRU // TODO:在将新页面插入页缓存区域中，会将页面标志设置为PG_locked error = add_to_page_cache_lru(cached_page, mapping, index, GFP_KERNEL); if (error) { if (error == -EEXIST) goto find_page; desc->error = error; goto out; } page = cached_page; cached_page = NULL; goto readpage;在这里，会首先调用page_cache_alloc_cold()分配一个页面。然后调用add_to_page_cache_lru（）将页面插入页高速缓存并加入lru.然后跳转到readpage。这也是第3种情况对应的处理： //读取页面readpage: /* Start the actual read. The read will unlock the page. */ //到这里的话，实际的读取过程开始了 ^_^ error = mapping->a_ops->readpage(filp, page);在这里会看到，最终会调用页高速缓存的readpage方法进行读取操作。文件页高速缓存的readpage操作同理，还是以ext2文件系统为例来分析。在open的时候，它将页高速缓存对应的各项操作设置如下：inode->i_mapping->a_ops = &ext2_aops;struct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, .writepage = ext2_writepage, .sync_page = block_sync_page, .prepare_write = ext2_prepare_write, .commit_write = generic_commit_write, .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages,};对应的入口为ext2_readpage:static int ext2_readpage(struct file *file, struct page *page){ return mpage_readpage(page, ext2_get_block);}这是一个封装的函数，采用一个回调函数做为参数.该回调函数将相对于文件起始的块号转换为文件系统的逻辑块号.Mpage_readpage()的代码如下：int mpage_readpage(struct page *page, get_block_t get_block){ struct bio *bio = NULL; sector_t last_block_in_bio = 0; //转要读的信息转换为bio结构 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, get_block); //提交这个bio if (bio) mpage_bio_submit(READ, bio); return 0;}mpage_bio_submit（）这个操作中有一部份代码在之前已经分析过了。剩余的代码很简单。这里不做分析.do_mpage_readpage()的代码如下：static struct bio *do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, get_block_t get_block){ struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; //计算一个页面中的数据块数目 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; //block的大小 const unsigned blocksize = 1 << blkbits; sector_t block_in_file; sector_t last_block; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; unsigned first_hole = blocks_per_page; struct block_device *bdev = NULL; struct buffer_head bh; int length; int fully_mapped = 1; //如果页面是一个缓存区页,跳转到confused.直接更新页在中的块缓存区 if (page_has_buffers(page)) goto confused; //页序号*每个页中的块数目 = 页面中的首个块号 block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits); //文件最后的块: 文件大小/块大小 last_block = (i_size_read(inode) + blocksize - 1) >> blkbits; bh.b_page = page; //遍历页面中的块数 for (page_block = 0; page_block < blocks_per_page; page_block++, block_in_file++) { bh.b_state = 0; if (block_in_file < last_block) { //将文件中的块号转换成bh if (get_block(inode, block_in_file, &bh, 0)) //如果有错误 goto confused; } //bh没有被映射，可能是一个文件空洞 if (!buffer_mapped(&bh)) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; continue; } /* some filesystems will copy data into the page during * the get_block call, in which case we don't want to * read it again. map_buffer_to_page copies the data * we just collected from get_block into the page's buffers * so readpage doesn't have to repeat the get_block call */ //如果块缓存区是最新的，将其数据直接copy到page if (buffer_uptodate(&bh)) { map_buffer_to_page(page, &bh, page_block); goto confused; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ //判断请求的块缓存是不是连续的。如果不连续，就跳转到confused if (page_block && blocks[page_block-1] != bh.b_blocknr-1) goto confused; blocks[page_block] = bh.b_blocknr; bdev = bh.b_bdev; } if (first_hole != blocks_per_page) { char *kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + (first_hole << blkbits), 0, PAGE_CACHE_SIZE - (first_hole << blkbits)); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); goto out; } } else if (fully_mapped) { //设置PG_mappedtodisk SetPageMappedToDisk(page); } /* * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) bio = mpage_bio_submit(READ, bio); alloc_new: if (bio == NULL) { //创建一个bio bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); if (bio == NULL) goto confused; } length = first_hole << blkbits; //将page对应的偏移与长度设置到bio 中 if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(READ, bio); goto alloc_new; } if (buffer_boundary(&bh) || (first_hole != blocks_per_page)) bio = mpage_bio_submit(READ, bio); else *last_block_in_bio = blocks[blocks_per_page - 1];out: return bio; confused: if (bio) bio = mpage_bio_submit(READ, bio); if (!PageUptodate(page)) block_read_full_page(page, get_block); else unlock_page(page); goto out;}这段代码实际上做了一个小小的优化。它会判断要提交的块缓存区是不是连续的。如果是连续的就可以将它们放一个bio中。然后提交到通用块设备层。如果不是连续的，对于每一个块缓存区都要提交一次.对于连续条件的bio提交很好理解，代码也很容易.重点分析对于不连续的块的处理。在上面的代码中可以看到，对于不连续块是通过block_read_full_page（）来处理的.代码如下：int block_read_full_page(struct page *page, get_block_t *get_block){ struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; int nr, i; int fully_mapped = 1; //页面没有被锁定 if (!PageLocked(page)) PAGE_BUG(page); //块大小 blocksize = 1 << inode->i_blkbits; //如果页面中没有块缓存区，则在其中建立空的块缓存区 if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); //块缓存区描述符的首部 head = page_buffers(page); //页中的起始块号 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); //文件中的最后一个块号 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; bh = head; nr = 0; i = 0; do { //已经是最新的了，不需要提交，继续下一个 if (buffer_uptodate(bh)) continue; //如果块缓存区没有被映射 if (!buffer_mapped(bh)) { fully_mapped = 0; if (iblock < lblock) { //将文件块号转换为bh if (get_block(inode, iblock, bh, 0)) SetPageError(page); } //如果这个bh还是没有映射。可能是对应文件的空洞区域 //将这个bh对应的区域置0 if (!buffer_mapped(bh)) { void *kaddr = kmap_atomic(page, KM_USER0); memset(kaddr + i * blocksize, 0, blocksize); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); set_buffer_uptodate(bh); continue; } /* * get_block() might have updated the buffer * synchronously */ //如果bh为最新了，不需要提交了 if (buffer_uptodate(bh)) continue; } //提要提交的bh保存到arr数组里 arr[nr++] = bh; } while (i++, iblock++, (bh = bh->b_this_page) != head); //设置PG_mappdtodisk if (fully_mapped) SetPageMappedToDisk(page); //如果没有要提交的 if (!nr) { /* * All buffers are uptodate - we can set the page uptodate * as well. But not if get_block() returned an error. */ if (!PageError(page)) SetPageUptodate(page); unlock_page(page); return 0; } /* Stage two: lock the buffers */ //对每一个提交的bh进行锁定 for (i = 0; i < nr; i++) { bh = arr[i]; lock_buffer(bh); mark_buffer_async_read(bh); } /* * Stage 3: start the IO. Check for uptodateness * inside the buffer lock in case another process reading * the underlying blockdev brought it uptodate (the sct fix). */ //提交每一个bh for (i = 0; i < nr; i++) { bh = arr[i]; if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else submit_bh(READ, bh); } return 0;}从上面的代码中看了.对于不连续的读操作，会反复调用submit_bh()来完成. 8.2:文件的写操作在用户空间中，用户的写操作接口为write.对应系统调用的入口为sys_write().代码如下：asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count){ struct file *file; ssize_t ret = -EBADF; int fput_needed; //取得文件描述符对应的file //fget_ligsh():对fget()进行了优化。如果当前file没有被共享的话。那么在取的时候就不必要加锁 file = fget_light(fd, &fput_needed); if (file) { //当前文件指针位置 loff_t pos = file_pos_read(file); ret = vfs_write(file, buf, count, &pos); //更新文件指针 file_pos_write(file, pos); //对共享情况下的解锁 fput_light(file, fput_needed); } return ret;}上面的代码与读操作差不多，都是取文件描述符和当前文件，操作完后，更新文件指针位置.vfs_write()代码如下：ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos){ struct inode *inode = file->f_dentry->d_inode; ssize_t ret; //文件不可写？ if (!(file->f_mode & FMODE_WRITE)) return -EBADF; //没有操作函数或者是有操作函数但没有写函数。出错返回 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) return -EINVAL; //对写区域所加的强制锁 ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count); if (!ret) { ret = security_file_permission (file, MAY_WRITE); if (!ret) { if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else ret = do_sync_write(file, buf, count, pos); if (ret > 0) dnotify_parent(file->f_dentry, DN_MODIFY); } } return ret;}对于大部份情况，写操作会由file->f_op->write完成.在ext2文件系统中，此接口对应的函数为：ssize_t generic_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos){ struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; down(&inode->i_sem); //返回write的有效字节数 ret = generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); //如果定义了O_SYNC或者inode定义了MS_SYNCHRONOUS标志 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; //把缓存区上面的东西写回设备 err = sync_page_range(inode, mapping, *ppos - ret, ret); if (err < 0) ret = err; } return ret;}如果打开文件时带有O_SYNC标志，或者文件系统带有SYNC标志，都会将缓存中的数据直接写到文件系统上.转入generic_file_write_nolock():ssize_tgeneric_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos){ struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; unsigned long seg; loff_t pos; ssize_t written; ssize_t err; ocount = 0; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ ocount += iv->iov_len; if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) return -EINVAL; //判断用户给的区域是否合法 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) continue; if (seg == 0) return -EFAULT; nr_segs = seg; ocount -= iv->iov_len; /* This segment is no good */ break; } //count: 要write的字节总数 count = ocount; //ppos:当前的位置 pos = *ppos; /* We can write back this queue in page reclaim */ //backing_dev_info: 预读信息 current->backing_dev_info = mapping->backing_dev_info; written = 0; //对写操作的详细检查 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; if (count == 0) goto out; err = remove_suid(file->f_dentry); if (err) goto out; //更新索引结点的时间戳信息 inode_update_time(inode, 1); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, count, ocount); if (written < 0 || written == count) goto out; /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; } written = generic_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count, written);out: current->backing_dev_info = NULL; return written ? written : err;}如果文件打开时带有了O_DIRECT标志，则会跳过文件缓存直接将数据写到文件系统中。对于O_DIRECT的操作我们在之后再做总结。对于一般的情况，都会转入到generic_file_buffered_write():ssize_tgeneric_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, loff_t *ppos, size_t count, ssize_t written){ struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; struct address_space_operations *a_ops = mapping->a_ops; struct inode *inode = mapping->host; long status = 0; struct page *page; struct page *cached_page = NULL; size_t bytes; struct pagevec lru_pvec; const struct iovec *cur_iov = iov; /* current iovec */ size_t iov_base = 0; /* offset in the current iovec */ char __user *buf; pagevec_init(&lru_pvec, 0); buf = iov->iov_base + written; /* handle partial DIO write */ do { unsigned long index; unsigned long offset; size_t copied; //offset: 页面中的偏移 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ //offset: 页面序号 index = pos >> PAGE_CACHE_SHIFT; //页面中的剩余信息 bytes = PAGE_CACHE_SIZE - offset; //如果bytes > 数据的长度 if (bytes > count) bytes = count; /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. */ fault_in_pages_readable(buf, bytes); //到页高速缓存中寻找index对应的页面。如果不存在，则新建 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; break; } //调用prepare_write。在这里就会涉及到缓存头的概念了 ^_^ status = a_ops->prepare_write(file, page, offset, offset+bytes); if (unlikely(status)) { loff_t isize = i_size_read(inode); /* * prepare_write() may have instantiated a few blocks * outside i_size. Trim these off again. */ unlock_page(page); page_cache_release(page); if (pos + bytes > isize) vmtruncate(inode, isize); break; } //把数据copy到缓冲区 if (likely(nr_segs == 1)) copied = filemap_copy_from_user(page, offset, buf, bytes); else copied = filemap_copy_from_user_iovec(page, offset, cur_iov, iov_base, bytes); flush_dcache_page(page); //调用commit_write。将数据写回设备 status = a_ops->commit_write(file, page, offset, offset+bytes); if (likely(copied > 0)) { if (!status) status = copied; if (status >= 0) { written += status; count -= status; pos += status; buf += status; if (unlikely(nr_segs > 1)) filemap_set_next_iovec(&cur_iov, &iov_base, status); } } if (unlikely(copied != bytes)) if (status >= 0) status = -EFAULT; unlock_page(page); mark_page_accessed(page); page_cache_release(page); if (status < 0) break; balance_dirty_pages_ratelimited(mapping); cond_resched(); } while (count); *ppos = pos; if (cached_page) page_cache_release(cached_page); /* * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); } } /* * If we get here for O_DIRECT writes then we must have fallen through * to buffered writes (block instantiation inside i_size). So we sync * the file data here, to try to honour O_DIRECT expectations. */ if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); pagevec_lru_add(&lru_pvec); return written ? written : status;}从上面的代码可以看出：对于写操作，会先到高速缓存中取对应的page。然后调用a_ops->prepare_write（）。然后将要写的数据拷贝到缓存区页上，接着调用a_ops-> commit_write（）。下来我们分别分别这两个操作.8.2.1:页高速缓存的prepare_write()操作Ext2系统对应的入口为：static intext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to){ return block_prepare_write(page,from,to,ext2_get_block);}这里是一个封装函数。对于块设备来说，不同的只是后面所带的函数指针，这样的函数结构我们在读操作中也见过。Ext_get_block()函数的操作为，将对应文件的块号转换为文件系统的逻辑块号.转入block_prepare_write（）：int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block){ struct inode *inode = page->mapping->host; int err = __block_prepare_write(inode, page, from, to, get_block); //如果失败，清除page的uptodate标志 if (err) ClearPageUptodate(page); return err;}__block_prepare_write（）的操作为：static int __block_prepare_write(struct inode *inode, struct page *page, unsigned from, unsigned to, get_block_t *get_block){ unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!PageLocked(page)); BUG_ON(from > PAGE_CACHE_SIZE); BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); //标大小 blocksize = 1 << inode->i_blkbits; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); head = page_buffers(page); bbits = inode->i_blkbits; //该页面的起始起号 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; //对于没有落在from->to这个区间的bh // TODO: 这样做实际上要依赖一个条件：块大小必须为512的整数倍且须为2的幂大小 if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { //这里可能会进行文件系统大小的扩充. err = get_block(inode, block, bh, 1); if (err) goto out; //块缓存区刚被分配，没有被访问就置为BH_NEW //通常是通过get_block（）刚刚映射好的，不能访问 if (buffer_new(bh)) { clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); //如果页面uptodate.则设置bh的相应标志 if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; } //如果只是对该块缓存区的部份进行操作，则将不操作的部份置0 if (block_end > to || block_start < from) { void *kaddr; kaddr = kmap_atomic(page, KM_USER0); if (block_end > to) memset(kaddr+to, 0, block_end-to); if (block_start < from) memset(kaddr+block_start, 0, from-block_start); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); } continue; } } if (PageUptodate(page)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; } //如果bh没有uptodata.先将其和文件系统同步 if (!buffer_uptodate(bh) && !buffer_delay(bh) && (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; } } /* * If we issued read requests - let them complete. */ //如果有提交的bh.等待其I/O完成 while(wait_bh > wait) { wait_on_buffer(*--wait_bh); if (!buffer_uptodate(*wait_bh)) return -EIO; } return 0;out: /* * Zero out any newly allocated blocks to avoid exposing stale * data. If BH_New is set, we know that the block was newly * allocated in the above loop. */ bh = head; block_start = 0; do { block_end = block_start+blocksize; if (block_end <= from) goto next_bh; if (block_start >= to) break; if (buffer_new(bh)) { void *kaddr; clear_buffer_new(bh); kaddr = kmap_atomic(page, KM_USER0); memset(kaddr+block_start, 0, bh->b_size); kunmap_atomic(kaddr, KM_USER0); set_buffer_uptodate(bh); mark_buffer_dirty(bh); }next_bh: block_start = block_end; bh = bh->b_this_page; } while (bh != head); return err;}对于读操作，写操作可能更加复杂，因为写操作要动态调整文件的大小。文件大小的调整过程是在ext_get_block()这个回调函数中完成的。Prepare_write操作完成了对缓存冲页进行了必要的初始化和文件大小的扩充.直正将数据写到文件系统上是在commit_write（）中完成的：int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to){ struct inode *inode = page->mapping->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold i_sem. */ //如果文件被扩大了.更改inode->i_size if (pos > inode->i_size) { i_size_write(inode, pos); mark_inode_dirty(inode); } return 0;}经过上面的分析，我们知道，在调用commit_write()之前，已经将要写的数据拷贝到了页缓冲区.__block_commit_write()的代码如下：static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to){ unsigned block_start, block_end; int partial = 0; unsigned blocksize; struct buffer_head *bh, *head; blocksize = 1 << inode->i_blkbits; //对被修改的部份置为dirty for(bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; } else { set_buffer_uptodate(bh); mark_buffer_dirty(bh); } } /* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus readpage() for * the next read(). Here we 'discover' whether the page went * uptodate as a result of this (potentially partial) write. */ //如果整个页面的块缓存区都置为了dirty.则置页面的PG_uptodate标志. if (!partial) SetPageUptodate(page); return 0;}在上面的代码中，我们看到，只是把块缓存区置为了“脏”，并没有直正的将数据写到文件系统中，那是什么时候完成这个写的过程的呢？记得我们在分析pdflush线程数的时候，曾经介绍过 “回写陈旧的页面”。没错，就是在那里，旧页面被回写到了文件系统.在那一节，我们遗留下了两个问题。即mapping->a_ops->writepages和mapping->a_ops->writepage的操作。我们在这一节里详细的分析一下. 8.2.1: mapping->a_ops->writepages()操作对于ext2来说，它的mapping各项操作赋值为：struct address_space_operations ext2_aops = {…….writepage = ext2_writepage,.writepages = ext2_writepages,……}相应的,writepages入口为ext2_writepages（）：static intext2_writepages(struct address_space *mapping, struct writeback_control *wbc){ return mpage_writepages(mapping, wbc, ext2_get_block);}mpage_writepages()就是我们在pdflush线程组中曾经分析过的子函数.在这里不再赘述. 8.2.2: mapping->a_ops->writepage()操作相应的入口为ext2_writepage（）：static int ext2_writepage(struct page *page, struct writeback_control *wbc){ return block_write_full_page(page, ext2_get_block, wbc);}转入block_write_full_page（）static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc){ int err; sector_t block; sector_t last_block; struct buffer_head *bh, *head; int nr_underway = 0; BUG_ON(!PageLocked(page)); //文件中的最后一个块号 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; //如果不是块缓存页，则在页中建立块缓存区 if (!page_has_buffers(page)) { create_empty_buffers(page, 1 << inode->i_blkbits, (1 << BH_Dirty)|(1 << BH_Uptodate)); } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; * handle that here by just cleaning them. */ //块缓存页中的起始块号 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); //块缓存区描述符首部 head = page_buffers(page); bh = head; /* * Get all the dirty buffers mapped to disk addresses and * handle any aliases from the underlying blockdev's mapping. */ do { //如果块号超过了文件的最后块号 if (block > last_block) { /* * mapped buffers outside i_size will occur, because * this page can be outside i_size when there is a * truncate in progress. */ /* * The buffer was zeroed by block_write_full_page() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { //从文件系统中读取文件相对块号对应的bh err = get_block(inode, block, bh, 1); if (err) goto recover; if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } } bh = bh->b_this_page; block++; } while (bh != head); do { get_bh(bh); //块缓存区没有被映射 if (!buffer_mapped(bh)) continue; /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can * potentially cause a busy-wait loop from pdflush and kswapd * activity, but those code paths have their own higher-level * throttling. */ //在操作之前先锁定块缓存区 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); } else if (test_set_buffer_locked(bh)) { //如果操作模式为WB_SYNC_NONE或者不允许阻塞。 //在块缓存区已经被锁定时，直接退出 redirty_page_for_writepage(wbc, page); continue; } //如果页面为脏，设置块缓存区为BH_ASYNC_WRITE if (test_clear_buffer_dirty(bh)) { mark_buffer_async_write(bh); } else { unlock_buffer(bh); } } while ((bh = bh->b_this_page) != head); /* * The page and its buffers are protected by PageWriteback(), so we can * drop the bh refcounts early. */ BUG_ON(PageWriteback(page)); //设置页面回写标志 set_page_writeback(page); unlock_page(page); //遍历页中的块缓存区，将BH_ASYNC_WRITE标志的BH回写到文件系统 do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { submit_bh(WRITE, bh); nr_underway++; } put_bh(bh); bh = next; } while (bh != head); err = 0;done: if (nr_underway == 0) { /* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * ll_rw_block/submit_bh. A rare case. */ int uptodate = 1; do { if (!buffer_uptodate(bh)) { uptodate = 0; break; } bh = bh->b_this_page; } while (bh != head); if (uptodate) SetPageUptodate(page); end_page_writeback(page); /* * The page and buffer_heads can be released at any time from * here on. */ wbc->pages_skipped++; /* We didn't write this page */ } return err; recover: /* * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. * The page is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ do { get_bh(bh); if (buffer_mapped(bh) && buffer_dirty(bh)) { lock_buffer(bh); mark_buffer_async_write(bh); } else { /* * The buffer may have been set dirty during * attachment to a dirty page. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); SetPageError(page); BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); submit_bh(WRITE, bh); nr_underway++; } put_bh(bh); bh = next; } while (bh != head); goto done;}该函数会遍历页面中的块缓存区，然后将脏的块缓存区写回文件系统.

Linux文件系统之文件的读写(续二) Linux文件系统之文件的读写 Linux文件系统之文件的读写(续一) linux文件系统之文件的打开与关闭 linux文件系统之路径查找与文件系统的挂载 Linux文件系统之目录的建立 Linux文件系统之sysfs 从文件 I/O 看 Linux 的虚拟文件系统对 proc 文件系统的分析（二） - linux文件系统 - Embedded and ... linux文件系统 Linux分区和文件系统嵌入式Linux文件系统简介 linux下Nand Flash的JFFS2文件系统的移植1 通过编程获取Linux文件系统使用的详细信息1 linux文件系统的Inode，硬链接和软链接作用（原创）我的读写兵法（续二） linux文件的操作流程 Linux ext2/ext3文件系统详解 Linux操作系统文件系统基础知识详解 C# 读写Config文件 VBA 读写文件详细参数 MFC文件读写 vb 读写ini文件 linux下的文件结构--各文件夹的作用-Linux