Linux Kernel: VFS

1. Linux Kernel: VFS

1. Linux Kernel: VFS

1.1. Data Structure

1.1.1. file_system_type

struct file_system_type {
    const char *name;
    struct super_block *(*get_sb) (struct file_system_type *, int,
                                   const char *, void *);
    // ...
};

1.1.2. super_block

struct super_block {
    unsigned long                s_blocksize;
    struct super_operations     *s_op;
    struct dentry               *s_root;
    struct list_head             s_inodes;      
    struct list_head             s_dirty;       
    struct block_device         *s_bdev;
    // ...
}

1.1.3. inode

struct inode {
    unsigned long            i_ino;
    umode_t                  i_mode;
    unsigned int             i_nlink;
    uid_t                    i_uid;
    gid_t                    i_gid;
    loff_t                   i_size;
    struct timespec          i_atime;
    struct timespec          i_mtime;
    struct timespec          i_ctime;
    unsigned long            i_blocks;
    struct inode_operations *i_op;
    struct file_operations  *i_fop;     
    struct super_block      *i_sb;
    struct address_space    *i_mapping;
    // ...
};

1.1.4. dentry

struct dentry {
    atomic_t                  d_count;
    struct inode             *d_inode;
    struct dentry            *d_parent;
    struct qstr               d_name;
    struct list_head          d_child;
    struct list_head          d_subdirs;
    struct dentry_operations *d_op;
    int                       d_mounted;
    // ...
};

1.1.5. file

struct file {
    struct dentry          *f_dentry;
    struct vfsmount        *f_vfsmnt;
    struct file_operations *f_op;
    loff_t                  f_pos;
    unsigned int            f_uid, f_gid;
    void                   *private_data;
    struct address_space   *f_mapping;
    // ...
};

1.1.6. vfsmount

struct vfsmount
{
    struct vfsmount    *mnt_parent;     
    struct dentry      *mnt_mountpoint; 
    struct dentry      *mnt_root;
    struct super_block *mnt_sb;
    // ...
};

1.1.7. nameidata

struct nameidata {
    struct dentry   *dentry;
    struct vfsmount *mnt;
    struct qstr      last;
    unsigned         depth;
    char            *saved_names[MAX_NESTED_LINKS + 1];
    // ...
};

1.1.8. task_struct

struct task_struct {
    struct fs_struct *fs;
    struct files_struct *files;
}

1.1.9. address_space

1.2. Operations

1.2.1. super_operations

1.2.1.1. alloc_inode

alloc_inode 分配一个 struct inode, 并做一些简单的初始化,后续的 read_inode 负责真正从磁盘加载 inode 信息.

alloc_inode 典型的调用栈:

xxx_lookup:
  iget(ino)
    inode = ifind_fast(sb, head, ino);
    if not found:
      inode = alloc_inode(sb);
    sb->s_op->read_inode(inode);

具体文件系统一般会实现自己的 alloc_inode:

例如 ext2 的 ext2_alloc_inode:

static struct inode *ext2_alloc_inode(struct super_block *sb):
  struct ext2_inode_info *ei;
  ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
  ei->vfs_inode.i_version = 1;
  return &ei->vfs_inode;

主要都是通过一个 ei 来保存和具体文件系统的 inode 相关的信息.

1.2.1.2. read_inode

read_inode 是从具体文件系统加载数据对 alloc_inode 分配的 inode 进行填充. 它典型的调用栈如上面 "alloc_inode 的调用栈" 所示: alloc_inode 后会紧接一个 read_inode.

与 alloc_inode 不同的是, vfs 并没有 read_inode 的 wrapper function, read_inode 的行为完全由文件系统的 i_op->read_inode 决定.

以 ext2_read_inode 为例:

void ext2_read_inode (struct inode * inode):
  struct ext2_inode_info *ei = EXT2_I(inode);
  // read_inode 调用通常来自 lookup, 而 lookup 在调用 read_inode 前会负责
  // 找到 name 对应的 inode->ino
  ino_t ino = inode->i_ino;
  // 从磁盘读取数据
  struct ext2_inode * raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
  inode->i_mode = le16_to_cpu(raw_inode->i_mode);
  inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
  inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
  // ...

1.2.1.3. write_inode

1.2.1.3.1. fsync 导致的 write_inode

sys_fsync(int fd):
  if (!file->f_op || !file->f_op->fsync):
    /* Why?  We can still call filemap_fdatawrite */
    return
  mapping = file->f_mapping;
  filemap_fdatawrite(mapping);
  file->f_op->fsync(file, file->f_dentry, 0);
    ext2_sync_file
      sync_mapping_buffers
      ext2_sync_inode
        sync_inode
          __writeback_single_inode
            __sync_single_inode
              do_writepages
              write_inode
                inode->i_sb->s_op->write_inode(inode, sync);

1.2.1.3.2. dirty page balance 导致的 write_inode (vm_dirty_ratio)

generic_file_buffered_write:
  balance_dirty_pages
    writeback_inodes
      sync_sb_inodes
        __writeback_single_inode

1.2.2. inode_operations

1.2.2.1. lookup

lookup 是 i_op 中非常重要的一个回调函数, 它负责将路径名转换为相应的 inode, 例如, open, mkdir, stat, rename 等都需要先通过 lookup 获得对应的 inode.

以 open 为例, lookup 的调用链为:

filp_open():
  error = open_namei(filename, namei_flags, mode, &nd);
    path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
      retval = link_path_walk(name, nd);
        err = do_lookup(nd, &this, &next);
          dentry = real_lookup(nd->dentry, name, nd);
            // 这里的 dentry 称为 negative dentry, 因为它的 d->d_inode 为空
            // 返回 negative dentry 表示 dentry 对应的 inode 不存在, 但并不是错误:
            // 例如 open(O_CREAT) 
            struct dentry * dentry = d_alloc(parent, name);
            result = dir->i_op->lookup(dir, dentry, nd);
            // result 不为 0, 分两种情况:
            // 1. lookup 成功
            // 2. lookup 失败, result 为负数, 例如 -ENOENT
            if (result):
                return result;
            else:
                // 返回之前的 negative dentry, open_namei 依赖 negative dentry 来判断
                // "dentry 对应的 inode 不存在但有效" 这种情况
                result = dentry;
        // link_path_walk
        dput(next.dentry);
    // open_namei
    // 若之前的 lookup 成功 (无论 inode 存在或不存在)
    dentry = __lookup_hash(&nd->last, nd->dentry, nd);
    if (!dentry->d_inode):
      // negative dentry
      vfs_create(dir->d_inode, dentry, mode, nd);
        inode->i_op->create()
      dput(nd->dentry);

总之, i_op->lookup 有三种返回值:

正常的 dentry 指针
- ENOENT 这种负值, 表示异常情况
NULL, 表示 negative dentry, 表示 dentry 有效但 inode 不存在

1.2.2.2. permission

vfs 对许多行为会进行 permission 检查, 并且会调用到 i_op->permission 函数.

以 open 为例, permission 的调用链为:

filp_open():
  open_namei()
    path_lookup()
      link_path_walk()
        for(;;):
          // 对 lookup 的每一级都进行 permission 检查
          err = permission(inode, MAY_EXEC, nd);
          if (err):
            break;

int permission(struct inode *inode, int mask, struct nameidata *nd):
  // inode 表示要检查的 inode
  // mask 表示需要的权限
  if (inode->i_op && inode->i_op->permission):
    // i_op 中和 fs 相关的实现
    // 由于 i_op->permission 与 generic_permission 是并列的, 所以 inode 可以完全定制自己
    // 的权限, 比如禁止 root 访问
    // 但一般情况下, inode 使用 generic_permission 就可以了,
    // 只要设置好 inode->fsuid, inode->i_gid 及 inode->i_mode 即可
    retval = inode->i_op->permission(inode, submask, nd);
  else:
    // generic 实现
    retval = generic_permission(inode, submask, NULL);

1.2.2.3. getattr / setattr

getattr 和 stat 系统调用有关, 大多数涉及到修改 inode 的系统调用例如 chmod, utime 等和 setattr 的关.

1.2.2.3.1. getattr

vfs_stat:
  user_path_walk(name, &nd);
  vfs_getattr(nd.mnt, nd.dentry, stat);
    if (inode->i_op->getattr):
      return inode->i_op->getattr(mnt, dentry, stat);
    generic_fillattr(inode, stat);
      stat->dev = inode->i_sb->s_dev;
      stat->ino = inode->i_ino;
      stat->mode = inode->i_mode;
      stat->nlink = inode->i_nlink;
      stat->uid = inode->i_uid;
      stat->gid = inode->i_gid;
      stat->rdev = inode->i_rdev;
      stat->atime = inode->i_atime;
      stat->mtime = inode->i_mtime;
      stat->ctime = inode->i_ctime;
      stat->size = i_size_read(inode);
      stat->blocks = inode->i_blocks;
      stat->blksize = inode->i_blksize;

若 i_op 没有实现 getattr, 则 generic_fillattr 会用 inode 的信息填充返回结果.

关于文件大小

stat 返回的信息中, 有两个和文件大小相关的值: stat->size 和 stat->blocks

其中 stat->size 被称为 apparent size, stat->blocks 称为 disk usage

$> truncate --size +10M a

$> stat a
  File: 'a'
  Size: 31457280        Blocks: 0          IO Block: 4096   regular file
Device: 802h/2050d      Inode: 21500388    Links: 1

$> du -k a
0       a

$> du --apparent-size -k a
30720   a

1.2.2.3.2. setattr

UID

sys_chmod:
  notify_change(nd.dentry, &newattrs);
    if (inode->i_op && inode->i_op->setattr):
      inode->i_op->setattr(dentry, attr);
    else:
      inode_setattr(inode, attr);
        if (ia_valid & ATTR_UID):
          inode->i_uid = attr->ia_uid;
        else if xxx:
          // ...
        mark_inode_dirty(inode);

ATIME / MTIME

读写文件时 atime/mtime 会相应变化, vfs 在 generic_file_read / write 处做了实现, 如果某个 fs 的文件读写不是通过 generic_file_read / write, 需要自己处理 ATIME/MTIME

1.2.3. file_operations

1.2.3.1. read

1.2.3.2. write

1.2.3.3. readdir

1.2.4. address_space_operations

1.2.5. 总结

四种 ops 的区别与联系:

s_op 代表对底层文件系统的真正的读写操作, 包括读写 inode, super_block 等. 以 inode 为例, 当上层修改了 inode 时, inode 被标记为 dirty, 最终在 sync 时通过 s_op 中的 write_inode 写回到设备上.
f_op 需要从 i_op 中分离出来, 可能有两方面原因:
1. file (fd) 不仅仅是文件 (inode) 的抽象, 也是 socket, timer 等的抽象, 它们共同的 "数据流读写" 部分被抽象为 file, 比如 read, write, mmap
2. 由于同一个 inode 可以通过多次打开对应多个 file, 和 file->f_pos 相关的操作必须分离到 f_op 中, 例如 read, write, readdir 等
f_op 可能主要代表上层对数据 (而非元数据) 的操作.
i_op 代表对单个 inode 的元数据的操作, 例如 lookup, create, permission, readlink, getattr …

f_op 中的 open, readdir, create, 以及 f_op 工作时用到的 permission 检查, path_lookup 等需要 i_op 去实现

另外, f_op 之外大量和文件相关的 syscall 例如 unlink, symlink, rename, getattr 等都需要 i_op 去实现.

i_op 中关于 inode 修改的操作都只针对内存中的数据, 真正的读写需要由 s_op 去实现.
a_op 可以看做是 "file 数据流抽象" 的一种通用实现.

若 f_op 中 read, write, mmap 等使用 generic 的实现 (例如 generic_file_read), 则 a_op 中相应的 readpage, writepage 等会被调用.

文件系统要支持 read, write 可以有两种选择:
1. 在 f_op 中实现自己的 read, write.
2. 使用 generic_file_read/write, 在 a_op 中实现 readpage, writepage
一般选择后者, 因为可以利用 pagecache.

1.3. Details

1.3.1. sys_mount

sys_mount 的主要功能是:

kern_mount
1. 通过 fs 注册的 get_sb 生成对应的 super_block
2. super_block 包含 fs 对应的 root dentry
3. 生成 vfmount, 并把上一步生成的 root dentry 设置 vfsmount 的 mnt_root
do_add_mount
1. 将 vfsmount 与某个 dentry 关联 (通过 vfsmount->mnt_mountpoint)

kern_mount
  do_kern_mount
    // 生成 vfsmount
    file_system_type *type = get_fs_type(fstype);
    mnt = alloc_vfsmnt(name);
    // 调用 get_sb, 其中 name 是 dev_name, sb->s_bdev 由 device
    // name 决定, 后续具体文件系统的大部分操作都需要通过 block device
    // driver 和 sb->s_bdev 打交道
    sb = type->get_sb(type, flags, name, data);
    mnt->mnt_sb = sb;
    mnt->mnt_root = dget(sb->s_root);
    mnt->mnt_mountpoint = sb->s_root;
    mnt->mnt_parent = mnt;

sys_mount:
  do_mount(dev_name, dir_name, ...)
    path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
    do_new_mount(&nd, type_page, flags, mnt_flags,dev_name)
      // do_kern_mount 时不需要 nd
      do_kern_mount(type, flags, dev_name, data);
      do_add_mount(mnt, nd, mnt_flags, NULL);
        graft_tree(mnt, nd)
          attach_mnt(mnt, nd)
            mnt->mnt_parent = mntget(nd->mnt);
            mnt->mnt_mountpoint = dget(nd->dentry);

1.3.2. mount the root

1.3.2.1. Stage 1: mount rootfs

start_kernel:
  vfs_caches_init
    mnt_init
      init_rootfs();
      init_mount_tree();

init_rootfs:
  register_filesystem(&rootfs_fs_type);

init_mount_tree:
  mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
    rootfs_get_sb
      ramfs_fill_super
        // 初始的 / 是目录 (S_IFDIR)
        inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
        root = d_alloc_root(inode);
        sb->s_root = root;
  namespace->root = mnt;
  // fs->root 设置为 mnt_root, 此后针对 "/" 的 lookup 会找到 mnt_root
  set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root);
  set_fs_root(current->fs, namespace->root, namespace->root->mnt_root);

1.3.2.2. Stage 2: mount real root

// kernel space `init`
init:
  // 尝试使用 initramfs
  populate_rootfs();
  if (sys_access((const char __user *) "/init", 0) == 0):
    // android 中 /init 的来源
    execute_command = "/init";
  else:
    // 尝试使用物理设备
    prepare_namespace();

1.3.2.2.1. populate_rootfs

populate_rootfs:
  // __initramfs_start 是 initramfs 镜像提前被加载到的内存位置
  // unpack_to_rootfs 需要做两件事:
  // 1. 对 __initramfs_start 位置的内容进行 gunzip
  // 2. 将对应的 cpio 内容加载到 rootfs 中, 所谓的 "加载", 实际就是通过 sys_open, sys_mkdir 等
  // 在 / 下临时生成文件和目录, 并通过 sys_chmod 等修改 owner 和权限
  // 参考: http://ieroot.com/2014/04/21/1534.html or  http://tinyurl.com/gslj9hf
  unpack_to_rootfs(__initramfs_start, __initramfs_end - __initramfs_start, 0);
    // 解析 initramfs 内容, 最后调用 do_name 在初始的 rootfs 中生成相应的文件
    write_buffer
      do_start
      do_header
      do_name
        // cpio 的末尾是 TRAILER!!!
        if (strcmp(collected, "TRAILER!!!") == 0):
          return;
        if (S_ISREG(mode)):
          sys_open(collected, O_WRONLY|O_CREAT, mode);
          sys_fchown(wfd, uid, gid);
          sys_fchmod(wfd, mode);
          do_copy
        else if (S_ISDIR(mode)):
          // xxx

1.3.2.2.2. prepare_namespace

若:

存在 initramfs, 且成功加载到 rootfs
initramfs 中包含 /init

则执行 userpsace 的 init, 后者会负责 mount root 等动作, 否则, 执行 prepare_namespace

prepare_namespace:
  sys_mount("devfs", "/dev", "devfs", 0, NULL);
  // root_device_name 是 boot 时指定的 kernel 参数
  ROOT_DEV = name_to_dev_t(root_device_name);
  if (initrd_load()):
    goto out;  
  mount_root()
    create_dev("/dev/root", ROOT_DEV, root_device_name);
    mount_block_root("/dev/root", root_mountflags);
      char *fs_names = __getname();
      get_fs_names(fs_names);
      // 尝试所有支持的 fstype
      for (p = fs_names; *p; p += strlen(p)+1):
        err = do_mount_root(name, p, flags, root_mount_data);
        if error == 0:
          break;
  out:
  sys_mount(".", "/", NULL, MS_MOVE, NULL);
  sys_chroot(".");
    set_fs_root(current->fs, nd.mnt, nd.dentry);

do_mount_root:
  // 把 /dev/root mount 到 /root
  sys_mount(name, "/root", fs, flags, data);
  sys_chdir("/root");

1.3.2.2.3. initrd_load

initrd (initialized RAM disk) 与 initramfs 类似, 都是一种镜像文件, 通过将镜像文件加载到内存做为基本的 root fs.

两者的主要区别是:

initramfs 本质是一个 gzip 压缩的 cpio 文件, 创建和读取都比较简单, 而且 kernel 读取 initramfs 时不必通过 page cache
initrd 是一个 ext2 的镜像文件, 需要 kernel 支持 ext2, 而且 ext2 的读取需要通过 page cache, 导致内存浪费.

1.3.2.2.4. 为什么需要 initrd / initramfs

可以提供一个只读的基本 root fs (比如 android 的作法)
将 kernel 中许多功能编译成模块打包到 initrd 并按需加载中减小 kernel 的大小和内存占用
linux 安装时可以通过硬件检测将必要的模块打包到 initrd 中
initramfs 中的 /init 可以接管 prepare_namespace 中关于 mount real fs 的工作, 而 mount real fs 越来越复杂, 放在 userspace 更合理一些.

Quote from: https://www.kernel.org/doc/Documentation/filesystems/ramfs-rootfs-initramfs.txt

The move to early userspace is necessary because finding and mounting the real root device is complex. Root partitions can span multiple devices (raid or separate journal). They can be out on the network (requiring dhcp, setting a specific MAC address, logging into a server, etc). They can live on removable media, with dynamically allocated major/minor numbers and persistent naming issues requiring a full udev implementation to sort out. They can be compressed, encrypted, copy-on-write, loopback mounted, strangely partitioned, and so on.

This kind of complexity (which inevitably includes policy) is rightly handled in userspace.

1.3.2.2.5. 为什么需要先 mount 一个 rootfs

对于需要使用 initramfs 或 initrd 的情形, rootfs 显然是必要的
即使不需要 initramfs, rootfs 也使得后续所有的 mount 相关操作都是一致的, 不需要考虑 "/" 是否存在的情况, 比如 userspace 要 mount 新的 root 在逻辑上会变的非常简单.

1.3.3. path_lookup

1.3.3.1. path_lookup

path_lookup:
  if (*name=='/'):
    nd->mnt = current->fs->rootmnt;
    nd->dentry = current->fs->root;
  else:
    nd->mnt = current->fs->pwdmnt;
    nd->dentry = current->fs->pwd;
  // 记录 symlink 层数, 防止 follow_link 时死循环
  current->total_link_count = 0;
  link_path_walk(name, nd);

1.3.3.2. link_path_walk

link_path_walk:
  // ignore leading '/'
  while (*name=='/'):
    name++;
  // 如果 name 只包含 '/', 直接返回, nd 为 path_lookup 赋予的初值
  if (!*name):
    return
  inode = nd->dentry->d_inode;
  for(;;):
    // 针对当前 path component 的权限检查 (只需要 exec 权限)
    err = exec_permission_lite(inode, nd);
      // 若 inode 实现了自己的 permission, 返回 -EAGAIN, 期望
      if (inode->i_op && inode->i_op->permission):
        return -EAGAIN;
      if (current->fsuid == inode->i_uid):
        mode >>= 6;
      else if (in_group_p(inode->i_gid)):
        mode >>= 3;
      // lookup 时只需要 exec 权限
      if (mode & MAY_EXEC):
        goto ok;
    if (err == -EAGAIN):
      err = permission(inode, MAY_EXEC, nd);
    if err:
      return err
    // 从 name 中获得下一个 compopent name, 并且基于 component name 计算其 hash
    struct qstr this;
    hash = init_name_hash();
    do {
      name++;
      hash = partial_name_hash(c, hash);
      c = *(const unsigned char *)name;
    } while (c && (c != '/'));
    this.len = name - (const char *) this.name;
    this.hash = end_name_hash(hash);
    if (! c):
      // 最后一个 component
      // last_component 的代码和随后的代码基本相同, 只是 lookup 完后会返回, 不会继续循环
      goto last_component  

    // 开始 lookup
    // 对于 . 与 .. 的处理
    if name == ".":
      continue
    if name == "..":
      follow_dotdot(&nd->mnt, &nd->dentry);
      inode = nd->dentry->d_inode;
      continue
    // 真正的 lookup
    do_lookup(nd, &this, &next);
    // next 保存本次 lookup 的结果
    follow_mount(&next.mnt, &next.dentry);
    inode = next.dentry->d_inode;
    if (inode->i_op->follow_link):
      do_follow_link(next.dentry, nd);
      inode = nd->dentry->d_inode;
    else:
      // 针对本次 component 的 lookup 完成, 更新 nd
      nd->mnt = next.mnt;
      nd->dentry = next.dentry;
    if (!inode->i_op->lookup):
      break;

1.3.3.3. follow_dotdot

follow_dotdot: (mnt, dentry)
  while (1):
    // 当前 dentry 是 root fs 的根目录
    if (*dentry == current->fs->root && *mnt == current->fs->rootmnt):
      break                    /* ONE */
    // 当前 dentry 不是其所在的 fs 的根目录
    if (*dentry != (*mnt)->mnt_root):
      *dentry = (*dentry)->d_parent;
      break                     /* TWO */
    parent = (*mnt)->mnt_parent;
    // 当前 dentry 所在 fs 是 root fs
    if (parent == *mnt):
      break                     /* THREE */
    *dentry = (*mnt)->mnt_mountpoint;
    *mnt = parent;

  follow_mount(mnt, dentry);

1. cd /..
   在 ONE 处返回
2. cd /a/b/.., 其中 b 不是 mount point
   在 TWO 处返回
3. cd /a/b/.., 其中 b 是 mount point
   首次进入循环时, mnt 被设置为其 parent, dentry 被设置为 parent->mnt_mountpoint,
   即 parent mnt 中真正的 /a/b. 第二次进入循环时, 在 TWO 返回

1.3.3.3.1. 关于 follow_dotdot 最后的 follow_mount

一般情况下, follow_mount 是无意义的操作: 上面提到的三个例子都不需要使用 follow_mount. 但考虑下面的例子:

$> mkdir -p a/b
$> mkdir c
$> mkdir d
$> sudo mount -o bind c a/b

$> cd a/b
$> stat ..
  File: '..'
  Size: 4096            Blocks: 8          IO Block: 4096   directory
Device: 802h/2050d      Inode: 21717951    Links: 3

$> sudo mount -o bind ~/d ~/a

$> stat ..
~/a/b@dell-work> stat ..
  File: '..'
  Size: 4096            Blocks: 8          IO Block: 4096   directory
Device: 802h/2050d      Inode: 21725293    Links: 2

如果没有上面那句 follow_mount, 最后一次 stat 的结果会是 21717951 (a) 而不是 21725293 (d)

1.3.3.4. do_lookup

do_lookup:
  struct vfsmount *mnt = nd->mnt;
  // 查找 dcache, 其中 dentry+qstr 构成 hash key
  struct dentry *dentry = __d_lookup(nd->dentry, name);
  if (!dentry):
    dentry = real_lookup(nd->dentry, name, nd);
      result = dir->i_op->lookup(dir, dentry, nd);

1.3.3.5. follow_mount

follow_mount:
  // 这里是一个循环, 因为可能有多个 fs 被先后 mount 到同一个目录
  // 通过这个循环, follow_mount 会返回最后一次 mount 对应的 mnt 和 dentry
  while (d_mountpoint(*dentry)):
           return dentry->d_mounted;
    struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
      // lookup_mnt 从一个全局的 mount_hashtable 查找对应的 vfsmount
      // 其中 hash key 为 mnt+dentry
    // mounted 为找到的 vfsmount, mounted 的 parent 必定是 mnt, 且
    // mounted 的 mnt_point 必定是 dentry
    *mnt = mounted;
    *dentry = mounted->mnt_root;

1.3.3.6. do_follow_link

do_follow_link:
  // total_link_count 限制了一个完整的 link_path_walk 最多能处理 40 次 symlink
  if (current->total_link_count >= 40):
    return -ELOOP
  // current->link_count 用来限制 nested symlink, 因为 do_follow_link 会递归的
  // 的调用 link_path_walk, 后者可能会再次调用到 do_follow_link
  if (current->link_count >= 5):
    return -ELOOP
  current->link_count++;
  current->total_link_count++;
  // i_op->follow_link 需要将 link 的结果通过 nd_set_link 保存起来,
  // 后续的 nd_get_link 可以读到这个结果
  dentry->d_inode->i_op->follow_link(dentry, nd);
  char *link = nd_get_link(nd);
  // follow_link 拿到对应的 link 后, 递归的调用 link_path_walk
  link_path_walk(link, nd);
  current->link_count--;

1.3.4. open

sys_open:
  fd = get_unused_fd();
  struct file *f = filp_open(tmp, flags, mode);
    open_namei(filename, namei_flags, mode, &nd);
      if (!(flag & O_CREAT)):
        path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
        goto ok;
      else:
        path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
        vfs_create(dir->d_inode, dentry, mode, nd);
          may_create(dir, dentry, nd);
          // i_op->create 需要有定义
          if (!dir->i_op || !dir->i_op->create):
            return -EACCES;
          dir->i_op->create(dir, dentry, mode, nd);
          // for dnotify
          inode_dir_notify(dir, DN_CREATE);
      goto ok:
    ok:
      // check permissions 
      may_open()
        permission()
    dentry_open(nd.dentry, nd.mnt, flags);
      // 根据 dentry 生成 file
      f = get_empty_filp();
      // f->f_mapping 来自 inode->i_mapping
      // f_mapping 的类型是 address_space, f_mapping->a_ops
      // 包括真正的从设备读写数据的函数: 例如 readpage, writepage, direct_IO 等
      // 具体参考 pagecache 及 address_space_operations
      f->f_mapping = inode->i_mapping;
      f->f_dentry = dentry;
      f->f_vfsmnt = mnt;
      f->f_pos = 0;
      // f->f_op 来自 inode->i_fop
      f->f_op = fops_get(inode->i_fop);
      f->f_op->open(inode,f);

  fd_install(fd, f);
    files->fd[fd] = file;

1.3.5. readdir

1.3.6. write

sys_write:
  file = fget_light(fd, &fput_needed);
    file = files->fd[fd];
  loff_t pos = file_pos_read(file);
    return file->f_pos;
  ret = vfs_write(file, buf, count, &pos);
    if (!(file->f_mode & FMODE_WRITE)):
      return -EBADF;
    // f_op->write 需要有定义
    if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)):
      return -EINVAL;
    ret = security_file_permission (file, MAY_WRITE);
    ret = file->f_op->write(file, buf, count, pos);
  file_pos_write(file, pos);
    file->f_pos = pos;

write 时并没有像 open 那样有类似 permission() 的权限检查. 但会有 security_file_permission 的检查, 例如 selinux

f_op->write 一般情况下为 generic_file_write, 后者会考虑 pagecache, 并最终调用到 a_ops 中的 writepage 等回调函数.

1.3.7. read

read 与 write 几乎完全一样, 只是把 f_op->write 换成 f_op->read

1.3.8. permission

permission()
  if (inode->i_op && inode->i_op->permission):
    inode->i_op->permission()
  else:
    // generic permission check for owner, group, other, ...
    generic_permission()

generic_permission:
  retval = generic_permission(inode, submask, NULL);
  // 666 或 751 等
  umode_t mode = inode->i_mode;
  if (current->fsuid == inode->i_uid):
    // current 进程是文件的 owner, 只取 mode 中和 owner 相关的部分
    mode >>= 6;
  else if (in_group_p(inode->i_gid)):
    mode >>= 3;
  if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)):
    return 0;
  // 即使前面的检查失败, capability 也可以 override 前面的结果 (比如 root 用户)
  if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
    if (capable(CAP_DAC_OVERRIDE))
      return 0;

  return -EACCES
  // 若 i_op->permission 或 generic_permission 没通过, 则直接报错
  if (retval):
    return retval;  
  // 若上面的检查通过, 进行进一步的检查 (例如 selinux)
  return security_inode_permission(inode, mask, nd);

1.3.9. dentry cache

1.3.9.1. dentry_operations

1.3.9.1.1. d_revalidate

若 dcache 中的某个 dentry 实现了 d_op->d_revalidate, 则每次要重用 dcache 中的该 dentry 之前, 都需要调用 d_revalidate, 以确认该 dentry 是 valid (或 updated)

do_lookup:
  struct dentry *dentry = __d_lookup(nd->dentry, name);
  if (dentry->d_op && dentry->d_op->d_revalidate):
    goto need_revalidate;

need_revalidate:
  // d_revalidate 返回 1 表示 valid, 返回 0 表示 dentry 已经无效, 需要 real_lookup
  if (dentry->d_op->d_revalidate(dentry, nd)):
    goto done;
  d_invalidate(dentry)
  goto need_lookup;

need_lookup:
  dentry = real_lookup(nd->dentry, name, nd);

d_revalidate 一个例子是 procfs 中的 proc_pid_lookup 返回的对应 pid 的 dentry, 这些 dentry 需要通过 pid_revalidate 进行 revalidate

pid_revalidate:
  struct inode *inode = dentry->d_inode;
  struct task_struct *task = proc_task(inode)
  if (pid_alive(task)):
    return 1
  else:
    return 0

1.3.10. page cache

如果一个文件系统想要使用 page cache, 则必须使用 generic_file_read 做为其 f_op->read.

generic_file_read 是大部分文件系统中默认的 f_op->read 的实现. 这个函数涉及到 IO 的大部分概念, 比如 page cache, specific file system, io scheduler 及 blk device driver

从 vfs 的角度出发, 只讨论 generic_file_read 和 page cache 相关的部分, 其它的主要是 IO 的概念 (比如 page 与 blocknr 的映射, bio, io_scheduler).

1.3.10.1. generic_file_read 如何使用 page cache?

generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos):
  struct iovec local_iov = { .iov_base = buf, .iov_len = count };
  // 1 表示只有一个 iovec
  __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
  // nr_segs 为 1
  for (seg = 0; seg < nr_segs; seg++):
    read_descriptor_t desc;
    desc.written = 0;
    desc.arg.buf = iov[seg].iov_base;
    desc.count = iov[seg].iov_len;
    // file_read_actor 是一个函数指针, 负责找到数据后将数据复制到 desc.arg.buf
    do_generic_file_read(filp,ppos,&desc,file_read_actor);
      // mapping 中找到 page cache 对应的 radix tree, filp 找到要读的文件, desc 找到目的 buf
      do_generic_mapping_read(struct address_space *mapping,
                        struct file_ra_state *_ra,
                        struct file *filp,
                        loff_t *ppos,
                        read_descriptor_t *desc,
                        read_actor_t actor)
        // 根据 ppos 算出对应的 page 的 index, page cache 是以 page 组织的 (4K)
        index = *ppos >> PAGE_CACHE_SHIFT;
        // 在 address_space 中查找 page
        page = find_get_page(mapping, index);
        if (page):
          // 调用 file_read_actor 复制数据到 desc.arg.buf, 完成此次 read
          actor(desc, page, offset, nr);
          return
        else:
          // page cache miss, 调用 a_ops->readpage, 这个函数是和具体文
          // 件系统相关文件系统的主要功能是找到 page 与 dlk device
          // driver 的 local block number 的对应关系(所谓的 mapping
          // layer), 其它的工作由通用的 mpage_readpage 完成. refers to
          // [[IO.org]]
          mapping->a_ops->readpage(filp, page);

1.3.10.2. radix tree

page cache 使用 radix tree 来组织, 目的是根据 page index 可以很快的找到内存中对应的 page.

这个 radix tree 每级有 64 个 slots (2^6), 初始情况下 radix tree 高度为 1 (只有 64 个 slot), 高度随着 page index 的变大而变大, 32 位的 page index 最多需要一个 6 层的 radix tree (2+6*5)

radix tree 的 leaf node 保存着 page 的指针 (intermediate node 不保存), 所以 1 层的 radix tree 能表示的最大文件大小是 64*4k=256K, 2 层可以表示 256k*64=16M, 以此类推, 最大为 16T

1.4. File Systems

1.4.1. procfs

1.4.1.1. init

proc_root_init:
  register_filesystem(&proc_fs_type);
  kern_mount(&proc_fs_type);
  proc_misc_init();
  proc_mkdir("net", NULL);
  proc_mkdir(xxx)
  ...

1.4.1.1.1. kern_mount

proc_get_sb:
  sb = type->get_sb(type, flags, name, data);
  // sb 的 root inode 是一个对应的 proc_root 这个 proc_dir_entry 的 inode
  // 主要的, 它的 i_op 和 i_fop 直接来自 proc_root
  sb->s_root = proc_get_inode(s, PROC_ROOT_INO, &proc_root)
  // iget 从 inodecache 时查找, 若没找到, 调用 alloc_inode
  inode = iget(sb, ino);
  if (de->proc_iops):
    inode->i_op = de->proc_iops;
  if (de->proc_fops):
    inode->i_fop = de->proc_fops;

1.4.1.1.2. proc_misc_init

proc_misc_init
  create_proc_read_entry("loadavg", 0, NULL, loadavg_read_proc, NULL);
  // ...
  proc_symlink("mounts", NULL, "self/mounts");
  create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
  // ...
  entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
  entry->proc_fops = &proc_kmsg_operations;

proc_dir_entry

proc_dir_entry 相当于虚拟的 /proc 目录树中的结点

struct proc_dir_entry {
    unsigned int low_ino;
    unsigned short namelen;
    const char *name;
    mode_t mode;
    nlink_t nlink;
    uid_t uid;
    gid_t gid;
    unsigned long size;
    struct inode_operations * proc_iops;
    struct file_operations * proc_fops;
    get_info_t *get_info;
    struct module *owner;
    struct proc_dir_entry *next, *parent, *subdir;
    void *data;
    read_proc_t *read_proc;
    write_proc_t *write_proc;
    atomic_t count;             /* use count */
    int deleted;                /* delete flag */
};

proc_root

proc_root 这个特殊的 proc_dir_entry 对应于 "/proc":

struct proc_dir_entry proc_root = {
    .low_ino    = PROC_ROOT_INO, 
    .namelen    = 5, 
    .name               = "/proc",
    .mode               = S_IFDIR | S_IRUGO | S_IXUGO, 
    .nlink              = 2, 
    .proc_iops  = &proc_root_inode_operations, 
    .proc_fops  = &proc_root_operations,
    .parent             = &proc_root,
};

上面的 do_kern_mount 可以看到, sb->mnt_root 这个 root inode 的主要信息都来自 proc_root

create_proc_entry

create_proc_entry, 可以简单理解为: 以 hard code 的形式在内存中建立 /proc 目录树.

create_proc_entry:
  ent = proc_create(&parent,name,mode,nlink);
  if (S_ISDIR(mode)):
    ent->proc_fops = &proc_dir_operations;
    ent->proc_iops = &proc_dir_inode_operations;
  proc_register(parent, ent)
    // inode number 是临时分配的
    i = get_inode_number();
    dp->low_ino = i;
    dp->next = dir->subdir;
    dp->parent = dir;
    dir->subdir = dp;
    if (S_ISDIR(dp->mode)):
      dp->proc_fops = &proc_dir_operations;
      dp->proc_iops = &proc_dir_inode_operations;
      dir->nlink++;
    else if (S_ISLNK(dp->mode)):
      if (dp->proc_iops == NULL):
        dp->proc_iops = &proc_link_inode_operations;
    else if (S_ISREG(dp->mode)):
      if (dp->proc_fops == NULL):
        dp->proc_fops = &proc_file_operations;
      if (dp->proc_iops == NULL):
        dp->proc_iops = &proc_file_inode_operations;

create_proc_read_entry 和 create_seq_entry 都是对 create_proc_entry 的简单封装.

proc_symlink

proc_symlink 在 create_proc_entry 基础上, 将 symlink 的信息写到了 proc_dir_entry 的 data 部分

struct proc_dir_entry *proc_symlink(const char *name, struct proc_dir_entry *parent, const char *dest):
  struct proc_dir_entry *ent;
  ent = proc_create(&parent,name,(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
  ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
  strcpy((char*)ent->data,dest);
  proc_register(parent, ent)
    if (S_ISLNK(dp->mode)):
      dp->proc_iops = &proc_link_inode_operations;

xlate_proc_name

通过 xlate_proc_name("/xxx/yyy") 可以找到 "/proc/xxx/yyy" 对应的 proc_dir_entry, 在 create_proc_entry 或后面的 proc_mkdir 时, 都会使用 xlate_proc_name

1.4.1.1.3. proc_mkdir

proc_mkdir 与 create_proc_entry 基本类似

1.4.1.1.4. 总结

proc_root_init 所做的:

注册 proc 文件系统
do_kern_mount, 主要是生成 vfsmount 和 mnt_root 相关的信息, 其中 mnt_root 这个 inode 主要是根据 proc_root 中的信息生成的, 注意 do_kern_mount 并没有完成真正的 mount 的动作 (并没有与 mountpoint 关联), 真正的 mount 需要由 init 主动通过 mount 系统调用来完成.
通过 proc_create 之类的函数建立起一个由 proc_dir_entry 构成的虚拟的 /proc 目录树, proc_dir_entry 的数据和 inode 需要填充的数据基本一致.
另外, proc_root_init 时通过 proc_dir_entry 构造的目录树并不是 /proc 的全部数据: 其它模块或 proc 自身可能会动态的生成数据

1.4.1.2. xx_op

1.4.1.2.1. file_system_type

static struct file_system_type proc_fs_type = {
    .name               = "proc",
    .get_sb             = proc_get_sb,
    .kill_sb    = kill_anon_super,
};

1.4.1.2.2. file_system_type.get_sb

proc_get_sb:

super_block *proc_get_sb(struct file_system_type *fs_type,...)
  return get_sb_single(fs_type, flags, data, proc_fill_super);

1.4.1.2.3. fill_super

proc_fill_super:

int proc_fill_super(struct super_block *s, void *data, int silent)
  ...  
  s->s_magic = PROC_SUPER_MAGIC;
  s->s_op = &proc_sops;
  root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
  s->s_root = d_alloc_root(root_inode);

1.4.1.2.4. super_operations

proc_sops:

static struct super_operations proc_sops = { 
    .alloc_inode        = proc_alloc_inode,
    .destroy_inode      = proc_destroy_inode,
    .read_inode = proc_read_inode,
    .drop_inode = generic_delete_inode,
    .delete_inode       = proc_delete_inode,
    .statfs             = simple_statfs,
    .remount_fs = proc_remount,
};

1.4.1.2.5. i_op for /proc root

static struct inode_operations proc_root_inode_operations = {
        .lookup         = proc_root_lookup,
};

1.4.1.2.6. i_fop for /proc root

static struct file_operations proc_root_operations = {
.read            = generic_read_dir,
    .readdir     = proc_root_readdir,
};

1.4.1.2.7. i_fop for generic proc dir

static struct file_operations proc_dir_operations = {
    .read                       = generic_read_dir,
    .readdir            = proc_readdir,
};

1.4.1.2.8. i_op for generic proc dir

static struct inode_operations proc_dir_inode_operations = {
    .lookup             = proc_lookup,
    .setattr    = proc_notify_change,
};

1.4.1.2.9. i_fop for generic proc file

static struct file_operations proc_file_operations = {
    .llseek             = proc_file_lseek,
    .read               = proc_file_read,
    .write              = proc_file_write,
};

1.4.1.2.10. i_op for generic proc file

static struct inode_operations proc_file_inode_operations = {
        .setattr        = proc_notify_change,
};

1.4.1.2.11. 和 /proc/<pid> 相关的 i_op/i_fop

/proc/<pid> 下的许多 inode , 都有其独立的 i_op 和 i_fop

1.4.1.3. super_operations

1.4.1.3.1. alloc_inode

static struct inode *proc_alloc_inode(struct super_block *sb):
  struct proc_inode *ei;
  struct inode *inode;
  ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
  ei->task = NULL;
  ei->type = 0;
  ei->op.proc_get_link = NULL;
  ei->pde = NULL;
  inode = &ei->vfs_inode;
  inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
  return inode;

1.4.1.3.2. read_inode

procfs 的 proc_read_inode 并没有做太多工作, 因为 inode 信息的填充主要在上层的 proc_get_inode 中完成.

1.4.1.3.3. statfs

1.4.1.4. inode_operations

1.4.1.4.1. lookup

对于 proc 来说, /proc 下许多目录都有其独立的 lookup 函数

proc_root_lookup

/proc 根目录下本身的 lookup 函数

proc_root_lookup
  // 首先查找非 <pid> 之类的 inode
  if (!proc_lookup(dir, dentry, nd)):
    return null
  // 若没有找到, 查找 <pid> 类型的 inode
  return proc_pid_lookup(dir, dentry, nd);

proc_lookup

/proc 一级目录下所有非 <pid> 的 inode 都是通过 proc_lookup 查找的. 具体的, 通过遍历之前通过 create_proc_entry 创建虚拟目录树来查找.

proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd):
  int error = -ENOENT;
  // dir 是 mnt_root, dentry 包括要查找的名字, nd 用来保存查找结果
  de = PDE(dir);
  for (de = de->subdir; de ; de = de->next):
    if (!memcmp(dentry->d_name.name, de->name, de->namelen)):
      unsigned int ino = de->low_ino;
      // proc_get_inode 基本就是将 proc_dir_entry 转换为相应的 inode
      inode = proc_get_inode(dir->i_sb, ino, de);
      break;
  if (inode):
    // 添加 <dentry, inode> 到 dcache,有了 dcache, 下次 lookup 时将不会再调用 proc_lookup
    d_add(dentry, inode);
    return null;
  // 若 lookup 失败, 返回 -ENOENT, 而不是返回 null
  return error;

因为 proc_lookup 时, 对于不存在的 inode 会返回 -ENOENT, 所以 /proc 根目录无法新建文件.

/proc@dell-work> touch a
touch: cannot touch 'a': No such file or directory

proc_pid_lookup

因为 /proc 下的 <pid> 之类的 inode 是动态变化的, 无法通过 create_proc_entry 添加到虚拟的 /proc 目录树中, 所以需要通过 proc_pid_lookup 动态去查找

proc_pid_lookup
  if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)):
    // inode 的 ino 为 last_ino++ 
    inode = new_inode(dir->i_sb);
    inode->i_op = &proc_self_inode_operations;
    d_add(dentry, inode);
    return null;
  tgid = name_to_int(dentry);
  task = find_task_by_pid(tgid);
  inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
    // new_inode 是 vfs 通用接口, 但它会调用到 sb 中的 alloc_inode, 具体的
    // proc_alloc_inode 会生成一个 proc_inode, 并返回其中的 vfs_inode,
    // 这也就是下面 PROC_I 的意义: proc_inode 在 inode 基础上附加了一些和
    // proc 自身相关的信息, 例如 task
    inode = new_inode(sb);
    struct proc_inode ei = PROC_I(inode);
      container_of(inode, struct proc_inode, vfs_inode);
  inode->i_op = &proc_tgid_base_inode_operations;
  inode->i_fop = &proc_tgid_base_operations;
  d_add(dentry, inode);

proc_tgid_base_lookup

/proc/<pid> 目录的 lookup 函数

上一步的 proc_pid_lookup 函数负责将 /proc/<pid> inode 的 i_op 赋值为 proc_tgid_base_inode_operations, 后者的定义为:

static struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
};

其中 proc_tgid_base_lookup 是 /proc/<pid> 的 lookup 函数

static struct pid_entry tgid_base_stuff[] = {
    E(PROC_TGID_TASK,      "task",    S_IFDIR|S_IRUGO|S_IXUGO),
    E(PROC_TGID_FD,        "fd",      S_IFDIR|S_IRUSR|S_IXUSR),
    E(PROC_TGID_ENVIRON,   "environ", S_IFREG|S_IRUSR),
    E(PROC_TGID_AUXV,      "auxv",        S_IFREG|S_IRUSR),
    E(PROC_TGID_STATUS,    "status",  S_IFREG|S_IRUGO),
    E(PROC_TGID_CMDLINE,   "cmdline", S_IFREG|S_IRUGO),
    E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
    E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
    // ...
}

struct dentry *proc_tgid_base_lookup(inode *dir, dentry *dentry, nameidata *nd):
  // dir 代表 /proc/<pid> 这个 inode, 其对应的 proc_inode 包括对应的 task
  // 可以对应到一个进程
  // dentry 包括要查找的名字
  // nd 返回查找结果
  // tgid_base_stuff 是一个 hard code 的数组  
  proc_pident_lookup(dir, dentry, tgid_base_stuff);
    struct task_struct *task = proc_task(dir);
    // ents 即 tgid_base_stuff
    for (p = ents; p->name; p++):
      if (!memcmp(dentry->d_name.name, p->name, p->len)):
        break;
    inode = proc_pid_make_inode(dir->i_sb, task, p->type);
      inode = new_inode(sb);
      inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
      inode->i_ino = fake_ino(task->pid, ino);
      ei = PROC_I(inode);
      // 这里的 task 赋值很重要, 它标识了这个 inode, 后面针对这个 inode 的回调
      // 一般都需要这个 task, 比如 proc_lookupfd 
      ei->task = task;
      ei->type = ino;
      inode->i_uid = 0;
      inode->i_gid = 0;
    switch(p->type):
      case PROC_TGID_TASK:
        inode->i_op = &proc_task_inode_operations;
        inode->i_fop = &proc_task_operations;
      case PROC_TID_FD:
        inode->i_op = &proc_fd_inode_operations;
        inode->i_fop = &proc_fd_operations;
      // ...
    d_add(dentry, inode);

1.4.1.4.2. permission

procfs 实现了自己的 permission 实现: proc_permission

1.4.1.4.3. readlink

/proc 中至少有三个地方用到了 symlink:

/proc/self
/proc/<pid>/fd/<fd>
/proc/mounts

在具体的文件系统中(例如 ext2), symlink 的路径是直接保存在 raw inode 中的, 不需要单独的 data block, proc 处理 symlink 时与 ext2 不太一样.

/proc/self

struct dentry * proc_pid_lookup:
  if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)):
    inode = new_inode(dir->i_sb);
    inode->i_mode = S_IFLNK|S_IRWXUGO;
    inode->i_op = &proc_self_inode_operations;
    // ...

proc_self_readlink(struct dentry *dentry, char __user *buffer,int buflen):
    char tmp[30];
    sprintf(tmp, "%d", current->tgid);
    return vfs_readlink(dentry,buffer,buflen,tmp);

/proc/<pid>/fd/<fd>

proc_lookupfd:
  // 该 inode 对应 /proc/<pid>/fd/<fd>, 其对应的 fd 被编码到 proc_inode->type 中
  inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
  inode->i_mode = S_IFLNK;
  // 后续 proc_pid_link_inode_operations->readlink 会调用 proc_fd_link 得到 symlink 的数据
  inode->i_op = &proc_pid_link_inode_operations;
  ei->op.proc_get_link = proc_fd_link;

proc_fd_link:
  struct task_struct *task = proc_task(inode);
  int fd = proc_type(inode) - PROC_TID_FD_DIR;
  files = get_files_struct(task);
  file = fcheck_files(files, fd);
    return files->fd[fd];
  *mnt = mntget(file->f_vfsmnt);
  *dentry = dget(file->f_dentry);

/proc/mounts

proc_root_init 时, 通过 proc_symlink 建立了一个 proc_dir_entry , 这个 entry 是 /proc/mounts 到 /proc/self/mounts 的 symlink, 具体参考前面 proc_symlink

proc_follow_link:
  // symlink 的 target 保存在 proc_inode->data 中
  nd_set_link(nd, PDE(dentry->d_inode)->data);
    nd->saved_names[nd->depth] = path;

generic_readlink:
  res = dentry->d_inode->i_op->follow_link(dentry, &nd);
  res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
    return nd->saved_names[nd->depth];

1.4.1.5. file_operations

1.4.1.5.1. readdir

readdir 的过程和 lookup 的过程类似, 根据不同的 inode 调用不同的 i_fop->readdir, 且 readdir 依据的数据的来源与 lookup 也是类似的:

proc_root_readdir 依赖 pid 列表和 create_proc_entry 构造的目录树
proc_tgid_base_readdir 依赖于 tgid_base_stuff 数组
…

1.4.1.6. 总结

vfs 由一系列的回调函数构成, 由最初的 file_system_type 到 super_operations 再到 inode_operations 和 file_operations
vfs 中最重要的一个回调可能要算 lookup 了, 因为最重要的 inode 结构体是由它生成的.
procfs 实现上使用多个 iop 和 fop 来处理不同的 inode, 这样扩展性好但比较繁琐, 另一种实现方案可能是提供统一的 iop 和 fop, 但这个 iop/fop 内部可能需要大量的 switch-case 来区别不同的 inode

1.4.2. debugfs

1.4.3. ramfs

ramfs 是基于内存的文件系统, 因为 kernel 本身有 dcache 和 page cache, 所以实现 ramfs 并不需要很多代码: ramfs 中大部分 x_op 都是直接操作 dcache 和 pagecache.

1.4.3.1. lookup

ramfs 的 lookup 被定义为 simple_lookup, 而 simple_lookup 只是简单的返回 NULL, 即 negative dentry … 嗯?

由于 ramfs 大量操作是建立在 dcache 基础上, 回顾 do_lookup 的过程:

do_lookup:
  // 查找 dcache, 对于 ramfs 来说, 如果 __d_lookup 查不到结果, 说明之
  // 前没有 add 过这个 dentry (通过 mkdir 或 create, symlink 等), 此时
  // do_lookup 其实已经可以直接返回 negative entry 了, 但由于
  // do_lookup 实现在 vfs 层,而 vfs 会调用 real_lookup 尝试从 backing
  // store 去 lookup 所以 ramfs 偷懒使用了 simple_lookup 直接返回 NULL
  struct dentry *dentry = __d_lookup(nd->dentry, name);
  if (!dentry):
    dentry = real_lookup(nd->dentry, name, nd);
      ramfs->simple_lookup

1.4.3.2. create

ramfs_mknod, 基本就是直接 make inode

1.4.3.3. readdir

dcache_readdir, 基本就是直接遍历 dcache

1.4.4. ext2

refers to

1.5. Appendix

1.5.1. special inode

除了普通文件和目录, 还存在一些特殊的文件:

char device
blk device
pipe
socket

inode->i_mode 是一个 short int, 它的低位保存权限 (rwx), 高位保存着文件的类型, 例如

#define S_IFSOCK 0140000
#define S_IFLNK  0120000
#define S_IFREG  0100000
#define S_IFBLK  0060000
#define S_IFDIR  0040000
#define S_IFCHR  0020000
#define S_IFIFO  0010000
#define S_ISUID  0004000
#define S_ISGID  0002000
#define S_ISVTX  0001000

1.5.1.1. char device

ext2_mknod():
  inode = ext2_new_inode()
  // rdev 保存着 mknod 传入的设备号
  init_special_inode(inode, inode->i_mode, rdev);
    if (S_ISCHR(mode)):
      inode->i_fop = &def_chr_fops;
      inode->i_rdev = rdev;

struct file_operations def_chr_fops = {
    .open = chrdev_open,
};

ext2_read_inode (struct inode * inode):
  // ...
  struct ext2_inode * raw_inode = ext2_get_inode(inode->i_sb, ino, &bh);
  inode->i_mode = le16_to_cpu(raw_inode->i_mode);
  if (S_ISREG(inode->i_mode)):
    // ...
  else:
    // i_block 保存着设备号
    init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));


chrdev_open():
  p = inode->i_cdev;
  // 第一次打开
  if (!p):
    // inode->i_rdev 是 mknod 时指定的设备号
    kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
    new = container_of(kobj, struct cdev, kobj);
    inode->i_cdev = p = new;
    inode->i_cindex = idx;
    list_add(&inode->i_devices, &p->list);

  // f_op 重置为 cdev->ops (之前为 inode->i_fop, 即 def_chr_fops)
  filp->f_op = fops_get(p->ops);
  if (filp->f_op->open):
    // driver 实现的 open
    ret = filp->f_op->open(inode,filp);

可见, char/blk device 设备文件需要文件系统的支持, 主要是两点:

文件系统需要能区分出这种类型的文件 (能给 inode->i_mode 相应的值)
需要能保存设备号 (能给 inode->i_rdev 相应的值)

1.5.1.2. blk device

1.5.2. inode->i_size

1.5.2.1. ext2

1.5.2.1.1. dir

ext2 目录的大小是随着目录中 entry 的多少变化的.

新建空目录

ext2_mkdir:
  inode = ext2_new_inode (dir, S_IFDIR | mode);
  ext2_make_empty(inode, dir);
    chunk_size = ext2_chunk_size(inode);
      // s_blocksize 与 fs 的配置有关, 一般为 4K
      return inode->i_sb->s_blocksize;
    ext2_commit_chunk(page, 0, chunk_size);
      // from 为 0, to 为 chunk_size (4K)
      page->mapping->a_ops->commit_write(NULL, page, from, to);
        generic_commit_write(file, page, from, to)
          // inode->i_size 初始为 0
          if (pos > inode->i_size):
            i_size_write(inode, pos);
              // I GOT YOU! 新建目录大小为 4K
              inode->i_size = i_size;
            mark_inode_dirty(inode);
  // 修改新建目录的父目录, 早期版本的 linux 称为 ext2_add_entry
  ext2_add_link(dentry, inode);

ext2_create:
  ext2_add_nondir(dentry, inode);
    ext2_add_link(dentry, inode);

ext2_add_link:
  from = (char*)de - (char*)page_address(page);
  // rec_len 和 entry 的名字长度有关
  to = from + rec_len;
  ext2_commit_chunk(page, from, to);

1.5.2.1.2. symlink

symlink 通过 ext2_symlink 生成

ext2_symlink:
  l = strlen(symname)+1;
  if (l > sizeof (EXT2_I(inode)->i_data)):
    struct ext2_inode_info {
      __le32    i_data[15];
      // ...
    }
    // 若 symname 长度大于 60 (15*4), 则使用 slow symlink
    // 所谓 slow symlink, 是指使用 data block 来保存 symname
    page_symlink(inode, symname, l);
    mapping->a_ops->commit_write(NULL, page, 0, len-1);
  else:
    // fast symlink, symname 被保存在 i_data 中
    memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
    inode->i_size = l-1;

关于 fast/slow symlink 的一个实验:

~@dell-work> ln -s "123456789012345678901234567890123456789012345678901234567890" a

~@dell-work> stat a
  File: 'a' -> '123456789012345678901234567890123456789012345678901234567890'
  Size: 60              Blocks: 8          IO Block: 4096   symbolic link
Device: 802h/2050d      Inode: 21500456    Links: 1

~@dell-work> rm a
rm: remove symbolic link 'a'? y

~@dell-work> ln -s "12345678901234567890123456789012345678901234567890123456789" a

~@dell-work> stat a
  File: 'a' -> '12345678901234567890123456789012345678901234567890123456789'
  Size: 59              Blocks: 0          IO Block: 4096   symbolic link
Device: 802h/2050d      Inode: 21500456    Links: 1

其中, symname 长度为 60 时, Blocks = 8, 表示使用了 8 个 sector: 4096/512 = 8 (sector 是磁盘的物理属性, 一般为 512b, 而 block 是 fs 抽象的属性, 在 mkfs 时可以指定)

另外, ext2_read_inode 时如何区别 fast/slow symlink?

ext2_read_inode:
  if (S_ISLNK(inode->i_mode)):
    if (ext2_inode_is_fast_symlink(inode)):
      inode->i_op = &ext2_fast_symlink_inode_operations;
    else:
      inode->i_op = &ext2_symlink_inode_operations;

ext2_inode_is_fast_symlink:
  int ea_blocks = EXT2_I(inode)->i_file_acl ? (inode->i_sb->s_blocksize >> 9) : 0;
  return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0;)

1.5.2.1.3. regular file

1.5.2.1.4. char/blk device

mknod 时 i_size 没有赋值, 所以大小为 0

1.5.2.1.5. fifo

1.5.2.1.6. socket

1.5.2.2. proc

1.5.3. .(single dot) 与 ..(double dots)

对于 ext2 来说, single dot 与 double dots 是真实存在于磁盘上的, 而并非程序模拟出来的 place holder

ext2_mkdir:
  inode = ext2_new_inode (dir, S_IFDIR | mode);
  ext2_make_empty(inode, parent);
    // inode 是新建的目录, parent 是父目录
    kaddr = kmap_atomic(page, KM_USER0);
    de = (struct ext2_dir_entry_2 *)kaddr;
    de->name_len = 1;
    de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1));
    memcpy (de->name, ".\0\0", 4);
    de->inode = cpu_to_le32(inode->i_ino);

    de = (struct ext2_dir_entry_2 *)(kaddr + EXT2_DIR_REC_LEN(1));
    de->name_len = 2;
    de->rec_len = cpu_to_le16(chunk_size - EXT2_DIR_REC_LEN(1));
    de->inode = cpu_to_le32(parent->i_ino);
    memcpy (de->name, "..\0", 4);

可见, single dot 与 double dots 是包含在 dir entry 中的, 并且它们对应的 inode 在 mkdir 时即被指定.

1.5.3.1. 测试

根据前面关于 path_lookup 的描述, single dot 与 double dots 在 path_walk 时是做为特例被处理的, 比如, 处理到 double dots 时, 会直接 climb up (并不需要根据 double dots 对应的 inode, 并且会考虑 mount), 那么可能存在这种问题:

getdents 或 readdir 返回的 double dots 对应的 linux_dirent 结构体中, d_ino 对应的应该是上面 mkdir 时被指定的 ino, 但如果当前目录被 mount 到另一个地方, 对 double dots 进行 path_lookup 时, 对应 double dots 的 inode 可能是和 mnt_parent 对应的 ino, 两者会不同?

#include <sys/types.h>
#include <dirent.h>

int main(int argc, char *argv[])
{
    DIR * d= opendir(argv[1]);
    int i = 0;
    struct dirent * ret = readdir(d);
    if (ret) {
        printf("%s %d\n", ret->d_name, ret->d_ino);
    }        
}

$> mkdir test

// a.out 底层使用 readdir
$> ./a.out test
.. 21504751

// stat 底层使用 path_lookup
$> stat test/..
  File: 'test/..'
  Size: 12288           Blocks: 24         IO Block: 4096   directory
Device: 802h/2050d      Inode: 21504751    Links: 121

$> stat .
  File: '.'
  Size: 12288           Blocks: 24         IO Block: 4096   directory
Device: 802h/2050d      Inode: 21504751    Links: 121

$> mkdir -p a/b

$> sudo mount -o bind test a/b

$> ./a.out a/b
.. 21504751

$> stat a/b/..
  File: 'a/b/..'
  Size: 4096            Blocks: 8          IO Block: 4096   directory
Device: 802h/2050d      Inode: 21725291    Links: 3

$> stat a
  File: 'a'
  Size: 4096            Blocks: 8          IO Block: 4096   directory
Device: 802h/2050d      Inode: 21725291    Links: 3

不过这种不一致看起来可以用来判断一个 entry 是不是 mount point …

1.5.4. atime / mtime

读写文件时 atime/mtime 会相应变化, vfs 在 generic_file_read / write 处做了实现, 如果某个 fs 的文件读写不是通过 generic_file_read / write, 需要自己处理 atime/mtime

do_generic_mapping_read:
  file_accessed(filp);
    touch_atime(file->f_vfsmnt, file->f_dentry)
      update_atime(dentry->d_inode);
        inode->i_atime = now;

generic_file_write:
  __generic_file_write_nolock
    __generic_file_aio_write_nolock
      inode_update_time
        inode->i_mtime = now;

1.5.5. ramfs, rootfs, initramfs

https://www.kernel.org/doc/Documentation/filesystems/ramfs-rootfs-initramfs.txt

1.5.6. getcwd

getcwd 显然依赖于 fs_struct->pwdmnt, 但 pwdmnt 做为 vfsmount 只包含一个 dentry 信息, 并不包含完整路径, getcwd 如果获得完整路径的?

__d_path:
  for (;;) {
      struct dentry * parent;
      if (dentry == vfsmnt->mnt_root ) {
          if (vfsmnt->mnt_parent == vfsmnt) {
              goto global_root;
          }
          dentry = vfsmnt->mnt_mountpoint;
          vfsmnt = vfsmnt->mnt_parent;
          continue;
      }
      parent = dentry->d_parent;
      memcpy(end, dentry->d_name.name, namelen);
      dentry = parent;
  }

和 lookup 时的 follow_dotdot 差不多.

由于 __d_path 使用 dentry->d_parent 获得完整路径, 所以估计 dcache 在 drop cache 时需要保证 dentry A 的 parent 不能先于 dentry A 被 drop?