linux文件系统解读-摩杜云开发者社区

解读linux的文件系统, 内核版本为 linux-2.1.129

文件系统核心抽象概念:

一. super_block

名字的由来是磁盘特定扇区中的超级块, 该对象记录文件系统本身的特性.

例如用ext4格式化磁盘后, 磁盘的超级块里就保持ext4文件系统的信息.

struct super_block {
  struct list_head  s_list;    /* Keep this first */
  kdev_t      s_dev;
  unsigned long    s_blocksize;
  unsigned char    s_blocksize_bits;
  unsigned char    s_lock;
  unsigned char    s_rd_only;
  unsigned char    s_dirt;
  struct file_system_type  *s_type;
  struct super_operations  *s_op;
  struct dquot_operations  *dq_op;
  unsigned long    s_flags;
  unsigned long    s_magic;
  unsigned long    s_time;
  struct dentry    *s_root;
  struct wait_queue  *s_wait;

  struct inode    *s_ibasket;
  short int    s_ibasket_count;
  short int    s_ibasket_max;
  struct list_head  s_dirty;  /* dirty inodes */

  union {
    struct minix_sb_info  minix_sb;
    struct ext2_sb_info  ext2_sb;
    struct hpfs_sb_info  hpfs_sb;
    struct ntfs_sb_info     ntfs_sb;
    struct msdos_sb_info  msdos_sb;
    struct isofs_sb_info  isofs_sb;
    struct nfs_sb_info  nfs_sb;
    struct sysv_sb_info  sysv_sb;
    struct affs_sb_info  affs_sb;
    struct ufs_sb_info  ufs_sb;
    struct romfs_sb_info  romfs_sb;
    struct smb_sb_info  smbfs_sb;
    struct hfs_sb_info  hfs_sb;
    struct adfs_sb_info  adfs_sb;
    struct qnx4_sb_info  qnx4_sb;     
    void      *generic_sbp;
  } u;
};

struct super_operations {
  void (*read_inode) (struct inode *);
  void (*write_inode) (struct inode *);
  void (*put_inode) (struct inode *);
  void (*delete_inode) (struct inode *);
  int (*notify_change) (struct dentry *, struct iattr *);
  void (*put_super) (struct super_block *);
  void (*write_super) (struct super_block *);
  int (*statfs) (struct super_block *, struct statfs *, int);
  int (*remount_fs) (struct super_block *, int *, char *);
  void (*clear_inode) (struct inode *);
  void (*umount_begin) (struct super_block *);
};

struct file_system_type {
  const char *name;
  int fs_flags;
  struct super_block *(*read_super) (struct super_block *, void *, int);
  struct file_system_type * next;
};

二. inode

记录文件的元信息, 该对象需要保持在磁盘上持久化.

struct inode {
  struct list_head  i_hash;
  struct list_head  i_list;
  struct list_head  i_dentry;

  unsigned long    i_ino;
  unsigned int    i_count;
  kdev_t      i_dev;
  umode_t      i_mode;
  nlink_t      i_nlink;
  uid_t      i_uid;
  gid_t      i_gid;
  kdev_t      i_rdev;
  off_t      i_size;
  time_t      i_atime;
  time_t      i_mtime;
  time_t      i_ctime;
  unsigned long    i_blksize;
  unsigned long    i_blocks;
  unsigned long    i_version;
  unsigned long    i_nrpages;
  struct semaphore  i_sem;
  struct semaphore  i_atomic_write;
  struct inode_operations  *i_op;
  struct super_block  *i_sb;
  struct wait_queue  *i_wait;
  struct file_lock  *i_flock;
  struct vm_area_struct  *i_mmap;
  struct page    *i_pages;
  struct dquot    *i_dquot[MAXQUOTAS];

  unsigned long    i_state;

  unsigned int    i_flags;
  unsigned char    i_pipe;
  unsigned char    i_sock;

  int      i_writecount;
  unsigned int    i_attr_flags;
  union {
    struct pipe_inode_info    pipe_i;
    struct minix_inode_info    minix_i;
    struct ext2_inode_info    ext2_i;
    struct hpfs_inode_info    hpfs_i;
    struct ntfs_inode_info          ntfs_i;
    struct msdos_inode_info    msdos_i;
    struct umsdos_inode_info  umsdos_i;
    struct iso_inode_info    isofs_i;
    struct nfs_inode_info    nfs_i;
    struct sysv_inode_info    sysv_i;
    struct affs_inode_info    affs_i;
    struct ufs_inode_info    ufs_i;
    struct romfs_inode_info    romfs_i;
    struct coda_inode_info    coda_i;
    struct smb_inode_info    smbfs_i;
    struct hfs_inode_info    hfs_i;
    struct adfs_inode_info    adfs_i;
    struct qnx4_inode_info    qnx4_i;     
    struct socket      socket_i;
    void        *generic_ip;
  } u;
};

struct inode_operations {
  struct file_operations * default_file_ops;
  int (*create) (struct inode *,struct dentry *,int);
  int (*lookup) (struct inode *,struct dentry *);
  int (*link) (struct dentry *,struct inode *,struct dentry *);
  int (*unlink) (struct inode *,struct dentry *);
  int (*symlink) (struct inode *,struct dentry *,const char *);
  int (*mkdir) (struct inode *,struct dentry *,int);
  int (*rmdir) (struct inode *,struct dentry *);
  int (*mknod) (struct inode *,struct dentry *,int,int);
  int (*rename) (struct inode *, struct dentry *,
      struct inode *, struct dentry *);
  int (*readlink) (struct dentry *, char *,int);
  struct dentry * (*follow_link) (struct dentry *, struct dentry *, unsigned int);
  int (*readpage) (struct file *, struct page *);
  int (*writepage) (struct file *, struct page *);
  int (*bmap) (struct inode *,int);
  void (*truncate) (struct inode *);
  int (*permission) (struct inode *, int);
  int (*smap) (struct inode *,int);
  int (*updatepage) (struct file *, struct page *, unsigned long, unsigned int, int);
  int (*revalidate) (struct dentry *);
};

三. dentry

目录项, 记录文件名和inode节点间的关系. 查询目录时候使用dentry, 并且使用了缓存加快查找速度

struct dentry {
  int d_count;
  unsigned int d_flags;
  struct inode  * d_inode;  /* Where the name belongs to - NULL is negative */
  struct dentry * d_parent;  /* parent directory */
  struct dentry * d_mounts;  /* mount information */
  struct dentry * d_covers;
  struct list_head d_hash;  /* lookup hash list */
  struct list_head d_lru;    /* d_count = 0 LRU list */
  struct list_head d_child;  /* child of parent list */
  struct list_head d_subdirs;  /* our children */
  struct list_head d_alias;  /* inode alias list */
  struct qstr d_name;
  unsigned long d_time;    /* used by d_revalidate */
  struct dentry_operations  *d_op;
  struct super_block * d_sb;  /* The root of the dentry tree */
  unsigned long d_reftime;  /* last time referenced */
  void * d_fsdata;    /* fs-specific data */
  unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
};

struct dentry_operations {
  int (*d_revalidate)(struct dentry *);
  int (*d_hash) (struct dentry *, struct qstr *);
  int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
  void (*d_delete)(struct dentry *);
  void (*d_release)(struct dentry *);
  void (*d_iput)(struct dentry *, struct inode *);
};

文件系统的设计思想:

一. 面向对象

面向对象的一个直观的解释是将操作和数据组合在一起, 形成一个功能单元.

用纯c实现的话, 就是将回调函数, 和数据定义, 放在同一个struct里.

例如dentry:

struct dentry {
  int d_count;
  unsigned int d_flags;
  struct inode  * d_inode;  /* Where the name belongs to - NULL is negative */
  struct dentry * d_parent;  /* parent directory */
  struct dentry * d_mounts;  /* mount information */
  struct dentry * d_covers;
  struct list_head d_hash;  /* lookup hash list */
  struct list_head d_lru;    /* d_count = 0 LRU list */
  struct list_head d_child;  /* child of parent list */
  struct list_head d_subdirs;  /* our children */
  struct list_head d_alias;  /* inode alias list */
  struct qstr d_name;
  unsigned long d_time;    /* used by d_revalidate */
  struct dentry_operations  *d_op;
  struct super_block * d_sb;  /* The root of the dentry tree */
  unsigned long d_reftime;  /* last time referenced */
  void * d_fsdata;    /* fs-specific data */
  unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
};

struct dentry_operations {
  int (*d_revalidate)(struct dentry *);
  int (*d_hash) (struct dentry *, struct qstr *);
  int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
  void (*d_delete)(struct dentry *);
  void (*d_release)(struct dentry *);
  void (*d_iput)(struct dentry *, struct inode *);
};

二. VFS

这点非常值得学习, 在各种文件系统之上, 增加了一层抽象. 类似于k8s中的csi, 也是对各种csi做了一层抽象.

In this section I'll briefly describe how things work, before
launching into the details. I'll start with describing what happens
when user programmes open and manipulate files, and then look from the
other view which is how a filesystem is supported and subsequently
mounted.

注册和挂载文件系统:

Registering and Mounting a Filesystem                              <subsection>
-------------------------------------

If you want to support a new kind of filesystem in the kernel, all you
need to do is call register_filesystem(). You pass a structure
describing the filesystem implementation (struct file_system_type)
which is then added to an internal table of supported filesystems. You
can do:

% cat /proc/filesystems

to see what filesystems are currently available on your system.

When a request is made to mount a block device onto a directory in
your filespace the VFS will call the appropriate method for the
specific filesystem. The dentry for the mount point will then be
updated to point to the root inode for the new filesystem.


struct file_system_type                                               <section>
=======================

This describes the filesystem. As of kernel 2.1.99, the following
members are defined:

struct file_system_type {
  const char *name;
  int fs_flags;
  struct super_block *(*read_super) (struct super_block *, void *, int);
  struct file_system_type * next;
};

  name: the name of the filesystem type, such as "ext2", "iso9660",
  "msdos" and so on

  fs_flags: various flags (i.e. if it is a read-only FS)

  read_super: the method to call when a new instance of this
  filesystem should be mounted

  next: for internal VFS use: you should initialise this to NULL

The read_super() method has the following arguments:

  struct super_block *sb: the superblock structure. This is partially
  initialised by the VFS and the rest must be initialised by the
  read_super() method

  void *data: arbitrary mount options, usually comes as an ASCII
  string

  int silent: whether or not to be silent on error

The read_super() method must determine if the block device specified
in the superblock contains a filesystem of the type the method
supports. On success the method returns the superblock pointer, on
failure it returns NULL.

The most interesting member of the superblock structure that the
read_super() method fills in is the "s_op" field. This is a pointer to
a "struct super_operations" which describes the next level of the
filesystem implementation.

核心操作:

sys_open

asmlinkage int sys_open(const char * filename, int flags, int mode)
{
  char * tmp;
  int fd, error;

  lock_kernel();
  fd = get_unused_fd();
  if (fd < 0)
    goto out;

  tmp = getname(filename);
  error = PTR_ERR(tmp);
  if (IS_ERR(tmp))
    goto out_fail;
  error = do_open(tmp, flags, mode, fd);
  putname(tmp);
  if (error)
    goto out_fail;
out:
  unlock_kernel();
  return fd;

out_fail:
  put_unused_fd(fd);
  fd = error;
  goto out;
}

/* should probably go into sys_open() */
static int do_open(const char * filename, int flags, int mode, int fd)
{
  struct file * f;

  f = filp_open(filename, flags, mode);
  if (IS_ERR(f))
    return PTR_ERR(f);
  fd_install(fd, f);
  return 0;
}

/*
 * Note that while the flag value (low two bits) for sys_open means:
 * 00 - read-only
 * 01 - write-only
 * 10 - read-write
 * 11 - special
 * it is changed into
 * 00 - no permissions needed
 * 01 - read-permission
 * 10 - write-permission
 * 11 - read-write
 * for the internal routines (ie open_namei()/follow_link() etc). 00 is
 * used by symlinks.
 */
struct file *filp_open(const char * filename, int flags, int mode)
{
  struct inode * inode;
  struct dentry * dentry;
  struct file * f;
  int flag,error;

  error = -ENFILE;
  f = get_empty_filp();
  if (!f)
    goto out;
  f->f_flags = flag = flags;
  f->f_mode = (flag+1) & O_ACCMODE;
  if (f->f_mode)
    flag++;
  if (flag & O_TRUNC)
    flag |= 2;
  dentry = open_namei(filename,flag,mode);
  error = PTR_ERR(dentry);
  if (IS_ERR(dentry))
    goto cleanup_file;
  inode = dentry->d_inode;
  if (f->f_mode & FMODE_WRITE) {
    error = get_write_access(inode);
    if (error)
      goto cleanup_dentry;
  }

  f->f_dentry = dentry;
  f->f_pos = 0;
  f->f_reada = 0;
  f->f_op = NULL;
  if (inode->i_op)
    f->f_op = inode->i_op->default_file_ops;
  if (f->f_op && f->f_op->open) {
    error = f->f_op->open(inode,f);
    if (error)
      goto cleanup_all;
  }
  f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

  return f;

cleanup_all:
  if (f->f_mode & FMODE_WRITE)
    put_write_access(inode);
cleanup_dentry:
  f->f_dentry = NULL;
  dput(dentry);
cleanup_file:
  put_filp(f);
out:
  return ERR_PTR(error);
}

/*
 * open_namei()
 *
 * namei for open - this is in fact almost the whole open-routine.
 *
 * Note that the low bits of "flag" aren't the same as in the open
 * system call - they are 00 - no permissions needed
 *       01 - read permission needed
 *       10 - write permission needed
 *       11 - read/write permissions needed
 * which is a lot more logical, and also allows the "no perm" needed
 * for symlinks (where the permissions are checked later).
 */
struct dentry * open_namei(const char * pathname, int flag, int mode)
{
  int acc_mode, error;
  struct inode *inode;
  struct dentry *dentry;

  mode &= S_IALLUGO & ~current->fs->umask;
  mode |= S_IFREG;

  dentry = lookup_dentry(pathname, NULL, lookup_flags(flag));
  if (IS_ERR(dentry))
    return dentry;

  acc_mode = ACC_MODE(flag);
  if (flag & O_CREAT) {
    struct dentry *dir;

    error = -EEXIST;
    if (dentry->d_inode && (flag & O_EXCL))
      goto exit;

    dir = lock_parent(dentry);
    error = PTR_ERR(dir);
    if (IS_ERR(dir))
      goto exit;

    /*
     * Somebody might have created the file while we
     * waited for the directory lock.. So we have to
     * re-do the existence test.
     */
    if (dentry->d_inode) {
      error = 0;
      if (flag & O_EXCL)
        error = -EEXIST;
    } else if (IS_RDONLY(dir->d_inode))
      error = -EROFS;
    else if (!dir->d_inode->i_op || !dir->d_inode->i_op->create)
      error = -EACCES;
    else if ((error = permission(dir->d_inode,MAY_WRITE | MAY_EXEC)) == 0) {
      DQUOT_INIT(dir->d_inode);
      error = dir->d_inode->i_op->create(dir->d_inode, dentry, mode);
      /* Don't check for write permission, don't truncate */
      acc_mode = 0;
      flag &= ~O_TRUNC;
    }
    unlock_dir(dir);
    if (error)
      goto exit;
  }

  error = -ENOENT;
  inode = dentry->d_inode;
  if (!inode)
    goto exit;

  error = -ELOOP;
  if (S_ISLNK(inode->i_mode))
    goto exit;
  
  error = -EISDIR;
  if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
    goto exit;

  error = permission(inode,acc_mode);
  if (error)
    goto exit;

  /*
   * FIFO's, sockets and device files are special: they don't
   * actually live on the filesystem itself, and as such you
   * can write to them even if the filesystem is read-only.
   */
  if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
        flag &= ~O_TRUNC;
  } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
    error = -EACCES;
    if (IS_NODEV(inode))
      goto exit;

    flag &= ~O_TRUNC;
  } else {
    error = -EROFS;
    if (IS_RDONLY(inode) && (flag & 2))
      goto exit;
  }
  /*
   * An append-only file must be opened in append mode for writing.
   */
  error = -EPERM;
  if (IS_APPEND(inode)) {
    if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
      goto exit;
    if (flag & O_TRUNC)
      goto exit;
  }

  if (flag & O_TRUNC) {
    error = get_write_access(inode);
    if (error)
      goto exit;

    /*
     * Refuse to truncate files with mandatory locks held on them.
     */
    error = locks_verify_locked(inode);
    if (!error) {
      DQUOT_INIT(inode);
      
      error = do_truncate(dentry, 0);
    }
    put_write_access(inode);
    if (error)
      goto exit;
  } else
    if (flag & FMODE_WRITE)
      DQUOT_INIT(inode);

  return dentry;

exit:
  dput(dentry);
  return ERR_PTR(error);
}

/*
 * Name resolution.
 *
 * This is the basic name resolution function, turning a pathname
 * into the final dentry.
 */
struct dentry * lookup_dentry(const char * name, struct dentry * base, unsigned int lookup_flags)
{
  struct dentry * dentry;
  struct inode *inode;

  if (*name == '/') {
    if (base)
      dput(base);
    do {
      name++;
    } while (*name == '/');
    __prefix_lookup_dentry(name, lookup_flags);
    base = dget(current->fs->root);
  } else if (!base) {
    base = dget(current->fs->pwd);
  }

  if (!*name)
    goto return_base;

  inode = base->d_inode;
  lookup_flags &= LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_SLASHOK;

  /* At this point we know we have a real path component. */
  for(;;) {
    int err;
    unsigned long hash;
    struct qstr this;
    unsigned int flags;
    unsigned int c;

    err = permission(inode, MAY_EXEC);
    dentry = ERR_PTR(err);
    if (err)
      break;

    this.name = name;
    c = *(const unsigned char *)name;

    hash = init_name_hash();
    do {
      name++;
      hash = partial_name_hash(c, hash);
      c = *(const unsigned char *)name;
    } while (c && (c != '/'));
    this.len = name - (const char *) this.name;
    this.hash = end_name_hash(hash);

    /* remove trailing slashes? */
    flags = lookup_flags;
    if (c) {
      char tmp;

      flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
      do {
        tmp = *++name;
      } while (tmp == '/');
      if (tmp)
        flags |= LOOKUP_CONTINUE;
    }

    /*
     * See if the low-level filesystem might want
     * to use its own hash..
     */
    if (base->d_op && base->d_op->d_hash) {
      int error;
      error = base->d_op->d_hash(base, &this);
      if (error < 0) {
        dentry = ERR_PTR(error);
        break;
      }
    }

    /* This does the actual lookups.. */
    dentry = reserved_lookup(base, &this);
    if (!dentry) {
      dentry = cached_lookup(base, &this);
      if (!dentry) {
        dentry = real_lookup(base, &this);
        if (IS_ERR(dentry))
          break;
      }
    }

    /* Check mountpoints.. */
    dentry = follow_mount(dentry);

    if (!(flags & LOOKUP_FOLLOW))
      break;

    base = do_follow_link(base, dentry, flags);
    if (IS_ERR(base))
      goto return_base;

    inode = base->d_inode;
    if (flags & LOOKUP_DIRECTORY) {
      if (!inode)
        goto no_inode;
      dentry = ERR_PTR(-ENOTDIR); 
      if (!inode->i_op || !inode->i_op->lookup)
        break;
      if (flags & LOOKUP_CONTINUE)
        continue;
    }
return_base:
    return base;
/*
 * The case of a nonexisting file is special.
 *
 * In the middle of a pathname lookup (ie when
 * LOOKUP_CONTINUE is set), it's an obvious
 * error and returns ENOENT.
 *
 * At the end of a pathname lookup it's legal,
 * and we return a negative dentry. However, we
 * get here only if there were trailing slashes,
 * which is legal only if we know it's supposed
 * to be a directory (ie "mkdir"). Thus the
 * LOOKUP_SLASHOK flag.
 */
no_inode:
    dentry = ERR_PTR(-ENOENT);
    if (flags & LOOKUP_CONTINUE)
      break;
    if (flags & LOOKUP_SLASHOK)
      goto return_base;
    break;
  }
  dput(base);
  return dentry;
}

/*
 * This is called when everything else fails, and we actually have
 * to go to the low-level filesystem to find out what we should do..
 *
 * We get the directory semaphore, and after getting that we also
 * make sure that nobody added the entry to the dcache in the meantime..
 */
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name)
{
  struct dentry * result;
  struct inode *dir = parent->d_inode;

  down(&dir->i_sem);
  /*
   * First re-do the cached lookup just in case it was created
   * while we waited for the directory semaphore..
   *
   * FIXME! This could use version numbering or similar to
   * avoid unnecessary cache lookups.
   */
  result = cached_lookup(parent, name);
  if (!result) {
    struct dentry * dentry = d_alloc(parent, name);
    result = ERR_PTR(-ENOMEM);
    if (dentry) {
      int error = dir->i_op->lookup(dir, dentry);
      result = dentry;
      if (error) {
        dput(dentry);
        result = ERR_PTR(error);
      }
    }
  }
  up(&dir->i_sem);
  return result;
}

struct dentry * d_alloc(struct dentry * parent, const struct qstr *name)
{
  char * str;
  struct dentry *dentry;

  /*
   * Prune the dcache if there are too many unused dentries.
   */
  if (dentry_stat.nr_unused > 3*(nr_inodes >> 1)) {
#ifdef DCACHE_DEBUG
printk("d_alloc: %d unused, pruning dcache\n", dentry_stat.nr_unused);
#endif
    prune_dcache(8);
    free_inode_memory(8);
  }

  dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); 
  if (!dentry)
    return NULL;

  if (name->len > DNAME_INLINE_LEN-1) {
    str = kmalloc(NAME_ALLOC_LEN(name->len), GFP_KERNEL);
    if (!str) {
      kmem_cache_free(dentry_cache, dentry); 
      return NULL;
    }
  } else
    str = dentry->d_iname; 

  memcpy(str, name->name, name->len);
  str[name->len] = 0;

  dentry->d_count = 1;
  dentry->d_flags = 0;
  dentry->d_inode = NULL;
  dentry->d_parent = NULL;
  dentry->d_sb = NULL;
  if (parent) {
    dentry->d_parent = dget(parent);
    dentry->d_sb = parent->d_sb;
    list_add(&dentry->d_child, &parent->d_subdirs);
  } else
    INIT_LIST_HEAD(&dentry->d_child);
    
  dentry->d_mounts = dentry;
  dentry->d_covers = dentry;
  INIT_LIST_HEAD(&dentry->d_hash);
  INIT_LIST_HEAD(&dentry->d_lru);
  INIT_LIST_HEAD(&dentry->d_subdirs);
  INIT_LIST_HEAD(&dentry->d_alias);

  dentry->d_name.name = str;
  dentry->d_name.len = name->len;
  dentry->d_name.hash = name->hash;
  dentry->d_op = NULL;
  dentry->d_fsdata = NULL;
  return dentry;
}

if (dentry) {
      int error = dir->i_op->lookup(dir, dentry);
      result = dentry;
      if (error) {
        dput(dentry);
        result = ERR_PTR(error);
      }
    }

sys_mount

/*
 * Flags is a 16-bit value that allows up to 16 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * NOTE! As old versions of mount() didn't use this setup, the flags
 * have to have a special 16-bit magic number in the high word:
 * 0xC0ED. If this magic word isn't present, the flags and data info
 * aren't used, as the syscall assumes we are talking to an older
 * version that didn't understand them.
 */
asmlinkage int sys_mount(char * dev_name, char * dir_name, char * type,
  unsigned long new_flags, void * data)
{
  struct file_system_type * fstype;
  struct dentry * dentry = NULL;
  struct inode * inode = NULL;
  kdev_t dev;
  int retval = -EPERM;
  unsigned long flags = 0;
  unsigned long page = 0;
  struct file dummy;  /* allows read-write or read-only flag */

  lock_kernel();
  if (!capable(CAP_SYS_ADMIN))
    goto out;
  if ((new_flags &
       (MS_MGC_MSK | MS_REMOUNT)) == (MS_MGC_VAL | MS_REMOUNT)) {
    retval = copy_mount_options (data, &page);
    if (retval < 0)
      goto out;
    retval = do_remount(dir_name,
            new_flags & ~MS_MGC_MSK & ~MS_REMOUNT,
            (char *) page);
    free_page(page);
    goto out;
  }

  retval = copy_mount_options (type, &page);
  if (retval < 0)
    goto out;
  fstype = get_fs_type((char *) page);
  free_page(page);
  retval = -ENODEV;
  if (!fstype)    
    goto out;

  memset(&dummy, 0, sizeof(dummy));
  if (fstype->fs_flags & FS_REQUIRES_DEV) {
    dentry = namei(dev_name);
    retval = PTR_ERR(dentry);
    if (IS_ERR(dentry))
      goto out;

    inode = dentry->d_inode;
    retval = -ENOTBLK;
    if (!S_ISBLK(inode->i_mode))
      goto dput_and_out;

    retval = -EACCES;
    if (IS_NODEV(inode))
      goto dput_and_out;

    dev = inode->i_rdev;
    retval = -ENXIO;
    if (MAJOR(dev) >= MAX_BLKDEV)
      goto dput_and_out;

    retval = -ENOTBLK;
    dummy.f_op = get_blkfops(MAJOR(dev));
    if (!dummy.f_op)
      goto dput_and_out;

    if (dummy.f_op->open) {
      dummy.f_dentry = dentry;
      dummy.f_mode = (new_flags & MS_RDONLY) ? 1 : 3;
      retval = dummy.f_op->open(inode, &dummy);
      if (retval)
        goto dput_and_out;
    }

  } else {
    retval = -EMFILE;
    if (!(dev = get_unnamed_dev()))
      goto out;
  }

  page = 0;
  if ((new_flags & MS_MGC_MSK) == MS_MGC_VAL) {
    flags = new_flags & ~MS_MGC_MSK;
    retval = copy_mount_options(data, &page);
    if (retval < 0)
      goto clean_up;
  }
  retval = do_mount(dev, dev_name, dir_name, fstype->name, flags,
        (void *) page);
  free_page(page);
  if (retval)
    goto clean_up;

dput_and_out:
  dput(dentry);
out:
  unlock_kernel();
  return retval;

clean_up:
  if (dummy.f_op) {
    if (dummy.f_op->release)
      dummy.f_op->release(inode, NULL);
  } else
    put_unnamed_dev(dev);
  goto dput_and_out;
}

/*
 * do_mount() does the actual mounting after sys_mount has done the ugly
 * parameter parsing. When enough time has gone by, and everything uses the
 * new mount() parameters, sys_mount() can then be cleaned up.
 *
 * We cannot mount a filesystem if it has active, used, or dirty inodes.
 * We also have to flush all inode-data for this device, as the new mount
 * might need new info.
 *
 * [21-Mar-97] T.Schoebel-Theuer: Now this can be overridden when
 * supplying a leading "!" before the dir_name, allowing "stacks" of
 * mounted filesystems. The stacking will only influence any pathname lookups
 * _after_ the mount, but open file descriptors or working directories that
 * are now covered remain valid. For example, when you overmount /home, any
 * process with old cwd /home/joe will continue to use the old versions,
 * as long as relative paths are used, but absolute paths like /home/joe/xxx
 * will go to the new "top of stack" version. In general, crossing a
 * mount point will always go to the top of stack element.
 * Anyone using this new feature must know what he/she is doing.
 */

int do_mount(kdev_t dev, const char * dev_name, const char * dir_name, const char * type, int flags, void * data)
{
  struct dentry * dir_d;
  struct super_block * sb;
  struct vfsmount *vfsmnt;
  int error;

  error = -EACCES;
  if (!(flags & MS_RDONLY) && dev && is_read_only(dev))
    goto out;

  /*
   * Do the lookup first to force automounting.
   */
  dir_d = namei(dir_name);
  error = PTR_ERR(dir_d);
  if (IS_ERR(dir_d))
    goto out;

  down(&mount_sem);
  error = -ENOTDIR;
  if (!S_ISDIR(dir_d->d_inode->i_mode))
    goto dput_and_out;

  error = -EBUSY;
  if (dir_d->d_covers != dir_d)
    goto dput_and_out;

  /*
   * Note: If the superblock already exists,
   * read_super just does a get_super().
   */
  error = -EINVAL;
  sb = read_super(dev, type, flags, data, 0);
  if (!sb)
    goto dput_and_out;

  /*
   * We may have slept while reading the super block, 
   * so we check afterwards whether it's safe to mount.
   */
  error = -EBUSY;
  if (!fs_may_mount(dev))
    goto dput_and_out;

  error = -ENOMEM;
  vfsmnt = add_vfsmnt(sb, dev_name, dir_name);
  if (vfsmnt) {
    d_mount(dget(dir_d), sb->s_root);
    error = 0;
  }

dput_and_out:
  dput(dir_d);
  up(&mount_sem);
out:
  return error;  
}

static struct vfsmount *add_vfsmnt(struct super_block *sb,
      const char *dev_name, const char *dir_name)
{
  struct vfsmount *lptr;
  char *tmp, *name;

  lptr = (struct vfsmount *)kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
  if (!lptr)
    goto out;
  memset(lptr, 0, sizeof(struct vfsmount));

  lptr->mnt_sb = sb;
  lptr->mnt_dev = sb->s_dev;
  lptr->mnt_flags = sb->s_flags;

  sema_init(&lptr->mnt_dquot.semaphore, 1);
  lptr->mnt_dquot.flags = 0;

  /* N.B. Is it really OK to have a vfsmount without names? */
  if (dev_name && !IS_ERR(tmp = getname(dev_name))) {
    name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
    if (name) {
      strcpy(name, tmp);
      lptr->mnt_devname = name;
    }
    putname(tmp);
  }
  if (dir_name && !IS_ERR(tmp = getname(dir_name))) {
    name = (char *) kmalloc(strlen(tmp)+1, GFP_KERNEL);
    if (name) {
      strcpy(name, tmp);
      lptr->mnt_dirname = name;
    }
    putname(tmp);
  }

  if (vfsmntlist == (struct vfsmount *)NULL) {
    vfsmntlist = vfsmnttail = lptr;
  } else {
    vfsmnttail->mnt_next = lptr;
    vfsmnttail = lptr;
  }
out:
  return lptr;
}

static void d_mount(struct dentry *covered, struct dentry *dentry)
{
  if (covered->d_mounts != covered) {
    printk("VFS: mount - already mounted\n");
    return;
  }
  covered->d_mounts = dentry;
  dentry->d_covers = covered;
}