7.  File and filesystem operations

      With the introduction of the data structures used by the filesystem operations, the complete list of filesystem entry points may be listed. As noted, they derive mostly from the Sun VFS interface. Lines marked with + are additions to the Sun definitions; lines marked with ! are modified from VFS.

      The structure describing the externally-visible features of a mounted filesystem, vfs, is:

 * Structure per mounted file system.
 * Each mounted file system has an array of
 * operations and an instance record.
 * The file systems are put on a doubly linked list.
struct vfs {
	struct vfs	*vfs_next;		/* next vfs in vfs list */
+	struct vfs	*vfs_prev;		/* prev vfs in vfs list */
	struct vfsops	*vfs_op;		/* operations on vfs */
	struct vnode	*vfs_vnodecovered;	/* vnode we mounted on */
	int	vfs_flag;		/* flags */
!	int	vfs_fsize;		/* fundamental block size */
+	int	vfs_bsize;		/* optimal transfer size */
!	uid_t	vfs_exroot;		/* exported fs uid 0 mapping */
	short	vfs_exflags;		/* exported fs flags */
	caddr_t	vfs_data;		/* private data */
	 * vfs flags.
	 * VFS_MLOCK lock the vfs so that name lookup cannot proceed past the vfs.
	 * This keeps the subtree stable during mounts and unmounts.
	#define	VFS_RDONLY	0x01		/* read only vfs */
+	#define	VFS_NOEXEC	0x02		/* can't exec from filesystem */
	#define	VFS_MLOCK	0x04		/* lock vfs so that subtree is stable */
	#define	VFS_MWAIT	0x08		/* someone is waiting for lock */
	#define	VFS_NOSUID	0x10		/* don't honor setuid bits on vfs */
	#define	VFS_EXPORTED	0x20		/* file system is exported (NFS) */

	 * exported vfs flags.
	#define	EX_RDONLY	0x01		/* exported read only */

The operations supported by the filesystem-specific layer on an individual filesystem are:

 * Operations supported on virtual file system.
struct vfsops {
!	int	(*vfs_mount)(		/* vfs, path, data, datalen */ );
!	int	(*vfs_unmount)(		/* vfs, forcibly */ );
+	int	(*vfs_mountroot)();
	int	(*vfs_root)(		/* vfs, vpp */ );
!	int	(*vfs_statfs)(		/* vfs, vp, sbp */ );
!	int	(*vfs_sync)(		/* vfs, waitfor */ );
+	int	(*vfs_fhtovp)(		/* vfs, fhp, vpp */ );
+	int	(*vfs_vptofh)(		/* vp, fhp */ );

The vfs_statfs entry returns a structure of the form:

 * file system statistics
struct statfs {
!	short	f_type;			/* type of filesystem */
+	short	f_flags;		/* copy of vfs (mount) flags */
!	long	f_fsize;		/* fundamental file system block size */
+	long	f_bsize;		/* optimal transfer block size */
	long	f_blocks;		/* total data blocks in file system */
	long	f_bfree;		/* free blocks in fs */
	long	f_bavail;		/* free blocks avail to non-superuser */
	long	f_files;		/* total file nodes in file system */
	long	f_ffree;		/* free file nodes in fs */
	fsid_t	f_fsid;			/* file system id */
+	char	*f_mntonname;		/* directory on which mounted */
+	char	*f_mntfromname;		/* mounted filesystem */
	long	f_spare[7];		/* spare for later */

typedef long fsid_t[2];			/* file system id type */

The modifications to Sun's interface at this level are minor. Additional arguments are present for the vfs_mount and vfs_umount entries. vfs_statfs accepts a vnode as well as filesystem identifier, as the information may not be uniform throughout a filesystem. For example, if a client may mount a file tree that spans multiple physical filesystems on a server, different sections may have different amounts of free space. (NFS does not allow remotely-mounted file trees to span physical filesystems on the server.) The final additions are the entries that support file handles. vfs_vptofh is provided for the use of file servers, which need to obtain an opaque file handle to represent the current vnode for transmission to clients. This file handle may later be used to relocate the vnode using vfs_fhtovp without requiring the vnode to remain in memory.

      Finally, the external form of a filesystem object, the vnode, is:

 * vnode types. VNON means no type.
enum vtype 	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK };

struct vnode {
	u_short	v_flag;			/* vnode flags (see below) */
	u_short	v_count;		/* reference count */
	u_short	v_shlockc;		/* count of shared locks */
	u_short	v_exlockc;		/* count of exclusive locks */
	struct vfs	*v_vfsmountedhere;	/* ptr to vfs mounted here */
	struct vfs	*v_vfsp;		/* ptr to vfs we are in */
	struct vnodeops	*v_op;			/* vnode operations */
+	struct text	*v_text;		/* text/mapped region */
	enum vtype	v_type;			/* vnode type */
	caddr_t	v_data;			/* private data for fs */
 * vnode flags.
#define	VROOT	0x01	/* root of its file system */
#define	VTEXT	0x02	/* vnode is a pure text prototype */
#define	VEXLOCK	0x10	/* exclusive lock */
#define	VSHLOCK	0x20	/* shared lock */
#define	VLWAIT	0x40	/* proc is waiting on shared or excl. lock */

The operations supported by the filesystems on individual vnodes are:

 * Operations on vnodes.
struct vnodeops {
!	int	(*vn_lookup)(		/* ndp */ );
!	int	(*vn_create)(		/* ndp, vap, fflags */ );
+	int	(*vn_mknod)(		/* ndp, vap, fflags */ );
!	int	(*vn_open)(		/* vp, fflags, cred */ );
	int	(*vn_close)(		/* vp, fflags, cred */ );
	int	(*vn_access)(		/* vp, fflags, cred */ );
	int	(*vn_getattr)(		/* vp, vap, cred */ );
	int	(*vn_setattr)(		/* vp, vap, cred */ );

+	int	(*vn_read)(		/* vp, uiop, offp, ioflag, cred */ );
+	int	(*vn_write)(		/* vp, uiop, offp, ioflag, cred */ );
!	int	(*vn_ioctl)(		/* vp, com, data, fflag, cred */ );
	int	(*vn_select)(		/* vp, which, cred */ );
+	int	(*vn_mmap)(		/* vp, ..., cred */ );
	int	(*vn_fsync)(		/* vp, cred */ );
+	int	(*vn_seek)(		/* vp, offp, off, whence */ );

!	int	(*vn_remove)(		/* ndp */ );
!	int	(*vn_link)(		/* vp, ndp */ );
!	int	(*vn_rename)(		/* src ndp, target ndp */ );
!	int	(*vn_mkdir)(		/* ndp, vap */ );
!	int	(*vn_rmdir)(		/* ndp */ );
!	int	(*vn_symlink)(		/* ndp, vap, nm */ );
	int	(*vn_readdir)(		/* vp, uiop, offp, ioflag, cred */ );
	int	(*vn_readlink)(		/* vp, uiop, ioflag, cred */ );

+	int	(*vn_abortop)(		/* ndp */ );
+	int	(*vn_lock)(		/* vp */ );
+	int	(*vn_unlock)(		/* vp */ );
!	int	(*vn_inactive)(		/* vp */ );
 * flags for ioflag
#define	IO_UNIT	0x01		/* do io as atomic unit for VOP_RDWR */
#define	IO_APPEND	0x02		/* append write for VOP_RDWR */
#define	IO_SYNC	0x04		/* sync io for VOP_RDWR */

The argument types listed in the comments following each operation are:

A pointer to a nameidata structure.
A pointer to a vattr structure (vnode attributes; see below).
File open flags, possibly including O_APPEND, O_CREAT, O_TRUNC and O_EXCL.
A pointer to a vnode previously obtained with vn_lookup.
A pointer to a ucred credentials structure.
A pointer to a uio structure.
Any of the IO flags defined above.
An ioctl command, with type unsigned long.
A pointer to a character buffer used to pass data to or from an ioctl.
One of FREAD, FWRITE or 0 (select for exceptional conditions).
A file offset of type off_t.
A pointer to file offset of type off_t.
One of L_SET, L_INCR, or L_XTND.
A pointer to a file handle buffer.

      Several changes have been made to Sun's set of vnode operations. Most obviously, the vn_lookup receives a nameidata structure containing its arguments and context as described. The same structure is also passed to one of the creation or deletion entries if the lookup operation is for CREATE or DELETE to complete an operation, or to the vn_abortop entry if no operation is undertaken. For filesystems that perform no locking between lookup for creation or deletion and the call to implement that action, the final pathname component may be left untranslated by the lookup routine. In any case, the pathname pointer points at the final name component, and the nameidata contains a reference to the vnode of the parent directory. The interface is thus flexible enough to accommodate filesystems that are fully stateful or fully stateless, while avoiding redundant operations whenever possible. One operation remains problematical, the vn_rename call. It is tempting to look up the source of the rename for deletion and the target for creation. However, filesystems that lock directories during such lookups must avoid deadlock if the two paths cross. For that reason, the source is translated for LOOKUP only, with the WANTPARENT flag set; the target is then translated with an operation of CREATE.

      In addition to the changes concerned with the nameidata interface, several other changes were made in the vnode operations. The vn_rdrw entry was split into vn_read and vn_write; frequently, the read/write entry amounts to a routine that checks the direction flag, then calls either a read routine or a write routine. The two entries may be identical for any given filesystem; the direction flag is contained in the uio given as an argument.

      All of the read and write operations use a uio to describe the file offset and buffer locations. All of these fields must be updated before return. In particular, the vn_readdir entry uses this to return a new file offset token for its current location.

      Several new operations have been added. The first, vn_seek, is a concession to record-oriented files such as directories. It allows the filesystem to verify that a seek leaves a file at a sensible offset, or to return a new offset token relative to an earlier one. For most filesystems and files, this operation amounts to performing simple arithmetic. Another new entry point is vn_mmap, for use in mapping device memory into a user process address space. Its semantics are not yet decided. The final additions are the vn_lock and vn_unlock entries. These are used to request that the underlying file be locked against changes for short periods of time if the filesystem implementation allows it. They are used to maintain consistency during internal operations such as exec, and may not be used to construct atomic operations from other filesystem operations.

      The attributes of a vnode are not stored in the vnode, as they might change with time and may need to be read from a remote source. Attributes have the form:

 * Vnode attributes.  A field value of -1
 * represents a field whose value is unavailable
 * (getattr) or which is not to be changed (setattr).
struct vattr {
	enum vtype	va_type;	/* vnode type (for create) */
	u_short	va_mode;	/* files access mode and type */
!	uid_t	va_uid;		/* owner user id */
!	gid_t	va_gid;		/* owner group id */
	long	va_fsid;	/* file system id (dev for now) */
!	long	va_fileid;	/* file id */
	short	va_nlink;	/* number of references to file */
	u_long	va_size;	/* file size in bytes (quad?) */
+	u_long	va_size1;	/* reserved if not quad */
	long	va_blocksize;	/* blocksize preferred for i/o */
	struct timeval	va_atime;	/* time of last access */
	struct timeval	va_mtime;	/* time of last modification */
	struct timeval	va_ctime;	/* time file changed */
	dev_t	va_rdev;	/* device the file represents */
	u_long	va_bytes;	/* bytes of disk space held by file */
+	u_long	va_bytes1;	/* reserved if va_bytes not a quad */