Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 5 Aug 2021 15:23:36 GMT
From:      Ka Ho Ng <khng@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 0dc332bff200 - main - Add fspacectl(2), vn_deallocate(9) and VOP_DEALLOCATE(9).
Message-ID:  <202108051523.175FNaGp086202@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by khng:

URL: https://cgit.FreeBSD.org/src/commit/?id=0dc332bff200c940edc36c4715b629a2e1e9f9ae

commit 0dc332bff200c940edc36c4715b629a2e1e9f9ae
Author:     Ka Ho Ng <khng@FreeBSD.org>
AuthorDate: 2021-08-05 15:20:42 +0000
Commit:     Ka Ho Ng <khng@FreeBSD.org>
CommitDate: 2021-08-05 15:20:42 +0000

    Add fspacectl(2), vn_deallocate(9) and VOP_DEALLOCATE(9).
    
    fspacectl(2) is a system call to provide space management support to
    userspace applications. VOP_DEALLOCATE(9) is a VOP call to perform the
    deallocation. vn_deallocate(9) is a public KPI for kmods' use.
    
    The purpose of proposing a new system call, a KPI and a VOP call is to
    allow bhyve or other hypervisor monitors to emulate the behavior of SCSI
    UNMAP/NVMe DEALLOCATE on a plain file.
    
    fspacectl(2) comprises of cmd and flags parameters to specify the
    space management operation to be performed. Currently cmd has to be
    SPACECTL_DEALLOC, and flags has to be 0.
    
    fo_fspacectl is added to fileops.
    VOP_DEALLOCATE(9) is added as a new VOP call. A trivial implementation
    of VOP_DEALLOCATE(9) is provided.
    
    Sponsored by:   The FreeBSD Foundation
    Reviewed by:    kib
    Differential Revision:  https://reviews.freebsd.org/D28347
---
 lib/libc/sys/Makefile.inc             |   1 +
 lib/libc/sys/Symbol.map               |   1 +
 lib/libc/sys/fspacectl.2              | 189 +++++++++++++++++++
 lib/libc/sys/pathconf.2               |   3 +
 share/man/man9/Makefile               |   2 +
 share/man/man9/VOP_DEALLOCATE.9       | 101 ++++++++++
 share/man/man9/vn_deallocate.9        | 103 +++++++++++
 sys/bsm/audit_kevents.h               |   1 +
 sys/compat/freebsd32/freebsd32.h      |   4 +
 sys/compat/freebsd32/freebsd32_misc.c |  34 ++++
 sys/compat/freebsd32/syscalls.master  |   5 +
 sys/kern/capabilities.conf            |   5 +
 sys/kern/sys_generic.c                |  70 +++++++
 sys/kern/syscalls.master              |   9 +
 sys/kern/vfs_default.c                | 122 ++++++++++++
 sys/kern/vfs_vnops.c                  | 110 +++++++++++
 sys/kern/vnode_if.src                 |  11 ++
 sys/security/audit/audit_bsm.c        |  12 ++
 sys/sys/fcntl.h                       |  20 ++
 sys/sys/file.h                        |  15 ++
 sys/sys/syscallsubr.h                 |   3 +
 sys/sys/unistd.h                      |   1 +
 sys/sys/vnode.h                       |   2 +
 tests/sys/file/Makefile               |   1 +
 tests/sys/file/fspacectl_test.c       | 338 ++++++++++++++++++++++++++++++++++
 25 files changed, 1163 insertions(+)

diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index a1eb9567a380..29e914872a8d 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -190,6 +190,7 @@ MAN+=	abort2.2 \
 	fhreadlink.2 \
 	flock.2 \
 	fork.2 \
+	fspacectl.2 \
 	fsync.2 \
 	getdirentries.2 \
 	getdtablesize.2 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 80bb2c236191..93fbc947a7e1 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,7 @@ FBSD_1.6 {
 
 FBSD_1.7 {
 	 _Fork;
+	fspacectl;
 };
 
 FBSDprivate_1.0 {
diff --git a/lib/libc/sys/fspacectl.2 b/lib/libc/sys/fspacectl.2
new file mode 100644
index 000000000000..2f581d1c1fb8
--- /dev/null
+++ b/lib/libc/sys/fspacectl.2
@@ -0,0 +1,189 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd August 4, 2021
+.Dt FSPACECTL 2
+.Os
+.Sh NAME
+.Nm fspacectl
+.Nd space management in a file
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fo fspacectl
+.Fa "int fd"
+.Fa "int cmd"
+.Fa "const struct spacectl_range *rqsr"
+.Fa "int flags"
+.Fa "struct spacectl_range *rmsr"
+.Fc
+.Sh DESCRIPTION
+.Nm
+is a system call performing space management over a file.
+The
+.Fa fd
+argument specifies the file descriptor to be operated on by the
+.Fa cmd
+argument.
+The
+.Fa rqsr
+argument points to a
+.Fa spacectl_range
+structure that contains the requested operation range.
+The
+.Fa flags
+argument controls the behavior of the operation to take place.
+If the
+.Fa rmsr
+argument is non-NULL, the
+.Fa spacectl_range
+structure it points to is updated to contain the unprocessed operation range
+after the system call returns.
+Both
+.Fa rqsr
+and
+.Fa rmsr
+arguments can point to the same structure.
+.Pp
+The
+.Fa spacectl_range
+structure is defined as:
+.Bd -literal
+struct spacectl_range {
+	off_t r_offset;
+	off_t r_len;
+};
+.Ed
+.Pp
+The operation specified by the
+.Fa cmd
+argument may be one of:
+.Bl -tag -width SPACECTL_DEALLOC
+.It Dv SPACECTL_DEALLOC
+Zero a region in the file specified by the
+.Fa rqsr
+argument.
+The
+.Va "rqsr->r_offset"
+has to be a value greater than or equal to 0, and the
+.Va "rqsr->r_len"
+has to be a value greater than 0.
+.Pp
+If the file system supports hole-punching,
+file system space deallocation may be performed in the given region.
+.El
+.Pp
+The
+.Fa flags
+argument needs to be the value 0 currently.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned;
+otherwise the value -1 is returned and
+.Va errno
+is set to indicate the error.
+.Sh ERRORS
+Possible failure conditions:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EBADF
+The
+.Fa fd
+argument references a file that was opened without write permission.
+.It Bq Er EINTR
+A signal was caught during execution.
+.It Bq Er EINVAL
+The
+.Fa cmd
+argument is not valid.
+.It Bq Er EINVAL
+If the
+.Fa cmd
+argument is
+.Dv SPACECTL_DEALLOC ,
+either the
+.Fa "range->r_offset"
+argument was less than zero, or the
+.Fa "range->r_len"
+argument was less than or equal to zero.
+.It Bq Er EINVAL
+An invalid or unsupported flag is included in
+.Fa flags .
+.It Bq Er EINVAL
+A flag included in
+.Fa flags
+is not supported by the operation specified by the
+.Fa cmd
+argument.
+.It Bq Er EFAULT
+The
+.Fa rqsr
+or a non-NULL
+.Fa rmsr
+argument point outside the process' allocated address space.
+.It Bq Er EIO
+An I/O error occurred while reading from or writing to a file system.
+.It Bq Er EINTEGRITY
+Corrupted data was detected while reading from the file system.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a file that supports
+.Nm .
+.It Bq Er ENOSPC
+There is insufficient free space remaining on the file system storage
+media.
+.It Bq Er ENOTCAPABLE
+The file descriptor
+.Fa fd
+has insufficient rights.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr creat 2 ,
+.Xr ftruncate 2 ,
+.Xr open 2 ,
+.Xr unlink 2
+.Sh HISTORY
+The
+.Nm
+system call appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+.Nm
+and this manual page were written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/lib/libc/sys/pathconf.2 b/lib/libc/sys/pathconf.2
index 62ec532705ef..c5a7ba1be3c5 100644
--- a/lib/libc/sys/pathconf.2
+++ b/lib/libc/sys/pathconf.2
@@ -166,6 +166,9 @@ specified file, otherwise 0.
 .It Li _PC_MIN_HOLE_SIZE
 If a file system supports the reporting of holes (see
 .Xr lseek 2 ) ,
+.It Li _PC_DEALLOC_PRESENT
+If a file system supports hole-punching (see
+.Xr fspacectl 2 ) ,
 .Fn pathconf
 and
 .Fn fpathconf
diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index d0012301d889..b2f1451a79d7 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -404,6 +404,7 @@ MAN=	accept_filter.9 \
 	vm_page_wire.9 \
 	vm_set_page_size.9 \
 	vmem.9 \
+	vn_deallocate.9 \
 	vn_fullpath.9 \
 	vn_isdisk.9 \
 	vnet.9 \
@@ -420,6 +421,7 @@ MAN=	accept_filter.9 \
 	VOP_BWRITE.9 \
 	VOP_COPY_FILE_RANGE.9 \
 	VOP_CREATE.9 \
+	VOP_DEALLOCATE.9 \
 	VOP_FSYNC.9 \
 	VOP_GETACL.9 \
 	VOP_GETEXTATTR.9 \
diff --git a/share/man/man9/VOP_DEALLOCATE.9 b/share/man/man9/VOP_DEALLOCATE.9
new file mode 100644
index 000000000000..1c7f80cfbc6c
--- /dev/null
+++ b/share/man/man9/VOP_DEALLOCATE.9
@@ -0,0 +1,101 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd May 11, 2021
+.Dt VOP_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm VOP_DEALLOCATE
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo VOP_DEALLOCATE
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *len"
+.Fa "int flags"
+.Fa "struct ucred *cred"
+.Fc
+.Sh DESCRIPTION
+This VOP call zeroes/deallocates storage for an offset range in a file.
+It is used to implement the
+.Xr fspacectl 2
+system call.
+.Pp
+Its arguments are:
+.Bl -tag -width offset
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The start of the range to deallocate storage in the file.
+.It Fa len
+The length of the range to deallocate storage in the file.
+.It Fa flags
+The flags of this call.
+This should be set to 0 for now.
+.It Fa cred
+The credentials of the caller.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *len
+are updated to reflect the portion of the range that
+still needs to be zeroed/deallocated on return.
+Partial result is considered a successful operation.
+.Sh LOCKS
+The vnode should be locked on entry and will still be locked on exit.
+.Sh RETURN VALUES
+Zero is returned if the call is successful, otherwise an appropriate
+error code is returned.
+.Sh ERRORS
+.Bl -tag -width Er
+.It Bq Er EINVAL
+Invalid
+.Fa offset , len
+or
+.Fa flags
+parameters are passed into this VOP call.
+.It Bq Er ENODEV
+The vnode type is not supported by this VOP call.
+.It Bq Er ENOSPC
+The file system is full.
+.It Bq Er EPERM
+An append-only flag is set on the file, but the caller is attempting to
+zero before the current end of file.
+.El
+.Sh SEE ALSO
+.Xr vnode 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/share/man/man9/vn_deallocate.9 b/share/man/man9/vn_deallocate.9
new file mode 100644
index 000000000000..415a8941ca68
--- /dev/null
+++ b/share/man/man9/vn_deallocate.9
@@ -0,0 +1,103 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd Jul 30, 2021
+.Dt VN_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm vn_deallocate
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo vn_deallocate
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *length"
+.Fa "int flags"
+.Fa "int ioflg"
+.Fa "struct ucred *active_cred"
+.Fa "struct ucred *file_cred"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn vn_deallocate
+function zeros and/or deallocates backing storage space from a file.
+This function only works on vnodes with
+.Dv VREG
+type.
+.Pp
+The arguments are:
+.Bl -tag -width active_cred
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The starting offset of the operation range.
+.It Fa length
+The length of the operation range.
+This must be greater than 0.
+.It Fa flags
+The control flags of the operation.
+This should be set to 0 for now.
+.It Fa ioflg
+The control flags of vnode locking.
+.It Fa active_cred
+The user credentials of the calling thread.
+.It Fa file_cred
+The credentials installed on the file description pointing to the vnode or NOCRED.
+.El
+.Pp
+The
+.Fn ioflg
+argument may be one or more of the following flags:
+.Bl -tag -width IO_RANGELOCKED
+.It Dv IO_NODELOCKED
+The vnode was locked before the call.
+.It Dv IO_RANGELOCKED
+Rangelock was owned around the call.
+.It Dv IO_NOMACCHECK
+Skip MAC checking in the call.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *length
+are updated to reflect the unprocessed operation range of the call.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned; otherwise the
+appropriate error is returned.
+.Sh SEE ALSO
+.Xr vnode 9 ,
+.Xr VOP_DEALLOCATE 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index eeb928ecafdc..0da82de1fbcb 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -662,6 +662,7 @@
 #define	AUE_SPECIALFD		43266	/* FreeBSD-specific. */
 #define	AUE_AIO_WRITEV		43267	/* FreeBSD-specific. */
 #define	AUE_AIO_READV		43268	/* FreeBSD-specific. */
+#define	AUE_FSPACECTL		43269	/* FreeBSD-specific. */
 
 /*
  * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/freebsd32.h b/sys/compat/freebsd32/freebsd32.h
index 2e4f5155cbf4..8a14a42db813 100644
--- a/sys/compat/freebsd32/freebsd32.h
+++ b/sys/compat/freebsd32/freebsd32.h
@@ -435,5 +435,9 @@ struct ptrace_coredump32 {
 	uint32_t	pc_limit1, pc_limit2;
 };
 
+struct spacectl_range32 {
+	uint32_t	r_offset1, r_offset2;
+	uint32_t	r_len1, r_len2;
+};
 
 #endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 736fd1123d53..c417a64d286a 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -3857,3 +3857,37 @@ freebsd32_ntp_adjtime(struct thread *td, struct freebsd32_ntp_adjtime_args *uap)
 	}
 	return (error);
 }
+
+int
+freebsd32_fspacectl(struct thread *td, struct freebsd32_fspacectl_args *uap)
+{
+	struct spacectl_range rqsr, rmsr;
+	struct spacectl_range32 rqsr32, rmsr32;
+	int error, cerror;
+
+	error = copyin(uap->rqsr, &rqsr32, sizeof(rqsr32));
+	if (error != 0)
+		return (error);
+	rqsr.r_offset = PAIR32TO64(off_t, rqsr32.r_offset);
+	rqsr.r_len = PAIR32TO64(off_t, rqsr32.r_len);
+
+	error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+	    &rmsr);
+	if (uap->rmsr != NULL) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+		rmsr32.r_offset1 = rmsr.r_offset;
+		rmsr32.r_offset2 = rmsr.r_offset >> 32;
+		rmsr32.r_len1 = rmsr.r_len;
+		rmsr32.r_len2 = rmsr.r_len >> 32;
+#else
+		rmsr32.r_offset1 = rmsr.r_offset >> 32;
+		rmsr32.r_offset2 = rmsr.r_offset;
+		rmsr32.r_len1 = rmsr.r_len >> 32;
+		rmsr32.r_len2 = rmsr.r_len;
+#endif
+		cerror = copyout(&rmsr32, uap->rmsr, sizeof(rmsr32));
+		if (error == 0)
+			error = cerror;
+	}
+	return (error);
+}
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index aac788bf3956..3e53de2dc966 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1176,5 +1176,10 @@
 				    struct aiocb32 *aiocbp); }
 579	AUE_AIO_READV	STD	{ int freebsd32_aio_readv( \
 				    struct aiocb32 *aiocbp); }
+580	AUE_FSPACECTL	STD	{ int freebsd32_fspacectl(int fd, \
+				    int cmd, \
+				    const struct spacectl_range32 *rqsr, \
+				    int flags, \
+				    struct spacectl_range32 *rmsr); }
 
 ; vim: syntax=off
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 602ec7088fc6..f53530eb7fa7 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -228,6 +228,11 @@ freebsd6_mmap
 freebsd6_pread
 freebsd6_pwrite
 
+##
+## Allow I/O-related file operations, subject to capability rights.
+##
+fspacectl
+
 ##
 ## Allow querying file and file system state with fstat(2) and fstatfs(2),
 ## subject to capability rights.
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index f86d494400e2..e6b2cba27a04 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -861,6 +861,76 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
 	return (error);
 }
 
+int
+sys_fspacectl(struct thread *td, struct fspacectl_args *uap)
+{
+	struct spacectl_range rqsr, rmsr;
+	int error, cerror;
+
+	error = copyin(uap->rqsr, &rqsr, sizeof(rqsr));
+	if (error != 0)
+		return (error);
+
+	error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+	    &rmsr);
+	if (uap->rmsr != NULL) {
+		cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr));
+		if (error == 0)
+			error = cerror;
+	}
+	return (error);
+}
+
+int
+kern_fspacectl(struct thread *td, int fd, int cmd,
+    const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp)
+{
+	struct file *fp;
+	struct spacectl_range rmsr;
+	int error;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_CMD(cmd);
+	AUDIT_ARG_FFLAGS(flags);
+
+	if (rqsr == NULL)
+		return (EINVAL);
+	rmsr = *rqsr;
+	if (rmsrp != NULL)
+		*rmsrp = rmsr;
+
+	if (cmd != SPACECTL_DEALLOC ||
+	    rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
+	    rqsr->r_offset > OFF_MAX - rqsr->r_len ||
+	    (flags & ~SPACECTL_F_SUPPORTED) != 0)
+		return (EINVAL);
+
+	error = fget_write(td, fd, &cap_pwrite_rights, &fp);
+	if (error != 0)
+		return (error);
+	AUDIT_ARG_FILE(td->td_proc, fp);
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+		error = ESPIPE;
+		goto out;
+	}
+	if ((fp->f_flag & FWRITE) == 0) {
+		error = EBADF;
+		goto out;
+	}
+
+	error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags,
+	    td->td_ucred, td);
+	/* fspacectl is not restarted after signals if the file is modified. */
+	if (rmsr.r_len != rqsr->r_len && (error == ERESTART ||
+	    error == EINTR || error == EWOULDBLOCK))
+		error = 0;
+	if (rmsrp != NULL)
+		*rmsrp = rmsr;
+out:
+	fdrop(fp, td);
+	return (error);
+}
+
 int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index af787908451a..11247aed8fd6 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3250,6 +3250,15 @@
 		    _Inout_ struct aiocb *aiocbp
 		);
 	}
+580	AUE_FSPACECTL	STD {
+		int fspacectl(
+		    int fd,
+		    int cmd,
+		    _In_ const struct spacectl_range *rqsr,
+		    int flags,
+		    _Out_opt_ struct spacectl_range *rmsr,
+		);
+	}
 
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 63bca7810847..c42d5a795935 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -93,6 +93,7 @@ static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
 static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap);
 static int vop_stdstat(struct vop_stat_args *ap);
 static int vop_stdvput_pair(struct vop_vput_pair_args *ap);
+static int vop_stddeallocate(struct vop_deallocate_args *ap);
 
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
@@ -117,6 +118,7 @@ struct vop_vector default_vnodeops = {
 	.vop_advlockasync =	vop_stdadvlockasync,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
 	.vop_allocate =		vop_stdallocate,
+	.vop_deallocate =	vop_stddeallocate,
 	.vop_bmap =		vop_stdbmap,
 	.vop_close =		VOP_NULL,
 	.vop_fsync =		VOP_NULL,
@@ -518,6 +520,7 @@ vop_stdpathconf(ap)
 		case _PC_ACL_EXTENDED:
 		case _PC_ACL_NFS4:
 		case _PC_CAP_PRESENT:
+		case _PC_DEALLOC_PRESENT:
 		case _PC_INF_PRESENT:
 		case _PC_MAC_PRESENT:
 			*ap->a_retval = 0;
@@ -1069,6 +1072,125 @@ vop_stdallocate(struct vop_allocate_args *ap)
 	return (error);
 }
 
+static int
+vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp,
+    struct ucred *cred)
+{
+	int iosize;
+	int error = 0;
+	struct iovec aiov;
+	struct uio auio;
+	struct thread *td;
+	off_t offset, len;
+
+	iosize = vap->va_blocksize;
+	td = curthread;
+	offset = *offsetp;
+	len = *lenp;
+
+	if (iosize == 0)
+		iosize = BLKDEV_IOSIZE;
+	/* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */
+	iosize = min(iosize * 8, ZERO_REGION_SIZE);
+
+	while (len > 0) {
+		int xfersize = iosize;
+		if (offset % iosize != 0)
+			xfersize -= offset % iosize;
+		if (xfersize > len)
+			xfersize = len;
+
+		aiov.iov_base = __DECONST(void *, zero_region);
+		aiov.iov_len = xfersize;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = offset;
+		auio.uio_resid = xfersize;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_rw = UIO_WRITE;
+		auio.uio_td = td;
+
+		error = VOP_WRITE(vp, &auio, 0, cred);
+		if (error != 0) {
+			len -= xfersize - auio.uio_resid;
+			offset += xfersize - auio.uio_resid;
+			break;
+		}
+
+		len -= xfersize;
+		offset += xfersize;
+	}
+
+	*offsetp = offset;
+	*lenp = len;
+	return (error);
+}
+
+static int
+vop_stddeallocate(struct vop_deallocate_args *ap)
+{
+	struct vnode *vp;
+	off_t offset, len;
+	struct ucred *cred;
+	int error;
+	struct vattr va;
+	off_t noff, xfersize, rem;
+
+	vp = ap->a_vp;
+	offset = *ap->a_offset;
+	len = *ap->a_len;
+	cred = ap->a_cred;
+
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error)
+		return (error);
+
+	len = omin(OFF_MAX - offset, *ap->a_len);
+	while (len > 0) {
+		noff = offset;
+		error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred);
+		if (error) {
+			if (error != ENXIO)
+				/* XXX: Is it okay to fallback further? */
+				goto out;
+
+			/*
+			 * No more data region to be filled
+			 */
+			len = 0;
+			error = 0;
+			break;
+		}
+		KASSERT(noff >= offset, ("FIOSEEKDATA going backward"));
+		if (noff != offset) {
+			xfersize = omin(noff - offset, len);
+			len -= xfersize;
+			offset += xfersize;
+			if (len == 0)
+				break;
+		}
+		error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred);
+		if (error)
+			goto out;
+
+		/* Fill zeroes */
+		xfersize = rem = omin(noff - offset, len);
+		error = vp_zerofill(vp, &va, &offset, &rem, cred);
+		if (error) {
+			len -= xfersize - rem;
+			goto out;
+		}
+
+		len -= xfersize;
+		if (should_yield())
+			break;
+	}
+out:
+	*ap->a_offset = offset;
+	*ap->a_len = len;
+	return (error);
+}
+
 int
 vop_stdadvise(struct vop_advise_args *ap)
 {
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index ccc468d71737..c54f55a99036 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -106,6 +106,7 @@ static fo_kqfilter_t	vn_kqfilter;
 static fo_close_t	vn_closefile;
 static fo_mmap_t	vn_mmap;
 static fo_fallocate_t	vn_fallocate;
+static fo_fspacectl_t	vn_fspacectl;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
@@ -123,6 +124,7 @@ struct 	fileops vnops = {
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap = vn_mmap,
 	.fo_fallocate = vn_fallocate,
+	.fo_fspacectl = vn_fspacectl,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
@@ -3439,6 +3441,114 @@ vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 	return (error);
 }
 
+static int
+vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
+    int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+	struct mount *mp;
+	void *rl_cookie;
+	off_t off, len;
+	int error;
+#ifdef AUDIT
+	bool audited_vnode1 = false;
+#endif
+
+	rl_cookie = NULL;
+	error = 0;
+	mp = NULL;
+	off = *offset;
+	len = *length;
+
+	if ((ioflg & (IO_NODELOCKED|IO_RANGELOCKED)) == 0)
+		rl_cookie = vn_rangelock_wlock(vp, off, off + len);
+	while (len > 0 && error == 0) {
+		/*
+		 * Try to deallocate the longest range in one pass.
+		 * In case a pass takes too long to be executed, it returns
+		 * partial result. The residue will be proceeded in the next
+		 * pass.
+		 */
+
+		if ((ioflg & IO_NODELOCKED) == 0) {
+			bwillwrite();
+			if ((error = vn_start_write(vp, &mp,
+			    V_WAIT | PCATCH)) != 0)
+				goto out;
+			vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
+		}
+#ifdef AUDIT
+		if (!audited_vnode1) {
+			AUDIT_ARG_VNODE1(vp);
+			audited_vnode1 = true;
+		}
+#endif
+
+#ifdef MAC
+		if ((ioflg & IO_NOMACCHECK) == 0)
+			error = mac_vnode_check_write(active_cred, file_cred,
+			    vp);
+#endif
+		if (error == 0)
+			error = VOP_DEALLOCATE(vp, &off, &len, flags,
+			    active_cred);
+
+		if ((ioflg & IO_NODELOCKED) == 0) {
+			VOP_UNLOCK(vp);
+			if (mp != NULL) {
+				vn_finished_write(mp);
+				mp = NULL;
+			}
+		}
+	}
+out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
+	*offset = off;
+	*length = len;
+	return (error);
+}
+
+int
+vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
+    int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+	if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
+	    flags != 0)
+		return (EINVAL);
+	if (vp->v_type != VREG)
+		return (ENODEV);
+
+	return (vn_deallocate_impl(vp, offset, length, flags, ioflg,
+	    active_cred, file_cred));
+}
+
*** 562 LINES SKIPPED ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202108051523.175FNaGp086202>