From owner-svn-src-all@FreeBSD.ORG Wed Sep 11 06:41:16 2013 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTP id C38F5A38; Wed, 11 Sep 2013 06:41:16 +0000 (UTC) (envelope-from kib@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id A21CD2839; Wed, 11 Sep 2013 06:41:16 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id r8B6fGcm018861; Wed, 11 Sep 2013 06:41:16 GMT (envelope-from kib@svn.freebsd.org) Received: (from kib@localhost) by svn.freebsd.org (8.14.7/8.14.5/Submit) id r8B6fGQU018859; Wed, 11 Sep 2013 06:41:16 GMT (envelope-from kib@svn.freebsd.org) Message-Id: <201309110641.r8B6fGQU018859@svn.freebsd.org> From: Konstantin Belousov Date: Wed, 11 Sep 2013 06:41:16 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r255467 - head/sys/kern X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 11 Sep 2013 06:41:16 -0000 Author: kib Date: Wed Sep 11 06:41:15 2013 New Revision: 255467 URL: http://svnweb.freebsd.org/changeset/base/255467 Log: Implement sendfile(2) for the posix shared memory segment file descriptor, in addition to the regular files. Requested by: alc Discussed with: emaste Tested by: pho (previous version) Sponsored by: The FreeBSD Foundation Approved by: re (hrs) Modified: head/sys/kern/uipc_shm.c head/sys/kern/uipc_syscalls.c Modified: head/sys/kern/uipc_shm.c ============================================================================== --- head/sys/kern/uipc_shm.c Wed Sep 11 06:16:12 2013 (r255466) +++ head/sys/kern/uipc_shm.c Wed Sep 11 06:41:15 2013 (r255467) @@ -134,7 +134,7 @@ static struct fileops shm_ops = { .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, - .fo_sendfile = invfo_sendfile, + .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; Modified: head/sys/kern/uipc_syscalls.c ============================================================================== --- head/sys/kern/uipc_syscalls.c Wed Sep 11 06:16:12 2013 (r255466) +++ head/sys/kern/uipc_syscalls.c Wed Sep 11 06:41:15 2013 (r255467) @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -86,7 +88,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include @@ -1850,8 +1852,6 @@ getsockaddr(namp, uaddr, len) return (error); } -#include - struct sendfile_sync { struct mtx mtx; struct cv cv; @@ -1917,6 +1917,10 @@ do_sendfile(struct thread *td, struct se cap_rights_t rights; int error; + /* + * File offset must be positive. If it goes beyond EOF + * we send only the header/trailer and no payload data. + */ if (uap->offset < 0) return (EINVAL); @@ -1978,79 +1982,240 @@ freebsd4_sendfile(struct thread *td, str } #endif /* COMPAT_FREEBSD4 */ -int -vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, - struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, - int kflags, struct thread *td) +static int +sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, + off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) { - struct vnode *vp = fp->f_vnode; - struct file *sock_fp; - struct vm_object *obj = NULL; - struct socket *so = NULL; - struct mbuf *m = NULL; - struct sf_buf *sf; - struct vm_page *pg; - struct vattr va; - struct sendfile_sync *sfs = NULL; - cap_rights_t rights; - off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0; - int bsize, error, hdrlen = 0, mnw = 0; + vm_page_t m; + vm_pindex_t pindex; + ssize_t resid; + int error, readahead, rv; + + pindex = OFF_TO_IDX(off); + VM_OBJECT_WLOCK(obj); + m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | + VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); - vn_lock(vp, LK_SHARED | LK_RETRY); - if (vp->v_type == VREG) { - bsize = vp->v_mount->mnt_stat.f_iosize; - if (nbytes == 0) { - error = VOP_GETATTR(vp, &va, td->td_ucred); - if (error != 0) { - VOP_UNLOCK(vp, 0); - obj = NULL; - goto out; + /* + * Check if page is valid for what we need, otherwise initiate I/O. + * + * The non-zero nd argument prevents disk I/O, instead we + * return the caller what he specified in nd. In particular, + * if we already turned some pages into mbufs, nd == EAGAIN + * and the main function send them the pages before we come + * here again and block. + */ + if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { + if (vp == NULL) + vm_page_xunbusy(m); + VM_OBJECT_WUNLOCK(obj); + *res = m; + return (0); + } else if (nd != 0) { + if (vp == NULL) + vm_page_xunbusy(m); + error = nd; + goto free_page; + } + + /* + * Get the page from backing store. + */ + error = 0; + if (vp != NULL) { + VM_OBJECT_WUNLOCK(obj); + readahead = sfreadahead * MAXBSIZE; + + /* + * Use vn_rdwr() instead of the pager interface for + * the vnode, to allow the read-ahead. + * + * XXXMAC: Because we don't have fp->f_cred here, we + * pass in NOCRED. This is probably wrong, but is + * consistent with our original implementation. + */ + error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), + UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / + bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); + SFSTAT_INC(sf_iocnt); + VM_OBJECT_WLOCK(obj); + } else { + if (vm_pager_has_page(obj, pindex, NULL, NULL)) { + rv = vm_pager_get_pages(obj, &m, 1, 0); + SFSTAT_INC(sf_iocnt); + m = vm_page_lookup(obj, pindex); + if (m == NULL) + error = EIO; + else if (rv != VM_PAGER_OK) { + vm_page_lock(m); + vm_page_free(m); + vm_page_unlock(m); + m = NULL; + error = EIO; } - rem = va.va_size; - } else - rem = nbytes; + } else { + pmap_zero_page(m); + m->valid = VM_PAGE_BITS_ALL; + m->dirty = 0; + } + if (m != NULL) + vm_page_xunbusy(m); + } + if (error == 0) { + *res = m; + } else if (m != NULL) { +free_page: + vm_page_lock(m); + vm_page_unwire(m, 0); + + /* + * See if anyone else might know about this page. If + * not and it is not valid, then free it. + */ + if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) + vm_page_free(m); + vm_page_unlock(m); + } + VM_OBJECT_WUNLOCK(obj); + KASSERT(error != 0 || (m->wire_count > 0 && m->valid == + VM_PAGE_BITS_ALL), + ("wrong page state m %p", m)); + return (error); +} + +static int +sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, + struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, + int *bsize) +{ + struct vattr va; + vm_object_t obj; + struct vnode *vp; + struct shmfd *shmfd; + int error; + + vp = *vp_res = NULL; + obj = NULL; + shmfd = *shmfd_res = NULL; + *bsize = 0; + + /* + * The file descriptor must be a regular file and have a + * backing VM object. + */ + if (fp->f_type == DTYPE_VNODE) { + vp = fp->f_vnode; + vn_lock(vp, LK_SHARED | LK_RETRY); + if (vp->v_type != VREG) { + error = EINVAL; + goto out; + } + *bsize = vp->v_mount->mnt_stat.f_iosize; + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error != 0) + goto out; + *obj_size = va.va_size; obj = vp->v_object; - if (obj != NULL) { - /* - * Temporarily increase the backing VM - * object's reference count so that a forced - * reclamation of its vnode does not - * immediately destroy it. - */ - VM_OBJECT_WLOCK(obj); - if ((obj->flags & OBJ_DEAD) == 0) { - vm_object_reference_locked(obj); - VM_OBJECT_WUNLOCK(obj); - } else { - VM_OBJECT_WUNLOCK(obj); - obj = NULL; - } + if (obj == NULL) { + error = EINVAL; + goto out; } - } else - bsize = 0; /* silence gcc */ - VOP_UNLOCK(vp, 0); - if (obj == NULL) { + } else if (fp->f_type == DTYPE_SHM) { + shmfd = fp->f_data; + obj = shmfd->shm_object; + *obj_size = shmfd->shm_size; + } else { error = EINVAL; goto out; } + VM_OBJECT_WLOCK(obj); + if ((obj->flags & OBJ_DEAD) != 0) { + VM_OBJECT_WUNLOCK(obj); + error = EBADF; + goto out; + } + + /* + * Temporarily increase the backing VM object's reference + * count so that a forced reclamation of its vnode does not + * immediately destroy it. + */ + vm_object_reference_locked(obj); + VM_OBJECT_WUNLOCK(obj); + *obj_res = obj; + *vp_res = vp; + *shmfd_res = shmfd; + +out: + if (vp != NULL) + VOP_UNLOCK(vp, 0); + return (error); +} + +static int +kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, + struct socket **so) +{ + cap_rights_t rights; + int error; + + *sock_fp = NULL; + *so = NULL; + /* * The socket must be a stream socket and connected. - * Remember if it a blocking or non-blocking socket. */ - error = getsock_cap(td->td_proc->p_fd, sockfd, - cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL); + error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, + CAP_SEND), sock_fp, NULL); + if (error != 0) + return (error); + *so = (*sock_fp)->f_data; + if ((*so)->so_type != SOCK_STREAM) + return (EINVAL); + if (((*so)->so_state & SS_ISCONNECTED) == 0) + return (ENOTCONN); + return (0); +} + +int +vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, + struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, + int kflags, struct thread *td) +{ + struct file *sock_fp; + struct vnode *vp; + struct vm_object *obj; + struct socket *so; + struct mbuf *m; + struct sf_buf *sf; + struct vm_page *pg; + struct shmfd *shmfd; + struct sendfile_sync *sfs; + struct vattr va; + off_t off, xfsize, fsbytes, sbytes, rem, obj_size; + int error, bsize, nd, hdrlen, mnw; + bool inflight_called; + + obj = NULL; + so = NULL; + m = NULL; + sfs = NULL; + fsbytes = sbytes = 0; + hdrlen = mnw = 0; + rem = nbytes; + inflight_called = false; + + error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); + if (error != 0) + return (error); + if (rem == 0) + rem = obj_size; + + error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; - so = sock_fp->f_data; - if (so->so_type != SOCK_STREAM) { - error = EINVAL; - goto out; - } - if ((so->so_state & SS_ISCONNECTED) == 0) { - error = ENOTCONN; - goto out; - } + /* * Do not wait on memory allocations but return ENOMEM for * caller to retry later. @@ -2123,7 +2288,7 @@ vn_sendfile(struct file *fp, int sockfd, int done; if ((nbytes != 0 && nbytes == fsbytes) || - (nbytes == 0 && va.va_size == fsbytes)) + (nbytes == 0 && obj_size == fsbytes)) break; mtail = NULL; @@ -2197,13 +2362,16 @@ retry_space: */ space -= hdrlen; - error = vn_lock(vp, LK_SHARED); - if (error != 0) - goto done; - error = VOP_GETATTR(vp, &va, td->td_ucred); - if (error != 0 || off >= va.va_size) { - VOP_UNLOCK(vp, 0); - goto done; + if (vp != NULL) { + error = vn_lock(vp, LK_SHARED); + if (error != 0) + goto done; + error = VOP_GETATTR(vp, &va, td->td_ucred); + if (error != 0 || off >= va.va_size) { + VOP_UNLOCK(vp, 0); + goto done; + } + obj_size = va.va_size; } /* @@ -2211,7 +2379,6 @@ retry_space: * dumped into socket buffer. */ while (space > loopbytes) { - vm_pindex_t pindex; vm_offset_t pgoff; struct mbuf *m0; @@ -2221,7 +2388,7 @@ retry_space: * or the passed in nbytes. */ pgoff = (vm_offset_t)(off & PAGE_MASK); - rem = va.va_size - offset; + rem = obj_size - offset; if (nbytes != 0) rem = omin(rem, nbytes); rem -= fsbytes + loopbytes; @@ -2236,59 +2403,15 @@ retry_space: * Attempt to look up the page. Allocate * if not found or wait and loop if busy. */ - pindex = OFF_TO_IDX(off); - VM_OBJECT_WLOCK(obj); - pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY | - VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL | - VM_ALLOC_WIRED); - - /* - * Check if page is valid for what we need, - * otherwise initiate I/O. - * If we already turned some pages into mbufs, - * send them off before we come here again and - * block. - */ - if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) - VM_OBJECT_WUNLOCK(obj); - else if (m != NULL) - error = EAGAIN; /* send what we already got */ - else if (flags & SF_NODISKIO) - error = EBUSY; - else { - ssize_t resid; - int readahead = sfreadahead * MAXBSIZE; - - VM_OBJECT_WUNLOCK(obj); - - /* - * Get the page from backing store. - * XXXMAC: Because we don't have fp->f_cred - * here, we pass in NOCRED. This is probably - * wrong, but is consistent with our original - * implementation. - */ - error = vn_rdwr(UIO_READ, vp, NULL, readahead, - trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | - IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT), - td->td_ucred, NOCRED, &resid, td); - SFSTAT_INC(sf_iocnt); - if (error != 0) - VM_OBJECT_WLOCK(obj); - } + if (m != NULL) + nd = EAGAIN; /* send what we already got */ + else if ((flags & SF_NODISKIO) != 0) + nd = EBUSY; + else + nd = 0; + error = sendfile_readpage(obj, vp, nd, off, + xfsize, bsize, td, &pg); if (error != 0) { - vm_page_lock(pg); - vm_page_unwire(pg, 0); - /* - * See if anyone else might know about - * this page. If not and it is not valid, - * then free it. - */ - if (pg->wire_count == 0 && pg->valid == 0 && - !vm_page_busied(pg)) - vm_page_free(pg); - vm_page_unlock(pg); - VM_OBJECT_WUNLOCK(obj); if (error == EAGAIN) error = 0; /* not a real error */ break; @@ -2358,7 +2481,8 @@ retry_space: } } - VOP_UNLOCK(vp, 0); + if (vp != NULL) + VOP_UNLOCK(vp, 0); /* Add the buffer chain to the socket buffer. */ if (m != NULL) {