Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 6 Apr 2017 09:34:54 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r316562 - in head/sys/compat/linuxkpi/common: include/linux src
Message-ID:  <201704060934.v369Ys50077560@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Thu Apr  6 09:34:54 2017
New Revision: 316562
URL: https://svnweb.freebsd.org/changeset/base/316562

Log:
  Implement proper support for memory map operations in the LinuxKPI,
  like open, close and fault using the character device pager.
  
  Some notes about the implementation:
  
  1) Linux drivers set the vm_ops and vm_private_data fields during a
  mmap() call to indicate that the driver wants to use the LinuxKPI VM
  operations. Else these operations are not used.
  
  2) The vm_private_data pointer is associated with a VM area structure
  and inserted into an internal LinuxKPI list. If the vm_private_data
  pointer already exists, the existing VM area structure is used instead
  of the allocated one which gets freed.
  
  3) The LinuxKPI's vm_private_data pointer is used as the callback
  handle for the FreeBSD VM object. The VM subsystem in FreeBSD has a
  similar list to identify equal handles and will only call the
  character device pager's close function once.
  
  4) All LinuxKPI VM operations are serialized through the mmap_sem
  sempaphore, which is per procedure, which prevents simultaneous access
  to the shared VM area structure when receiving page faults.
  
  Obtained from:		kmacy @
  MFC after:		1 week
  Sponsored by:		Mellanox Technologies

Modified:
  head/sys/compat/linuxkpi/common/include/linux/mm.h
  head/sys/compat/linuxkpi/common/include/linux/page.h
  head/sys/compat/linuxkpi/common/src/linux_compat.c

Modified: head/sys/compat/linuxkpi/common/include/linux/mm.h
==============================================================================
--- head/sys/compat/linuxkpi/common/include/linux/mm.h	Thu Apr  6 09:07:01 2017	(r316561)
+++ head/sys/compat/linuxkpi/common/include/linux/mm.h	Thu Apr  6 09:34:54 2017	(r316562)
@@ -38,6 +38,7 @@
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
 #include <linux/pfn.h>
+#include <linux/list.h>
 
 #include <asm/pgtable.h>
 
@@ -89,12 +90,25 @@ CTASSERT((VM_PROT_ALL & -(1 << 8)) == 0)
 typedef int (*pte_fn_t)(pte_t *, pgtable_t, unsigned long addr, void *data);
 
 struct vm_area_struct {
-	vm_offset_t	vm_start;
-	vm_offset_t	vm_end;
-	vm_offset_t	vm_pgoff;
-	vm_paddr_t	vm_pfn;		/* PFN For mmap. */
-	vm_size_t	vm_len;		/* length for mmap. */
-	vm_memattr_t	vm_page_prot;
+	vm_offset_t vm_start;
+	vm_offset_t vm_end;
+	vm_offset_t vm_pgoff;
+	pgprot_t vm_page_prot;
+	unsigned long vm_flags;
+	struct mm_struct *vm_mm;
+	void   *vm_private_data;
+	const struct vm_operations_struct *vm_ops;
+	struct linux_file *vm_file;
+
+	/* internal operation */
+	vm_paddr_t vm_pfn;		/* PFN for memory map */
+	vm_size_t vm_len;		/* length for memory map */
+	vm_pindex_t vm_pfn_first;
+	int	vm_pfn_count;
+	int    *vm_pfn_pcount;
+	vm_object_t vm_obj;
+	vm_map_t vm_cached_map;
+	TAILQ_ENTRY(vm_area_struct) vm_entry;
 };
 
 struct vm_fault {

Modified: head/sys/compat/linuxkpi/common/include/linux/page.h
==============================================================================
--- head/sys/compat/linuxkpi/common/include/linux/page.h	Thu Apr  6 09:07:01 2017	(r316561)
+++ head/sys/compat/linuxkpi/common/include/linux/page.h	Thu Apr  6 09:34:54 2017	(r316562)
@@ -47,6 +47,28 @@ typedef unsigned long pgprot_t;
 
 #define page	vm_page
 
+#define	LINUXKPI_PROT_VALID (1 << 4)
+#define	LINUXKPI_CACHE_MODE_SHIFT 3
+
+static inline pgprot_t
+cachemode2protval(vm_memattr_t attr)
+{
+	return ((attr | LINUXKPI_PROT_VALID) << LINUXKPI_CACHE_MODE_SHIFT);
+}
+
+static inline vm_memattr_t
+pgprot2cachemode(pgprot_t prot)
+{
+	int val;
+
+	val = prot >> LINUXKPI_CACHE_MODE_SHIFT;
+
+	if (val & LINUXKPI_PROT_VALID)
+		return (val & ~LINUXKPI_PROT_VALID);
+	else
+		return (VM_MEMATTR_DEFAULT);
+}
+
 #define	virt_to_page(x)		PHYS_TO_VM_PAGE(vtophys((x)))
 #define	page_to_pfn(pp)		(VM_PAGE_TO_PHYS((pp)) >> PAGE_SHIFT)
 #define	pfn_to_page(pfn)	(PHYS_TO_VM_PAGE((pfn) << PAGE_SHIFT))

Modified: head/sys/compat/linuxkpi/common/src/linux_compat.c
==============================================================================
--- head/sys/compat/linuxkpi/common/src/linux_compat.c	Thu Apr  6 09:07:01 2017	(r316561)
+++ head/sys/compat/linuxkpi/common/src/linux_compat.c	Thu Apr  6 09:34:54 2017	(r316562)
@@ -2,7 +2,7 @@
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
- * Copyright (c) 2013-2016 Mellanox Technologies, Ltd.
+ * Copyright (c) 2013-2017 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -88,6 +88,8 @@ MALLOC_DEFINE(M_KMALLOC, "linux", "Linux
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
 
+static struct vm_area_struct *linux_cdev_handle_find(void *handle);
+
 struct kobject linux_class_root;
 struct device linux_root_device;
 struct class linux_class_misc;
@@ -394,6 +396,166 @@ linux_file_dtor(void *cdp)
 }
 
 static int
+linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
+    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+	struct vm_area_struct *vmap;
+	struct vm_fault vmf;
+	int err;
+
+	linux_set_current(curthread);
+
+	/* get VM area structure */
+	vmap = linux_cdev_handle_find(vm_obj->handle);
+	MPASS(vmap != NULL);
+	MPASS(vmap->vm_private_data == vm_obj->handle);
+
+	/* fill out VM fault structure */
+	vmf.virtual_address = (void *)(pidx << PAGE_SHIFT);
+	vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+	vmf.pgoff = 0;
+	vmf.page = NULL;
+
+	VM_OBJECT_WUNLOCK(vm_obj);
+
+	down_write(&vmap->vm_mm->mmap_sem);
+	if (unlikely(vmap->vm_ops == NULL)) {
+		err = VM_FAULT_SIGBUS;
+	} else {
+		vmap->vm_pfn_count = 0;
+		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
+		vmap->vm_obj = vm_obj;
+
+		err = vmap->vm_ops->fault(vmap, &vmf);
+
+		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
+			kern_yield(0);
+			err = vmap->vm_ops->fault(vmap, &vmf);
+		}
+	}
+
+	/* translate return code */
+	switch (err) {
+	case VM_FAULT_OOM:
+		err = VM_PAGER_AGAIN;
+		break;
+	case VM_FAULT_SIGBUS:
+		err = VM_PAGER_BAD;
+		break;
+	case VM_FAULT_NOPAGE:
+		/*
+		 * By contract the fault handler will return having
+		 * busied all the pages itself. If pidx is already
+		 * found in the object, it will simply xbusy the first
+		 * page and return with vm_pfn_count set to 1.
+		 */
+		*first = vmap->vm_pfn_first;
+		*last = *first + vmap->vm_pfn_count - 1;
+		err = VM_PAGER_OK;
+		break;
+	default:
+		err = VM_PAGER_ERROR;
+		break;
+	}
+	up_write(&vmap->vm_mm->mmap_sem);
+	VM_OBJECT_WLOCK(vm_obj);
+	return (err);
+}
+
+static struct rwlock linux_vma_lock;
+static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
+    TAILQ_HEAD_INITIALIZER(linux_vma_head);
+
+static struct vm_area_struct *
+linux_cdev_handle_insert(void *handle, struct vm_area_struct *vmap)
+{
+	struct vm_area_struct *ptr;
+
+	rw_wlock(&linux_vma_lock);
+	TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
+		if (ptr->vm_private_data == handle) {
+			rw_wunlock(&linux_vma_lock);
+			kfree(vmap);
+			return (NULL);
+		}
+	}
+	TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
+	rw_wunlock(&linux_vma_lock);
+	return (vmap);
+}
+
+static void
+linux_cdev_handle_remove(struct vm_area_struct *vmap)
+{
+	if (vmap == NULL)
+		return;
+
+	rw_wlock(&linux_vma_lock);
+	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
+	rw_wunlock(&linux_vma_lock);
+	kfree(vmap);
+}
+
+static struct vm_area_struct *
+linux_cdev_handle_find(void *handle)
+{
+	struct vm_area_struct *vmap;
+
+	rw_rlock(&linux_vma_lock);
+	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
+		if (vmap->vm_private_data == handle)
+			break;
+	}
+	rw_runlock(&linux_vma_lock);
+	return (vmap);
+}
+
+static int
+linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
+		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
+{
+	const struct vm_operations_struct *vm_ops;
+	struct vm_area_struct *vmap;
+
+	vmap = linux_cdev_handle_find(handle);
+	MPASS(vmap != NULL);
+
+	*color = 0;
+
+	down_write(&vmap->vm_mm->mmap_sem);
+	vm_ops = vmap->vm_ops;
+	if (likely(vm_ops != NULL))
+		vm_ops->open(vmap);
+	up_write(&vmap->vm_mm->mmap_sem);
+
+	return (0);
+}
+
+static void
+linux_cdev_pager_dtor(void *handle)
+{
+	const struct vm_operations_struct *vm_ops;
+	struct vm_area_struct *vmap;
+
+	vmap = linux_cdev_handle_find(handle);
+	MPASS(vmap != NULL);
+
+	down_write(&vmap->vm_mm->mmap_sem);
+	vm_ops = vmap->vm_ops;
+	if (likely(vm_ops != NULL))
+		vm_ops->close(vmap);
+	up_write(&vmap->vm_mm->mmap_sem);
+
+	linux_cdev_handle_remove(vmap);
+}
+
+static struct cdev_pager_ops linux_cdev_pager_ops = {
+	.cdev_pg_populate	= linux_cdev_pager_populate,
+	.cdev_pg_ctor	= linux_cdev_pager_ctor,
+	.cdev_pg_dtor	= linux_cdev_pager_dtor
+};
+
+static int
 linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
 	struct linux_cdev *ldev;
@@ -707,10 +869,11 @@ static int
 linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **object, int nprot)
 {
+	struct vm_area_struct *vmap;
 	struct linux_file *filp;
 	struct thread *td;
 	struct file *file;
-	struct vm_area_struct vma;
+	vm_memattr_t attr;
 	int error;
 
 	td = curthread;
@@ -720,39 +883,82 @@ linux_dev_mmap_single(struct cdev *dev, 
 	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
 		return (error);
 	filp->f_flags = file->f_flag;
+
+	if (filp->f_op->mmap == NULL)
+		return (ENODEV);
+
 	linux_set_current(td);
-	vma.vm_start = 0;
-	vma.vm_end = size;
-	vma.vm_pgoff = *offset / PAGE_SIZE;
-	vma.vm_pfn = 0;
-	vma.vm_page_prot = VM_MEMATTR_DEFAULT;
-	if (filp->f_op->mmap) {
-		error = -filp->f_op->mmap(filp, &vma);
-		if (error == 0) {
-			struct sglist *sg;
-
-			sg = sglist_alloc(1, M_WAITOK);
-			sglist_append_phys(sg,
-			    (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT, vma.vm_len);
-			*object = vm_pager_allocate(OBJT_SG, sg, vma.vm_len,
-			    nprot, 0, td->td_ucred);
-		        if (*object == NULL) {
-				sglist_free(sg);
-				error = EINVAL;
-				goto done;
-			}
-			*offset = 0;
-			if (vma.vm_page_prot != VM_MEMATTR_DEFAULT) {
-				VM_OBJECT_WLOCK(*object);
-				vm_object_set_memattr(*object,
-				    vma.vm_page_prot);
-				VM_OBJECT_WUNLOCK(*object);
-			}
+
+	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
+	vmap->vm_start = 0;
+	vmap->vm_end = size;
+	vmap->vm_pgoff = *offset / PAGE_SIZE;
+	vmap->vm_pfn = 0;
+	vmap->vm_flags = vmap->vm_page_prot = nprot;
+	vmap->vm_ops = NULL;
+	vmap->vm_file = filp;
+	vmap->vm_mm = current->mm;
+
+	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
+		error = EINTR;
+	} else {
+		error = -filp->f_op->mmap(filp, vmap);
+		up_write(&vmap->vm_mm->mmap_sem);
+	}
+
+	if (error != 0) {
+		kfree(vmap);
+		return (error);
+	}
+
+	attr = pgprot2cachemode(vmap->vm_page_prot);
+
+	if (vmap->vm_ops != NULL) {
+		void *vm_private_data;
+
+		if (vmap->vm_ops->fault == NULL ||
+		    vmap->vm_ops->open == NULL ||
+		    vmap->vm_ops->close == NULL ||
+		    vmap->vm_private_data == NULL) {
+			kfree(vmap);
+			return (EINVAL);
 		}
-	} else
-		error = ENODEV;
-done:
-	return (error);
+
+		vm_private_data = vmap->vm_private_data;
+
+		vmap = linux_cdev_handle_insert(vm_private_data, vmap);
+
+		*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
+		    &linux_cdev_pager_ops, size, nprot, *offset, curthread->td_ucred);
+
+		if (*object == NULL) {
+			linux_cdev_handle_remove(vmap);
+			return (EINVAL);
+		}
+	} else {
+		struct sglist *sg;
+
+		sg = sglist_alloc(1, M_WAITOK);
+		sglist_append_phys(sg, (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
+
+		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
+		    nprot, 0, curthread->td_ucred);
+
+		kfree(vmap);
+
+		if (*object == NULL) {
+			sglist_free(sg);
+			return (EINVAL);
+		}
+	}
+
+	if (attr != VM_MEMATTR_DEFAULT) {
+		VM_OBJECT_WLOCK(*object);
+		vm_object_set_memattr(*object, attr);
+		VM_OBJECT_WUNLOCK(*object);
+	}
+	*offset = 0;
+	return (0);
 }
 
 struct cdevsw linuxcdevsw = {
@@ -1484,6 +1690,7 @@ linux_compat_init(void *arg)
 #if defined(__i386__) || defined(__amd64__)
 	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
 #endif
+	rw_init(&linux_vma_lock, "lkpi-vma-lock");
 
 	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
 	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
@@ -1514,6 +1721,8 @@ linux_compat_uninit(void *arg)
 	linux_kobject_kfree_name(&linux_class_root);
 	linux_kobject_kfree_name(&linux_root_device.kobj);
 	linux_kobject_kfree_name(&linux_class_misc.kobj);
+
+	rw_destroy(&linux_vma_lock);
 }
 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201704060934.v369Ys50077560>