Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 27 Dec 2018 14:37:11 -0800 (PST)
From:      "Rodney W. Grimes" <freebsd@pdx.rh.CN85.dnsmgr.net>
To:        Alexander Motin <mav@freebsd.org>
Cc:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   Re: svn commit: r342557 - head/sys/dev/nvd
Message-ID:  <201812272237.wBRMbB6W047349@pdx.rh.CN85.dnsmgr.net>
In-Reply-To: <201812271828.wBRISJIV054433@repo.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
> Author: mav
> Date: Thu Dec 27 18:28:19 2018
> New Revision: 342557
> URL: https://svnweb.freebsd.org/changeset/base/342557
> 
> Log:
>   Reimplement nvd(4) detach handling.
>   
>   Previous code typically crashed in case of NVMe device unplug or even clean
>   detach while some I/Os are still in flight.  To fix this the new code calls
>   disk_gone() and waits for confirmation of all references gone before calling
>   disk_destroy(), freeing other resources and allowing controller detach.
>   
>   While there, fix disk lists locking and reimplement unit numbers assignment.
>   
>   MFC after:	1 month
>   Sponsored by:	iXsystems, Inc.
> 
> Modified:
>   head/sys/dev/nvd/nvd.c
> 
> Modified: head/sys/dev/nvd/nvd.c
> ==============================================================================
> --- head/sys/dev/nvd/nvd.c	Thu Dec 27 17:23:01 2018	(r342556)
> +++ head/sys/dev/nvd/nvd.c	Thu Dec 27 18:28:19 2018	(r342557)
> @@ -2,6 +2,7 @@
>   * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
>   *
>   * Copyright (C) 2012-2016 Intel Corporation
> + * Copyright (C) 2018 Alexander Motin <mav@FreeBSD.org>
>   * All rights reserved.

Please exercise great care when placing copyright statements,
you have seperated Intels copyright from its All rights reserved
clause, making it appear as if you are the one exercising
that claim, and that they did not.

(Note, if it better to not put that clause on a seperate line if possible,
for the very problem that occurs here)

Please correct this.

Thanks,
Rod
>   *
>   * Redistribution and use in source and binary forms, with or without
> @@ -34,9 +35,11 @@ __FBSDID("$FreeBSD$");
>  #include <sys/kernel.h>
>  #include <sys/malloc.h>
>  #include <sys/module.h>
> +#include <sys/queue.h>
>  #include <sys/sysctl.h>
>  #include <sys/systm.h>
>  #include <sys/taskqueue.h>
> +#include <machine/atomic.h>
>  
>  #include <geom/geom.h>
>  #include <geom/geom_disk.h>
> @@ -46,15 +49,16 @@ __FBSDID("$FreeBSD$");
>  #define NVD_STR		"nvd"
>  
>  struct nvd_disk;
> +struct nvd_controller;
>  
>  static disk_ioctl_t nvd_ioctl;
>  static disk_strategy_t nvd_strategy;
>  static dumper_t nvd_dump;
>  
>  static void nvd_done(void *arg, const struct nvme_completion *cpl);
> +static void nvd_gone(struct nvd_disk *ndisk);
>  
>  static void *nvd_new_disk(struct nvme_namespace *ns, void *ctrlr);
> -static void destroy_geom_disk(struct nvd_disk *ndisk);
>  
>  static void *nvd_new_controller(struct nvme_controller *ctrlr);
>  static void nvd_controller_fail(void *ctrlr);
> @@ -67,6 +71,7 @@ MALLOC_DEFINE(M_NVD, "nvd", "nvd(4) allocations");
>  struct nvme_consumer *consumer_handle;
>  
>  struct nvd_disk {
> +	struct nvd_controller	*ctrlr;
>  
>  	struct bio_queue_head	bioq;
>  	struct task		bioqtask;
> @@ -78,6 +83,7 @@ struct nvd_disk {
>  
>  	uint32_t		cur_depth;
>  	uint32_t		ordered_in_flight;
> +	u_int			unit;
>  
>  	TAILQ_ENTRY(nvd_disk)	global_tailq;
>  	TAILQ_ENTRY(nvd_disk)	ctrlr_tailq;
> @@ -89,6 +95,7 @@ struct nvd_controller {
>  	TAILQ_HEAD(, nvd_disk)		disk_head;
>  };
>  
> +static struct mtx			nvd_lock;
>  static TAILQ_HEAD(, nvd_controller)	ctrlr_head;
>  static TAILQ_HEAD(disk_list, nvd_disk)	disk_head;
>  
> @@ -139,6 +146,7 @@ nvd_load()
>  	if (!nvme_use_nvd)
>  		return 0;
>  
> +	mtx_init(&nvd_lock, "nvd_lock", NULL, MTX_DEF);
>  	TAILQ_INIT(&ctrlr_head);
>  	TAILQ_INIT(&disk_head);
>  
> @@ -152,25 +160,25 @@ static void
>  nvd_unload()
>  {
>  	struct nvd_controller	*ctrlr;
> -	struct nvd_disk		*disk;
> +	struct nvd_disk		*ndisk;
>  
>  	if (!nvme_use_nvd)
>  		return;
>  
> -	while (!TAILQ_EMPTY(&ctrlr_head)) {
> -		ctrlr = TAILQ_FIRST(&ctrlr_head);
> +	mtx_lock(&nvd_lock);
> +	while ((ctrlr = TAILQ_FIRST(&ctrlr_head)) != NULL) {
>  		TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
> +		TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
> +			nvd_gone(ndisk);
> +		while (!TAILQ_EMPTY(&ctrlr->disk_head))
> +			msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_unload",0);
>  		free(ctrlr, M_NVD);
>  	}
> +	mtx_unlock(&nvd_lock);
>  
> -	while (!TAILQ_EMPTY(&disk_head)) {
> -		disk = TAILQ_FIRST(&disk_head);
> -		TAILQ_REMOVE(&disk_head, disk, global_tailq);
> -		destroy_geom_disk(disk);
> -		free(disk, M_NVD);
> -	}
> -
>  	nvme_unregister_consumer(consumer_handle);
> +
> +	mtx_destroy(&nvd_lock);
>  }
>  
>  static int
> @@ -220,6 +228,42 @@ nvd_strategy(struct bio *bp)
>  	taskqueue_enqueue(ndisk->tq, &ndisk->bioqtask);
>  }
>  
> +static void
> +nvd_gone(struct nvd_disk *ndisk)
> +{
> +	struct bio	*bp;
> +
> +	printf(NVD_STR"%u: detached\n", ndisk->unit);
> +	mtx_lock(&ndisk->bioqlock);
> +	disk_gone(ndisk->disk);
> +	while ((bp = bioq_takefirst(&ndisk->bioq)) != NULL) {
> +		if (__predict_false(bp->bio_flags & BIO_ORDERED))
> +			atomic_add_int(&ndisk->ordered_in_flight, -1);
> +		bp->bio_error = ENXIO;
> +		bp->bio_flags |= BIO_ERROR;
> +		bp->bio_resid = bp->bio_bcount;
> +		biodone(bp);
> +	}
> +	mtx_unlock(&ndisk->bioqlock);
> +}
> +
> +static void
> +nvd_gonecb(struct disk *dp)
> +{
> +	struct nvd_disk *ndisk = (struct nvd_disk *)dp->d_drv1;
> +
> +	disk_destroy(ndisk->disk);
> +	mtx_lock(&nvd_lock);
> +	TAILQ_REMOVE(&disk_head, ndisk, global_tailq);
> +	TAILQ_REMOVE(&ndisk->ctrlr->disk_head, ndisk, ctrlr_tailq);
> +	if (TAILQ_EMPTY(&ndisk->ctrlr->disk_head))
> +		wakeup(&ndisk->ctrlr->disk_head);
> +	mtx_unlock(&nvd_lock);
> +	taskqueue_free(ndisk->tq);
> +	mtx_destroy(&ndisk->bioqlock);
> +	free(ndisk, M_NVD);
> +}
> +
>  static int
>  nvd_ioctl(struct disk *ndisk, u_long cmd, void *data, int fflag,
>      struct thread *td)
> @@ -304,7 +348,9 @@ nvd_new_controller(struct nvme_controller *ctrlr)
>  	    M_ZERO | M_WAITOK);
>  
>  	TAILQ_INIT(&nvd_ctrlr->disk_head);
> +	mtx_lock(&nvd_lock);
>  	TAILQ_INSERT_TAIL(&ctrlr_head, nvd_ctrlr, tailq);
> +	mtx_unlock(&nvd_lock);
>  
>  	return (nvd_ctrlr);
>  }
> @@ -313,46 +359,61 @@ static void *
>  nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_arg)
>  {
>  	uint8_t			descr[NVME_MODEL_NUMBER_LENGTH+1];
> -	struct nvd_disk		*ndisk;
> +	struct nvd_disk		*ndisk, *tnd;
>  	struct disk		*disk;
>  	struct nvd_controller	*ctrlr = ctrlr_arg;
> +	int unit;
>  
>  	ndisk = malloc(sizeof(struct nvd_disk), M_NVD, M_ZERO | M_WAITOK);
> +	ndisk->ctrlr = ctrlr;
> +	ndisk->ns = ns;
> +	ndisk->cur_depth = 0;
> +	ndisk->ordered_in_flight = 0;
> +	mtx_init(&ndisk->bioqlock, "nvd bioq lock", NULL, MTX_DEF);
> +	bioq_init(&ndisk->bioq);
> +	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
>  
> -	disk = disk_alloc();
> +	mtx_lock(&nvd_lock);
> +	unit = 0;
> +	TAILQ_FOREACH(tnd, &disk_head, global_tailq) {
> +		if (tnd->unit > unit)
> +			break;
> +		unit = tnd->unit + 1;
> +	}
> +	ndisk->unit = unit;
> +	if (tnd != NULL)
> +		TAILQ_INSERT_BEFORE(tnd, ndisk, global_tailq);
> +	else
> +		TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
> +	TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
> +	mtx_unlock(&nvd_lock);
> +
> +	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
> +	    taskqueue_thread_enqueue, &ndisk->tq);
> +	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
> +
> +	disk = ndisk->disk = disk_alloc();
>  	disk->d_strategy = nvd_strategy;
>  	disk->d_ioctl = nvd_ioctl;
>  	disk->d_dump = nvd_dump;
> +	disk->d_gone = nvd_gonecb;
>  	disk->d_name = NVD_STR;
> +	disk->d_unit = ndisk->unit;
>  	disk->d_drv1 = ndisk;
>  
> -	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
>  	disk->d_sectorsize = nvme_ns_get_sector_size(ns);
>  	disk->d_mediasize = (off_t)nvme_ns_get_size(ns);
> +	disk->d_maxsize = nvme_ns_get_max_io_xfer_size(ns);
>  	disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns);
>  	if (disk->d_delmaxsize > nvd_delete_max)
>  		disk->d_delmaxsize = nvd_delete_max;
>  	disk->d_stripesize = nvme_ns_get_stripesize(ns);
> -
> -	if (TAILQ_EMPTY(&disk_head))
> -		disk->d_unit = 0;
> -	else
> -		disk->d_unit =
> -		    TAILQ_LAST(&disk_head, disk_list)->disk->d_unit + 1;
> -
> -	disk->d_flags = DISKFLAG_DIRECT_COMPLETION;
> -
> +	disk->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
>  	if (nvme_ns_get_flags(ns) & NVME_NS_DEALLOCATE_SUPPORTED)
>  		disk->d_flags |= DISKFLAG_CANDELETE;
> -
>  	if (nvme_ns_get_flags(ns) & NVME_NS_FLUSH_SUPPORTED)
>  		disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
>  
> -/* ifdef used here to ease porting to stable branches at a later point. */
> -#ifdef DISKFLAG_UNMAPPED_BIO
> -	disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
> -#endif
> -
>  	/*
>  	 * d_ident and d_descr are both far bigger than the length of either
>  	 *  the serial or model number strings.
> @@ -365,22 +426,6 @@ nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_ar
>  
>  	disk->d_rotation_rate = DISK_RR_NON_ROTATING;
>  
> -	ndisk->ns = ns;
> -	ndisk->disk = disk;
> -	ndisk->cur_depth = 0;
> -	ndisk->ordered_in_flight = 0;
> -
> -	mtx_init(&ndisk->bioqlock, "NVD bioq lock", NULL, MTX_DEF);
> -	bioq_init(&ndisk->bioq);
> -
> -	TASK_INIT(&ndisk->bioqtask, 0, nvd_bioq_process, ndisk);
> -	ndisk->tq = taskqueue_create("nvd_taskq", M_WAITOK,
> -	    taskqueue_thread_enqueue, &ndisk->tq);
> -	taskqueue_start_threads(&ndisk->tq, 1, PI_DISK, "nvd taskq");
> -
> -	TAILQ_INSERT_TAIL(&disk_head, ndisk, global_tailq);
> -	TAILQ_INSERT_TAIL(&ctrlr->disk_head, ndisk, ctrlr_tailq);
> -
>  	disk_create(disk, DISK_VERSION);
>  
>  	printf(NVD_STR"%u: <%s> NVMe namespace\n", disk->d_unit, descr);
> @@ -389,58 +434,22 @@ nvd_new_disk(struct nvme_namespace *ns, void *ctrlr_ar
>  		(uintmax_t)disk->d_mediasize / disk->d_sectorsize,
>  		disk->d_sectorsize);
>  
> -	return (NULL);
> +	return (ndisk);
>  }
>  
>  static void
> -destroy_geom_disk(struct nvd_disk *ndisk)
> -{
> -	struct bio	*bp;
> -	struct disk	*disk;
> -	uint32_t	unit;
> -	int		cnt = 0;
> -
> -	disk = ndisk->disk;
> -	unit = disk->d_unit;
> -	taskqueue_free(ndisk->tq);
> -
> -	disk_destroy(ndisk->disk);
> -
> -	mtx_lock(&ndisk->bioqlock);
> -	for (;;) {
> -		bp = bioq_takefirst(&ndisk->bioq);
> -		if (bp == NULL)
> -			break;
> -		bp->bio_error = EIO;
> -		bp->bio_flags |= BIO_ERROR;
> -		bp->bio_resid = bp->bio_bcount;
> -		cnt++;
> -		biodone(bp);
> -	}
> -
> -	printf(NVD_STR"%u: lost device - %d outstanding\n", unit, cnt);
> -	printf(NVD_STR"%u: removing device entry\n", unit);
> -
> -	mtx_unlock(&ndisk->bioqlock);
> -
> -	mtx_destroy(&ndisk->bioqlock);
> -}
> -
> -static void
>  nvd_controller_fail(void *ctrlr_arg)
>  {
>  	struct nvd_controller	*ctrlr = ctrlr_arg;
> -	struct nvd_disk		*disk;
> +	struct nvd_disk		*ndisk;
>  
> -	while (!TAILQ_EMPTY(&ctrlr->disk_head)) {
> -		disk = TAILQ_FIRST(&ctrlr->disk_head);
> -		TAILQ_REMOVE(&disk_head, disk, global_tailq);
> -		TAILQ_REMOVE(&ctrlr->disk_head, disk, ctrlr_tailq);
> -		destroy_geom_disk(disk);
> -		free(disk, M_NVD);
> -	}
> -
> +	mtx_lock(&nvd_lock);
>  	TAILQ_REMOVE(&ctrlr_head, ctrlr, tailq);
> +	TAILQ_FOREACH(ndisk, &ctrlr->disk_head, ctrlr_tailq)
> +		nvd_gone(ndisk);
> +	while (!TAILQ_EMPTY(&ctrlr->disk_head))
> +		msleep(&ctrlr->disk_head, &nvd_lock, 0, "nvd_fail", 0);
> +	mtx_unlock(&nvd_lock);
>  	free(ctrlr, M_NVD);
>  }
>  
> 
> 

-- 
Rod Grimes                                                 rgrimes@freebsd.org



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201812272237.wBRMbB6W047349>