Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 2 Mar 2011 14:53:18 -0700 (MST)
From:      Ian Lepore <freebsd@damnhippie.dyndns.org>
To:        FreeBSD-gnats-submit@FreeBSD.org
Subject:   arm/155214: [patch] MMC/SD IO slow on Atmel ARM with modern large SD cards
Message-ID:  <201103022153.p22LrIhP097307@revolution.hippie.lan>
Resent-Message-ID: <201103022210.p22MABmL078817@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         155214
>Category:       arm
>Synopsis:       [patch] MMC/SD IO slow on Atmel ARM with modern large SD cards
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    freebsd-arm
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Mar 02 22:10:10 UTC 2011
>Closed-Date:
>Last-Modified:
>Originator:     Ian Lepore <freebsd@damnhippie.dyndns.org>
>Release:        FreeBSD 8.2-RC3 arm
>Organization:
none
>Environment:
FreeBSD dvb 8.2-RC3 FreeBSD 8.2-RC3 #49: Tue Feb 15 22:52:14 UTC 2011     root@revolution.hippie.lan:/usr/obj/arm/usr/src/sys/DVB  arm

Included patch is against -current even though the problem was first seen on
8.2-RC3

The problem was seen on AT91RM9200 hardware, but presumably also affects the
SAM9 series which uses the same driver code.

>Description:
With the latest generation of large-capacity SD cards, write speeds as low as
20 kbytes/sec are seen.  These modern cards have erase-block sizes as large as 
8192K (compared to 32K typical on previous generations).  The at91_mci driver 
does only single-sector IO; apparently this requires the SD card to internally 
perform an expensive read-erase-modify-write cycle for each 512 byte block 
written to the card.

It should be noted that even with older SD cards that have smaller erase-block
sizes, write throughput to the card rarely exceeds about 350 kbytes/sec; still
very slow for modern hardware.

With the patch provided, write throughput improves to approximately 
1500 kbytes/sec.  Read speeds also improve (was 640, now 2400 kbytes/sec).

>How-To-Repeat:
The slow IO can be demonstrated using dd to write directly to the device
(bypassing buffering and caching at higher layers):

dvb# dd if=/dev/zero of=/dev/mmcsd0s2a bs=64k count=100
100+0 records in
100+0 records out
6553600 bytes transferred in 266.252838 secs (24614 bytes/sec)

dvb# dd if=/dev/zero of=/dev/mmcsd0s2a bs=6400k count=1
1+0 records in
1+0 records out
6553600 bytes transferred in 265.004883 secs (24730 bytes/sec)

>Fix:
The following patch adds support for multi-block IO to the at91_mci driver.

The patch is to -current even though the problem was discovered in 8.2-RC3.
The patched driver will also work in 8.2, but requires a few other items in
the arm/at91 directory to be back-ported as well (to support references to
things such as at91_master_clock and at91_is_rm9200)).

--- patch-arm-at91_mci-for-multiblock begins here ---
--- sys/arm/at91/at91_mci.c.cvs_v1.18	2011-02-28 13:43:54.000000000 -0700
+++ sys/arm/at91/at91_mci.c	2011-03-02 07:21:38.000000000 -0700
@@ -67,7 +67,13 @@
 
 #include "opt_at91.h"
 
-#define BBSZ	512
+#ifndef AT91_MCI_MAX_BLOCKS
+#define AT91_MCI_MAX_BLOCKS 64		/* default 32k bounce buffer */
+#endif
+
+#define BBSIZE (AT91_MCI_MAX_BLOCKS * 512)
+
+static int mci_debug;
 
 struct at91_mci_softc {
 	void *intrhand;			/* Interrupt handle */
@@ -75,21 +81,24 @@
 	int sc_cap;
 #define	CAP_HAS_4WIRE		1	/* Has 4 wire bus */
 #define	CAP_NEEDS_BYTESWAP	2	/* broken hardware needing bounce */
-	int flags;
 	int has_4wire;
-#define CMD_STARTED	1
-#define STOP_STARTED	2
+	int flags;
+#define PENDING_CMD	0x01
+#define PENDING_STOP	0x02
+#define PENDING_ERROR	0x04
+#define CMD_MULTIREAD	0x10
+#define CMD_MULTIWRITE	0x20
 	struct resource *irq_res;	/* IRQ resource */
 	struct resource	*mem_res;	/* Memory resource */
 	struct mtx sc_mtx;
 	bus_dma_tag_t dmatag;
 	bus_dmamap_t map;
-	int mapped;
 	struct mmc_host host;
 	int bus_busy;
 	struct mmc_request *req;
 	struct mmc_command *curcmd;
-	char bounce_buffer[BBSZ];
+	char      *bbuf_vaddr;		/* bounce buf in KVA space */
+	bus_addr_t bbuf_paddr;		/* bounce buf mapped into DMA space */
 };
 
 static inline uint32_t
@@ -124,6 +133,14 @@
 #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
 
 static void
+at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+	if (error != 0)
+		return;
+	*(bus_addr_t *)arg = segs[0].ds_addr;
+}
+
+static void
 at91_mci_pdc_disable(struct at91_mci_softc *sc)
 {
 	WR4(sc, PDC_PTCR, PDC_PTCR_TXTDIS | PDC_PTCR_RXTDIS);
@@ -137,6 +154,48 @@
 	WR4(sc, PDC_TNCR, 0);
 }
 
+/* Reset the controller, then restore most of the current state.
+ *
+ * This is called after detecting an error.  It's also called after stopping a
+ * multi-block write, to un-wedge the device so that it will handle the NOTBUSY
+ * signal correctly.  See comments in at91_mci_stop_done() for more details.
+ */
+static void at91_mci_reset(struct at91_mci_softc *sc)
+{
+	uint32_t mr;
+	uint32_t sdcr;
+	uint32_t dtor;
+	uint32_t imr;
+
+	at91_mci_pdc_disable(sc);
+
+	/* save current state */
+
+	imr  = RD4(sc, MCI_IMR);
+	mr   = RD4(sc, MCI_MR) & 0x7fff;
+	sdcr = RD4(sc, MCI_SDCR);
+	dtor = RD4(sc, MCI_DTOR);
+
+	/* reset the controller */
+
+	WR4(sc, MCI_IDR, 0xffffffff);
+	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST);
+
+	/* restore state */
+
+	WR4(sc, MCI_CR, MCI_CR_MCIEN);
+	WR4(sc, MCI_MR, mr);
+	WR4(sc, MCI_SDCR, sdcr);
+	WR4(sc, MCI_DTOR, dtor);
+	WR4(sc, MCI_IER, imr);
+
+	/* Make sure sdio interrupts will fire.  Not sure why reading
+	 * SR ensures that, but this is in the linux driver.
+	 */
+
+	RD4(sc, MCI_SR);
+}
+
 static void
 at91_mci_init(device_t dev)
 {
@@ -145,7 +204,7 @@
 	WR4(sc, MCI_CR, MCI_CR_MCIEN);		/* Enable controller */
 	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
 	WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1);
-	WR4(sc, MCI_MR, 0x834a);	// XXX GROSS HACK FROM LINUX
+	WR4(sc, MCI_MR, 0x834a); // set PDCMODE, PWSDIV=3, CLKDIV=75
 #ifndef  AT91_MCI_SLOT_B
 	WR4(sc, MCI_SDCR, 0);			/* SLOT A, 1 bit bus */
 #else
@@ -168,7 +227,6 @@
 static int
 at91_mci_probe(device_t dev)
 {
-
 	device_set_desc(dev, "MCI mmc/sd host bridge");
 	return (0);
 }
@@ -193,24 +251,44 @@
 
 	AT91_MCI_LOCK_INIT(sc);
 
-	/*
-	 * Allocate DMA tags and maps
+	at91_mci_fini(dev);
+	at91_mci_init(dev);
+
+	/* Allocate DMA tags and maps and a physically-contiguous bounce buffer.
+	 *
+	 * The parms in the tag_create call cause the dmamem_alloc call to
+	 * create a single contiguous buffer of BBSIZE bytes aligned to a 4096
+	 * byte boundary.
+	 *
+	 * Allocate the bounce buffer using DMA_COHERENT which in effect means
+	 * that the pages are mapped as non-cacheable (making all the
+	 * bus_dmamap_sync() calls become fast no-ops) because there's no value
+	 * in caching the data in the swap/bounce buffers (there would just be
+	 * extra overhead in flushing the caches after the data has been
+	 * accessed exactly once).
+	 *
+	 * After allocating the bounce buffer we load the map for it and leave
+	 * it loaded as long as the driver is active.
 	 */
-	err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
-	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1,
-	    MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag);
+
+	err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0,
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
+	    BBSIZE, 1, MAXPHYS, 0, NULL, NULL, &sc->dmatag);
 	if (err != 0)
 		goto out;
 
-	err = bus_dmamap_create(sc->dmatag, 0,  &sc->map);
+	err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr,
+	    BUS_DMA_NOWAIT|BUS_DMA_COHERENT, &sc->map);
 	if (err != 0)
 		goto out;
 
-	at91_mci_fini(dev);
-	at91_mci_init(dev);
+	 err = bus_dmamap_load(sc->dmatag, sc->map, sc->bbuf_vaddr, BBSIZE,
+	    at91_mci_getaddr, &sc->bbuf_paddr, BUS_DMA_NOWAIT);
+	 if (err != 0)
+		 goto out;
 
 	/*
-	 * Activate the interrupt
+	 * Set up to handle interrupts.
 	 */
 	err = bus_setup_intr(dev, sc->irq_res, INTR_TYPE_MISC | INTR_MPSAFE,
 	    at91_mci_intr, sc, &sc->intrhand);
@@ -230,10 +308,13 @@
 	if (sc->has_4wire)
 		sc->sc_cap |= CAP_HAS_4WIRE;
 
-	sc->host.f_min = at91_master_clock / 512;
+	/* Our real min freq is master_clock/512, but upper driver layers are
+	 * going to set the min speed during card discovery, and the right speed
+	 * for that is 400khz, so advertise a safe value just under that.
+	 */
 	sc->host.f_min = 375000;
 	sc->host.f_max = at91_master_clock / 2;
-	if (sc->host.f_max > 50000000)	
+	if (sc->host.f_max > 50000000)
 		sc->host.f_max = 50000000;	/* Limit to 50MHz */
 
 	sc->host.host_ocr = MMC_OCR_320_330 | MMC_OCR_330_340;
@@ -252,8 +333,15 @@
 static int
 at91_mci_detach(device_t dev)
 {
+	struct at91_mci_softc *sc = device_get_softc(dev);
+
 	at91_mci_fini(dev);
 	at91_mci_deactivate(dev);
+
+	bus_dmamap_unload(sc->dmatag, sc->map);
+	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr, sc->map);
+	bus_dma_tag_destroy(sc->dmatag);
+
 	return (EBUSY);	/* XXX */
 }
 
@@ -293,7 +381,7 @@
 	sc->intrhand = 0;
 	bus_generic_detach(sc->dev);
 	if (sc->mem_res)
-		bus_release_resource(dev, SYS_RES_IOPORT,
+		bus_release_resource(dev, SYS_RES_MEMORY,
 		    rman_get_rid(sc->mem_res), sc->mem_res);
 	sc->mem_res = 0;
 	if (sc->irq_res)
@@ -303,14 +391,6 @@
 	return;
 }
 
-static void
-at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-	if (error != 0)
-		return;
-	*(bus_addr_t *)arg = segs[0].ds_addr;
-}
-
 static int
 at91_mci_update_ios(device_t brdev, device_t reqdev)
 {
@@ -322,7 +402,7 @@
 	sc = device_get_softc(brdev);
 	host = &sc->host;
 	ios = &host->ios;
-	// bus mode?
+
 	if (ios->clock == 0) {
 		WR4(sc, MCI_CR, MCI_CR_MCIDIS);
 		clkdiv = 0;
@@ -346,137 +426,176 @@
 static void
 at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd)
 {
-	uint32_t cmdr, ier = 0, mr;
+	uint32_t cmdr, mr;
 	uint32_t *src, *dst;
 	int i;
 	struct mmc_data *data;
-	void *vaddr;
-	bus_addr_t paddr;
 
 	sc->curcmd = cmd;
 	data = cmd->data;
-	cmdr = cmd->opcode;
 
 	/* XXX Upper layers don't always set this */
 	cmd->mrq = sc->req;
 
+	/* Begin setting up command register. */
+
+	cmdr = cmd->opcode;
+
+	if (sc->host.ios.bus_mode == opendrain)
+		cmdr |= MCI_CMDR_OPDCMD;
+
+	/* Set up response handling.  Allow max timeout for responses. */
+
 	if (MMC_RSP(cmd->flags) == MMC_RSP_NONE)
 		cmdr |= MCI_CMDR_RSPTYP_NO;
 	else {
-		/* Allow big timeout for responses */
 		cmdr |= MCI_CMDR_MAXLAT;
 		if (cmd->flags & MMC_RSP_136)
 			cmdr |= MCI_CMDR_RSPTYP_136;
 		else
 			cmdr |= MCI_CMDR_RSPTYP_48;
 	}
-	if (cmd->opcode == MMC_STOP_TRANSMISSION)
-		cmdr |= MCI_CMDR_TRCMD_STOP;
-	if (sc->host.ios.bus_mode == opendrain)
-		cmdr |= MCI_CMDR_OPDCMD;
-	if (!data) {
-		// The no data case is fairly simple
+
+	/* If there is no data transfer, just set up the right interrupt mask
+	 * and start the command.
+	 *
+	 * The interrupt mask needs to be CMDRDY plus all non-data-transfer
+	 * errors. It's important to leave the transfer-related errors out, to
+	 * avoid spurious timeout or crc errors on a STOP command following a
+	 * multiblock read.  When a multiblock read is in progress, sending a
+	 * STOP in the middle of a block occasionally triggers such errors, but
+	 * we're totally disinterested in them because we've already gotten all
+	 * the data we wanted without error before sending the STOP command.
+	 */
+
+	if (data == NULL) {
+		uint32_t ier = MCI_SR_CMDRDY | 
+                    MCI_SR_RTOE | MCI_SR_RENDE | 
+		    MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE;
+
 		at91_mci_pdc_disable(sc);
-//		printf("CMDR %x ARGR %x\n", cmdr, cmd->arg);
+
+		if (cmd->opcode == MMC_STOP_TRANSMISSION)
+			cmdr |= MCI_CMDR_TRCMD_STOP;
+
+		/* Ignore response CRC on CMD2 and ACMD41, per standard. */
+
+		if (cmd->opcode == MMC_SEND_OP_COND ||
+		    cmd->opcode == ACMD_SD_SEND_OP_COND)
+			ier &= ~MCI_SR_RCRCE;
+
+		if (mci_debug)
+			printf("CMDR %x (opcode %d) ARGR %x no data\n", 
+			    cmdr, cmd->opcode, cmd->arg);
+
 		WR4(sc, MCI_ARGR, cmd->arg);
 		WR4(sc, MCI_CMDR, cmdr);
-		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
+		WR4(sc, MCI_IER, ier);
 		return;
 	}
+
+	/* There is data, set up the transfer-related parts of the command. */
+
 	if (data->flags & MMC_DATA_READ)
 		cmdr |= MCI_CMDR_TRDIR;
+
 	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE))
 		cmdr |= MCI_CMDR_TRCMD_START;
+
 	if (data->flags & MMC_DATA_STREAM)
 		cmdr |= MCI_CMDR_TRTYP_STREAM;
-	if (data->flags & MMC_DATA_MULTI)
+	else if (data->flags & MMC_DATA_MULTI) {
 		cmdr |= MCI_CMDR_TRTYP_MULTIPLE;
-	// Set block size and turn on PDC mode for dma xfer and disable
-	// PDC until we're ready.
-	mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN;
-	WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE);
-	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-	if (cmdr & MCI_CMDR_TRCMD_START) {
-		if (cmdr & MCI_CMDR_TRDIR)
-			vaddr = cmd->data->data;
-		else {
-			/* Use bounce buffer even if we don't need
-			 * byteswap, since buffer may straddle a page
-			 * boundry, and we don't handle multi-segment
-			 * transfers in hardware.
-			 * (page issues seen from 'bsdlabel -w' which
-			 * uses raw geom access to the volume).
-			 * Greg Ansley (gja (at) ansley.com)
-			 */
-			vaddr = sc->bounce_buffer;
-			src = (uint32_t *)cmd->data->data;
-			dst = (uint32_t *)vaddr;
-			if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-				for (i = 0; i < data->len / 4; i++)
-					dst[i] = bswap32(src[i]);
-			} else
-				memcpy(dst, src, data->len);
-		}
-		data->xfer_len = 0;
-		if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, data->len,
-		    at91_mci_getaddr, &paddr, 0) != 0) {
-			cmd->error = MMC_ERR_NO_MEMORY;
-			sc->req = NULL;
-			sc->curcmd = NULL;
-			cmd->mrq->done(cmd->mrq);
-			return;
-		}
-		sc->mapped++;
-		if (cmdr & MCI_CMDR_TRDIR) {
+		sc->flags |= (data->flags & MMC_DATA_READ) ? 
+				CMD_MULTIREAD : CMD_MULTIWRITE;
+	}
+
+	/* Disable PDC until we're ready.
+	 *
+	 * Set block size and turn on PDC mode for dma xfer.
+	 * Note that the block size is the smaller of the amount of data to be
+	 * transferred, or 512 bytes.  The 512 size is fixed by the standard;
+	 * smaller blocks are possible, but never larger.
+	 */
+
+	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); 
+
+	mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; 
+	mr |=  min(data->len, 512) << 16; 
+	WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE);
+
+	/* Set up DMA.
+	 *
+	 * Use a bounce buffer even if we don't need to byteswap, because doing
+	 * multi-block IO with a single large DMA buffer is way fast (compared
+	 * to single-block IO), even after incurring the overhead of also
+	 * copying from/to the caller's buffers (which may be in non-contiguous
+	 * physical pages).
+	 *
+	 * In an ideal non-byteswap world we could create a dma tag that allows
+	 * for discontiguous segments and do the IO directly from/to the
+	 * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the
+	 * discontiguous buffers through the PDC. Someday.
+	 *
+	 * XXX what about stream transfers?
+	 */
+
+	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) {
+		if (data->flags & MMC_DATA_READ) {
 			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREREAD);
-			WR4(sc, PDC_RPR, paddr);
+			WR4(sc, PDC_RPR, sc->bbuf_paddr);
 			WR4(sc, PDC_RCR, data->len / 4);
-			ier = MCI_SR_ENDRX;
+			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
 		} else {
+			if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
+				src = (uint32_t *)data->data;
+				dst = (uint32_t *)sc->bbuf_vaddr;
+				for (i = 0; i < data->len / 4; i++)
+					dst[i] = bswap32(src[i]);
+			} else {
+				bcopy(data->data, sc->bbuf_vaddr, data->len);
+			}
 			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREWRITE);
-			WR4(sc, PDC_TPR, paddr);
+			WR4(sc, PDC_TPR, sc->bbuf_paddr);
 			WR4(sc, PDC_TCR, data->len / 4);
-			ier = MCI_SR_TXBUFE;
+			/* do not enable PDC xfer until CMDRDY asserted */
 		}
+		data->xfer_len = 0; /* XXX what's this? appears to be unused. */
 	}
-//	printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg);
+
+	if (mci_debug)
+		printf("CMDR %x (opcode %d) ARGR %x with data\n", cmdr, cmd->opcode, cmd->arg);
+
 	WR4(sc, MCI_ARGR, cmd->arg);
-	if (cmdr & MCI_CMDR_TRCMD_START) {
-		if (cmdr & MCI_CMDR_TRDIR) {
-			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
-			WR4(sc, MCI_CMDR, cmdr);
-		} else {
-			WR4(sc, MCI_CMDR, cmdr);
-			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
-		}
-	}
-	WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+	WR4(sc, MCI_CMDR, cmdr);
+	WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
 }
 
 static void
-at91_mci_start(struct at91_mci_softc *sc)
+at91_mci_next_operation(struct at91_mci_softc *sc)
 {
 	struct mmc_request *req;
 
 	req = sc->req;
 	if (req == NULL)
 		return;
-	// assert locked
-	if (!(sc->flags & CMD_STARTED)) {
-		sc->flags |= CMD_STARTED;
-//		printf("Starting CMD\n");
-		at91_mci_start_cmd(sc, req->cmd);
-		return;
-	}
-	if (!(sc->flags & STOP_STARTED) && req->stop) {
-//		printf("Starting Stop\n");
-		sc->flags |= STOP_STARTED;
-		at91_mci_start_cmd(sc, req->stop);
-		return;
+
+	if (!(sc->flags & PENDING_ERROR)) {
+		if (sc->flags & PENDING_CMD) {
+			sc->flags &= ~PENDING_CMD;
+			at91_mci_start_cmd(sc, req->cmd);
+			return;
+		} else if (sc->flags & PENDING_STOP) {
+			sc->flags &= ~PENDING_STOP;
+			at91_mci_start_cmd(sc, req->stop);
+			return;
+		}
 	}
-	/* We must be done -- bad idea to do this while locked? */
+
+	WR4(sc, MCI_IDR, 0xffffffff);
 	sc->req = NULL;
 	sc->curcmd = NULL;
+	//printf("req done\n");
 	req->done(req);
 }
 
@@ -486,16 +605,16 @@
 	struct at91_mci_softc *sc = device_get_softc(brdev);
 
 	AT91_MCI_LOCK(sc);
-	// XXX do we want to be able to queue up multiple commands?
-	// XXX sounds like a good idea, but all protocols are sync, so
-	// XXX maybe the idea is naive...
 	if (sc->req != NULL) {
 		AT91_MCI_UNLOCK(sc);
 		return (EBUSY);
 	}
+	//printf("new req\n");
 	sc->req = req;
-	sc->flags = 0;
-	at91_mci_start(sc);
+	sc->flags = PENDING_CMD;
+	if (sc->req->stop)
+		sc->flags |= PENDING_STOP;
+	at91_mci_next_operation(sc);
 	AT91_MCI_UNLOCK(sc);
 	return (0);
 }
@@ -535,118 +654,299 @@
 static void
 at91_mci_read_done(struct at91_mci_softc *sc)
 {
-	uint32_t *walker;
-	struct mmc_command *cmd;
-	int i, len;
+	struct mmc_command *cmd = sc->curcmd;
+
+	/* We arrive here when the entire DMA transfer for a read is done,
+	 * whether it's a single or multi-block read.  Either way the next thing
+	 * to do is move on to the next operation.  For single-block that'll
+	 * mean returning the now-completed request, for multi-block it will
+	 * invoke the stop command sequence.
+	 */
+
+	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 
-	cmd = sc->curcmd;
 	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD);
-	bus_dmamap_unload(sc->dmatag, sc->map);
-	sc->mapped--;
+
 	if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-		walker = (uint32_t *)cmd->data->data;
-		len = cmd->data->len / 4;
+		int i;
+		uint32_t *src = (uint32_t *)sc->bbuf_vaddr;
+		uint32_t *dst = (uint32_t *)cmd->data->data;
+		int len = cmd->data->len / 4;
 		for (i = 0; i < len; i++)
-			walker[i] = bswap32(walker[i]);
+			dst[i] = bswap32(src[i]);
+	} else {
+		bcopy(sc->bbuf_vaddr, cmd->data->data, cmd->data->len);
 	}
-	// Finish up the sequence...
-	WR4(sc, MCI_IDR, MCI_SR_ENDRX);
-	WR4(sc, MCI_IER, MCI_SR_RXBUFF);
-	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
 }
 
 static void
-at91_mci_xmit_done(struct at91_mci_softc *sc)
+at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr)
 {
-	// Finish up the sequence...
+	struct mmc_command *cmd = sc->curcmd;
+
+	/* We arrive here when the entire DMA transfer for a write is done,
+	 * whether it's a single or multi-block write.  If it's multi-block we
+	 * have to immediately move on to the next operation which is to send
+	 * the stop command.  If it's a single-block transfer we need to wait
+	 * for NOTBUSY, but if that's already asserted we can avoid another
+	 * interrupt and just move on to completing the request right away.
+	 */
+
 	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-	WR4(sc, MCI_IDR, MCI_SR_TXBUFE);
-	WR4(sc, MCI_IER, MCI_SR_NOTBUSY);
+
 	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE);
-	bus_dmamap_unload(sc->dmatag, sc->map);
-	sc->mapped--;
+
+	if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) {
+                cmd->error = MMC_ERR_NONE;
+                at91_mci_next_operation(sc);
+	} else {
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+	}
+}
+
+static void
+at91_mci_notbusy(struct at91_mci_softc *sc)
+{
+	struct mmc_command *cmd = sc->curcmd;
+
+	/* We arrive here by either completion of a single-block write, or
+	 * completion of the stop command that ended a multi-block write (and, I
+	 * suppose, after a card-select or erase, but I haven't tested those).
+	 * Anyway, we're done and it's time to move on to the next command.
+	 */
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
+}
+
+static void
+at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr)
+{
+	struct mmc_command *cmd = sc->curcmd;
+
+	/* We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION
+	 * command.  Depending on the operation being stopped, we may have to do
+	 * some unusual things to work around hardware bugs.
+	 */
+
+	/* This is known to be true of at91rm9200 hardware; it may or may not
+	 * apply to more recent chips: 
+	 *
+	 * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does
+	 * not properly reflect the actual busy state of the card as signaled on
+	 * the DAT0 line; it always claims the card is not-busy.  If we believe
+	 * that and let operations continue, following commands will fail with
+	 * response timeouts (except of course MMC_SEND_STATUS -- it indicates
+	 * the card is busy in the PRG state, which was the smoking gun that
+	 * showed MCI_SR NOTBUSY was not tracking DAT0 correctly).
+	 *
+	 * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only
+	 * for Write Operations."  I guess technically since we sent a stop it's
+	 * not a write operation anymore.  But then just what did they think it
+	 * meant for the stop command to have "...an optional busy signal
+	 * transmitted on the data line" according to the SD spec?
+	 *
+	 * I tried a variety of things to un-wedge the MCI and get the status
+	 * register to reflect NOTBUSY correctly again, but the only thing that
+	 * worked was a full device reset.  It feels like an awfully big hammer,
+	 * but doing a full reset after every multiblock write is still faster
+	 * than doing single-block IO (by almost two orders of magnitude:
+	 * 20KB/sec improves to about 1.8MB/sec best case).
+	 *
+	 * After doing the reset, wait for a NOTBUSY interrupt before continuing
+	 * with the next operation.
+	 */
+
+	if (sc->flags & CMD_MULTIWRITE) {
+		at91_mci_reset(sc);
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+		return;
+	}
+
+	/* This is known to be true of at91rm9200 hardware; it may or may not
+	 * apply to more recent chips: 
+	 *
+	 * After stopping a multi-block read, loop to read and discard any data
+	 * that coasts in after we sent the stop command.  The docs don't say
+	 * anything about it, but empirical testing shows that 1-3 additional
+	 * words of data get buffered up in some unmentioned internal fifo and
+	 * if we don't read and discard them here they end up on the front of
+	 * the next read DMA transfer we do.
+	 */
+
+	if (sc->flags & CMD_MULTIREAD) {
+		uint32_t sr;
+		int count = 0;
+		do {
+			sr = RD4(sc, MCI_SR);
+			if (sr & MCI_SR_RXRDY) {
+				RD4(sc,  MCI_RDR);
+				++count;
+			}
+		} while (sr & MCI_SR_RXRDY);
+//              if (count != 0)
+//                      printf("Had to soak up %d words after read\n", count);
+	}
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
+
+}
+
+static void
+at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr)
+{
+	struct mmc_command *cmd = sc->curcmd;
+	int i;
+
+	if (cmd == NULL)
+		return;
+
+	/* We get here at the end of EVERY command.  We retrieve the command
+	 * response (if any) then decide what to do next based on the command.
+	 */
+
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) {
+			cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
+			if (mci_debug)
+				printf("RSPR[%d] = %x sr=%x\n", i, cmd->resp[i],  sr);
+		}
+	}
+
+	/* If this was a stop command, go handle the various special
+	 * conditions (read: bugs) that have to be dealt with following a stop.
+	 */
+
+	if (cmd->opcode == MMC_STOP_TRANSMISSION) {
+		at91_mci_stop_done(sc, sr);
+		return;
+	}
+
+	/* If this command can continue to assert BUSY beyond the response then
+	 * we need to wait for NOTBUSY before the command is really done.
+	 *
+	 * Note that this may not work properly on the at91rm9200.  It certainly
+	 * doesn't work for the STOP command that follows a multi-block write,
+	 * so post-stop CMDRDY is handled separately; see the special handling
+	 * in at91_mci_stop_done().
+	 *
+	 * Beside STOP, there are other R1B-type commands that use the busy
+	 * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect),
+	 * CMD38 (erase). I haven't tested any of them, but I rather expect
+	 * them all to have the same sort of problem with MCI_SR not actually
+	 * reflecting the state of the DAT0-line busy indicator.  So this code
+	 * may need to grow some sort of special handling for them too. (This
+	 * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly
+	 * sets the response flags to R1 rather than R1B.)
+	 */
+
+	if ((cmd->flags & MMC_RSP_BUSY)) {
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+		return;
+	}
+
+	/* If there is a data transfer with this command, then...
+	 * - If it's a read, we need to wait for ENDRX.
+	 * - If it's a write, now is the time to enable the PDC and we need to
+	 *   wait for BLKE.
+	 */
+
+	if (cmd->data) {
+		uint32_t ier;
+		if (cmd->data->flags & MMC_DATA_READ) {
+			ier = MCI_SR_ENDRX;
+		} else {
+			ier = MCI_SR_BLKE;
+			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
+		}
+		WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+		return;
+	}
+
+	/* If we made it to here, we don't need to wait for anything more for
+	 * the current command, move on to the next command (will complete the
+	 * request if there is no next command).
+	 */
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
 }
 
 static void
 at91_mci_intr(void *arg)
 {
 	struct at91_mci_softc *sc = (struct at91_mci_softc*)arg;
-	uint32_t sr;
-	int i, done = 0;
-	struct mmc_command *cmd;
+	struct mmc_command *cmd = sc->curcmd;
+	uint32_t sr, isr;
 
 	AT91_MCI_LOCK(sc);
-	sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR);
-//	printf("i 0x%x\n", sr);
-	cmd = sc->curcmd;
-	if (sr & MCI_SR_ERROR) {
-		// Ignore CRC errors on CMD2 and ACMD47, per relevant standards
-		if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND ||
-		    cmd->opcode == ACMD_SD_SEND_OP_COND))
-			cmd->error = MMC_ERR_NONE;
-		else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE))
+
+	sr = RD4(sc, MCI_SR);
+	isr = sr & RD4(sc, MCI_IMR);
+
+	if (mci_debug)
+		printf("i 0x%x sr 0x%x\n", isr, sr);
+
+	/* All interrupts are one-shot; disable it now.
+	 * The next operation will re-enable whatever interrupts it wants.
+	 */
+
+	WR4(sc, MCI_IDR, isr);
+
+	if (isr & MCI_SR_ERROR) {
+		if (isr & (MCI_SR_RTOE | MCI_SR_DTOE))
 			cmd->error = MMC_ERR_TIMEOUT;
-		else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
+		else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
 			cmd->error = MMC_ERR_BADCRC;
-		else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE))
+		else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE))
 			cmd->error = MMC_ERR_FIFO;
 		else
 			cmd->error = MMC_ERR_FAILED;
-		done = 1;
-		if (sc->mapped && cmd->error) {
-			bus_dmamap_unload(sc->dmatag, sc->map);
-			sc->mapped--;
-		}
+		device_printf(sc->dev, 
+		    "IO error; status MCI_SR = 0x%x cmd opcode = %d\n",  
+		    sr, cmd->opcode);
+		sc->flags |= PENDING_ERROR;
+		at91_mci_reset(sc);
+		at91_mci_next_operation(sc);
 	} else {
-		if (sr & MCI_SR_TXBUFE) {
+		if (isr & MCI_SR_TXBUFE) {
 //			printf("TXBUFE\n");
-			at91_mci_xmit_done(sc);
 		}
-		if (sr & MCI_SR_RXBUFF) {
+		if (isr & MCI_SR_RXBUFF) {
 //			printf("RXBUFF\n");
-			WR4(sc, MCI_IDR, MCI_SR_RXBUFF);
-			WR4(sc, MCI_IER, MCI_SR_CMDRDY);
 		}
-		if (sr & MCI_SR_ENDTX) {
+		if (isr & MCI_SR_ENDTX) {
 //			printf("ENDTX\n");
 		}
-		if (sr & MCI_SR_ENDRX) {
+		if (isr & MCI_SR_ENDRX) {
 //			printf("ENDRX\n");
 			at91_mci_read_done(sc);
 		}
-		if (sr & MCI_SR_NOTBUSY) {
+		if (isr & MCI_SR_NOTBUSY) {
 //			printf("NOTBUSY\n");
-			WR4(sc, MCI_IDR, MCI_SR_NOTBUSY);
-			WR4(sc, MCI_IER, MCI_SR_CMDRDY);
+			at91_mci_notbusy(sc);
 		}
-		if (sr & MCI_SR_DTIP) {
+		if (isr & MCI_SR_DTIP) {
 //			printf("Data transfer in progress\n");
 		}
-		if (sr & MCI_SR_BLKE) {
+		if (isr & MCI_SR_BLKE) {
 //			printf("Block transfer end\n");
+			at91_mci_write_done(sc, sr);
 		}
-		if (sr & MCI_SR_TXRDY) {
+		if (isr & MCI_SR_TXRDY) {
 //			printf("Ready to transmit\n");
 		}
-		if (sr & MCI_SR_RXRDY) {
+		if (isr & MCI_SR_RXRDY) {
 //			printf("Ready to receive\n");
 		}
-		if (sr & MCI_SR_CMDRDY) {
+		if (isr & MCI_SR_CMDRDY) {
 //			printf("Command ready\n");
-			done = 1;
-			cmd->error = MMC_ERR_NONE;
-		}
-	}
-	if (done) {
-		WR4(sc, MCI_IDR, 0xffffffff);
-		if (cmd != NULL && (cmd->flags & MMC_RSP_PRESENT)) {
-			for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1);
-			     i++) {
-				cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
-//				printf("RSPR[%d] = %x\n", i, cmd->resp[i]);
-			}
+			at91_mci_cmdrdy(sc, sr);
 		}
-		at91_mci_start(sc);
 	}
 	AT91_MCI_UNLOCK(sc);
 }
@@ -703,7 +1003,7 @@
 		*(int *)result = sc->host.caps;
 		break;
 	case MMCBR_IVAR_MAX_DATA:
-		*(int *)result = 1;
+		*(int *)result = AT91_MCI_MAX_BLOCKS;
 		break;
 	}
 	return (0);
--- patch-arm-at91_mci-for-multiblock ends here ---

>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201103022153.p22LrIhP097307>