Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 9 Oct 2007 15:01:49 +0200
From:      Bjorn Gronvall <bg@sics.se>
To:        freebsd-fs@freebsd.org
Subject:   NFS server does not cluster writes
Message-ID:  <20071009150149.337279ce@ibook.sics.se>

next in thread | raw e-mail | index | archive | help
Hi,

The current NFS server does only cluster reads but never writes which
in turn leads to poor sequential-write performance. The attached patch
makes the following changes:

1/ Rearrange the code so that the same code can be used to detect both
   sequential reads and writes.

2/ Merge in updates from vfs_vnops.c::sequential_heuristic.

3/ Use double hashing in order to avoid hash-clustering in the
   nfsheur table.

4/ Pack nfsheur table more efficiently.

5/ Tolerate reordered RPCs to some small amount (initially suggested
   by Ellard and Seltzer).

6/ Back-off from sequential access rather than immediately switching to
   random access.

These changes has been tested on a low performance ATA disk (with
write caching disabled) and speeded up large sequential writes by a
factor of four. I would be interested in getting numbers from more
normal server configurations if somebody has the time to try it out.

Cheers,
/b

-- 
  _     _                                           ,_______________.
Bjorn Gronvall (Björn Grönvall)                    /_______________/|
Swedish Institute of Computer Science              |               ||
PO Box 1263, S-164 29 Kista, Sweden                | Schroedingers ||
Email: bg@sics.se, Phone +46 -8 633 15 25          |      Cat      |/
Cellular +46 -70 768 06 35, Fax +46 -8 751 72 30   '---------------'








--- nfs_serv.c.orig	2007-10-09 12:03:00.000000000 +0200
+++ nfs_serv.c	2007-10-09 13:50:02.000000000 +0200
@@ -106,18 +106,98 @@
 
 #define MAX_COMMIT_COUNT	(1024 * 1024)
 
-#define NUM_HEURISTIC		1017
+#define NUM_HEURISTIC		1031 /* Must be prime! */
+#define HASH_MAXSTEP		0x3ff
 #define NHUSE_INIT		64
 #define NHUSE_INC		16
 #define NHUSE_MAX		2048
+CTASSERT(NUM_HEURISTIC > (HASH_MAXSTEP + 1));
 
 static struct nfsheur {
+	off_t nh_nextoff;	/* next offset for sequential detection */
 	struct vnode *nh_vp;	/* vp to match (unreferenced pointer) */
-	off_t nh_nextr;		/* next offset for sequential detection */
-	int nh_use;		/* use count for selection */
-	int nh_seqcount;	/* heuristic */
+	uint16_t nh_use;	/* use count for selection */
+	uint16_t nh_seqcount;	/* in units of BKVASIZE bytes */
 } nfsheur[NUM_HEURISTIC];
 
+/*
+ * Sequential heuristic - detect sequential operation
+ */
+static
+struct nfsheur *
+sequential_heuristic(const struct uio *uio, struct vnode *vp)
+{
+	struct nfsheur *nh;
+	unsigned hi, step;	/* Double hashing */
+	int try = 32;		/* A bit large? */
+	int nblocks;
+
+	/*
+	 * Locate best candidate
+	 */
+
+	hi =   ((unsigned)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
+	step = ((unsigned)vp / sizeof(struct vnode)) & HASH_MAXSTEP;
+	step++;			/* Step must not be zero. */
+	nh = &nfsheur[hi];
+
+	while (try--) {
+		if (nfsheur[hi].nh_vp == vp) {
+			nh = &nfsheur[hi];
+			break;
+		}
+		if (nfsheur[hi].nh_use > 0)
+			--nfsheur[hi].nh_use;
+		hi = hi + step;
+		if (hi >= NUM_HEURISTIC)
+			hi -= NUM_HEURISTIC;
+		if (nfsheur[hi].nh_use < nh->nh_use)
+			nh = &nfsheur[hi];
+	}
+
+	if (nh->nh_vp != vp) {
+		nh->nh_vp = vp;
+		nh->nh_nextoff = uio->uio_offset;
+		nh->nh_use = NHUSE_INIT;
+		if (uio->uio_offset == 0)
+			nh->nh_seqcount = 4;
+		else
+			nh->nh_seqcount = 1;
+	}
+
+	nh->nh_use += NHUSE_INC;
+	if (nh->nh_use > NHUSE_MAX)
+		nh->nh_use = NHUSE_MAX;
+
+	/*
+	 * Calculate heuristic
+	 */
+
+	/*
+	 * XXX we assume that the filesystem block size is
+	 * the default.  Not true, but still gives us a pretty
+	 * good indicator of how sequential the read operations
+	 * are.
+	 */
+	nblocks = (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+	if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
+	    uio->uio_offset == nh->nh_nextoff) {
+		nh->nh_seqcount += nblocks;
+		if (nh->nh_seqcount > IO_SEQMAX)
+			nh->nh_seqcount = IO_SEQMAX;
+	} else if (qabs(uio->uio_offset - nh->nh_nextoff) <=
+		   4*imax(BKVASIZE, uio->uio_resid)) {
+		/* Probably reordered RPC, do nothing. */
+	} else {
+		nh->nh_seqcount /= 4;
+		/* RPCs larger than 1 block should cluster IO. */
+		if (nblocks > 1 && nh->nh_seqcount < nblocks)
+			nh->nh_seqcount = nblocks;
+	}
+
+	return (nh);
+}
+
 /* Global vars */
 
 int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000;
@@ -855,61 +935,6 @@
 	else
 		cnt = reqlen;
 
-	/*
-	 * Calculate seqcount for heuristic
-	 */
-
-	{
-		int hi;
-		int try = 32;
-
-		/*
-		 * Locate best candidate
-		 */
-
-		hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
-		nh = &nfsheur[hi];
-
-		while (try--) {
-			if (nfsheur[hi].nh_vp == vp) {
-				nh = &nfsheur[hi];
-				break;
-			}
-			if (nfsheur[hi].nh_use > 0)
-				--nfsheur[hi].nh_use;
-			hi = (hi + 1) % NUM_HEURISTIC;
-			if (nfsheur[hi].nh_use < nh->nh_use)
-				nh = &nfsheur[hi];
-		}
-
-		if (nh->nh_vp != vp) {
-			nh->nh_vp = vp;
-			nh->nh_nextr = off;
-			nh->nh_use = NHUSE_INIT;
-			if (off == 0)
-				nh->nh_seqcount = 4;
-			else
-				nh->nh_seqcount = 1;
-		}
-
-		/*
-		 * Calculate heuristic
-		 */
-
-		if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) {
-			if (++nh->nh_seqcount > IO_SEQMAX)
-				nh->nh_seqcount = IO_SEQMAX;
-		} else if (nh->nh_seqcount > 1) {
-			nh->nh_seqcount = 1;
-		} else {
-			nh->nh_seqcount = 0;
-		}
-		nh->nh_use += NHUSE_INC;
-		if (nh->nh_use > NHUSE_MAX)
-			nh->nh_use = NHUSE_MAX;
-		ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
-        }
-
 	nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt));
 	if (v3) {
 		tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED);
@@ -967,9 +992,11 @@
 		uiop->uio_resid = len;
 		uiop->uio_rw = UIO_READ;
 		uiop->uio_segflg = UIO_SYSSPACE;
+		nh = sequential_heuristic(uiop, vp);
+		ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
 		error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
 		off = uiop->uio_offset;
-		nh->nh_nextr = off;
+		nh->nh_nextoff = off;
 		FREE((caddr_t)iv2, M_TEMP);
 		if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) {
 			if (!error)
@@ -1037,12 +1064,14 @@
 	nfsfh_t nfh;
 	fhandle_t *fhp;
 	struct uio io, *uiop = &io;
+	struct nfsheur *nh;
 	off_t off;
 	struct mount *mntp = NULL;
 	int tvfslocked;
 	int vfslocked;
 
 	nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
+	bwillwrite();
 	vfslocked = 0;
 	if (mrep == NULL) {
 		*mrq = NULL;
@@ -1175,9 +1204,12 @@
 	    uiop->uio_segflg = UIO_SYSSPACE;
 	    uiop->uio_td = NULL;
 	    uiop->uio_offset = off;
+	    nh = sequential_heuristic(uiop, vp);
+	    ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
 	    error = VOP_WRITE(vp, uiop, ioflags, cred);
 	    /* XXXRW: unlocked write. */
 	    nfsrvstats.srvvop_writes++;
+	    nh->nh_nextoff = uiop->uio_offset;
 	    FREE((caddr_t)iv, M_TEMP);
 	}
 	aftat_ret = VOP_GETATTR(vp, vap, cred, td);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20071009150149.337279ce>