Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 26 Jan 2010 06:36:10 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r203012 - in projects/suj/head: lib/libufs sbin/fsck_ffs sbin/mount sbin/tunefs sys/sys sys/ufs/ffs sys/ufs/ufs
Message-ID:  <201001260636.o0Q6aAwh005669@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Tue Jan 26 06:36:10 2010
New Revision: 203012
URL: http://svn.freebsd.org/changeset/base/203012

Log:
   - Move the softdep journal inode into the namespace at /.sujournal.  This
     requires quite a lot of code as tunefs needs to be able to create
     directory entries in ROOTINO.  However this is much cleaner from a
     compat standpoint.  The inode is marked IMMUTABLE and only readable by
     root.  Eventually the kernel will prevent clearing of the IMMUTABLE bit.
   - Fix a nasty link count bug involving changedirectory_offset().  When
     a link may exist at more than one location depending on when the
     directory block was written we create duplicate addref records.  When
     an add and a remove are detected at the same offset the remove is
     discarded based on the assumption that it cancels the link in the add.
     A legitimate remove may collide with one of these alternate offset adds
     that are created by fsck and be discarded even though it removed a real
     link.  To resolve this the lineage of the addref must be established
     to determine whether the remove refers to an alternate address or not.
     Any offset which is not up-to-date with respect to the offset in the
     move record is considered alternate and will not discard a remove.
   - Use clear_remove() when we begin to exhaust dependencies to prevent
     excessive looping in request_cleanup().  This should probably
     also be done in softdep_fsync().  Only workloads which delete
     incredible numbers of files within the same directory would be
     affected.  stress2 can generate over 100,000 outstanding removes on
     my test machine.

Modified:
  projects/suj/head/lib/libufs/cgroup.c
  projects/suj/head/lib/libufs/libufs.h
  projects/suj/head/sbin/fsck_ffs/pass4.c
  projects/suj/head/sbin/fsck_ffs/suj.c
  projects/suj/head/sbin/mount/mount.c
  projects/suj/head/sbin/tunefs/tunefs.c
  projects/suj/head/sys/sys/mount.h
  projects/suj/head/sys/ufs/ffs/ffs_alloc.c
  projects/suj/head/sys/ufs/ffs/ffs_softdep.c
  projects/suj/head/sys/ufs/ffs/ffs_vfsops.c
  projects/suj/head/sys/ufs/ffs/fs.h
  projects/suj/head/sys/ufs/ufs/inode.h

Modified: projects/suj/head/lib/libufs/cgroup.c
==============================================================================
--- projects/suj/head/lib/libufs/cgroup.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/lib/libufs/cgroup.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -71,6 +71,67 @@ gotit:
 	return (cgbase(fs, cgp->cg_cgx) + blkstofrags(fs, bno));
 }
 
+int
+cgbfree(struct uufsd *disk, ufs2_daddr_t bno, long size)
+{
+	u_int8_t *blksfree;
+	struct fs *fs;
+	struct cg *cgp;
+	ufs1_daddr_t fragno, cgbno;
+	int i, cg, blk, frags, bbase;
+
+	fs = &disk->d_fs;
+	cg = dtog(fs, bno);
+	if (cgread1(disk, cg) != 1)
+		return (-1);
+	cgp = &disk->d_cg;
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp);
+	if (size == fs->fs_bsize) {
+		fragno = fragstoblks(fs, cgbno);
+		ffs_setblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, 1);
+		cgp->cg_cs.cs_nbfree++;
+		fs->fs_cstotal.cs_nbfree++;
+		fs->fs_cs(fs, cg).cs_nbfree++;
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+		/*
+		 * decrement the counts associated with the old frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
+		/*
+		 * deallocate the fragment
+		 */
+		frags = numfrags(fs, size);
+		for (i = 0; i < frags; i++)
+			setbit(blksfree, cgbno + i);
+		cgp->cg_cs.cs_nffree += i;
+		fs->fs_cstotal.cs_nffree += i;
+		fs->fs_cs(fs, cg).cs_nffree += i;
+		/*
+		 * add back in counts associated with the new frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
+		/*
+		 * if a complete block has been reassembled, account for it
+		 */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			cgp->cg_cs.cs_nffree -= fs->fs_frag;
+			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, 1);
+			cgp->cg_cs.cs_nbfree++;
+			fs->fs_cstotal.cs_nbfree++;
+			fs->fs_cs(fs, cg).cs_nbfree++;
+		}
+	}
+	return cgwrite(disk);
+}
+
 ino_t
 cgialloc(struct uufsd *disk)
 {

Modified: projects/suj/head/lib/libufs/libufs.h
==============================================================================
--- projects/suj/head/lib/libufs/libufs.h	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/lib/libufs/libufs.h	Tue Jan 26 06:36:10 2010	(r203012)
@@ -111,6 +111,7 @@ int berase(struct uufsd *, ufs2_daddr_t,
  * cgroup.c
  */
 ufs2_daddr_t cgballoc(struct uufsd *);
+int cgbfree(struct uufsd *, ufs2_daddr_t, long);
 ino_t cgialloc(struct uufsd *);
 int cgread(struct uufsd *);
 int cgread1(struct uufsd *, int);

Modified: projects/suj/head/sbin/fsck_ffs/pass4.c
==============================================================================
--- projects/suj/head/sbin/fsck_ffs/pass4.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sbin/fsck_ffs/pass4.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -72,9 +72,6 @@ pass4(void)
 		for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) {
 			if (inumber < ROOTINO)
 				continue;
-			if (sblock.fs_flags & FS_SUJ &&
-			    inumber == sblock.fs_sujournal)
-				continue;
 			idesc.id_number = inumber;
 			switch (inoinfo(inumber)->ino_state) {
 

Modified: projects/suj/head/sbin/fsck_ffs/suj.c
==============================================================================
--- projects/suj/head/sbin/fsck_ffs/suj.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sbin/fsck_ffs/suj.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #include <stdlib.h>
 #include <stdint.h>
 #include <libufs.h>
+#include <string.h>
 #include <strings.h>
 #include <err.h>
 #include <assert.h>
@@ -63,6 +64,7 @@ struct suj_seg {
 struct suj_rec {
 	TAILQ_ENTRY(suj_rec) sr_next;
 	union jrec	*sr_rec;
+	int		sr_alt;	/* Is alternate address? */
 };
 TAILQ_HEAD(srechd, suj_rec);
 
@@ -127,6 +129,7 @@ TAILQ_HEAD(seghd, suj_seg) allsegs;
 uint64_t oldseq;
 static struct uufsd *disk = NULL;
 static struct fs *fs = NULL;
+ino_t sujino;
 
 /*
  * Summary statistics.
@@ -191,8 +194,7 @@ closedisk(const char *devnam)
 		fs->fs_cstotal.cs_nifree += cgsum->cs_nifree;
 		fs->fs_cstotal.cs_ndir += cgsum->cs_ndir;
 	}
-	/* XXX Don't set clean for now, we don't trust the journal. */
-	/* fs->fs_clean = 1; */
+	fs->fs_clean = 1;
 	fs->fs_time = time(NULL);
 	fs->fs_mtime = time(NULL);
 	if (sbwrite(disk, 0) == -1)
@@ -1823,6 +1825,7 @@ ino_append(union jrec *rec)
 	sino->si_hasrecs = 1;
 	srec = errmalloc(sizeof(*srec));
 	srec->sr_rec = rec;
+	srec->sr_alt = 0;
 	TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next);
 }
 
@@ -1844,9 +1847,10 @@ ino_build_ref(struct suj_ino *sino, stru
 
 	refrec = (struct jrefrec *)srec->sr_rec;
 	if (debug)
-		printf("ino_build: op %d, ino %d, nlink %d, parent %d, diroff %jd\n", 
-		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink, refrec->jr_parent,
-		    refrec->jr_diroff);
+		printf("ino_build: op %d, ino %d, nlink %d, "
+		    "parent %d, diroff %jd\n", 
+		    refrec->jr_op, refrec->jr_ino, refrec->jr_nlink,
+		    refrec->jr_parent, refrec->jr_diroff);
 
 	/*
 	 * Search for a mvrec that matches this offset.  Whether it's an add
@@ -1871,16 +1875,19 @@ ino_build_ref(struct suj_ino *sino, stru
 				rrn = errmalloc(sizeof(*refrec));
 				*rrn = *refrec;
 				rrn->jr_op = JOP_ADDREF;
+				rrn->jr_diroff = mvrec->jm_oldoff;
 				srn = errmalloc(sizeof(*srec));
+				srn->sr_alt = 1;
 				srn->sr_rec = (union jrec *)rrn;
 				ino_build_ref(sino, srn);
-				refrec->jr_diroff = mvrec->jm_oldoff;
 			}
 		}
 	}
 	/*
 	 * We walk backwards so that adds and removes are evaluated in the
-	 * correct order.
+	 * correct order.  If a primary record conflicts with an alt keep
+	 * the primary and discard the alt.  We must track this to keep
+	 * the correct number of removes in the list.
 	 */
 	for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn;
 	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
@@ -1890,7 +1897,17 @@ ino_build_ref(struct suj_ino *sino, stru
 			continue;
 		if (debug)
 			printf("Discarding dup.\n");
-		rrn->jr_mode = refrec->jr_mode;
+		if (srn->sr_alt == 0) {
+			rrn->jr_mode = refrec->jr_mode;
+			return;
+		}
+		/*
+		 * Replace the record in place with the old nlink in case
+		 * we replace the head of the list.  Abandon srec as a dup.
+		 */
+		refrec->jr_nlink = rrn->jr_nlink;
+		srn->sr_rec = srec->sr_rec;
+		srn->sr_alt = srec->sr_alt;
 		return;
 	}
 	TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next);
@@ -1930,9 +1947,12 @@ ino_move_ref(struct suj_ino *sino, struc
 		/*
 		 * When an entry is moved we don't know whether the write
 		 * to move has completed yet.  To resolve this we create
-		 * a new add dependency in the new location as if it were added
-		 * twice.  Only one will succeed.
+		 * a new add dependency in the new location as if it were
+		 * added twice.  Only one will succeed.  Consider the
+		 * new offset the primary location for the inode and the
+		 * old offset the alt.
 		 */
+		srn->sr_alt = 1;
 		refrec = errmalloc(sizeof(*refrec));
 		refrec->jr_op = JOP_ADDREF;
 		refrec->jr_ino = mvrec->jm_ino;
@@ -1941,12 +1961,14 @@ ino_move_ref(struct suj_ino *sino, struc
 		refrec->jr_mode = rrn->jr_mode;
 		refrec->jr_nlink = rrn->jr_nlink;
 		srn = errmalloc(sizeof(*srn));
+		srn->sr_alt = 0;
 		srn->sr_rec = (union jrec *)refrec;
 		ino_build_ref(sino, srn);
 		break;
 	}
 	/*
-	 * Add this mvrec to the queue of pending mvs.
+	 * Add this mvrec to the queue of pending mvs, possibly collapsing
+	 * it with a prior move for the same inode and offset.
 	 */
 	for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn;
 	    srn = TAILQ_PREV(srn, srechd, sr_next)) {
@@ -2195,19 +2217,25 @@ suj_verifyino(union dinode *ip)
 
 	if (DIP(ip, di_nlink) != 1) {
 		printf("Invalid link count %d for journal inode %d\n",
-		    DIP(ip, di_nlink), fs->fs_sujournal);
+		    DIP(ip, di_nlink), sujino);
+		return (-1);
+	}
+
+	if (DIP(ip, di_flags) != (SF_IMMUTABLE | SF_NOUNLINK)) {
+		printf("Invalid flags 0x%X for journal inode %d\n",
+		    DIP(ip, di_flags), sujino);
 		return (-1);
 	}
 
-	if (DIP(ip, di_mode) != IFREG) {
-		printf("Invalid mode %d for journal inode %d\n",
-		    DIP(ip, di_mode), fs->fs_sujournal);
+	if (DIP(ip, di_mode) != (IFREG | IREAD)) {
+		printf("Invalid mode %o for journal inode %d\n",
+		    DIP(ip, di_mode), sujino);
 		return (-1);
 	}
 
 	if (DIP(ip, di_size) < SUJ_MIN || DIP(ip, di_size) > SUJ_MAX) {
 		printf("Invalid size %jd for journal inode %d\n",
-		    DIP(ip, di_size), fs->fs_sujournal);
+		    DIP(ip, di_size), sujino);
 		return (-1);
 	}
 
@@ -2447,20 +2475,60 @@ restart:
 }
 
 /*
+ * Search a directory block for the SUJ_FILE.
+ */
+static void
+suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
+{
+	char block[MAXBSIZE];
+	struct direct *dp;
+	int bytes;
+	int off;
+
+	if (sujino)
+		return;
+	bytes = lfragtosize(fs, frags);
+	if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0)
+		err(1, "Failed to read ROOTINO directory block %jd", blk);
+	for (off = 0; off < bytes; off += dp->d_reclen) {
+		dp = (struct direct *)&block[off];
+		if (dp->d_reclen == 0)
+			break;
+		if (dp->d_ino == 0)
+			continue;
+		if (dp->d_namlen != strlen(SUJ_FILE))
+			continue;
+		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
+			continue;
+		sujino = dp->d_ino;
+		return;
+	}
+}
+
+/*
  * Orchestrate the verification of a filesystem via the softupdates journal.
  */
 int
 suj_check(const char *filesys)
 {
 	union dinode *jip;
+	union dinode *ip;
 	uint64_t blocks;
 
 	opendisk(filesys);
 	TAILQ_INIT(&allsegs);
 	/*
+	 * Find the journal inode.
+	 */
+	ip = ino_read(ROOTINO);
+	sujino = 0;
+	ino_visit(ip, ROOTINO, suj_find, 0);
+	if (sujino == 0)
+		errx(1, "Journal inode removed.  Use tunefs to re-create.");
+	/*
 	 * Fetch the journal inode and verify it.
 	 */
-	jip = ino_read(fs->fs_sujournal);
+	jip = ino_read(sujino);
 	printf("** SU+J Recovering %s\n", filesys);
 	if (suj_verifyino(jip) != 0)
 		return (-1);
@@ -2469,11 +2537,11 @@ suj_check(const char *filesys)
 	 * available journal blocks in with suj_read().
 	 */
 	printf("** Reading %jd byte journal from inode %d.\n",
-	    DIP(jip, di_size), fs->fs_sujournal);
+	    DIP(jip, di_size), sujino);
 	suj_jblocks = jblocks_create();
-	blocks = ino_visit(jip, fs->fs_sujournal, suj_add_block, 0);
+	blocks = ino_visit(jip, sujino, suj_add_block, 0);
 	if (blocks != numfrags(fs, DIP(jip, di_size)))
-		errx(1, "Sparse journal inode %d.\n", fs->fs_sujournal);
+		errx(1, "Sparse journal inode %d.\n", sujino);
 	suj_read();
 	jblocks_destroy(suj_jblocks);
 	suj_jblocks = NULL;

Modified: projects/suj/head/sbin/mount/mount.c
==============================================================================
--- projects/suj/head/sbin/mount/mount.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sbin/mount/mount.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -113,7 +113,6 @@ static struct opt {
 	{ MNT_ACLS,		"acls" },
 	{ MNT_NFS4ACLS,		"nfsv4acls" },
 	{ MNT_GJOURNAL,		"gjournal" },
-	{ MNT_SUJ,		"journal" }, /* always soft-updates, journal */
 	{ 0, NULL }
 };
 

Modified: projects/suj/head/sbin/tunefs/tunefs.c
==============================================================================
--- projects/suj/head/sbin/tunefs/tunefs.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sbin/tunefs/tunefs.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
+#include <ufs/ufs/dir.h>
 
 #include <ctype.h>
 #include <err.h>
@@ -74,6 +75,7 @@ struct uufsd disk;
 void usage(void);
 void printfs(void);
 int journal_alloc(int64_t size);
+void journal_clear(void);
 void sbdirty(void);
 
 int
@@ -355,11 +357,11 @@ main(int argc, char *argv[])
 			if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) {
 				warnx("%s remains unchanged as disabled", name);
 			} else {
-				sbdirty();
+				journal_clear();
  				sblock.fs_flags &= ~(FS_DOSOFTDEP | FS_SUJ);
-				sblock.fs_sujournal = 0;
 				sblock.fs_sujfree = 0;
- 				warnx("%s cleared", name);
+ 				warnx("%s cleared, "
+				    "remove .sujournal to reclaim space", name);
 			}
  		}
 	}
@@ -523,11 +525,9 @@ journal_balloc(void)
 {
 	ufs2_daddr_t blk;
 	struct cg *cgp;
-	struct fs *fs;
 	int valid;
 
 	cgp = &disk.d_cg;
-	fs = &disk.d_fs;
 	for (;;) {
 		blk = cgballoc(&disk);
 		if (blk > 0)
@@ -553,13 +553,231 @@ journal_balloc(void)
 		warnx("Failed to find sufficient free blocks for the journal");
 		return -1;
 	}
-	if (bwrite(&disk, fsbtodb(fs, blk), clrbuf, fs->fs_bsize) <= 0) {
+	if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf,
+	    sblock.fs_bsize) <= 0) {
 		warn("Failed to initialize new block");
 		return -1;
 	}
 	return (blk);
 }
 
+/*
+ * Search a directory block for the SUJ_FILE.
+ */
+static ino_t
+dir_search(ufs2_daddr_t blk, int bytes)
+{
+	char block[MAXBSIZE];
+	struct direct *dp;
+	int off;
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	for (off = 0; off < bytes; off += dp->d_reclen) {
+		dp = (struct direct *)&block[off];
+		if (dp->d_reclen == 0)
+			break;
+		if (dp->d_ino == 0)
+			continue;
+		if (dp->d_namlen != strlen(SUJ_FILE))
+			continue;
+		if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0)
+			continue;
+		return (dp->d_ino);
+	}
+
+	return (0);
+}
+
+/*
+ * Search in the ROOTINO for the SUJ_FILE.  If it exists we can not enable
+ * journaling.
+ */
+static ino_t
+journal_findfile(void)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	int mode;
+	void *ip;
+	int i;
+
+	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+		warn("Failed to get root inode");
+		return (-1);
+	}
+	dp2 = ip;
+	dp1 = ip;
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+			warnx("ROOTINO extends beyond direct blocks.");
+			return (-1);
+		}
+		for (i = 0; i < NDADDR; i++) {
+			if (dp1->di_db[i] == 0)
+				break;
+			if (dir_search(dp1->di_db[i],
+			    sblksize(&sblock, (off_t)dp1->di_size, i)) != 0)
+				return (-1);
+		}
+	} else {
+		if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) {
+			warnx("ROOTINO extends beyond direct blocks.");
+			return (-1);
+		}
+		for (i = 0; i < NDADDR; i++) {
+			if (dp2->di_db[i] == 0)
+				break;
+			if (dir_search(dp2->di_db[i],
+			    sblksize(&sblock, (off_t)dp2->di_size, i)) != 0)
+				return (-1);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Insert the journal at inode 'ino' into directory blk 'blk' at the first
+ * free offset of 'off'.  DIRBLKSIZ blocks after off are initialized as
+ * empty.
+ */
+static int
+dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino)
+{
+	struct direct *dp;
+	char block[MAXBSIZE];
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	bzero(&block[off], sblock.fs_bsize - off);
+	dp = (struct direct *)&block[off];
+	dp->d_ino = ino;
+	dp->d_reclen = DIRBLKSIZ;
+	dp->d_type = DT_REG;
+	dp->d_namlen = strlen(SUJ_FILE);
+	bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE));
+	off += DIRBLKSIZ;
+	for (; off < sblock.fs_bsize; off += DIRBLKSIZ) {
+		dp = (struct direct *)&block[off];
+		dp->d_ino = 0;
+		dp->d_reclen = DIRBLKSIZ;
+		dp->d_type = DT_UNKNOWN;
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) {
+		warn("Failed to write dir block");
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Extend a directory block in 'blk' by copying it to a full size block
+ * and inserting the new journal inode into .sujournal.
+ */
+static int
+dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino)
+{
+	char block[MAXBSIZE];
+
+	if (bread(&disk, fsbtodb(&sblock, blk), block, size) <= 0) {
+		warn("Failed to read dir block");
+		return (-1);
+	}
+	if (bwrite(&disk, fsbtodb(&sblock, nblk), block, size) <= 0) {
+		warn("Failed to write dir block");
+		return (-1);
+	}
+
+	return dir_insert(nblk, size, ino);
+}
+
+/*
+ * Insert the journal file into the ROOTINO directory.  We always extend the
+ * last frag
+ */
+static int
+journal_insertfile(ino_t ino)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	void *ip;
+	ufs2_daddr_t nblk;
+	ufs2_daddr_t blk;
+	ufs_lbn_t lbn;
+	int size;
+	int mode;
+	int off;
+
+	if (getino(&disk, &ip, ROOTINO, &mode) != 0) {
+		warn("Failed to get root inode");
+		sbdirty();
+		return (-1);
+	}
+	dp2 = ip;
+	dp1 = ip;
+	blk = 0;
+	size = 0;
+	nblk = journal_balloc();
+	if (nblk <= 0)
+		return (-1);
+	/*
+	 * For simplicity sake we aways extend the ROOTINO into a new
+	 * directory block rather than searching for space and inserting
+	 * into an existing block.  However, if the rootino has frags
+	 * have to free them and extend the block.
+	 */
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		lbn = lblkno(&sblock, dp1->di_size);
+		off = blkoff(&sblock, dp1->di_size);
+		blk = dp1->di_db[lbn];
+		size = sblksize(&sblock, (off_t)dp1->di_size, lbn);
+	} else {
+		lbn = lblkno(&sblock, dp2->di_size);
+		off = blkoff(&sblock, dp2->di_size);
+		blk = dp2->di_db[lbn];
+		size = sblksize(&sblock, (off_t)dp2->di_size, lbn);
+	}
+	if (off != 0) {
+		if (dir_extend(blk, nblk, off, ino) == -1)
+			return (-1);
+	} else {
+		blk = 0;
+		if (dir_insert(nblk, 0, ino) == -1)
+			return (-1);
+	}
+	if (sblock.fs_magic == FS_UFS1_MAGIC) {
+		dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+		dp1->di_db[lbn] = nblk;
+		dp1->di_size = lblktosize(&sblock, lbn+1);
+	} else {
+		dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE;
+		dp2->di_db[lbn] = nblk;
+		dp2->di_size = lblktosize(&sblock, lbn+1);
+	}
+	if (putino(&disk) < 0) {
+		warn("Failed to write root inode");
+		return (-1);
+	}
+	if (cgwrite(&disk) < 0) {
+		warn("Failed to write updated cg");
+		sbdirty();
+		return (-1);
+	}
+	if (blk) {
+		if (cgbfree(&disk, blk, size) < 0) {
+			warn("Failed to write cg");
+			return (-1);
+		}
+	}
+
+	return (0);
+}
+
 static int
 indir_fill(ufs2_daddr_t blk, int level, int *resid)
 {
@@ -567,22 +785,20 @@ indir_fill(ufs2_daddr_t blk, int level, 
 	ufs1_daddr_t *bap1;
 	ufs2_daddr_t *bap2;
 	ufs2_daddr_t nblk;
-	struct fs *fs;
 	int ncnt;
 	int cnt;
 	int i;
 
-	fs = &disk.d_fs;
 	bzero(indirbuf, sizeof(indirbuf));
 	bap1 = (ufs1_daddr_t *)indirbuf;
 	bap2 = (void *)bap1;
 	cnt = 0;
-	for (i = 0; i < NINDIR(fs) && *resid != 0; i++) {
+	for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) {
 		nblk = journal_balloc();
 		if (nblk <= 0)
 			return (-1);
 		cnt++;
-		if (fs->fs_magic == FS_UFS1_MAGIC)
+		if (sblock.fs_magic == FS_UFS1_MAGIC)
 			*bap1++ = nblk;
 		else
 			*bap2++ = nblk;
@@ -594,13 +810,47 @@ indir_fill(ufs2_daddr_t blk, int level, 
 		} else 
 			(*resid)--;
 	}
-	if (bwrite(&disk, fsbtodb(fs, blk), indirbuf, fs->fs_bsize) <= 0) {
+	if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf,
+	    sblock.fs_bsize) <= 0) {
 		warn("Failed to write indirect");
 		return (-1);
 	}
 	return (cnt);
 }
 
+/*
+ * Clear the flag bits so the journal can be removed.
+ */
+void
+journal_clear(void)
+{
+	struct ufs1_dinode *dp1;
+	struct ufs2_dinode *dp2;
+	ino_t ino;
+	int mode;
+	void *ip;
+
+	ino = journal_findfile();
+	if (ino <= 0) {
+		warnx("Journal file does not exist");
+		return;
+	}
+	if (getino(&disk, &ip, ino, &mode) != 0) {
+		warn("Failed to get journal inode");
+		return;
+	}
+	dp2 = ip;
+	dp1 = ip;
+	if (sblock.fs_magic == FS_UFS1_MAGIC)
+		dp1->di_flags = 0;
+	else
+		dp2->di_flags = 0;
+	if (putino(&disk) < 0) {
+		warn("Failed to write journal inode");
+		return;
+	}
+}
+
 int
 journal_alloc(int64_t size)
 {
@@ -609,32 +859,39 @@ journal_alloc(int64_t size)
 	ufs2_daddr_t blk;
 	void *ip;
 	struct cg *cgp;
-	struct fs *fs;
 	int resid;
 	ino_t ino;
 	int blks;
 	int mode;
 	int i;
 
-	fs = &disk.d_fs;
 	cgp = &disk.d_cg;
 	ino = 0;
 
 	/*
+	 * If the journal file exists we can't allocate it.
+	 */
+	ino = journal_findfile();
+	if (ino > 0)
+		warnx("Journal file %s already exists, please remove.",
+		    SUJ_FILE);
+	if (ino != 0)
+		return (-1);
+	/*
 	 * If the user didn't supply a size pick one based on the filesystem
 	 * size constrained with hardcoded MIN and MAX values.  We opt for
 	 * 1/1024th of the filesystem up to MAX but not exceeding one CG and
 	 * not less than the MIN.
 	 */
 	if (size == 0) {
-		size = (fs->fs_size * fs->fs_bsize) / 1024;
+		size = (sblock.fs_size * sblock.fs_bsize) / 1024;
 		size = MIN(SUJ_MAX, size);
-		if (size / fs->fs_fsize > fs->fs_fpg)
-			size = fs->fs_fpg * fs->fs_fsize;
+		if (size / sblock.fs_fsize > sblock.fs_fpg)
+			size = sblock.fs_fpg * sblock.fs_fsize;
 		size = MAX(SUJ_MIN, size);
 	}
-	resid = blocks = size / fs->fs_bsize;
-	if (fs->fs_cstotal.cs_nbfree < blocks) {
+	resid = blocks = size / sblock.fs_bsize;
+	if (sblock.fs_cstotal.cs_nbfree < blocks) {
 		warn("Insufficient free space for %jd byte journal", size);
 		return (-1);
 	}
@@ -647,9 +904,9 @@ journal_alloc(int64_t size)
 			continue;
 		/*
 		 * Try to minimize fragmentation by requiring at least a
-		 * 1/8th of the blocks be present in each cg we use.
+		 * 1/16th of the blocks be present in each cg we use.
 		 */
-		if (cgp->cg_cs.cs_nbfree < blocks / 8)
+		if (cgp->cg_cs.cs_nbfree < blocks / 16)
 			continue;
 		ino = cgialloc(&disk);
 		if (ino <= 0)
@@ -668,22 +925,24 @@ journal_alloc(int64_t size)
 		 */
 		dp2 = ip;
 		dp1 = ip;
-		if (fs->fs_magic == FS_UFS1_MAGIC) {
+		if (sblock.fs_magic == FS_UFS1_MAGIC) {
 			bzero(dp1, sizeof(*dp1));
 			dp1->di_size = size;
-			dp1->di_mode = IFREG;
+			dp1->di_mode = IFREG | IREAD;
 			dp1->di_nlink = 1;
+			dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
 		} else {
 			bzero(dp2, sizeof(*dp2));
 			dp2->di_size = size;
-			dp2->di_mode = IFREG;
+			dp2->di_mode = IFREG | IREAD;
 			dp2->di_nlink = 1;
+			dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK;
 		}
 		for (i = 0; i < NDADDR && resid; i++, resid--) {
 			blk = journal_balloc();
 			if (blk <= 0)
 				goto out;
-			if (fs->fs_magic == FS_UFS1_MAGIC) {
+			if (sblock.fs_magic == FS_UFS1_MAGIC) {
 				dp1->di_db[i] = blk;
 				dp1->di_blocks++;
 			} else {
@@ -700,7 +959,7 @@ journal_alloc(int64_t size)
 				sbdirty();
 				goto out;
 			}
-			if (fs->fs_magic == FS_UFS1_MAGIC) {
+			if (sblock.fs_magic == FS_UFS1_MAGIC) {
 				dp1->di_ib[i] = blk;
 				dp1->di_blocks += blks;
 			} else {
@@ -708,10 +967,10 @@ journal_alloc(int64_t size)
 				dp2->di_blocks += blks;
 			}
 		}
-		if (fs->fs_magic == FS_UFS1_MAGIC)
-			dp1->di_blocks *= fs->fs_bsize / disk.d_bsize;
+		if (sblock.fs_magic == FS_UFS1_MAGIC)
+			dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize;
 		else
-			dp2->di_blocks *= fs->fs_bsize / disk.d_bsize;
+			dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize;
 		if (putino(&disk) < 0) {
 			warn("Failed to write inode");
 			sbdirty();
@@ -722,8 +981,11 @@ journal_alloc(int64_t size)
 			sbdirty();
 			return (-1);
 		}
-		fs->fs_sujournal = ino;
-		fs->fs_sujfree = 0;
+		if (journal_insertfile(ino) < 0) {
+			sbdirty();
+			return (-1);
+		}
+		sblock.fs_sujfree = 0;
 		return (0);
 	}
 	warnx("Insufficient contiguous free space for the journal.");

Modified: projects/suj/head/sys/sys/mount.h
==============================================================================
--- projects/suj/head/sys/sys/mount.h	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sys/sys/mount.h	Tue Jan 26 06:36:10 2010	(r203012)
@@ -240,7 +240,6 @@ void          __mnt_vnode_markerfree(str
 #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
 #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
 #define	MNT_NFS4ACLS	0x00000010
-#define	MNT_SUJ		0x00000080	/* softdep journaling */
 
 /*
  * NFS export related mount flags.
@@ -277,7 +276,7 @@ void          __mnt_vnode_markerfree(str
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
 			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS	| \
-			MNT_NFS4ACLS	| MNT_SUJ)
+			MNT_NFS4ACLS)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
@@ -326,6 +325,7 @@ void          __mnt_vnode_markerfree(str
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
 #define	MNTK_SHARED_WRITES	0x00000080 /* Allow shared locking for writes */
+#define	MNTK_SUJ	0x00000100	/* Softdep journaling enabled */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */

Modified: projects/suj/head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- projects/suj/head/sys/ufs/ffs/ffs_alloc.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sys/ufs/ffs/ffs_alloc.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -1851,6 +1851,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
 	ino_t inum;
 	struct workhead *dephd;
 {
+	struct mount *mp;
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t fragno, cgbno;
@@ -1965,7 +1966,8 @@ ffs_blkfree(ump, fs, devvp, bno, size, i
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
-	if (UFSTOVFS(ump)->mnt_flag & MNT_SOFTDEP)
+	mp = UFSTOVFS(ump);
+	if (mp->mnt_flag & MNT_SOFTDEP)
 		softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
 		    numfrags(fs, size), dephd);
 	bdwrite(bp);

Modified: projects/suj/head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- projects/suj/head/sys/ufs/ffs/ffs_softdep.c	Tue Jan 26 05:17:03 2010	(r203011)
+++ projects/suj/head/sys/ufs/ffs/ffs_softdep.c	Tue Jan 26 06:36:10 2010	(r203012)
@@ -1902,7 +1902,7 @@ softdep_unmount(mp)
 	struct mount *mp;
 {
 
-	if (mp->mnt_flag & MNT_SUJ)
+	if (mp->mnt_kern_flag & MNTK_SUJ)
 		journal_unmount(mp);
 }
 
@@ -2044,16 +2044,36 @@ journal_mount(mp, fs, cred)
 	struct fs *fs;
 	struct ucred *cred;
 {
+	struct componentname cnp;
 	struct jblocks *jblocks;
+	struct vnode *dvp;
 	struct vnode *vp;
 	struct inode *ip;
 	ufs2_daddr_t blkno;
+	ino_t sujournal;
 	int bcount;
 	int error;
 	int i;
 
-	mp->mnt_flag |= MNT_SUJ;
-	error = VFS_VGET(mp, fs->fs_sujournal, LK_EXCLUSIVE, &vp);
+	mp->mnt_kern_flag |= MNTK_SUJ;
+	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
+	if (error)
+		return (error);
+	bzero(&cnp, sizeof(cnp));
+	cnp.cn_nameiop = LOOKUP;
+	cnp.cn_flags = ISLASTCN;
+	cnp.cn_thread = curthread;
+	cnp.cn_cred = curthread->td_ucred;
+	cnp.cn_pnbuf = SUJ_FILE;
+	cnp.cn_nameptr = SUJ_FILE;
+	cnp.cn_namelen = strlen(SUJ_FILE);
+	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
+	vput(dvp);
+	if (error != 0) {
+		printf("Failed to find journal.  Use tunefs to create one\n");
+		return (error);
+	}
+	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, &vp);
 	if (error)
 		return (error);
 	ip = VTOI(vp);
@@ -2075,9 +2095,18 @@ journal_mount(mp, fs, cred)
 	}
 	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
 	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
-	DIP_SET(ip, i_modrev, fs->fs_mtime);
-	ip->i_flags |= IN_MODIFIED;
-	ffs_update(vp, 1);
+	/*
+	 * Only validate the journal contents if the filesystem is clean,
+	 * otherwise we write the logs but they'll never be used.  If the
+	 * filesystem was still dirty when we mounted it the journal is
+	 * invalid and a new journal can only be valid if it starts from a
+	 * clean mount.
+	 */
+	if (fs->fs_clean) {
+		DIP_SET(ip, i_modrev, fs->fs_mtime);
+		ip->i_flags |= IN_MODIFIED;
+		ffs_update(vp, 1);
+	}
 	VFSTOUFS(mp)->softdep_jblocks = jblocks;
 out:
 	vput(vp);
@@ -2159,6 +2188,11 @@ remove_from_journal(wk)
 	ump->softdep_on_journal -= 1;
 }
 
+/*
+ * Check for journal space as well as dependency limits so the prelink
+ * code can throttle both journaled and non-journaled filesystems.
+ * Threshold is 0 for low and 1 for min.
+ */
 static int
 journal_space(ump, thresh)
 	struct ufsmount *ump;
@@ -2167,7 +2201,20 @@ journal_space(ump, thresh)
 	struct jblocks *jblocks;
 	int avail;
 
+	/*
+	 * We use a tighter restriction here to prevent request_cleanup()
+	 * running in threads from running into locks we currently hold.
+	 */
+	if (num_inodedep > (max_softdeps / 10) * 9)
+		return (0);
+
 	jblocks = ump->softdep_jblocks;
+	if (jblocks == NULL)
+		return (1);
+	if (thresh)
+		thresh = jblocks->jb_min;
+	else
+		thresh = jblocks->jb_low;
 	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
 	avail = jblocks->jb_free - avail;
 
@@ -2210,15 +2257,13 @@ softdep_prealloc(vp, waitok)
 	struct vnode *vp;
 	int waitok;
 {
-	struct jblocks *jblocks;
 	struct ufsmount *ump;
 
 	if (DOINGSUJ(vp) == 0)
 		return (0);
 	ump = VFSTOUFS(vp->v_mount);
-	jblocks = ump->softdep_jblocks;
 	ACQUIRE_LOCK(&lk);
-	if (journal_space(ump, jblocks->jb_low)) {
+	if (journal_space(ump, 0)) {
 		FREE_LOCK(&lk);
 		return (0);
 	}
@@ -2233,9 +2278,9 @@ softdep_prealloc(vp, waitok)
 	ffs_syncvnode(vp, waitok);
 	ACQUIRE_LOCK(&lk);
 	process_removes(vp);
-	if (journal_space(ump, jblocks->jb_low) == 0) {
+	if (journal_space(ump, 0) == 0) {
 		softdep_speedup();
-		if (journal_space(ump, jblocks->jb_min) == 0)
+		if (journal_space(ump, 1) == 0)
 			journal_suspend(ump);
 	}
 	FREE_LOCK(&lk);
@@ -2243,18 +2288,22 @@ softdep_prealloc(vp, waitok)
 	return (0);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201001260636.o0Q6aAwh005669>