Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 25 Aug 2013 20:56:17 +0300
From:      Mikolaj Golub <trociny@FreeBSD.org>
To:        Yamagi Burmeister <lists@yamagi.org>
Cc:        freebsd-fs@freebsd.org
Subject:   Re: 9.2-RC1: LORs / Deadlock with SU+J on HAST in "memsync" mode
Message-ID:  <20130825175616.GA3472@gmail.com>
In-Reply-To: <20130822121341.0f27cb5e372d12bab8725654@yamagi.org>
References:  <20130819115101.ae9c0cf788f881dc4de464c5@yamagi.org> <20130822121341.0f27cb5e372d12bab8725654@yamagi.org>

next in thread | previous in thread | raw e-mail | index | archive | help

--uAKRQypu60I7Lcqm
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Thu, Aug 22, 2013 at 12:13:41PM +0200, Yamagi Burmeister wrote:

> After having some systems upgraded to FreeBSD 9.2-RC1/RC2 and switched
> HAST to the new "memsync" mode I've seen processes getting stuck when
> accessing files on UFS filesystems with SU+J enabled. Testing showed
> that this only seems to happen (while I couldn't reproduce it in other
> combinations I'm not quite sure if its really the case) when HAST is
> running in "memsync" mode and the UFS filesystem on HAST has SU+J
> enabled. It can be reproduced easily with the instructions below. 

I think I found (and reproduced) a scenario, when the primary might
leak HAST IO request (hio), resulting in IO getting stuck.

This may happen when the secondary is disconnecting and there are
pending WRITE requests in primary's hio_recv list.

In primary, remote_recv_thread():
  * hast_proto_recv_hdr() returns "Unable to receive reply header";
  * continue (restart loop);
  * memsyncack = false;
  * take hio from hio_recv list;
  * goto done_queue
  * hio has:
      hio_replication == MEMSYNC
      hio_countdown == 2 (local write complete, not in local_send
      queue)
    thus,
  * refcnt_release(&hio->hio_countdown) => hio_countdown == 1
  * !memsyncack => continue;

As a result the hio is not put in any queue and the request is leaked.

I am attaching the patch aimed to fix this. In done_queue, it checks
if the request is after disconnection, and if it is and is completed
locally it is put to the done queue. To disambiguate requests I had to
add a flag to hio, telling if memsync ack from the secondary is
already received.

Yamagi, I don't know if this is your case, but could you try the
patch?

If it does not help, please, after the hang, get core images of the
worker processes (both primary and secondary) using gcore(1) and
provide them together with hastd binary and libraries it is linked
with (from `ldd /sbin/hastd' list). Note, core files might expose
secure information from your host, if this worries you, you can send
them to me privately.

Also, grep hastd /var/log/all.log from both the primary and the
secondary might be useful.

-- 
Mikolaj Golub

--uAKRQypu60I7Lcqm
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: inline; filename="primary.c.memsync.hio_leack.1.patch"

Index: sbin/hastd/primary.c
===================================================================
--- sbin/hastd/primary.c	(revision 254760)
+++ sbin/hastd/primary.c	(working copy)
@@ -90,9 +90,9 @@ struct hio {
 	 */
 	struct g_gate_ctl_io	 hio_ggio;
 	/*
-	 * Request was already confirmed to GEOM Gate.
+	 * HIOF_* flags.
 	 */
-	bool			 hio_done;
+	int			 hio_flags;
 	/*
 	 * Remember replication from the time the request was initiated,
 	 * so we won't get confused when replication changes on reload.
@@ -104,6 +104,12 @@ struct hio {
 #define	hio_done_next	hio_next[0]
 
 /*
+ * Flags kept in hio_flags.
+ */
+#define	HIOF_DONE		0x00000001 /* Confirmed to GEOM Gate. */
+#define	HIOF_MEMSYNC_ACK	0x00000002 /* Memsync acknowledged. */
+
+/*
  * Free list holds unused structures. When free list is empty, we have to wait
  * until some in-progress requests are freed.
  */
@@ -1119,7 +1125,7 @@ write_complete(struct hast_resource *res, struct h
 	struct g_gate_ctl_io *ggio;
 	unsigned int ncomp;
 
-	PJDLOG_ASSERT(!hio->hio_done);
+	PJDLOG_ASSERT((hio->hio_flags & HIOF_DONE) == 0);
 
 	ggio = &hio->hio_ggio;
 	PJDLOG_ASSERT(ggio->gctl_cmd == BIO_WRITE);
@@ -1143,7 +1149,7 @@ write_complete(struct hast_resource *res, struct h
 	rw_unlock(&hio_remote_lock[ncomp]);
 	if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1)
 		primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed");
-	hio->hio_done = true;
+	hio->hio_flags |= HIOF_DONE;
 }
 
 /*
@@ -1174,7 +1180,7 @@ ggate_recv_thread(void *arg)
 		ggio->gctl_unit = res->hr_ggateunit;
 		ggio->gctl_length = MAXPHYS;
 		ggio->gctl_error = 0;
-		hio->hio_done = false;
+		hio->hio_flags = 0;
 		hio->hio_replication = res->hr_replication;
 		pjdlog_debug(2,
 		    "ggate_recv: (%p) Waiting for request from the kernel.",
@@ -1710,6 +1716,7 @@ remote_recv_thread(void *arg)
 			TAILQ_REMOVE(&hio_recv_list[ncomp], hio,
 			    hio_next[ncomp]);
 			mtx_unlock(&hio_recv_list_lock[ncomp]);
+			hio->hio_errors[ncomp] = ENOTCONN;
 			goto done_queue;
 		}
 		if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) {
@@ -1811,6 +1818,13 @@ done_queue:
 				PJDLOG_ASSERT(!memsyncack);
 				break;
 			case 1:
+				if (hio->hio_errors[ncomp] == ENOTCONN) {
+					if ((hio->hio_flags & HIOF_MEMSYNC_ACK)
+					    == 0) {
+						refcnt_release(&hio->hio_countdown);
+					}
+					break;
+				}
 				if (memsyncack) {
 					/*
 					 * Local request already finished, so we
@@ -1822,6 +1836,7 @@ done_queue:
 					 * We still need to wait for final
 					 * remote reply.
 					 */
+					hio->hio_flags |= HIOF_MEMSYNC_ACK;
 					pjdlog_debug(2,
 					    "remote_recv: (%p) Moving request back to the recv queue.",
 					    hio);
@@ -1838,6 +1853,10 @@ done_queue:
 				}
 				continue;
 			case 2:
+				if (hio->hio_errors[ncomp] == ENOTCONN) {
+					refcnt_release(&hio->hio_countdown);
+					break;
+				}
 				/*
 				 * We received remote memsync reply even before
 				 * local write finished.
@@ -1847,6 +1866,7 @@ done_queue:
 				pjdlog_debug(2,
 				    "remote_recv: (%p) Moving request back to the recv queue.",
 				    hio);
+				hio->hio_flags |= HIOF_MEMSYNC_ACK;
 				mtx_lock(&hio_recv_list_lock[ncomp]);
 				TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio,
 				    hio_next[ncomp]);
@@ -1931,7 +1951,7 @@ ggate_send_thread(void *arg)
 			if (range_sync_wait)
 				cv_signal(&range_sync_cond);
 			mtx_unlock(&range_lock);
-			if (!hio->hio_done)
+			if ((hio->hio_flags & HIOF_DONE) == 0)
 				write_complete(res, hio);
 		} else {
 			if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) {
@@ -2114,7 +2134,7 @@ sync_thread(void *arg __unused)
 		ggio->gctl_offset = offset;
 		ggio->gctl_length = length;
 		ggio->gctl_error = 0;
-		hio->hio_done = false;
+		hio->hio_flags = 0;
 		hio->hio_replication = res->hr_replication;
 		for (ii = 0; ii < ncomps; ii++)
 			hio->hio_errors[ii] = EINVAL;

--uAKRQypu60I7Lcqm--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20130825175616.GA3472>