Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 4 Oct 2018 05:54:57 +0000 (UTC)
From:      Pawel Jakub Dawidek <pjd@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r339177 - head/contrib/openbsm/bin/auditdistd
Message-ID:  <201810040554.w945svpV002413@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: pjd
Date: Thu Oct  4 05:54:57 2018
New Revision: 339177
URL: https://svnweb.freebsd.org/changeset/base/339177

Log:
  When the adist_free list is empty and we lose connection to the receiver we
  move all elements from the adist_send and adist_recv lists back onto the
  adist_free list, but we don't wake consumers waitings for the adist_free list
  to become non-empty. This can lead to the sender process stopping audit trail
  files distribution and waiting forever.
  
  Fix the problem by adding the missing wakeup.
  
  While here slow down spinning on CPU in case of a short race in
  sender_disconnect() and add an explaination when it can occur.
  
  PR:		201953
  Reported by:	peter
  Approved by:	re (kib)

Modified:
  head/contrib/openbsm/bin/auditdistd/auditdistd.h
  head/contrib/openbsm/bin/auditdistd/sender.c

Modified: head/contrib/openbsm/bin/auditdistd/auditdistd.h
==============================================================================
--- head/contrib/openbsm/bin/auditdistd/auditdistd.h	Thu Oct  4 05:48:09 2018	(r339176)
+++ head/contrib/openbsm/bin/auditdistd/auditdistd.h	Thu Oct  4 05:54:57 2018	(r339177)
@@ -248,6 +248,21 @@ struct adrep {
 	if (_wakeup)							\
 		cv_signal(list##_cond);					\
 } while (0)
+#define	QUEUE_CONCAT2(tolist, fromlist1, fromlist2)	do {		\
+	bool _wakeup;							\
+									\
+	mtx_lock(tolist##_lock);					\
+	_wakeup = TAILQ_EMPTY(tolist);					\
+	mtx_lock(fromlist1##_lock);					\
+	TAILQ_CONCAT((tolist), (fromlist1), adr_next);			\
+	mtx_unlock(fromlist1##_lock);					\
+	mtx_lock(fromlist2##_lock);					\
+	TAILQ_CONCAT((tolist), (fromlist2), adr_next);			\
+	mtx_unlock(fromlist2##_lock);					\
+	mtx_unlock(tolist##_lock);					\
+	if (_wakeup)							\
+		cv_signal(tolist##_cond);				\
+} while (0)
 #define	QUEUE_WAIT(list)	do {					\
 	mtx_lock(list##_lock);						\
 	while (TAILQ_EMPTY(list))					\

Modified: head/contrib/openbsm/bin/auditdistd/sender.c
==============================================================================
--- head/contrib/openbsm/bin/auditdistd/sender.c	Thu Oct  4 05:48:09 2018	(r339176)
+++ head/contrib/openbsm/bin/auditdistd/sender.c	Thu Oct  4 05:54:57 2018	(r339177)
@@ -342,14 +342,7 @@ sender_disconnect(void)
 	pjdlog_warning("Disconnected from %s.", adhost->adh_remoteaddr);
 
 	/* Move all in-flight requests back onto free list. */
-	mtx_lock(&adist_free_list_lock);
-	mtx_lock(&adist_send_list_lock);
-	TAILQ_CONCAT(&adist_free_list, &adist_send_list, adr_next);
-	mtx_unlock(&adist_send_list_lock);
-	mtx_lock(&adist_recv_list_lock);
-	TAILQ_CONCAT(&adist_free_list, &adist_recv_list, adr_next);
-	mtx_unlock(&adist_recv_list_lock);
-	mtx_unlock(&adist_free_list_lock);
+	QUEUE_CONCAT2(&adist_free_list, &adist_send_list, &adist_recv_list);
 }
 
 static void
@@ -609,9 +602,13 @@ recv_thread(void *arg __unused)
 		if (adhost->adh_remote == NULL) {
 			/*
 			 * Connection is dead.
-			 * XXX: We shouldn't be here.
+			 * There is a short race in sender_disconnect() between
+			 * setting adh_remote to NULL and removing entries from
+			 * the recv list, which can result in us being here.
+			 * To avoid just spinning, wait for 0.1s.
 			 */
 			rw_unlock(&adist_remote_lock);
+			usleep(100000);
 			continue;
 		}
 		if (proto_recv(adhost->adh_remote, &adrep,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201810040554.w945svpV002413>