Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 22 Feb 2020 03:44:10 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r358236 - in head/sys: kern sys vm
Message-ID:  <202002220344.01M3iAqI013978@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Sat Feb 22 03:44:10 2020
New Revision: 358236
URL: https://svnweb.freebsd.org/changeset/base/358236

Log:
  Add an atomic-free tick moderated lazy update variant of SMR.
  
  This enables very cheap read sections with free-to-use latencies and memory
  overhead similar to epoch.  On a recent AMD platform a read section cost
  1ns vs 5ns for the default SMR.  On Xeon the numbers should be more like 1
  ns vs 11.  The memory consumption should be proportional to the product
  of the free rate and 2*1/hz while normal SMR consumption is proportional
  to the product of free rate and maximum read section time.
  
  While here refactor the code to make future additions more
  straightforward.
  
  Name the overall technique Global Unbound Sequences (GUS) and adjust some
  comments accordingly.  This helps distinguish discussions of the general
  technique (SMR) vs this specific implementation (GUS).
  
  Discussed with:	rlibby, markj

Modified:
  head/sys/kern/subr_smr.c
  head/sys/sys/_smr.h
  head/sys/sys/smr.h
  head/sys/vm/uma_core.c

Modified: head/sys/kern/subr_smr.c
==============================================================================
--- head/sys/kern/subr_smr.c	Sat Feb 22 03:14:05 2020	(r358235)
+++ head/sys/kern/subr_smr.c	Sat Feb 22 03:44:10 2020	(r358236)
@@ -41,6 +41,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/uma.h>
 
 /*
+ * Global Unbounded Sequences (GUS)
+ *
  * This is a novel safe memory reclamation technique inspired by
  * epoch based reclamation from Samy Al Bahra's concurrency kit which
  * in turn was based on work described in:
@@ -53,7 +55,8 @@ __FBSDID("$FreeBSD$");
  * This is not an implementation of hazard pointers or related
  * techniques.  The term safe memory reclamation is used as a
  * generic descriptor for algorithms that defer frees to avoid
- * use-after-free errors with lockless datastructures.
+ * use-after-free errors with lockless datastructures or as
+ * a mechanism to detect quiescence for writer synchronization.
  *
  * The basic approach is to maintain a monotonic write sequence
  * number that is updated on some application defined granularity.
@@ -67,7 +70,7 @@ __FBSDID("$FreeBSD$");
  * a global write clock that is used to mark memory on free.
  *
  * The write and read sequence numbers can be thought of as a two
- * handed clock with readers always advancing towards writers.  SMR
+ * handed clock with readers always advancing towards writers.  GUS 
  * maintains the invariant that all readers can safely access memory
  * that was visible at the time they loaded their copy of the sequence
  * number.  Periodically the read sequence or hand is polled and
@@ -80,9 +83,12 @@ __FBSDID("$FreeBSD$");
  * A stored sequence number that falls outside of this range has expired
  * and needs no scan to reclaim.
  *
- * A notable distinction between this SMR and Epoch, qsbr, rcu, etc. is
+ * A notable distinction between GUS and Epoch, qsbr, rcu, etc. is
  * that advancing the sequence number is decoupled from detecting its
- * observation.  This results in a more granular assignment of sequence
+ * observation.  That is to say, the delta between read and write
+ * sequence numbers is not bound.  This can be thought of as a more
+ * generalized form of epoch which requires them at most one step
+ * apart.  This results in a more granular assignment of sequence
  * numbers even as read latencies prohibit all or some expiration.
  * It also allows writers to advance the sequence number and save the
  * poll for expiration until a later time when it is likely to
@@ -164,60 +170,145 @@ static uma_zone_t smr_zone;
 #define	SMR_SEQ_MAX_ADVANCE	SMR_SEQ_MAX_DELTA / 2
 #endif
 
+/*
+ * The grace period for lazy (tick based) SMR.
+ *
+ * Hardclock is responsible for advancing ticks on a single CPU while every
+ * CPU receives a regular clock interrupt.  The clock interrupts are flushing
+ * the store buffers and any speculative loads that may violate our invariants.
+ * Because these interrupts are not synchronized we must wait one additional
+ * tick in the future to be certain that all processors have had their state
+ * synchronized by an interrupt.
+ *
+ * This assumes that the clock interrupt will only be delayed by other causes
+ * that will flush the store buffer or prevent access to the section protected
+ * data.  For example, an idle processor, or an system management interrupt,
+ * or a vm exit.
+ *
+ * We must wait one additional tick if we are around the wrap condition
+ * because the write seq will move forward by two with one interrupt.
+ */
+#define	SMR_LAZY_GRACE		2
+#define	SMR_LAZY_GRACE_MAX	(SMR_LAZY_GRACE + 1)
+
+/*
+ * The maximum sequence number ahead of wr_seq that may still be valid.  The
+ * sequence may not be advanced on write for lazy or deferred SMRs.  In this
+ * case poll needs to attempt to forward the sequence number if the goal is
+ * within wr_seq + SMR_SEQ_ADVANCE.
+ */
+#define	SMR_SEQ_ADVANCE		MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)
+
 static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
 static counter_u64_t advance = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, "");
 static counter_u64_t advance_wait = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, "");
 static counter_u64_t poll = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, "");
 static counter_u64_t poll_scan = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, "");
+static counter_u64_t poll_fail = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, "");
 
-
 /*
- * Advance the write sequence and return the new value for use as the
- * wait goal.  This guarantees that any changes made by the calling
- * thread prior to this call will be visible to all threads after
- * rd_seq meets or exceeds the return value.
+ * Advance a lazy write sequence number.  These move forward at the rate of
+ * ticks.  Grace is two ticks in the future.  lazy write sequence numbers can
+ * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
  *
- * This function may busy loop if the readers are roughly 1 billion
- * sequence numbers behind the writers.
+ * This returns the _current_ write sequence number.  The lazy goal sequence
+ * number is SMR_LAZY_GRACE ticks ahead.
  */
-smr_seq_t
-smr_advance(smr_t smr)
+static smr_seq_t
+smr_lazy_advance(smr_t smr, smr_shared_t s)
 {
-	smr_shared_t s;
-	smr_seq_t goal, s_rd_seq;
+	smr_seq_t s_rd_seq, s_wr_seq, goal;
+	int t;
 
+	CRITICAL_ASSERT(curthread);
+
 	/*
-	 * It is illegal to enter while in an smr section.
+	 * Load s_wr_seq prior to ticks to ensure that the thread that
+	 * observes the largest value wins.
 	 */
-	SMR_ASSERT_NOT_ENTERED(smr);
+	s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
 
 	/*
-	 * Modifications not done in a smr section need to be visible
-	 * before advancing the seq.
+	 * We must not allow a zero tick value.  We go back in time one tick
+	 * and advance the grace period forward one tick around zero.
 	 */
-	atomic_thread_fence_rel();
+	t = ticks;
+	if (t == SMR_SEQ_INVALID)
+		t--;
 
 	/*
-	 * Load the current read seq before incrementing the goal so
-	 * we are guaranteed it is always < goal.
+	 * The most probable condition that the update already took place.
 	 */
-	s = zpcpu_get(smr)->c_shared;
-	s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+	if (__predict_true(t == s_wr_seq))
+		goto out;
 
 	/*
-	 * Increment the shared write sequence by 2.  Since it is
-	 * initialized to 1 this means the only valid values are
-	 * odd and an observed value of 0 in a particular CPU means
-	 * it is not currently in a read section.
+	 * After long idle periods the read sequence may fall too far
+	 * behind write.  Prevent poll from ever seeing this condition
+	 * by updating the stale rd_seq.  This assumes that there can
+	 * be no valid section 2bn ticks old.  The rd_seq update must
+	 * be visible before wr_seq to avoid races with other advance
+	 * callers.
 	 */
-	goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
+	s_rd_seq = atomic_load_int(&s->s_rd_seq);
+	if (SMR_SEQ_GT(s_rd_seq, t))
+		atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);
+
+	/*
+	 * Release to synchronize with the wr_seq load above.  Ignore
+	 * cmpset failures from simultaneous updates.
+	 */
+	atomic_cmpset_rel_int(&s->s_wr_seq, s_wr_seq, t);
 	counter_u64_add(advance, 1);
+	/* If we lost either update race another thread did it. */
+	s_wr_seq = t;
+out:
+	goal = s_wr_seq + SMR_LAZY_GRACE;
+	/* Skip over the SMR_SEQ_INVALID tick. */
+	if (goal < SMR_LAZY_GRACE)
+		goal++;
+	return (goal);
+}
 
+/*
+ * Increment the shared write sequence by 2.  Since it is initialized
+ * to 1 this means the only valid values are odd and an observed value
+ * of 0 in a particular CPU means it is not currently in a read section.
+ */
+static smr_seq_t
+smr_shared_advance(smr_shared_t s)
+{
+
+	return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
+}
+
+/*
+ * Advance the write sequence number for a normal smr section.  If the
+ * write sequence is too far behind the read sequence we have to poll
+ * to advance rd_seq and prevent undetectable wraps.
+ */
+static smr_seq_t
+smr_default_advance(smr_t smr, smr_shared_t s)
+{
+	smr_seq_t goal, s_rd_seq;
+
+	CRITICAL_ASSERT(curthread);
+	KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+	    ("smr_default_advance: called with lazy smr."));
+
 	/*
+	 * Load the current read seq before incrementing the goal so
+	 * we are guaranteed it is always < goal.
+	 */
+	s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+	goal = smr_shared_advance(s);
+
+	/*
 	 * Force a synchronization here if the goal is getting too
 	 * far ahead of the read sequence number.  This keeps the
 	 * wrap detecting arithmetic working in pathological cases.
@@ -226,33 +317,175 @@ smr_advance(smr_t smr)
 		counter_u64_add(advance_wait, 1);
 		smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
 	}
+	counter_u64_add(advance, 1);
 
 	return (goal);
 }
 
+/*
+ * Deferred SMRs conditionally update s_wr_seq based on an
+ * cpu local interval count.
+ */
+static smr_seq_t
+smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
+{
+
+	if (++self->c_deferred < self->c_limit)
+		return (smr_shared_current(s) + SMR_SEQ_INCR);
+	self->c_deferred = 0;
+	return (smr_default_advance(smr, s));
+}
+
+/*
+ * Advance the write sequence and return the value for use as the
+ * wait goal.  This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * rd_seq meets or exceeds the return value.
+ *
+ * This function may busy loop if the readers are roughly 1 billion
+ * sequence numbers behind the writers.
+ *
+ * Lazy SMRs will not busy loop and the wrap happens every 49.6 days
+ * at 1khz and 119 hours at 10khz.  Readers can block for no longer
+ * than half of this for SMR_SEQ_ macros to continue working.
+ */
 smr_seq_t
-smr_advance_deferred(smr_t smr, int limit)
+smr_advance(smr_t smr)
 {
+	smr_t self;
+	smr_shared_t s;
 	smr_seq_t goal;
-	smr_t csmr;
+	int flags;
 
+	/*
+	 * It is illegal to enter while in an smr section.
+	 */
 	SMR_ASSERT_NOT_ENTERED(smr);
 
+	/*
+	 * Modifications not done in a smr section need to be visible
+	 * before advancing the seq.
+	 */
+	atomic_thread_fence_rel();
+
 	critical_enter();
-	csmr = zpcpu_get(smr);
-	if (++csmr->c_deferred >= limit) {
-		goal = SMR_SEQ_INVALID;
-		csmr->c_deferred = 0;
-	} else
-		goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR;
+	/* Try to touch the line once. */
+	self = zpcpu_get(smr);
+	s = self->c_shared;
+	flags = self->c_flags;
+	goal = SMR_SEQ_INVALID;
+	if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
+		goal = smr_default_advance(smr, s);
+	else if ((flags & SMR_LAZY) != 0)
+		goal = smr_lazy_advance(smr, s);
+	else if ((flags & SMR_DEFERRED) != 0)
+		goal = smr_deferred_advance(smr, s, self);
 	critical_exit();
-	if (goal != SMR_SEQ_INVALID)
-		return (goal);
 
-	return (smr_advance(smr));
+	return (goal);
 }
 
 /*
+ * Poll to determine the currently observed sequence number on a cpu
+ * and spinwait if the 'wait' argument is true.
+ */
+static smr_seq_t
+smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
+{
+	smr_seq_t c_seq;
+
+	c_seq = SMR_SEQ_INVALID;
+	for (;;) {
+		c_seq = atomic_load_int(&c->c_seq);
+		if (c_seq == SMR_SEQ_INVALID)
+			break;
+
+		/*
+		 * There is a race described in smr.h:smr_enter that
+		 * can lead to a stale seq value but not stale data
+		 * access.  If we find a value out of range here we
+		 * pin it to the current min to prevent it from
+		 * advancing until that stale section has expired.
+		 *
+		 * The race is created when a cpu loads the s_wr_seq
+		 * value in a local register and then another thread
+		 * advances s_wr_seq and calls smr_poll() which will
+		 * oberve no value yet in c_seq and advance s_rd_seq
+		 * up to s_wr_seq which is beyond the register
+		 * cached value.  This is only likely to happen on
+		 * hypervisor or with a system management interrupt.
+		 */
+		if (SMR_SEQ_LT(c_seq, s_rd_seq))
+			c_seq = s_rd_seq;
+
+		/*
+		 * If the sequence number meets the goal we are done
+		 * with this cpu.
+		 */
+		if (SMR_SEQ_LEQ(goal, c_seq))
+			break;
+
+		if (!wait)
+			break;
+		cpu_spinwait();
+	}
+
+	return (c_seq);
+}
+
+/*
+ * Loop until all cores have observed the goal sequence or have
+ * gone inactive.  Returns the oldest sequence currently active;
+ *
+ * This function assumes a snapshot of sequence values has
+ * been obtained and validated by smr_poll().
+ */
+static smr_seq_t
+smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
+    smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
+{
+	smr_seq_t rd_seq, c_seq;
+	int i;
+
+	CRITICAL_ASSERT(curthread);
+	counter_u64_add_protected(poll_scan, 1);
+
+	/*
+	 * The read sequence can be no larger than the write sequence at
+	 * the start of the poll.
+	 */
+	rd_seq = s_wr_seq;
+	CPU_FOREACH(i) {
+		/*
+		 * Query the active sequence on this cpu.  If we're not
+		 * waiting and we don't meet the goal we will still scan
+		 * the rest of the cpus to update s_rd_seq before returning
+		 * failure.
+		 */
+		c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
+		    wait);
+
+		/*
+		 * Limit the minimum observed rd_seq whether we met the goal
+		 * or not.
+		 */
+		if (c_seq != SMR_SEQ_INVALID)
+			rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
+	}
+
+	/*
+	 * Advance the rd_seq as long as we observed a more recent value.
+	 */
+	s_rd_seq = atomic_load_int(&s->s_rd_seq);
+	if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {
+		atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
+		s_rd_seq = rd_seq;
+	}
+
+	return (s_rd_seq);
+}
+
+/*
  * Poll to determine whether all readers have observed the 'goal' write
  * sequence number.
  *
@@ -268,9 +501,10 @@ bool
 smr_poll(smr_t smr, smr_seq_t goal, bool wait)
 {
 	smr_shared_t s;
-	smr_t c;
-	smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
-	int i;
+	smr_t self;
+	smr_seq_t s_wr_seq, s_rd_seq;
+	smr_delta_t delta;
+	int flags;
 	bool success;
 
 	/*
@@ -278,6 +512,8 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
 	 */
 	KASSERT(!wait || !SMR_ENTERED(smr),
 	    ("smr_poll: Blocking not allowed in a SMR section."));
+	KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+	    ("smr_poll: Blocking not allowed on lazy smrs."));
 
 	/*
 	 * Use a critical section so that we can avoid ABA races
@@ -285,116 +521,79 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait)
 	 */
 	success = true;
 	critical_enter();
-	s = zpcpu_get(smr)->c_shared;
+	/* Attempt to load from self only once. */
+	self = zpcpu_get(smr);
+	s = self->c_shared;
+	flags = self->c_flags;
 	counter_u64_add_protected(poll, 1);
 
 	/*
+	 * Conditionally advance the lazy write clock on any writer
+	 * activity.  This may reset s_rd_seq.
+	 */
+	if ((flags & SMR_LAZY) != 0)
+		smr_lazy_advance(smr, s);
+
+	/*
 	 * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
 	 * observe an updated read sequence that is larger than write.
 	 */
 	s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
 
 	/*
-	 * wr_seq must be loaded prior to any c_seq value so that a stale
-	 * c_seq can only reference time after this wr_seq.
+	 * If we have already observed the sequence number we can immediately
+	 * return success.  Most polls should meet this criterion.
 	 */
+	if (SMR_SEQ_LEQ(goal, s_rd_seq))
+		goto out;
+
+	/*
+	 * wr_seq must be loaded prior to any c_seq value so that a
+	 * stale c_seq can only reference time after this wr_seq.
+	 */
 	s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
 
 	/*
-	 * This may have come from a deferred advance.  Consider one
-	 * increment past the current wr_seq valid and make sure we
-	 * have advanced far enough to succeed.  We simply add to avoid
-	 * an additional fence.
+	 * This is the distance from s_wr_seq to goal.  Positive values
+	 * are in the future.
 	 */
-	if (goal == s_wr_seq + SMR_SEQ_INCR) {
-		atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR);
-		s_wr_seq = goal;
+	delta = SMR_SEQ_DELTA(goal, s_wr_seq);
+
+	/*
+	 * Detect a stale wr_seq.
+	 *
+	 * This goal may have come from a deferred advance or a lazy
+	 * smr.  If we are not blocking we can not succeed but the
+	 * sequence number is valid.
+	 */
+	if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&
+	    (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
+		if (!wait) {
+			success = false;
+			goto out;
+		}
+		/* LAZY is always !wait. */
+		s_wr_seq = smr_shared_advance(s);
+		delta = 0;
 	}
 
 	/*
-	 * Detect whether the goal is valid and has already been observed.
+	 * Detect an invalid goal.
 	 *
 	 * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
 	 * it to be valid.  If it is not then the caller held on to it and
 	 * the integer wrapped.  If we wrapped back within range the caller
 	 * will harmlessly scan.
-	 *
-	 * A valid goal must be greater than s_rd_seq or we have not verified
-	 * that it has been observed and must fall through to polling.
 	 */
-	if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal))
+	if (delta > 0)
 		goto out;
 
-	/*
-	 * Loop until all cores have observed the goal sequence or have
-	 * gone inactive.  Keep track of the oldest sequence currently
-	 * active as rd_seq.
-	 */
-	counter_u64_add_protected(poll_scan, 1);
-	rd_seq = s_wr_seq;
-	CPU_FOREACH(i) {
-		c = zpcpu_get_cpu(smr, i);
-		c_seq = SMR_SEQ_INVALID;
-		for (;;) {
-			c_seq = atomic_load_int(&c->c_seq);
-			if (c_seq == SMR_SEQ_INVALID)
-				break;
-
-			/*
-			 * There is a race described in smr.h:smr_enter that
-			 * can lead to a stale seq value but not stale data
-			 * access.  If we find a value out of range here we
-			 * pin it to the current min to prevent it from
-			 * advancing until that stale section has expired.
-			 *
-			 * The race is created when a cpu loads the s_wr_seq
-			 * value in a local register and then another thread
-			 * advances s_wr_seq and calls smr_poll() which will 
-			 * oberve no value yet in c_seq and advance s_rd_seq
-			 * up to s_wr_seq which is beyond the register
-			 * cached value.  This is only likely to happen on
-			 * hypervisor or with a system management interrupt.
-			 */
-			if (SMR_SEQ_LT(c_seq, s_rd_seq))
-				c_seq = s_rd_seq;
-
-			/*
-			 * If the sequence number meets the goal we are
-			 * done with this cpu.
-			 */
-			if (SMR_SEQ_GEQ(c_seq, goal))
-				break;
-
-			/*
-			 * If we're not waiting we will still scan the rest
-			 * of the cpus and update s_rd_seq before returning
-			 * an error.
-			 */
-			if (!wait) {
-				success = false;
-				break;
-			}
-			cpu_spinwait();
-		}
-
-		/*
-		 * Limit the minimum observed rd_seq whether we met the goal
-		 * or not.
-		 */
-		if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
-			rd_seq = c_seq;
-	}
-
-	/*
-	 * Advance the rd_seq as long as we observed the most recent one.
-	 */
-	s_rd_seq = atomic_load_int(&s->s_rd_seq);
-	do {
-		if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
-			goto out;
-	} while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
-
+	/* Determine the lowest visible sequence number. */
+	s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
+	success = SMR_SEQ_LEQ(goal, s_rd_seq);
 out:
+	if (!success)
+		counter_u64_add_protected(poll_fail, 1);
 	critical_exit();
 
 	/*
@@ -407,7 +606,7 @@ out:
 }
 
 smr_t
-smr_create(const char *name)
+smr_create(const char *name, int limit, int flags)
 {
 	smr_t smr, c;
 	smr_shared_t s;
@@ -417,13 +616,19 @@ smr_create(const char *name)
 	smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);
 
 	s->s_name = name;
-	s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+	if ((flags & SMR_LAZY) == 0)
+		s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+	else
+		s->s_rd_seq = s->s_wr_seq = ticks;
 
 	/* Initialize all CPUS, not just those running. */
 	for (i = 0; i <= mp_maxid; i++) {
 		c = zpcpu_get_cpu(smr, i);
 		c->c_seq = SMR_SEQ_INVALID;
 		c->c_shared = s;
+		c->c_deferred = 0;
+		c->c_limit = limit;
+		c->c_flags = flags;
 	}
 	atomic_thread_fence_seq_cst();
 
@@ -460,5 +665,6 @@ smr_init_counters(void *unused)
 	advance_wait = counter_u64_alloc(M_WAITOK);
 	poll = counter_u64_alloc(M_WAITOK);
 	poll_scan = counter_u64_alloc(M_WAITOK);
+	poll_fail = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL);

Modified: head/sys/sys/_smr.h
==============================================================================
--- head/sys/sys/_smr.h	Sat Feb 22 03:14:05 2020	(r358235)
+++ head/sys/sys/_smr.h	Sat Feb 22 03:44:10 2020	(r358236)
@@ -32,6 +32,7 @@
 #define	_SYS__SMR_H_
 
 typedef uint32_t	smr_seq_t;
+typedef int32_t		smr_delta_t;
 typedef struct smr 	*smr_t;
 
 #endif	/* __SYS_SMR_H_ */

Modified: head/sys/sys/smr.h
==============================================================================
--- head/sys/sys/smr.h	Sat Feb 22 03:14:05 2020	(r358235)
+++ head/sys/sys/smr.h	Sat Feb 22 03:44:10 2020	(r358236)
@@ -45,11 +45,13 @@
  * Modular arithmetic for comparing sequence numbers that have
  * potentially wrapped.  Copied from tcp_seq.h.
  */
-#define	SMR_SEQ_LT(a, b)	((int32_t)((a)-(b)) < 0)
-#define	SMR_SEQ_LEQ(a, b)	((int32_t)((a)-(b)) <= 0)
-#define	SMR_SEQ_GT(a, b)	((int32_t)((a)-(b)) > 0)
-#define	SMR_SEQ_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
-#define	SMR_SEQ_DELTA(a, b)	((int32_t)((a)-(b)))
+#define	SMR_SEQ_LT(a, b)	((smr_delta_t)((a)-(b)) < 0)
+#define	SMR_SEQ_LEQ(a, b)	((smr_delta_t)((a)-(b)) <= 0)
+#define	SMR_SEQ_GT(a, b)	((smr_delta_t)((a)-(b)) > 0)
+#define	SMR_SEQ_GEQ(a, b)	((smr_delta_t)((a)-(b)) >= 0)
+#define	SMR_SEQ_DELTA(a, b)	((smr_delta_t)((a)-(b)))
+#define	SMR_SEQ_MIN(a, b)	(SMR_SEQ_LT((a), (b)) ? (a) : (b))
+#define	SMR_SEQ_MAX(a, b)	(SMR_SEQ_GT((a), (b)) ? (a) : (b))
 
 #define	SMR_SEQ_INVALID		0
 
@@ -66,8 +68,13 @@ struct smr {
 	smr_seq_t	c_seq;		/* Current observed sequence. */
 	smr_shared_t	c_shared;	/* Shared SMR state. */
 	int		c_deferred;	/* Deferred advance counter. */
+	int		c_limit;	/* Deferred advance limit. */
+	int		c_flags;	/* SMR Configuration */
 };
 
+#define	SMR_LAZY	0x0001		/* Higher latency write, fast read. */
+#define	SMR_DEFERRED	0x0002		/* Aggregate updates to wr_seq. */
+
 #define	SMR_ENTERED(smr)						\
     (curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID)
 
@@ -94,7 +101,7 @@ struct smr {
  * All acceses include a parameter for an assert to verify the required
  * synchronization.  For example, a writer might use:
  *
- * smr_serilized_store(pointer, value, mtx_owned(&writelock));
+ * smr_serialized_store(pointer, value, mtx_owned(&writelock));
  *
  * These are only enabled in INVARIANTS kernels.
  */
@@ -127,6 +134,9 @@ typedef struct {							\
  * Store 'v' to an SMR protected pointer while serialized by an
  * external mechanism.  'ex' should contain an assert that the
  * external mechanism is held.  i.e. mtx_owned()
+ *
+ * Writers that are serialized with mutual exclusion or on a single
+ * thread should use smr_serialized_store() rather than swap.
  */
 #define	smr_serialized_store(p, v, ex) do {				\
 	SMR_ASSERT(ex, "smr_serialized_store");				\
@@ -138,6 +148,8 @@ typedef struct {							\
  * swap 'v' with an SMR protected pointer and return the old value
  * while serialized by an external mechanism.  'ex' should contain
  * an assert that the external mechanism is provided.  i.e. mtx_owned()
+ *
+ * Swap permits multiple writers to update a pointer concurrently.
  */
 #define	smr_serialized_swap(p, v, ex) ({				\
 	SMR_ASSERT(ex, "smr_serialized_swap");				\
@@ -170,7 +182,8 @@ typedef struct {							\
 } while (0)
 
 /*
- * Return the current write sequence number.
+ * Return the current write sequence number.  This is not the same as the
+ * current goal which may be in the future.
  */
 static inline smr_seq_t
 smr_shared_current(smr_shared_t s)
@@ -195,6 +208,8 @@ smr_enter(smr_t smr)
 
 	critical_enter();
 	smr = zpcpu_get(smr);
+	KASSERT((smr->c_flags & SMR_LAZY) == 0,
+	    ("smr_enter(%s) lazy smr.", smr->c_shared->s_name));
 	KASSERT(smr->c_seq == 0,
 	    ("smr_enter(%s) does not support recursion.",
 	    smr->c_shared->s_name));
@@ -228,6 +243,8 @@ smr_exit(smr_t smr)
 
 	smr = zpcpu_get(smr);
 	CRITICAL_ASSERT(curthread);
+	KASSERT((smr->c_flags & SMR_LAZY) == 0,
+	    ("smr_exit(%s) lazy smr.", smr->c_shared->s_name));
 	KASSERT(smr->c_seq != SMR_SEQ_INVALID,
 	    ("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));
 
@@ -243,26 +260,72 @@ smr_exit(smr_t smr)
 }
 
 /*
- * Advances the write sequence number.  Returns the sequence number
- * required to ensure that all modifications are visible to readers.
+ * Enter a lazy smr section.  This is used for read-mostly state that
+ * can tolerate a high free latency.
  */
-smr_seq_t smr_advance(smr_t smr);
+static inline void
+smr_lazy_enter(smr_t smr)
+{
 
+	critical_enter();
+	smr = zpcpu_get(smr);
+	KASSERT((smr->c_flags & SMR_LAZY) != 0,
+	    ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+	KASSERT(smr->c_seq == 0,
+	    ("smr_lazy_enter(%s) does not support recursion.",
+	    smr->c_shared->s_name));
+
+	/*
+	 * This needs no serialization.  If an interrupt occurs before we
+	 * assign sr_seq to c_seq any speculative loads will be discarded.
+	 * If we assign a stale wr_seq value due to interrupt we use the
+	 * same algorithm that renders smr_enter() safe.
+	 */
+	smr->c_seq = smr_shared_current(smr->c_shared);
+}
+
 /*
- * Advances the write sequence number only after N calls.  Returns
- * the correct goal for a wr_seq that has not yet occurred.  Used to
- * minimize shared cacheline invalidations for frequent writers.
+ * Exit a lazy smr section.  This is used for read-mostly state that
+ * can tolerate a high free latency.
  */
-smr_seq_t smr_advance_deferred(smr_t smr, int limit);
+static inline void
+smr_lazy_exit(smr_t smr)
+{
 
+	smr = zpcpu_get(smr);
+	CRITICAL_ASSERT(curthread);
+	KASSERT((smr->c_flags & SMR_LAZY) != 0,
+	    ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+	KASSERT(smr->c_seq != SMR_SEQ_INVALID,
+	    ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name));
+
+	/*
+	 * All loads/stores must be retired before the sequence becomes
+	 * visible.  The fence compiles away on amd64.  Another
+	 * alternative would be to omit the fence but store the exit
+	 * time and wait 1 tick longer.
+	 */
+	atomic_thread_fence_rel();
+	smr->c_seq = SMR_SEQ_INVALID;
+	critical_exit();
+}
+
 /*
+ * Advances the write sequence number.  Returns the sequence number
+ * required to ensure that all modifications are visible to readers.
+ */
+smr_seq_t smr_advance(smr_t smr);
+
+/*
  * Returns true if a goal sequence has been reached.  If
  * wait is true this will busy loop until success.
  */
 bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
 
 /* Create a new SMR context. */
-smr_t smr_create(const char *name);
+smr_t smr_create(const char *name, int limit, int flags);
+
+/* Destroy the context. */
 void smr_destroy(smr_t smr);
 
 /*

Modified: head/sys/vm/uma_core.c
==============================================================================
--- head/sys/vm/uma_core.c	Sat Feb 22 03:14:05 2020	(r358235)
+++ head/sys/vm/uma_core.c	Sat Feb 22 03:44:10 2020	(r358236)
@@ -1140,7 +1140,6 @@ hash_free(struct uma_hash *hash)
  * Returns:
  *	Nothing
  */
-
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
@@ -1200,7 +1199,7 @@ cache_drain(uma_zone_t zone)
 	 */
 	seq = SMR_SEQ_INVALID;
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
-		seq = smr_current(zone->uz_smr);
+		seq = smr_advance(zone->uz_smr);
 	CPU_FOREACH(cpu) {
 		cache = &zone->uz_cpu[cpu];
 		bucket = cache_bucket_unload_alloc(cache);
@@ -1329,7 +1328,7 @@ bucket_cache_reclaim(uma_zone_t zone, bool drain)
 		 * the item count.  Reclaim it individually here.
 		 */
 		zdom = ZDOM_GET(zone, i);
-		if ((zone->uz_flags & UMA_ZONE_SMR) == 0) {
+		if ((zone->uz_flags & UMA_ZONE_SMR) == 0 || drain) {
 			ZONE_CROSS_LOCK(zone);
 			bucket = zdom->uzd_cross;
 			zdom->uzd_cross = NULL;
@@ -2679,7 +2678,7 @@ out:
 
 	/* Caller requests a private SMR context. */
 	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
-		zone->uz_smr = smr_create(zone->uz_name);
+		zone->uz_smr = smr_create(zone->uz_name, 0, 0);
 
 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
@@ -4137,22 +4136,21 @@ zone_free_cross(uma_zone_t zone, uma_bucket_t bucket, 
 	    "uma_zfree: zone %s(%p) draining cross bucket %p",
 	    zone->uz_name, zone, bucket);
 
-	STAILQ_INIT(&fullbuckets);
+	/*
+	 * It is possible for buckets to arrive here out of order so we fetch
+	 * the current smr seq rather than accepting the bucket's.
+	 */
+	seq = SMR_SEQ_INVALID;
+	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+		seq = smr_advance(zone->uz_smr);
 
 	/*
 	 * To avoid having ndomain * ndomain buckets for sorting we have a
 	 * lock on the current crossfree bucket.  A full matrix with
 	 * per-domain locking could be used if necessary.
 	 */
+	STAILQ_INIT(&fullbuckets);
 	ZONE_CROSS_LOCK(zone);
-
-	/*
-	 * It is possible for buckets to arrive here out of order so we fetch
-	 * the current smr seq rather than accepting the bucket's.
-	 */
-	seq = SMR_SEQ_INVALID;
-	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
-		seq = smr_current(zone->uz_smr);
 	while (bucket->ub_cnt > 0) {
 		item = bucket->ub_bucket[bucket->ub_cnt - 1];
 		domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202002220344.01M3iAqI013978>