Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 28 Jun 2015 05:04:09 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r284901 - in head/sys: amd64/amd64 amd64/include i386/i386 i386/include
Message-ID:  <201506280504.t5S549Cv042100@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Sun Jun 28 05:04:08 2015
New Revision: 284901
URL: https://svnweb.freebsd.org/changeset/base/284901

Log:
  Remove unneeded data dependency, currently imposed by
  atomic_load_acq(9), on it source, for x86.
  
  Right now, atomic_load_acq() on x86 is sequentially consistent with
  other atomics, code ensures this by doing store/load barrier by
  performing locked nop on the source.  Provide separate primitive
  __storeload_barrier(), which is implemented as the locked nop done on
  a cpu-private variable, and put __storeload_barrier() before load, to
  keep seq_cst semantic but avoid introducing false dependency on the
  no-modification of the source for its later use.
  
  Note that seq_cst property of x86 atomic_load_acq() is not documented
  and not carried by atomics implementations on other architectures,
  although some kernel code relies on the behaviour.  This commit does
  not intend to change this.
  
  Reviewed by:	alc
  Discussed with:	bde
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks

Modified:
  head/sys/amd64/amd64/atomic.c
  head/sys/amd64/amd64/vm_machdep.c
  head/sys/amd64/include/atomic.h
  head/sys/i386/i386/vm_machdep.c
  head/sys/i386/include/atomic.h

Modified: head/sys/amd64/amd64/atomic.c
==============================================================================
--- head/sys/amd64/amd64/atomic.c	Sun Jun 28 03:22:26 2015	(r284900)
+++ head/sys/amd64/amd64/atomic.c	Sun Jun 28 05:04:08 2015	(r284901)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #undef ATOMIC_ASM
 
 /* Make atomic.h generate public functions */
+static __inline void __storeload_barrier(void);
 #define WANT_FUNCTIONS
 #define static
 #undef __inline

Modified: head/sys/amd64/amd64/vm_machdep.c
==============================================================================
--- head/sys/amd64/amd64/vm_machdep.c	Sun Jun 28 03:22:26 2015	(r284900)
+++ head/sys/amd64/amd64/vm_machdep.c	Sun Jun 28 05:04:08 2015	(r284901)
@@ -93,6 +93,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off
     "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
 _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
     "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
+_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
+    "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)

Modified: head/sys/amd64/include/atomic.h
==============================================================================
--- head/sys/amd64/include/atomic.h	Sun Jun 28 03:22:26 2015	(r284900)
+++ head/sys/amd64/include/atomic.h	Sun Jun 28 05:04:08 2015	(r284901)
@@ -85,7 +85,7 @@ u_long	atomic_fetchadd_long(volatile u_l
 int	atomic_testandset_int(volatile u_int *p, u_int v);
 int	atomic_testandset_long(volatile u_long *p, u_int v);
 
-#define	ATOMIC_LOAD(TYPE, LOP)					\
+#define	ATOMIC_LOAD(TYPE)					\
 u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -245,53 +245,79 @@ atomic_testandset_long(volatile u_long *
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels.  We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence.  For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels.  We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies.  In userspace, a word in the red zone on the stack
+ * (-8(%rsp)) is utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
  */
-#define	ATOMIC_STORE(TYPE)				\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__compiler_membar();				\
-	*p = v;						\
-}							\
-struct __hack
 
-#if defined(_KERNEL) && !defined(SMP)
+#if defined(_KERNEL)
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE tmp;					\
-							\
-	tmp = *p;					\
-	__compiler_membar();				\
-	return (tmp);					\
-}							\
-struct __hack
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in amd64/vm_machdep.c ensures that the value is correct.
+ */
+#define	OFFSETOF_MONITORBUF	0x180
 
-#else /* !(_KERNEL && !SMP) */
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE res;					\
-							\
-	__asm __volatile(MPLOCKED LOP			\
-	: "=a" (res),			/* 0 */		\
-	  "+m" (*p)			/* 1 */		\
-	: : "memory", "cc");				\
-	return (res);					\
-}							\
+	__asm __volatile("lock; addl $0,%%gs:%0"
+	    : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
+
+	__compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+	__asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+#define	ATOMIC_LOAD(TYPE)					\
+static __inline u_##TYPE					\
+atomic_load_acq_##TYPE(volatile u_##TYPE *p)			\
+{								\
+	u_##TYPE res;						\
+								\
+	__storeload_barrier();					\
+	res = *p;						\
+	__compiler_membar();					\
+	return (res);						\
+}								\
 struct __hack
 
-#endif /* _KERNEL && !SMP */
+#define	ATOMIC_STORE(TYPE)					\
+static __inline void						\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)	\
+{								\
+								\
+	__compiler_membar();					\
+	*p = v;							\
+}								\
+struct __hack
 
 #endif /* KLD_MODULE || !__GNUCLIKE_ASM */
 
@@ -315,20 +341,19 @@ ATOMIC_ASM(clear,    long,  "andq %1,%0"
 ATOMIC_ASM(add,	     long,  "addq %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subq %1,%0",  "ir",  v);
 
-ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
-ATOMIC_LOAD(long,  "cmpxchgq %0,%1");
-
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+#define	ATOMIC_LOADSTORE(TYPE)					\
+	ATOMIC_LOAD(TYPE);					\
+	ATOMIC_STORE(TYPE)
+
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
-
+#undef ATOMIC_LOADSTORE
 #ifndef WANT_FUNCTIONS
 
 /* Read the current value and store a new value in the destination. */

Modified: head/sys/i386/i386/vm_machdep.c
==============================================================================
--- head/sys/i386/i386/vm_machdep.c	Sun Jun 28 03:22:26 2015	(r284900)
+++ head/sys/i386/i386/vm_machdep.c	Sun Jun 28 05:04:08 2015	(r284901)
@@ -111,6 +111,8 @@ _Static_assert(OFFSETOF_CURTHREAD == off
     "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread.");
 _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb),
     "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb.");
+_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
+    "OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf.");
 
 static void	cpu_reset_real(void);
 #ifdef SMP

Modified: head/sys/i386/include/atomic.h
==============================================================================
--- head/sys/i386/include/atomic.h	Sun Jun 28 03:22:26 2015	(r284900)
+++ head/sys/i386/include/atomic.h	Sun Jun 28 05:04:08 2015	(r284901)
@@ -87,7 +87,7 @@ int	atomic_cmpset_int(volatile u_int *ds
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 int	atomic_testandset_int(volatile u_int *p, u_int v);
 
-#define	ATOMIC_LOAD(TYPE, LOP)					\
+#define	ATOMIC_LOAD(TYPE)					\
 u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
 #define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
@@ -228,53 +228,78 @@ atomic_testandset_int(volatile u_int *p,
  * We assume that a = b will do atomic loads and stores.  Due to the
  * IA32 memory model, a simple store guarantees release semantics.
  *
- * However, loads may pass stores, so for atomic_load_acq we have to
- * ensure a Store/Load barrier to do the load in SMP kernels.  We use
- * "lock cmpxchg" as recommended by the AMD Software Optimization
- * Guide, and not mfence.  For UP kernels, however, the cache of the
- * single processor is always consistent, so we only need to take care
- * of the compiler.
+ * However, a load may pass a store if they are performed on distinct
+ * addresses, so for atomic_load_acq we introduce a Store/Load barrier
+ * before the load in SMP kernels.  We use "lock addl $0,mem", as
+ * recommended by the AMD Software Optimization Guide, and not mfence.
+ * In the kernel, we use a private per-cpu cache line as the target
+ * for the locked addition, to avoid introducing false data
+ * dependencies.  In userspace, a word at the top of the stack is
+ * utilized.
+ *
+ * For UP kernels, however, the memory of the single processor is
+ * always consistent, so we only need to stop the compiler from
+ * reordering accesses in a way that violates the semantics of acquire
+ * and release.
  */
-#define	ATOMIC_STORE(TYPE)				\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__compiler_membar();				\
-	*p = v;						\
-}							\
-struct __hack
+#if defined(_KERNEL)
 
-#if defined(_KERNEL) && !defined(SMP)
+/*
+ * OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf).
+ *
+ * The open-coded number is used instead of the symbolic expression to
+ * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers.
+ * An assertion in i386/vm_machdep.c ensures that the value is correct.
+ */
+#define	OFFSETOF_MONITORBUF	0x180
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE tmp;					\
-							\
-	tmp = *p;					\
-	__compiler_membar();				\
-	return (tmp);					\
-}							\
-struct __hack
+#if defined(SMP)
+static __inline void
+__storeload_barrier(void)
+{
 
-#else /* !(_KERNEL && !SMP) */
+	__asm __volatile("lock; addl $0,%%fs:%0"
+	    : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
+}
+#else /* _KERNEL && UP */
+static __inline void
+__storeload_barrier(void)
+{
 
-#define	ATOMIC_LOAD(TYPE, LOP)				\
-static __inline u_##TYPE				\
-atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
-{							\
-	u_##TYPE res;					\
-							\
-	__asm __volatile(MPLOCKED LOP			\
-	: "=a" (res),			/* 0 */		\
-	  "+m" (*p)			/* 1 */		\
-	: : "memory", "cc");				\
-	return (res);					\
-}							\
+	__compiler_membar();
+}
+#endif /* SMP */
+#else /* !_KERNEL */
+static __inline void
+__storeload_barrier(void)
+{
+
+	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc");
+}
+#endif /* _KERNEL*/
+
+#define	ATOMIC_LOAD(TYPE)					\
+static __inline u_##TYPE					\
+atomic_load_acq_##TYPE(volatile u_##TYPE *p)			\
+{								\
+	u_##TYPE res;						\
+								\
+	__storeload_barrier();					\
+	res = *p;						\
+	__compiler_membar();					\
+	return (res);						\
+}								\
 struct __hack
 
-#endif /* _KERNEL && !SMP */
+#define	ATOMIC_STORE(TYPE)					\
+static __inline void						\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)	\
+{								\
+								\
+	__compiler_membar();					\
+	*p = v;							\
+}								\
+struct __hack
 
 #ifdef _KERNEL
 
@@ -511,19 +536,19 @@ ATOMIC_ASM(clear,    long,  "andl %1,%0"
 ATOMIC_ASM(add,	     long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
-ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
-ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
-ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
-ATOMIC_LOAD(long,  "cmpxchgl %0,%1");
-
-ATOMIC_STORE(char);
-ATOMIC_STORE(short);
-ATOMIC_STORE(int);
-ATOMIC_STORE(long);
+#define	ATOMIC_LOADSTORE(TYPE)				\
+	ATOMIC_LOAD(TYPE);				\
+	ATOMIC_STORE(TYPE)
+
+ATOMIC_LOADSTORE(char);
+ATOMIC_LOADSTORE(short);
+ATOMIC_LOADSTORE(int);
+ATOMIC_LOADSTORE(long);
 
 #undef ATOMIC_ASM
 #undef ATOMIC_LOAD
 #undef ATOMIC_STORE
+#undef ATOMIC_LOADSTORE
 
 #ifndef WANT_FUNCTIONS
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201506280504.t5S549Cv042100>