Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 21 Jul 2003 17:32:28 -0700 (PDT)
From:      Peter Wemm <peter@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 34799 for review
Message-ID:  <200307220032.h6M0WSSJ023138@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=34799

Change 34799 by peter@peter_hammer on 2003/07/21 17:31:45

	initial shot at fleshing out SSE support in the fp*() API

Affected files ...

.. //depot/projects/hammer/sys/amd64/include/ieeefp.h#2 edit

Differences ...

==== //depot/projects/hammer/sys/amd64/include/ieeefp.h#2 (text+ko) ====

@@ -90,6 +90,13 @@
 #define FP_RND_FLD	0xc00	/* round control field */
 #define FP_STKY_FLD	0x3f	/* sticky flags field */
 
+#define	SSE_STKY_FLD	0x3f	/* exception flags */
+#define	SSE_DAZ_FLD	0x40	/* Denormals are zero */
+#define	SSE_MSKS_FLD	0x1f80	/* exception masks field */
+#define	SSE_RND_FLD	0x6000	/* rounding control */
+#define	SSE_FZ_FLD	0x8000	/* flush to zero on underflow */
+
+
 /*
  * FP register bit field offsets
  */
@@ -98,84 +105,164 @@
 #define FP_RND_OFF	10	/* round control offset */
 #define FP_STKY_OFF	0	/* sticky flags offset */
 
+#define	SSE_STKY_OFF	0	/* exception flags offset */
+#define	SSE_DAZ_OFF	6	/* DAZ exception mask offset */
+#define	SSE_MSKS_OFF	7	/* other exception masks offset */
+#define	SSE_RND_OFF	13	/* rounding control offset */
+#define	SSE_FZ_OFF	15	/* flush to zero offset */
+
 #ifdef __GNUC__
 
 #define	__fldenv(addr)	__asm __volatile("fldenv %0" : : "m" (*(addr)))
 #define	__fnstenv(addr)	__asm __volatile("fnstenv %0" : "=m" (*(addr)))
+#define	__fldcw(addr)	__asm __volatile("fldcw %0" : "=m" (*(addr)))
 #define	__fnstcw(addr)	__asm __volatile("fnstcw %0" : "=m" (*(addr)))
 #define	__fnstsw(addr)	__asm __volatile("fnstsw %0" : "=m" (*(addr)))
+#define	__ldmxcsr(addr)	__asm __volatile("ldmxcsr %0" : "=m" (*(addr)))
+#define	__stmxcsr(addr)	__asm __volatile("stmxcsr %0" : "=m" (*(addr)))
 
 /*
- * return the contents of a FP register
+ * General notes about conflicting SSE vs FP status bits.
+ * This code assumes that software will not fiddle with the control
+ * bits of the SSE and x87 in such a way to get them out of sync and
+ * still expect this to work.  Break this at your peril.
+ * Because I based this on the i386 port, the x87 state is used for
+ * the fpget*() functions, and is shadowed into the SSE state for
+ * the fpset*() functions.  For dual source fpget*() functions, I
+ * merge the two together.  I think.
+ */
+
+/* Set rounding control */
+static __inline__ fp_rnd_t
+fpgetround(void)
+{
+	unsigned short _cw;
+
+	__fnstcw(&_cw);
+	return ((_cw & FP_RND_FLD) >> FP_RND_OFF);
+}
+
+static __inline__ fp_rnd_t
+fpsetround(fp_rnd_t _m)
+{
+	unsigned short _cw;
+	unsigned int _mxcsr;
+	fp_rnd_t _p;
+
+	__fnstcw(&_cw);
+	_p = (_cw & FP_RND_FLD) >> FP_RND_OFF;
+	_cw &= ~FP_RND_FLD;
+	_cw |= (_m << FP_RND_OFF) & FP_RND_FLD;
+	__fldcw(&_cw);
+	__stmxcsr(&_mxcsr);
+	_mxcsr &= ~SSE_RND_FLD;
+	_mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD;
+	__ldmxcsr(&_mxcsr);
+	return (_p);
+}
+
+/*
+ * Set precision for fadd/fsub/fsqrt etc x87 instructions
+ * There is no equivalent SSE mode or control. It always runs 
+ * in 64 bit precision mode for SSE2 calculations.
  */
-static __inline__ int
-__fpgetreg(int _reg)
+static __inline__ fp_prec_t
+fpgetprec(void)
+{
+	unsigned short _cw;
+
+	__fnstcw(&_cw);
+	return ((_cw & FP_PRC_FLD) >> FP_PRC_OFF);
+}
+
+static __inline__ fp_rnd_t
+fpsetprec(fp_rnd_t _m)
 {
-	unsigned short _mem;
+	unsigned short _cw;
+	fp_prec_t _p;
 
-	/*-
-	 * This is more efficient than it looks.  The switch gets optimized
-	 * away if _reg is constant.
-	 *
-	 * The default case only supports _reg == 0.  We could handle more
-	 * registers (e.g., tags) using fnstenv, but the interface doesn't
-	 * support more.
-	 */
-	switch(_reg) {
-	default:
-		__fnstcw(&_mem);
-		break;
-	case FP_STKY_REG:
-		__fnstsw(&_mem);
-		break;
-	}
-	return _mem;
+	__fnstcw(&_cw);
+	_p = (_cw & FP_PRC_FLD) >> FP_PRC_OFF;
+	_cw &= ~FP_PRC_FLD;
+	_cw |= (_m << FP_PRC_OFF) & FP_PRC_FLD;
+	__fldcw(&_cw);
+	return (_p);
 }
 
 /*
- * set a FP mode; return previous mode
+ * Look at the exception masks
+ * Note that x87 masks are inverse of the fp*() functions
+ * API.  ie: mask = 1 means disable for x87 and SSE, but
+ * for the fp*() api, mask = 1 means enabled.
  */
-static __inline__ int
-__fpsetreg(int _m, int _reg, int _fld, int _off)
+static __inline__ fp_except_t
+fpgetmask(void)
+{
+	unsigned short _cw;
+
+	__fnstcw(&_cw);
+	return ((~_cw) & FP_MSKS_FLD);
+}
+
+static __inline__ fp_except_t
+fpsetmask(fp_except_t _m)
+{
+	unsigned short _cw;
+	unsigned int _mxcsr;
+	fp_except_t _p;
+
+	__fnstcw(&_cw);
+	_p = (~_cw) & FP_MSKS_FLD;
+	_cw ~= ~FP_MSKS_FLD;
+	_cw |= (~_m) & FP_MSKS_FLD;
+	__fldcw(&_cw);
+	__stmxcsr(&_mxcxr);
+	/* XXX should clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD */
+	_mxcsr &= ~SSE_MSKS_FLD;
+	_mxcsr |= ((~_m) << SSE_MSKS_OFF) & SSE_MSKS_FLD);
+	__ldmxcsr(&_mxcsr);
+	return (_p);
+}
+
+/* See which sticky exceptions are pending, and reset them */
+static __inline__ fp_except_t
+fpgetsticky(void)
+{
+	unsigned short _sw;
+	unsigned int _mxcsr;
+	fp_except_t _ex;
+
+	__fnstsw(&_sw);
+	_ex = _sw & FP_STKY_FLD;
+	__stmxcsr(&_mxcsr);
+	_ex |= _mxcsr & SSE_STKY_FLD;
+	return (_ex);
+}
+
+static __inline__ fp_except_t
+fpresetsticky(fp_except_t _m)
 {
 	unsigned _env[7];
-	unsigned _p;
+	unsigned int _mxcsr;
+	fp_except_t _p;
 
-	/*
-	 * _reg == 0 could be handled better using fnstcw/fldcw.
-	 */
 	__fnstenv(_env);
-	_p =  (_env[_reg] & _fld) >> _off;
-	_env[_reg] = (_env[_reg] & ~_fld) | (_m << _off & _fld);
+	_p = _env[FP_STKY_REG] & _m;
+	__stmxcsr(&_mxcsr);
+	_p |= _mxcsr & SSE_STKY_FLD;
+	_env[FP_STKY_REG] &= ~_m;
 	__fldenv(_env);
-	return _p;
+	_mxcsr &= ~_m;
+	__ldmxcsr(&_mxcsr);
+	return (_p);
 }
 
-#endif /* __GNUC__ */
-
-/*
- * SysV/386 FP control interface
- */
-#define	fpgetround()	((fp_rnd_t)					\
-	((__fpgetreg(FP_RND_REG) & FP_RND_FLD) >> FP_RND_OFF))
-#define	fpsetround(m)	((fp_rnd_t)					\
-	__fpsetreg((m), FP_RND_REG, FP_RND_FLD, FP_RND_OFF))
-#define	fpgetprec()	((fp_prec_t)					\
-	((__fpgetreg(FP_PRC_REG) & FP_PRC_FLD) >> FP_PRC_OFF))
-#define	fpsetprec(m)	((fp_prec_t)					\
-	__fpsetreg((m), FP_PRC_REG, FP_PRC_FLD, FP_PRC_OFF))
-#define	fpgetmask()	((fp_except_t)					\
-	((~__fpgetreg(FP_MSKS_REG) & FP_MSKS_FLD) >> FP_MSKS_OFF))
-#define	fpsetmask(m)	((fp_except_t)					\
-	(~__fpsetreg(~(m), FP_MSKS_REG, FP_MSKS_FLD, FP_MSKS_OFF)) &	\
-	    (FP_MSKS_FLD >> FP_MSKS_OFF))
-#define	fpgetsticky()	((fp_except_t)					\
-	((__fpgetreg(FP_STKY_REG) & FP_STKY_FLD) >> FP_STKY_OFF))
-#define	fpresetsticky(m) ((fp_except_t)					\
-	__fpsetreg(0, FP_STKY_REG, (m), FP_STKY_OFF))
+/* It is called fpsetsticky(), but is really a reset function */
 #define	fpsetsticky(m)	fpresetsticky(m)
 
 /* Suppress prototypes in the MI header. */
 #define	_IEEEFP_INLINED_	1
 
+#endif /* __GNUC__ */
+
 #endif /* !_MACHINE_IEEEFP_H_ */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200307220032.h6M0WSSJ023138>