Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 11 Mar 2011 14:40:36 -0800
From:      Tim Bird <tim.bird@am.sony.com>
To:        <freebsd-amd64@freebsd.org>
Subject:   [PATCH] Add support for AVX
Message-ID:  <4D7AA4E4.3010508@am.sony.com>

next in thread | raw e-mail | index | archive | help
Hello all,

I am representing a group in Sony that wishes to submit the patch
below to add the described feature to AVX.  I know this is somewhat
unconventional, but I am not the original author of the patch, nor
do I have a build or run environment for BSD.  However, I have
been asked to be a proxy for our BSD developer to help shepherd
this patch into FreeBSD.  If you have questions about the patch,
or issues with it, I will relay the information to the original
author and try to resolve any problems.

As I am new to FreeBSD, I'm not sure the correct individual or
group to direct this patch to, but hopefully someone can provide
me some pointers.  I am starting with freebsd-amd64, but maybe this
should be sent to freebsd-hackers???

Thanks for any assistance you can provide.
 -- Tim Bird, Principal Software Engineer
    Sony Network Entertainment

---------------------------
This is a patch that provides support for AVX in FreeBSD.  After it is
applied to FreeBSD 9.0-CURRENT, user programs on it can use AVX fully
and also the kernel may use it by calling fpu_kern_enter().  AVX is the
newest SIMD extension for x86 CPUs introduced from Intel's Sandy Bridge
and also will be introduced by AMD's next generation processor Bulldozer.

This patch can be applied to FreeBSD 9.0-CURRENT r217297. (r217297 is a
revision of subversion in the repository http://svn.freebsd.org/base)
You can use clang to make a kernel with it as well as gcc.  The kernel
runs on both amd64 processors either AVX has or not without any
configurations.

The strategy of the patch is the same as for FPU(MMX)/SSE.  Namely, code that
initializes FPU(MMX)/SSE also initializes AVX, when it saves them it also
saves AVX as well, and restores it too only if the processor has AVX.
If the processor doesn't have AVX, it does nothing and it only adds a
test instruction and a conditional branch instruction.

Actually it does the followings;
- When a processor boots if the processor has AVX then enable its use.
  at sys/amd64/amd64/initcpu.c
- Save AVX context when a thread switching occurs.
  at sys/amd64/amd64/cpu_switch.S
- Change fxsave()/fxrstor() to xsave()/xrstor() in order to save/restore AVX
  status whenever it saves/restores FPU(MMX)/SSE.
  at sys/amd64/amd64/fpu.c
- Initializes AVX.
  fpuinit() at sys/amd64/amd64/fpu.c
- Adjust an alignment of xsave/xrstor's target addrress in order to avoid GP
  exception due to miss alignment.
  xsave/xrstor instructions require that a boundary of a target address is
  64B. But the alignment of a stack is 32B. So fpugetregs()/fpusetregs() adjust
  it via a bounce buffer before they issue xsave/xrstor instruction.
  fpugetregs()/fpusetregs() at sys/amd64/amd64/fpu.c

There are two issues in the patch;
1. Machine code is embedded
The default assembler for building a kernel, as of 2.15 doesn't know
instructions which is added for AVX.  So, the patch embeds machine code for
xsetbv/xsave/xrstor instruction.

2. Extra data copying
If an alignment of an address of a stack is less than 64B, in order
to avoid GP exception, it copies contents to/from a bounce buffer.
If callers of fpugetregs()/fpusetregs() can be changed, this extra
data coping is removed.  Because of the callers are MI, I hesitated
about changing them.

Thanks,

Index: sys/amd64/include/ucontext.h
===================================================================
--- sys/amd64/include/ucontext.h	(revision 217297)
+++ sys/amd64/include/ucontext.h	(working copy)
@@ -87,7 +87,7 @@
 	/*
 	 * See <machine/fpu.h> for the internals of mc_fpstate[].
 	 */
-	long	mc_fpstate[64] __aligned(16);
+	long	mc_fpstate[104] __aligned(64);

 	__register_t	mc_fsbase;
 	__register_t	mc_gsbase;
Index: sys/amd64/include/fpu.h
===================================================================
--- sys/amd64/include/fpu.h	(revision 217297)
+++ sys/amd64/include/fpu.h	(working copy)
@@ -51,6 +51,11 @@
 	u_char	xmm_bytes[16];
 };

+/* Contents of each AVX extended accumulator */
+struct  ymmacc {
+	u_char  ymm_bytes[16];
+};
+
 struct  envxmm {
 	u_int16_t	en_cw;		/* control word (16bits) */
 	u_int16_t	en_sw;		/* status word (16bits) */
@@ -71,7 +76,9 @@
 	} sv_fp[8];
 	struct xmmacc	sv_xmm[16];
 	u_char sv_pad[96];
-} __aligned(16);
+	u_char xsv_hd[64];
+	struct ymmacc   sv_ymm[16];
+} __aligned(64);

 #ifdef _KERNEL
 struct fpu_kern_ctx {
Index: sys/amd64/include/signal.h
===================================================================
--- sys/amd64/include/signal.h	(revision 217297)
+++ sys/amd64/include/signal.h	(working copy)
@@ -98,7 +98,7 @@
 	 */
 	long	sc_fpformat;
 	long	sc_ownedfp;
-	long	sc_fpstate[64] __aligned(16);
+	long	sc_fpstate[104] __aligned(64);

 	long	sc_fsbase;
 	long	sc_gsbase;
Index: sys/amd64/include/specialreg.h
===================================================================
--- sys/amd64/include/specialreg.h	(revision 217297)
+++ sys/amd64/include/specialreg.h	(working copy)
@@ -66,6 +66,7 @@
 #define	CR4_PCE	0x00000100	/* Performance monitoring counter enable */
 #define	CR4_FXSR 0x00000200	/* Fast FPU save/restore used by OS */
 #define	CR4_XMM	0x00000400	/* enable SIMD/MMX2 to use except 16 */
+#define CR4_XSAVE 0x00040000    /* enable XSETBV/XGETBV */

 /*
  * Bits in AMD64 special registers.  EFER is 64 bits wide.
@@ -134,6 +135,9 @@
 #define	CPUID2_MOVBE	0x00400000
 #define	CPUID2_POPCNT	0x00800000
 #define	CPUID2_AESNI	0x02000000
+#define	CPUID2_XSAVE	0x04000000
+#define	CPUID2_OSXSAVE	0x08000000
+#define	CPUID2_AVX	0x10000000

 /*
  * Important bits in the Thermal and Power Management flags
Index: sys/amd64/amd64/initcpu.c
===================================================================
--- sys/amd64/amd64/initcpu.c	(revision 217297)
+++ sys/amd64/amd64/initcpu.c	(working copy)
@@ -142,6 +142,19 @@
 	}
 }

+
+static void
+xsetbv(unsigned int index, unsigned long value)
+{
+        register_t eax = value;
+        register_t edx = value >> 32;
+
+	/* xsetbv */
+        __asm __volatile(".byte 0x0f,0x01,0xd1" ::
+			 "a"(eax), "d"(edx), "c"(index));
+}
+
+
 /*
  * Initialize CPU control registers
  */
@@ -154,6 +167,11 @@
 		load_cr4(rcr4() | CR4_FXSR | CR4_XMM);
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
+	if (cpu_feature2 & CPUID2_XSAVE) {
+		load_cr4(rcr4() | CR4_XSAVE);
+		/* enable to use xsave/xrstor */
+		xsetbv(0, 7);
+	}
 	if ((amd_feature & AMDID_NX) != 0) {
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S	(revision 217297)
+++ sys/amd64/amd64/cpu_switch.S	(working copy)
@@ -115,7 +115,20 @@
 	jne	1f
 	movq	PCB_SAVEFPU(%r8),%r8
 	clts
+
+	/* if the cpu has AVX, use xsave instead of fxsave */
+	testl	$CPUID2_XSAVE,cpu_feature2
+	jne	2f
 	fxsave	(%r8)
+	jmp	3f
+2:
+	mov	$0x7, %eax
+	mov	%rdx, %rbx
+	mov	$0x0, %edx
+	/* xsave (%r8) fix me when xsave is available */
+	.byte	0x41, 0x0f, 0xae, 0x20
+	mov	%rbx, %rdx
+3:
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
@@ -302,8 +315,80 @@
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
+	/* Fetch PCB. */
+	movq	%rdi,%rcx
+
 	/* Save caller's return address. */
 	movq	(%rsp),%rax
+	movq	%rax,PCB_RIP(%rcx)
+
+	movq	%cr3,%rax
+	movq	%rax,PCB_CR3(%rcx)
+
+	movq	%rbx,PCB_RBX(%rcx)
+	movq	%rsp,PCB_RSP(%rcx)
+	movq	%rbp,PCB_RBP(%rcx)
+	movq	%r12,PCB_R12(%rcx)
+	movq	%r13,PCB_R13(%rcx)
+	movq	%r14,PCB_R14(%rcx)
+	movq	%r15,PCB_R15(%rcx)
+
+	/*
+	 * If fpcurthread == NULL, then the fpu h/w state is irrelevant and the
+	 * state had better already be in the pcb.  This is true for forks
+	 * but not for dumps (the old book-keeping with FP flags in the pcb
+	 * always lost for dumps because the dump pcb has 0 flags).
+	 *
+	 * If fpcurthread != NULL, then we have to save the fpu h/w state to
+	 * fpcurthread's pcb and copy it to the requested pcb, or save to the
+	 * requested pcb and reload.  Copying is easier because we would
+	 * have to handle h/w bugs for reloading.  We used to lose the
+	 * parent's fpu state for forks by forgetting to reload.
+	 */
+	pushfq
+	cli
+	movq	PCPU(FPCURTHREAD),%rax
+	testq	%rax,%rax
+	je	1f
+
+	movq	TD_PCB(%rax),%rdi
+	leaq	PCB_SAVEFPU(%rdi),%rdi
+	clts
+
+	/* if the cpu has AVX, use xsave instead of fxsave */
+	testl	$CPUID2_XSAVE,cpu_feature2
+	jne	2f
+	fxsave	(%rdi)
+	jmp	3f
+2:
+	mov	$0x7, %eax
+	mov	$0x0, %edx
+	/* xsave (%rdi) fix me when xsave is available */
+	.byte	0x0f, 0xae, 0x27
+3:
+	smsw	%ax
+	orb	$CR0_TS,%al
+	lmsw	%ax
+
+	movq	$PCB_SAVEFPU_SIZE,%rdx	/* arg 3 */
+	leaq	PCB_SAVEFPU(%rcx),%rsi	/* arg 2 */
+	/* arg 1 (%rdi) already loaded */
+	call	bcopy
+1:
+	popfq
+
+	ret
+END(savectx)
+
+/*
+ * savectx2(xpcb)
+ * Update xpcb, saving current processor state.
+ */
+ENTRY(savectx2)
+	/* Fetch XPCB. */
+	movq	%rdi,%r8
+	/* Save caller's return address. */
+	movq	(%rsp),%rax
 	movq	%rax,PCB_RIP(%rdi)

 	movq	%rbx,PCB_RBX(%rdi)
@@ -355,7 +440,17 @@
 	str	PCB_TR(%rdi)

 	clts
+	/* if the cpu has AVX, use xsave instead of fxsave */
+	testl	$CPUID2_XSAVE,cpu_feature2
+	jne	1f
 	fxsave	PCB_USERFPU(%rdi)
+	jmp	2f
+1:
+	mov	$0x7, %eax
+	mov	$0x0, %edx
+	/* xsave PCB_USERFPU(%rdi) fix me when xsave is available */
+	.byte   0x0f, 0xae, 0xa7, 0x00, 0x01, 0x00, 0x00
+2:	
 	movq	%rsi,%cr0	/* The previous %cr0 is saved in %rsi. */

 	movl	$1,%eax
Index: sys/amd64/amd64/fpu.c
===================================================================
--- sys/amd64/amd64/fpu.c	(revision 217297)
+++ sys/amd64/amd64/fpu.c	(working copy)
@@ -105,6 +105,40 @@

 static	struct savefpu		fpu_initialstate;

+
+static void
+xsave(struct savefpu *addr)
+{
+	if (cpu_feature2 & CPUID2_XSAVE) {
+		/* __asm("xsave %0" : "=m" (*(addr))) -> xsave (%rdi) */
+		__asm __volatile("mov $0x7, %%eax\n"
+				 "mov $0x0, %%edx\n"
+				 ".byte 0x0f, 0xae, 0x27"
+				 :: "D"(addr)
+				 : "%eax", "%edx", "memory");
+	} else {
+		fxsave(addr);
+	}
+}
+
+
+static void
+xrstor(struct savefpu *addr)
+{
+	if (cpu_feature2 & CPUID2_XSAVE) {
+		/* __asm("xrstor %0" : : "m" (*(addr))) -> xrstor (%rdi) */
+		__asm __volatile("mov $0x7, %%eax\n"
+				 "mov $0x0, %%edx\n"
+				 ".byte 0x0f, 0xae, 0x2f"
+				 :: "D"(addr)
+				 : "%eax", "%edx");
+	} else {
+		fxrstor(addr);
+	}
+}
+
+
+
 /*
  * Initialize the floating point unit.  On the boot CPU we generate a
  * clean state that is used to initialize the floating point unit when
@@ -128,13 +162,14 @@
 	mxcsr = __INITIAL_MXCSR__;
 	ldmxcsr(mxcsr);
 	if (PCPU_GET(cpuid) == 0) {
-		fxsave(&fpu_initialstate);
+		xsave(&fpu_initialstate);
 		if (fpu_initialstate.sv_env.en_mxcsr_mask)
 			cpu_mxcsr_mask = fpu_initialstate.sv_env.en_mxcsr_mask;
 		else
 			cpu_mxcsr_mask = 0xFFBF;
 		bzero(fpu_initialstate.sv_fp, sizeof(fpu_initialstate.sv_fp));
 		bzero(fpu_initialstate.sv_xmm, sizeof(fpu_initialstate.sv_xmm));
+		bzero(fpu_initialstate.sv_ymm, sizeof(fpu_initialstate.sv_ymm));
 	}
 	start_emulating();
 	intr_restore(saveintr);
@@ -150,7 +185,7 @@
 	critical_enter();
 	if (curthread == PCPU_GET(fpcurthread)) {
 		stop_emulating();
-		fxsave(PCPU_GET(curpcb)->pcb_save);
+		xsave(PCPU_GET(curpcb)->pcb_save);
 		start_emulating();
 		PCPU_SET(fpcurthread, 0);
 	}
@@ -423,7 +458,7 @@
 		 * the PCB doesn't contain a clean FPU state.  Explicitly
 		 * load an initial state.
 		 */
-		fxrstor(&fpu_initialstate);
+		xrstor(&fpu_initialstate);
 		if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
 			fldcw(pcb->pcb_initial_fpucw);
 		if (PCB_USER_FPU(pcb))
@@ -432,7 +467,7 @@
 		else
 			set_pcb_flags(pcb, PCB_FPUINITDONE);
 	} else
-		fxrstor(pcb->pcb_save);
+		xrstor(pcb->pcb_save);
 	critical_exit();
 }

@@ -469,7 +504,26 @@
 	}
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
-		fxsave(&pcb->pcb_user_save);
+		/*
+		 * FIXME:
+		 * xsave requires that a boundary of a target address is 64B
+		 * and struct savefpu has an attribute for aligned 64B.  But
+		 * gcc doesn't treat a variable on stack as 64B boundary.
+		 * So, it uses a buffer to do xsave safely and copy them the
+		 * target if a miss aligned address is passed.
+		 */
+		if ((unsigned long)&pcb->pcb_user_save % 64) {
+			char tmp[sizeof(struct savefpu) + 63];
+			void *p = (void *)((unsigned long)(tmp + 63) & ~0x3f);
+
+			bzero(((struct savefpu*)p)->xsv_hd, 64);
+
+			xsave((struct savefpu*)p);
+			bcopy(p, &pcb->pcb_user_save, sizeof(struct savefpu));
+		} else {
+			bzero(pcb->pcb_user_save.xsv_hd, 64);
+			xsave(&pcb->pcb_user_save);
+		}
 		critical_exit();
 		return (_MC_FPOWNED_FPU);
 	} else {
@@ -502,7 +556,24 @@
 	pcb = td->td_pcb;
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
-		fxrstor(addr);
+
+		/* In order to avoid #GP, initialize some of XSAVE.HEADER. */
+		u_long *header = (u_long*)(addr->xsv_hd);
+		header[0] = 7;
+		header[1] = 0;
+		header[2] = 0;
+
+		/*
+		 * FIXME: See fpugetregs()
+		 */
+		if ((unsigned long)addr % 64) {
+			char tmp[sizeof(struct savefpu) + 63];
+			void *p = (void *)((unsigned long)(tmp + 63) & ~0x3f);
+			bcopy(addr, p, sizeof(struct savefpu));
+			xrstor((struct savefpu*)p);
+		} else {
+			xrstor(addr);
+		}
 		critical_exit();
 		set_pcb_flags(pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	} else {
Index: sys/amd64/amd64/identcpu.c
===================================================================
--- sys/amd64/amd64/identcpu.c	(revision 217297)
+++ sys/amd64/amd64/identcpu.c	(working copy)
@@ -275,7 +275,7 @@
 				"\012SSSE3"	/* SSSE3 */
 				"\013CNXT-ID"	/* L1 context ID available */
 				"\014<b11>"
-				"\015<b12>"
+				"\015FMA"
 				"\016CX16"	/* CMPXCHG16B Instruction */
 				"\017xTPR"	/* Send Task Priority Messages*/
 				"\020PDCM"	/* Perf/Debug Capability MSR */
@@ -291,7 +291,7 @@
 				"\032AESNI"	/* AES Crypto*/
 				"\033XSAVE"
 				"\034OSXSAVE"
-				"\035<b28>"
+				"\035AVX"
 				"\036<b29>"
 				"\037<b30>"
 				"\040<b31>"






Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4D7AA4E4.3010508>