Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 9 Sep 2017 17:41:10 +1000 (EST)
From:      Bruce Evans <brde@optusnet.com.au>
To:        Mateusz Guzik <mjg@freebsd.org>
Cc:        src-committers@freebsd.org, svn-src-all@freebsd.org,  svn-src-head@freebsd.org
Subject:   Re: svn commit: r323346 - in head/sys: powerpc/powerpc riscv/riscv
Message-ID:  <20170909165558.L17783@besplex.bde.org>
In-Reply-To: <201709090556.v895u4jT047914@repo.freebsd.org>
References:  <201709090556.v895u4jT047914@repo.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Sat, 9 Sep 2017, Mateusz Guzik wrote:

> Log:
>  Fix riscv and powerpc compilation after r323329.
>
>  On these archs bzero is a C function, which triggers a compilation error
>  as the compiler tries to expand the macro.

bzero() is a C function on all arches.  It is just written in C on these
archs.

> Modified: head/sys/powerpc/powerpc/machdep.c
> ==============================================================================
> --- head/sys/powerpc/powerpc/machdep.c	Sat Sep  9 05:50:47 2017	(r323345)
> +++ head/sys/powerpc/powerpc/machdep.c	Sat Sep  9 05:56:04 2017	(r323346)
> @@ -417,43 +417,6 @@ powerpc_init(vm_offset_t fdt, vm_offset_t toc, vm_offs
> 	    (sizeof(struct callframe) - 3*sizeof(register_t))) & ~15UL);
> }
>
> -void
> -bzero(void *buf, size_t len)
> [... 100+ more lines of patches]

Ugh.  All you have to do here is declare bzero correctly:

void
(bzero)(void *buf, size_t len)

Since bzero() is now a macro, the old declaration is incorrect.

> ...
> @@ -555,3 +518,41 @@ DB_SHOW_COMMAND(spr, db_show_spr)
> 	    (unsigned long)spr);
> }
> #endif
> +
> +#undef bzero
> +void
> +bzero(void *buf, size_t len)

The undef should work too.  Why make the patch unreadable by moving
everything?

> Modified: head/sys/riscv/riscv/machdep.c
> ==============================================================================
> --- head/sys/riscv/riscv/machdep.c	Sat Sep  9 05:50:47 2017	(r323345)
> +++ head/sys/riscv/riscv/machdep.c	Sat Sep  9 05:56:04 2017	(r323346)
> @@ -151,16 +151,6 @@ cpu_idle_wakeup(int cpu)
> 	return (0);
> }
>
> -void
> -bzero(void *buf, size_t len)
> -{
> -	uint8_t *p;
> -
> -	p = buf;
> -	while(len-- > 0)
> -		*p++ = 0;
> -}
> -

This is an underengineered C version, while the powerpc version is
misoverengineered.  The correct overengineering is to copy the libc
version (at worst the MI version which is a bit like the powerpc
kernel version) to libkern.

> int
> fill_regs(struct thread *td, struct reg *regs)
> {
> @@ -890,4 +880,15 @@ initriscv(struct riscv_bootparams *rvbp)
> 	riscv_init_interrupts();
>
> 	early_boot = 0;
> +}
> +
> +#undef bzero
> +void
> +bzero(void *buf, size_t len)
> +{
> +	uint8_t *p;
> +
> +	p = buf;
> +	while(len-- > 0)
> +		*p++ = 0;
> }

I guess moving it to the end is to keep the macro alive for the rest of
the file.  So just declare it correctly and don't use #undef or move
anything.

My old version of this has related complications:

X Index: systm.h
X ===================================================================
X RCS file: /home/ncvs/src/sys/sys/systm.h,v
X retrieving revision 1.208
X diff -u -2 -r1.208 systm.h
X --- systm.h	17 Jun 2004 17:16:52 -0000	1.208
X +++ systm.h	7 Feb 2007 23:20:06 -0000
X @@ -140,45 +142,79 @@
X  void	backtrace(void);
X  void	cpu_boot(int);
X -void	cpu_rootconf(void);
X -extern uint32_t crc32_tab[];
X +void	cpu_halt(void) __dead2;
X +void	cpu_reset(void) __dead2;
X  uint32_t crc32(const void *buf, size_t size);
X +extern uint32_t crc32_tab[];
X  void	critical_enter(void);
X  void	critical_exit(void);
X +void	hexdump(void *ptr, int length, const char *hdr, int flags);
X +#define	HD_COLUMN_MASK	0xff
X +#define	HD_DELIM_MASK	0xff00
X +#define	HD_OMIT_COUNT	(1 << 16)
X +#define	HD_OMIT_HEX	(1 << 17)
X +#define	HD_OMIT_CHARS	(1 << 18)
X  void	init_param1(void);
X  void	init_param2(long physpages);
X  void	init_param3(long kmempages);
X  void	tablefull(const char *);
X -int	kvprintf(char const *, void (*)(int, void*), void *, int,
X -	    __va_list) __printflike(1, 0);
X +int	kvprintf(char const *, void (*)(int, void*), void *, int, __va_list)
X +	    __printflike(1, 0);
X  void	log(int, const char *, ...) __printflike(2, 3);
X  void	log_console(struct uio *);
X  int	printf(const char *, ...) __printflike(1, 2);
X  int	snprintf(char *, size_t, const char *, ...) __printflike(3, 4);
X -int	sprintf(char *buf, const char *, ...) __printflike(2, 3);
X +int	sprintf(char *, const char *, ...) __printflike(2, 3);
X +int	sscanf(const char *, char const *, ...) __scanflike(2, 3);
X +long	strtol(const char *, char **, int) __nonnull(1);
X +quad_t	strtoq(const char *, char **, int) __nonnull(1);
X +u_long	strtoul(const char *, char **, int) __nonnull(1);
X +u_quad_t strtouq(const char *, char **, int) __nonnull(1);
X +void	tprintf(struct proc *p, int pri, const char *, ...)
X +	    __printflike(3, 4);
X  int	uprintf(const char *, ...) __printflike(1, 2);
X  int	vprintf(const char *, __va_list) __printflike(1, 0);
X  int	vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0);
X -int	vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); 
X -int	vsprintf(char *buf, const char *, __va_list) __printflike(2, 0);
X +int	vsnrprintf(char *, size_t, int, const char *, __va_list)
X +	    __printflike(4, 0); 
X +int	vsprintf(char *, const char *, __va_list) __printflike(2, 0);
X +int	vsscanf(const char *, char const *, __va_list) __scanflike(2, 0);
X  int	ttyprintf(struct tty *, const char *, ...) __printflike(2, 3);
X -int	sscanf(const char *, char const *, ...) __nonnull(1) __nonnull(2);
X -int	vsscanf(const char *, char const *, __va_list) __nonnull(1) __nonnull(2);
X -long	strtol(const char *, char **, int) __nonnull(1);
X -u_long	strtoul(const char *, char **, int) __nonnull(1);
X -quad_t	strtoq(const char *, char **, int) __nonnull(1);
X -u_quad_t strtouq(const char *, char **, int) __nonnull(1);
X -void	tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4);
X -void	hexdump(void *ptr, int length, const char *hdr, int flags);
X -#define	HD_COLUMN_MASK	0xff
X -#define	HD_DELIM_MASK	0xff00
X -#define	HD_OMIT_COUNT	(1 << 16)
X -#define	HD_OMIT_HEX	(1 << 17)
X -#define	HD_OMIT_CHARS	(1 << 18)

Lots of unrelated style fixes in the same hunk.

X 
X -#define ovbcopy(f, t, l) bcopy((f), (t), (l))
X +#define	ovbcopy(f, t, l) bcopy((f), (t), (l))

Related style fix.

X  void	bcopy(const void *from, void *to, size_t len) __nonnull(1) __nonnull(2);
X +#ifndef _BZERO_DECLARED
X  void	bzero(void *buf, size_t len) __nonnull(1);
X +#define	_BZERO_DECLARED
X +#endif

Allow alternative implementations (perhaps as a pure macro).

X +
X +#if 1
X +#define	bzero(p, n) ({						\
X +	if (__builtin_constant_p(n) && (n) <= 32)		\
X +		__builtin_memset((p), 0, (n));			\
X +	else							\
X +		(bzero)((p), (n));				\
X +})
X +#endif

Hard-coding the threshold is wrong.  32 is for gcc-3 on at least i386.
gcc-4 generates inline code up to 64 on both i386 and 32.  The clang
code for length 64 is bad on i386, but the exten bzero() is worse.
clang's inline code then consists of 16 5-10 byte instructions to move 4
bytes each .  gcc prefers string instructions above about 8 btes for
gcc-3 and 16 bytes for gcc-4.  "rep stosl" takes too long to start up,
but gcc-3 prefers it.  gcc-4 prefers several "stosl"s.  This is still
slower than ordinary store instructions provided the instruction fetcher
can keep up with the latter.  bzero() on i386 uses "rep stos*" in all
cases, with extra pessimizations for the end condition.

It seems best to change the 64 to 8 * sizeof(long).  __builtin_memset()
actually degrades to memset() above the threshold.  memset() is slightly
worse than bzero().  (It has silly optimizations done in libkern
instead of here.)  The magic 64 is basically the x86 threshold for gcc-4
and clang on certain (perhaps all) -march configurations.  ("rep stosl"
works better on newer arches, but it still takes too long to start up
for counts below a few hundred bytes, and AVX methods are probably better
starting at more like 64 bytes).  Compilers know that that don't know
what is best so I think they use a threshold of 64 for most cases.  But
this is very x86-dependent.  We could also know that the riscv bzero()
is especially slow and use a threshold of ininity so that it is never
called (memset() would be called instead, and we know that it reduces
to extern bzero() so actually reaches the slow bzero() slightly slower).

X 
X -void	*memcpy(void *to, const void *from, size_t len) __nonnull(1) __nonnull(2);
X +#if 0
X +static __inline void *
X +memset(void *b, int c, size_t len)
X +{
X +	char *bb;
X +
X +	if (c == 0)
X +		bzero(b, len);
X +	else
X +		for (bb = b; len--; )
X +			*bb++ = c;
X +	return (b);
X +}
X +#endif

Without the '#if 0', this would move the inline memset() from libkern.h
to here.  It is arguably a style bug for it to be in libkern.h, and it
must be here for the bzero() in it to use the macro.

-current has much worse messes for memset().  It still has the inline
version in libkern.  It also has an extern memset() in libkern/memset.c.
The inline function is misnamed memset(), so it cannot be used to
implement the extern memset(), so the code is duplicated and ifdefs
tangles are needed to prevent duplicate definitions.

X +
X +void	*memcpy(void *to, const void *from, size_t len) __nonnull(1)
X +	    __nonnull(2);
X +#if 1
X +#define	memcpy(to, from, n)	__builtin_memcpy((to), (from), (n))
X +#endif

I also optimize memcpy().  This is simpler because the builtin falls back
to an extern function with essentially the same name and no messes in its
implementation (the x86 memcpy() is just intentionally suboptimal, since
it was supposed to be only used for this fallback back when memcpy() gave
the builtin).

X 
X  int	copystr(const void * __restrict kfaddr, void * __restrict kdaddr,

Bruce



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20170909165558.L17783>