Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 15 Feb 2015 00:38:18 -0800
From:      Mark Millard <markmi@dsl-only.net>
To:        FreeBSD PowerPC ML <freebsd-ppc@freebsd.org>
Subject:   Re: PowerMac G5 powerpc64: new context where repeatedly booting varies between failing and working
Message-ID:  <5FE82152-BBF7-4C6D-932D-AEE70546CACA@dsl-only.net>
In-Reply-To: <C355D814-D486-4644-B9C6-92992092FD55@dsl-only.net>
References:  <7CA43EE3-8C11-4FBD-9F8A-42DF08B82362@dsl-only.net> <ABDD60F1-72C0-41E0-8DFB-4CFDCA9ACA82@dsl-only.net> <C355D814-D486-4644-B9C6-92992092FD55@dsl-only.net>

next in thread | previous in thread | raw e-mail | index | archive | help
I changed the mi_startup code to report the specific SI_SUB_TUNABLES =
item that ends up with the address showing as changed afterwards.

mp_setmaxid

is the only culprit reported so far. Further printf work lead to seeing =
the calculation working fine before the following OF_peer(0) but when =
there is a boot failure the bad value first shows up at this level just =
after the OF_peer(0) call returned:

static int
powermac_smp_first_cpu(platform_t plat, struct cpuref *cpuref)
{
        char buf[8];
        phandle_t cpu, dev, root;
        int res;

        root =3D OF_peer(0);

        dev =3D OF_child(root);
        while (dev !=3D 0) {
                res =3D OF_getprop(dev, "name", buf, sizeof(buf));
                if (res > 0 && strcmp(buf, "cpus") =3D=3D 0)
                        break;
                dev =3D OF_peer(dev);
        }
        if (dev =3D=3D 0) {
                /*
                 * psim doesn't have a name property on the /cpus node,
                 * but it can be found directly
                 */
                dev =3D OF_finddevice("/cpus");
                if (dev =3D=3D -1)
                        return (ENOENT);
        }
       =20
        cpu =3D OF_child(dev);
       =20
        while (cpu !=3D 0) {
                res =3D OF_getprop(cpu, "device_type", buf, =
sizeof(buf));
                if (res > 0 && strcmp(buf, "cpu") =3D=3D 0)
                        break;
                cpu =3D OF_peer(cpu);
        }
        if (cpu =3D=3D 0)
                return (ENOENT);

        return (powermac_smp_fill_cpuref(cpuref, cpu));
}=20


=3D=3D=3D
Mark Millard
markmi at dsl-only.net

On 2015-Feb-14, at 06:13 PM, Mark Millard <markmi at dsl-only.net> =
wrote:

I found a time frame before the TOC corruption. The corruption happens =
during SI_SUB_TUNABLES processing, not before and not after in any of my =
examples so far.

With my current extra displays of the address calculations a normal boot =
now starts out (dmesg -a prefix) as below:

> powerpc_init end: &authnone_private: 0xe313a8
> mi_startup start &authnone_private: 0xe313a8
>=20
>=20
>=20
> sysinit: 0xbd9c00 *sysinit: 0xc3c8a8
> &authnone_private: 0xe313a8
>=20
> &authnone_private before subsystem: 0xe313a8
> subsystem 700000
> &authnone_private before subsystem: 0xe313a8
> subsystem 800001
> Copyright (c) 1992-2015 The FreeBSD Project.
> ...


But when the boots fail the before "subsystem 700000" &authnone_private =
figure above is unchanged but after that things look like (picking an =
example bad value that has occurred and manually typing it):

> &authnone_private before subsystem: 0x2400004200e313a8
> subsystem 800001
> Copyright (c) 1992-2015 The FreeBSD Project.
> ...

and all later displays of the calculation agree with the displayed bad =
value until it crashes. I've never seen the value change at any other =
stage so far.


The code for mi_startup displaying the values as above is:


> root@FBSDG5M1:/usr/src # svnlite diff sys/kern/init_main.c
> Index: sys/kern/init_main.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> --- sys/kern/init_main.c	(revision 278443)
> +++ sys/kern/init_main.c	(working copy)
> @@ -91,6 +91,11 @@
> #include <ddb/ddb.h>
> #include <ddb/db_sym.h>
>=20
> +#if defined(VERBOSE_SYSINIT)
> +/* HACK! */
> +extern void* authnone_create(void);
> +#endif
> +
> void mi_startup(void);				/* Should be =
elsewhere */
>=20
> /* Components of the first process -- never freed. */
> @@ -207,6 +212,8 @@
> 	int verbose;
> #endif
>=20
> +printf("mi_startup start &authnone_private: %p\n\n", =
authnone_create());
> +
> 	if (boothowto & RB_VERBOSE)
> 		bootverbose++;
>=20
> @@ -215,7 +222,12 @@
> 		sysinit_end =3D SET_LIMIT(sysinit_set);
> 	}
>=20
> +
> restart:
> +
> +printf("\n\nsysinit: %p *sysinit: %p\n", sysinit, *sysinit);
> +printf("&authnone_private: %p\n\n", authnone_create());
> +
> 	/*
> 	 * Perform a bubble sort of the system initialization objects by
> 	 * their subsystem (primary key) and order (secondary key).
> @@ -234,6 +246,8 @@
>=20
> #if defined(VERBOSE_SYSINIT)
> 	last =3D SI_SUB_COPYRIGHT;
> +/* HACK */
> +	last =3D SI_SUB_DUMMY;
> 	verbose =3D 0;
> #if !defined(DDB)
> 	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups =
disabled.\n");
> @@ -254,7 +268,11 @@
>=20
> #if defined(VERBOSE_SYSINIT)
> 		if ((*sipp)->subsystem > last) {
> +printf("&authnone_private before subsystem: %p\n ", =
authnone_create());
> +
> 			verbose =3D 1;
> +/* HACK */
> +verbose =3D 0;
> 			last =3D (*sipp)->subsystem;
> 			printf("subsystem %x\n", last);
> 		}


I have also observed a new wildly different bad value: 0 instead of =
0x2400004200e313a8.

The kernel runs much farther in this case but eventually dies for =
another large bad address. But the 0 means that some stomping on low =
memory occurred, such as 24(r29) indicating address 24 (0x18 hex) in the =
instruction that fails for r29=3D0x2400004200e313a8.





=3D=3D=3D
Mark Millard
markmi at dsl-only.net

On 2015-Feb-14, at 02:21 PM, Mark Millard <markmi at dsl-only.net> =
wrote:

I've been able to show that the TOC entry that authnone_init accesses is =
messed up and it is so from very early on.

I took advantage of sys/rpc/auth_none.c exposing the static variable's =
address calculation result, in fact the same one that the crash happened =
for:

AUTH *
authnone_create()
{
      struct authnone_private *ap =3D &authnone_private;

      return (&ap->no_client);
}

The no_client even happens to be the first field of the struct pointed =
to by ap.

So I put calls of that routine where it would periodically monitor the =
calculation during the early part of booting:

root@FBSDG5M1:/usr/src # svnlite diff sys/kern/init_main.c=20
Index: sys/kern/init_main.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/kern/init_main.c	(revision 278443)
+++ sys/kern/init_main.c	(working copy)
@@ -91,6 +91,11 @@
#include <ddb/ddb.h>
#include <ddb/db_sym.h>

+#if defined(VERBOSE_SYSINIT)
+/* HACK! */
+extern void* authnone_create(void);
+#endif
+
void mi_startup(void);				/* Should be elsewhere =
*/

/* Components of the first process -- never freed. */
@@ -282,7 +287,9 @@

#if defined(VERBOSE_SYSINIT)
		if (verbose)
+{printf(" authnone_private address generation check: %p ", =
authnone_create());
			printf("done.\n");
+}
#endif

So when it boots successfully it reports messages like:

 malloc_init(&M_JFREEFRAG)...  authnone_private address generation =
check: 0xe313a8 done.

When the boots fails the very first such message of that form shows the =
0x2400002200e313a8 value, as do all the later ones. When the boot works =
it always shows 0xe313a8.

[I have since shortened the text with:
printf(" &authnone_private: %p ", authnone_create());]

It would appear that the TOC entry generation/update is the source of =
the variations in value that are observed that can lead to a crash.



=3D=3D=3D
Mark Millard
markmi at dsl-only.net

On 2015-Feb-14, at 01:53 AM, Mark Millard <markmi at dsl-only.net> =
wrote:

FreeBSD context: PowerMac G5 Quad-Core with 16GB RAM.

root@FBSDG5M1:/usr/src # freebsd-version -ku ; uname -a
10.1-STABLE
10.1-STABLE
FreeBSD FBSDG5M1 10.1-STABLE FreeBSD 10.1-STABLE #10 r278443M: Fri Feb =
13 03:26:27 PST 2015     =
root@FBSDG5M1:/usr/obj/usr/src/sys/GENERIC64vtsc  powerpc

Other configuration/context details for /boot/kernel10.1S/kernel are =
given late in this message. But I will here mention the use of:

options         DDB
options         GDB
options         VERBOSE_SYSINIT
options         BOOTVERBOSE=3D1
options         BOOTHOWTO=3DRB_VERBOSE



I've got a new context where repeatedly booting via power-off then =
power-on varies between failing and working, always failing in the same =
way/place when it does.


Here are 3 addresses (&...: ...) from a successful boot of kernel10.1S, =
where the first one will be  different for the failing boots (this is =
from dmesg -a):

authnone_init(0)...  &authnone_private: 0xe313a8, &_null_auth: 0xe31608, =
&authnone_ops: 0xc31f80 done.

where the extra output is from the added printf in:

static void
authnone_init(void *dummy)
{
     struct authnone_private *ap =3D &authnone_private;
     XDR xdrs;

     ap->no_client.ah_cred =3D ap->no_client.ah_verf =3D _null_auth;
     ap->no_client.ah_ops =3D &authnone_ops;
printf(" &authnone_private: %p, &_null_auth: %p, &authnone_ops: %p ", =
ap, &_null_auth, &authnone_ops);
     xdrmem_create(&xdrs, ap->mclient, MAX_MARSHAL_SIZE, XDR_ENCODE);
     xdr_opaque_auth(&xdrs, &ap->no_client.ah_cred);
     xdr_opaque_auth(&xdrs, &ap->no_client.ah_verf);
     ap->mcnt =3D XDR_GETPOS(&xdrs);
     XDR_DESTROY(&xdrs);
}
SYSINIT(authnone_init, SI_SUB_KMEM, SI_ORDER_ANY, authnone_init, NULL);

The authnone_init code for through its first store into ap->... is:

00000000007a3ea4 <.authnone_init> mflr    r0
00000000007a3ea8 <.authnone_init+0x4> std     r29,-24(r1)
00000000007a3eac <.authnone_init+0x8> std     r30,-16(r1)
00000000007a3eb0 <.authnone_init+0xc> std     r31,-8(r1)
00000000007a3eb4 <.authnone_init+0x10> std     r0,16(r1)
00000000007a3eb8 <.authnone_init+0x14> stdu    r1,-192(r1)
00000000007a3ebc <.authnone_init+0x18> mr      r31,r1
00000000007a3ec0 <.authnone_init+0x1c> ld      r29,304(r2)
00000000007a3ec4 <.authnone_init+0x20> ld      r9,312(r2)
00000000007a3ec8 <.authnone_init+0x24> ld      r0,0(r9)
00000000007a3ecc <.authnone_init+0x28> ld      r11,8(r9)
00000000007a3ed0 <.authnone_init+0x2c> ld      r9,16(r9)
00000000007a3ed4 <.authnone_init+0x30> std     r0,24(r29)

When the boots fail they fail on that last std: std r0,24(r29) , doing =
so based on

r2: 0xd2da20
r29: 0x2400002200e313a8
bad virtual address: 0x2400002200e313c0

(These are from a boot-crash time register display, so hand copied off =
screen as it is too soon for interaction with DDB. I've got a default =
ddb script in place that does the display.)

When it boots okay r29 =3D 0x00e313a8 and the address accessed is =
0x00e313c0 instead: see the first address that I started with above (for =
&authnone_private).

In other words: The difference is the upper half of r29. I've no =
evidence that r2 is corrupt for failing boots for this code.

So either 304(r2) accesses different values from one time to the next =
for booting or the r29 register is corrupted somehow between

00000000007a3ec0 <.authnone_init+0x1c> ld      r29,304(r2)
and
00000000007a3ed4 <.authnone_init+0x30> std     r0,24(r29)

(such as an interrupt not restoring the 64bit-ABI's register value =
fully).

At this point I've no clue if variability in the TOC contents that =
304(r2) references makes any sense or not. I've yet to figure out how it =
is established.


More FreeBSD configuration details:

10.1-STABLE's buildworld kernel and installworld were all done from the =
PowerMac G5 itself.

root@FBSDG5M1:/usr/src # more sys/powerpc/conf/GENERIC64
GENERIC64      GENERIC64vtsc =20
root@FBSDG5M1:/usr/src # more sys/powerpc/conf/GENERIC64vtsc=20
include GENERIC64
ident   GENERIC64vtsc

nooptions       PS3                     #Sony Playstation 3              =
 HACK!!! to allow sc

options         DDB                     # HACK!!! to dump early crash =
info (but 11.0-CURRENT already has it)
options         GDB                     # HACK!!! ...
options         VERBOSE_SYSINIT
options         BOOTVERBOSE=3D1
options         BOOTHOWTO=3DRB_VERBOSE
#options        KTR
#options        KTR_MASK=3DKTR_TRAP
#options        KTR_CPUMASK=3D0xF
#options        KTR_VERBOSE

# HACK!!! to allow sc for 2560x1440 display on Radeon X1950 that vt =
historically mishandled during booting
device          sc
#device          kbdmux         # HACK: already listed by vt
options         SC_OFWFB        # OFW frame buffer
options         SC_DFLT_FONT    # compile font in
makeoptions     SC_DFLT_FONT=3Dcp437


# Disable extra checking typically used for FreeBSD 11.0-CURRENT:
nooptions       DEADLKRES               #Enable the deadlock resolver
nooptions       INVARIANTS              #Enable calls of extra sanity =
checking
nooptions       INVARIANT_SUPPORT       #Extra sanity checks of internal =
structures, required by INVARIANTS
nooptions       WITNESS                 #Enable checks to detect =
deadlocks and cycles
nooptions       WITNESS_SKIPSPIN        #Don't run witness on spinlocks =
for speed
nooptions       MALLOC_DEBUG_MAXZONES   # Separate malloc(9) zones

root@FBSDG5M1:/usr/src # svnlite status
?       .snap
M       sys/ddb/db_main.c
M       sys/ddb/db_script.c
M      sys/powerpc/conf
?       sys/powerpc/conf/GENERIC64vtsc
M       sys/powerpc/ofw/ofw_machdep.c
M       sys/powerpc/ofw/ofwcall64.S
M       sys/powerpc/powermac/powermac_thermal.c
M       sys/rpc/auth_none.c

root@FBSDG5M1:/usr/src # more /boot/loader.conf=20
#kernel=3D"kernel"
#kernel=3D"kernel10.1RE"
kernel=3D"kernel10.1S"
#kernel=3D"kernel11C"
verbose_loading=3D"YES"
kern.vty=3Dvt

root@FBSDG5M1:/usr/src # more /etc/make.conf=20
WRKDIRPREFIX=3D/usr/obj/portswork
WITH_DEBUG=3D
MALLOC_PRODUCTION=3D

root@FBSDG5M1:/usr/src # more /etc/src.conf
CFLAGS+=3D-DELF_VERBOSE
#WITH_DEBUG_FILES=3D
#WITHOUT_CLANG=3D

Other than powermac_thermal.c (from Justin H.) the source changes are =
for investigation of various early-boot problems for PowerMac G5's. The =
PowerMac G5 that powermac_thermal.c was put in place to experiment with =
is no longer around but I've not yet removed the powermac_thermal.c =
update from my environment.



=3D=3D=3D
Mark Millard
markmi at dsl-only.net







Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?5FE82152-BBF7-4C6D-932D-AEE70546CACA>