From owner-freebsd-hackers@freebsd.org  Tue Oct 13 13:28:08 2015
Return-Path: <owner-freebsd-hackers@freebsd.org>
Delivered-To: freebsd-hackers@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org
 [IPv6:2001:1900:2254:206a::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 534F4A12167
 for <freebsd-hackers@mailman.ysv.freebsd.org>;
 Tue, 13 Oct 2015 13:28:08 +0000 (UTC)
 (envelope-from pjalaber@gmail.com)
Received: from mail-oi0-x236.google.com (mail-oi0-x236.google.com
 [IPv6:2607:f8b0:4003:c06::236])
 (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
 (Client CN "smtp.gmail.com",
 Issuer "Google Internet Authority G2" (verified OK))
 by mx1.freebsd.org (Postfix) with ESMTPS id 19453ABD;
 Tue, 13 Oct 2015 13:28:08 +0000 (UTC)
 (envelope-from pjalaber@gmail.com)
Received: by oihr205 with SMTP id r205so9309521oih.3;
 Tue, 13 Oct 2015 06:28:07 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113;
 h=mime-version:in-reply-to:references:date:message-id:subject:from:to
 :cc:content-type;
 bh=8dTNnEE/VRPy+pyWMRb0HTs5hVCrtzFVwfxf2dUZH6Q=;
 b=i3jFseIbGNBKI8a9DkuGF/zTP4Sk+kl4oSewlpkQ+xNjJo4nWFjqcOC/Isq2ETeJqZ
 xHzAgr53evo4shOD+o4s/ZR+mw/fqqfIrQTrBitbRnjOFUOT5BcQCuaimudhOaTeN43L
 q5ByEBceKVDaN4YIchuIYcwQf6T8FmGGxtXnBWsg2EZ5LHlnInDvQ08Ac85uqwKpNIj2
 fb4PgyIHylCtuCAvq7uT5xxqgUSZIjK7lhIhu1N+36Jl3ICY/4KJRWrkb7w3erfLZUSQ
 aKFy92yPqvmyOLeU6afNJZmJtjpaS6Tn1oyOdG5sa1rlcvmkBE/nYSfYvWNK3DlQ7ER4
 SIOw==
MIME-Version: 1.0
X-Received: by 10.202.60.137 with SMTP id j131mr8072602oia.12.1444742887245;
 Tue, 13 Oct 2015 06:28:07 -0700 (PDT)
Received: by 10.202.87.88 with HTTP; Tue, 13 Oct 2015 06:28:07 -0700 (PDT)
In-Reply-To: <CA+i3ByLROuP=nU_gXGBcAuEoC653QNiC9Rth4_U+T5WpEUbfXA@mail.gmail.com>
References: <CA+i3ByK8TLb6cRCw3dJgGYCb81ENE=HrgsDX+MM-=yVn8P1hgg@mail.gmail.com>
 <2768515.JZVZhYiQVE@ralph.baldwin.cx>
 <CA+i3ByKHU33sMN3oM73-H5pCuj3qqNY_VUCRkWWyJ8_X9PvUew@mail.gmail.com>
 <1902697.ny7xAkAVI4@ralph.baldwin.cx>
 <CA+i3ByLROuP=nU_gXGBcAuEoC653QNiC9Rth4_U+T5WpEUbfXA@mail.gmail.com>
Date: Tue, 13 Oct 2015 15:28:07 +0200
Message-ID: <CA+i3By+PDEwp1VVvKu435hU7FMBQYh3gyp5_cGCcAKgA43N8tw@mail.gmail.com>
Subject: Re: adaptive rwlock deadlock
From: Philippe Jalaber <pjalaber@gmail.com>
To: John Baldwin <jhb@freebsd.org>
Cc: freebsd-hackers@freebsd.org
Content-Type: text/plain; charset=UTF-8
X-Content-Filtered-By: Mailman/MimeDel 2.1.20
X-BeenThere: freebsd-hackers@freebsd.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: Technical Discussions relating to FreeBSD
 <freebsd-hackers.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/freebsd-hackers>, 
 <mailto:freebsd-hackers-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-hackers/>
List-Post: <mailto:freebsd-hackers@freebsd.org>
List-Help: <mailto:freebsd-hackers-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/freebsd-hackers>, 
 <mailto:freebsd-hackers-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Tue, 13 Oct 2015 13:28:08 -0000

I am able to reproduce the freeze with a simple kernel module modcrash.c.

It freezes with a Freebsd 9.3 and 10.0 VM, but not with a Freebsd 10.1,
10.2 or 11.0 VM.
It also freezes on a VIA appliance running Freebsd 9.3 and 10.2.
All freezes can be reproduced with one or more cpus.

Here the modcrash.c source code. Just comment the #define CRASHME (which
moves a pause call) and it does not freeze any more.
Please note that you can't klunload the module, it crashes the system.

Philippe

#include <sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/kthread.h>

static void modcrash_attach(void);
static void modcrash_detach(void);

static void modcrash_thread1(void *);
static void modcrash_thread2(void *);
static void modcrash_thread3(void *);

struct thread *thread1_td, *thread2_td, *thread3_td;

static struct rwlock inplock;
RW_SYSINIT(inplock, &inplock, "inplock");

static struct rwlock glock;
RW_SYSINIT(glock, &glock, "glock");

static int volatile global_busy = 0;

#define MILLISECONDS(ms) ((hz/1000)*(ms))

static void
modcrash_attach(void)
{
    if ( kthread_add(modcrash_thread1, NULL, NULL, &thread1_td, 0, 0,
"modcrash1") != 0 )
        printf("error\n");

    if ( kthread_add(modcrash_thread2, NULL, NULL, &thread2_td, 0, 0,
"modcrash2") != 0 )
        printf("error\n");

    if ( kthread_add(modcrash_thread3, NULL, NULL, &thread3_td, 0, 0,
"modcrash3") != 0 )
        printf("error\n");
}

static void
modcrash_detach(void)
{
}

static void
modcrash_thread1(void *arg)
{
    int busy;

    printf("thread 1 running\n");

    while ( 1 )
    {
        rw_wlock(&inplock);

        do
        {
            rw_rlock(&glock);
            busy = global_busy;
            if ( busy )
            {
                rw_runlock(&glock);
            }
        } while ( busy );

        rw_runlock(&glock);

        rw_wunlock(&inplock);

        pause("thread 1", MILLISECONDS(10));
    }

    kthread_exit();
}


static void
modcrash_thread2(void *arg)
{
    int busy;

    printf("thread 2 running\n");

    while ( 1 )
    {
        rw_wlock(&glock);
        if ( (random() & 1) == 0 )
        {
            global_busy = 1;
            busy = 1;
        }
        else
        {
            busy = 0;
        }
        rw_wunlock(&glock);

#define CRASHME

#ifdef CRASHME
        pause("thread 2", MILLISECONDS(10));
#endif

        if ( busy )
        {
            rw_wlock(&glock);
            global_busy = 0;
            rw_wunlock(&glock);
        }

#ifndef CRASHME
        pause("thread 2", MILLISECONDS(10));
#endif
    }

    kthread_exit();
}

static void
modcrash_thread3(void *context)
{
    printf("thread 3 running\n");

    while ( 1 )
    {
        rw_wlock(&inplock);
        rw_wunlock(&inplock);
        pause("thread 3", MILLISECONDS(10));
    }

    kthread_exit();
}

static int
modcrashmodevent(module_t mod, int type, void *data)
{

    switch (type) {
    case MOD_LOAD:
        modcrash_attach();
        break;
    case MOD_UNLOAD:
        modcrash_detach();
        break;
    default:
        return EOPNOTSUPP;
    }
    return 0;
}

static moduledata_t modcrash_mod = {
    "modcrash",
    modcrashmodevent,
    0
};

DECLARE_MODULE(modcrash, modcrash_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
MODULE_VERSION(modcrash, 1);


2015-08-06 10:09 GMT+02:00 Philippe Jalaber <pjalaber@gmail.com>:

> 2015-08-05 17:41 GMT+02:00 John Baldwin <jhb@freebsd.org>:
>
>> On Wednesday, August 05, 2015 04:27:53 PM Philippe Jalaber wrote:
>> > 2015-08-04 22:10 GMT+02:00 John Baldwin <jhb@freebsd.org>:
>> >
>> > > On Tuesday, July 07, 2015 12:10:19 PM Philippe Jalaber wrote:
>> > > > Hi,
>> > > >
>> > > > I am facing a strange problem using the network stack and adaptive
>> > > rwlocks
>> > > > running Freebsd 9.3.
>> > > > Basically I can reproduce the problem with 3 threads:
>> > > >
>> > > > 1) thread 1 has taken the rwlock of structure inpcb in exclusive
>> mode in
>> > > > tcp_input.c. This thread also runs my own code and repeatedly takes
>> a
>> > > > rwlock (called g_rwlock) in shared mode and releases it, until a
>> shared
>> > > > object is marked not "busy" any more:
>> > > >
>> > > > rwlock(inp_lock);
>> > > > ....
>> > > > do { // thread is active waiting in the loop
>> > > >     rlock(g_rwlock);
>> > > >     o = find();
>> > > >     if ( o == NULL )
>> > > >         break;
>> > > >     busy = o.busy;
>> > > >     if (o != NULL && busy)
>> > > >         runlock(g_rwlock);
>> > > > } while ( busy );
>> > > >
>> > > > if ( o != NULL )
>> > > > {
>> > > >     // do something with o
>> > > >     ....
>> > > > }
>> > > > runlock(g_rwlock);
>> > > > ....
>> > > >
>> > > > 2) thread 2 wants to set the shared object as "ready". So it tries
>> to
>> > > take
>> > > > g_rwlock in exclusive mode and is blocked in
>> _rw_wlock_hard@kern_rwlock.c
>> > > :815
>> > > > "turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE)" because
>> thread 1
>> > > has
>> > > > already taken it in shared mode:
>> > > >
>> > > > wlock(g_rwlock);
>> > > > o = find();
>> > > > if ( o != NULL )
>> > > >     o.busy = 1;
>> > > > wunlock(g_rwlock);
>> > > >
>> > > > // o is busy so work on it without any lock
>> > > > ....
>> > > >
>> > > > wlock(g_rwlock); // thread is blocked here
>> > > > o.busy = 0;
>> > > > maybe_delete(o);
>> > > > wunlock(g_rwlock);
>> > > >
>> > > > 3) thread 3 spins on the same inpcb rwlock than thread 1 in
>> > > > _rw_wlock_hard@kern_rwlock.c:721 "while ((struct
>> > > > thread*)RW_OWNER(rw->rw_lock) == owner && TD_IS_RUNNING(owner)) "
>> > > >
>> > > >
>> > > > My target machine has two cpus.
>> > > > Thread 1 is pinned to cpu 0.
>> > > > Thread 2 and Thread 3 are pinned to cpu 1.
>> > > > Thread 1 and Thread 2 have a priority of 28.
>> > > > Thread 3 has a priority of 127
>> > > >
>> > > > Now what seems to happen is that when thread 1 calls
>> runlock(g_rwlock),
>> > > it
>> > > > calls turnstile_broadcast@kern_rwlock.c:650, but thread 2 never
>> regains
>> > > > control because thread 3 is spinning on the inpcb rwlock. Also the
>> > > > condition TD_IS_RUNNING(owner) is always true because thread 1 is
>> active
>> > > > waiting in a loop. So the 3 threads deadlock.
>> > > > Note that if I compile the kernel without adaptive rwlocks it works
>> > > without
>> > > > any problem.
>> > > > A workaround is to add a call to "sched_relinquish(curthread)" in
>> thread
>> > > 1
>> > > > in the loop just after the call to runlock.
>> > >
>> > > It sounds like we are not forcing a preemption on CPU 1 in this case
>> via
>> > > sched_add().
>> > >
>> > > For SCHED_4BSD you could try the 'FULL_PREEMPTION' kernel option.
>> > > For ULE you can adjust 'preempt_thresh' on the fly, though I think the
>> > > default setting should actually still work.
>> > >
>> > > Can you use KTR or some such to determine if IPI_PREEMPT is being
>> sent by
>> > > CPU 0 to CPU 1 in this case?
>> > >
>> > > > I am also wondering about the code in _rw_runlock after
>> > > > "turnstile_broadcast(ts, queue)". Isn't the flag
>> RW_LOCK_WRITE_WAITERS
>> > > > definitely lost if the other thread which is blocked in
>> turnstile_wait
>> > > > never regains control ?
>> > >
>> > > All the write waiters are awakened by a broadcast (as opposed to a
>> signal
>> > > operation).  They are on the run queue, not on the turnstile queue
>> anymore,
>> > > so there aren't any write waiters left (the bit only tracks if there
>> are
>> > > waiters on the turnstile).
>> > >
>> > > --
>> > > John Baldwin
>> > >
>> >
>> > I Use ULE scheduler.
>> > Here's the KTR output using ktrdump on a vmcore after watchdog.
>> >
>> >   75447 ipi_selected: cpu: 1 ipi: fc
>> >   75446 stop_cpus() with 252 type
>> >   75445 ipi_cpu: cpu: 1 ipi: 2
>> >   75444 ipi_cpu: cpu: 1 ipi: 2
>> >   75443 ipi_cpu: cpu: 1 ipi: 2
>> >   75442 ipi_cpu: cpu: 1 ipi: 2
>> >   75441 ipi_cpu: cpu: 1 ipi: 2
>> >   ....
>> >   3862 ipi_cpu: cpu: 1 ipi: 2
>> >   3861 ipi_cpu: cpu: 1 ipi: 2
>> >   3860 ipi_cpu: cpu: 1 ipi: 2
>> >   3859 ipi_cpu: cpu: 1 ipi: 2
>> >   3858 ipi_cpu: cpu: 1 ipi: 2
>> >   3857 ipi_selected: cpu: 1 ipi: f3
>> >   3856 ipi_cpu: cpu: 1 ipi: 2
>> >   3855 ipi_cpu: cpu: 1 ipi: 2
>> >   3854 ipi_cpu: cpu: 1 ipi: 2
>> >   3853 ipi_selected: cpu: 0 ipi: f3
>> >   3852 ipi_cpu: cpu: 1 ipi: 2
>> >   3851 ipi_selected: cpu: 1 ipi: f3
>> >   3850 ipi_cpu: cpu: 1 ipi: 2
>> >   3849 ipi_cpu: cpu: 1 ipi: 2
>> >   3848 ipi_selected: cpu: 0 ipi: f3
>> >   3847 ipi_cpu: cpu: 1 ipi: 2
>> >   3846 ipi_cpu: cpu: 1 ipi: 2
>> >   3845 ipi_cpu: cpu: 1 ipi: 2
>> >   3844 ipi_cpu: cpu: 1 ipi: 2
>> >   3843 ipi_cpu: cpu: 1 ipi: 2
>> >   3842 ipi_cpu: cpu: 1 ipi: 2
>> >   3841 ipi_cpu: cpu: 1 ipi: 2
>> >   3840 ipi_cpu: cpu: 1 ipi: 2
>> >   3839 ipi_cpu: cpu: 1 ipi: 2
>> >   3838 ipi_cpu: cpu: 1 ipi: 2
>> >   3837 ipi_cpu: cpu: 1 ipi: 2
>> >   3836 ipi_cpu: cpu: 1 ipi: 2
>> >   3835 ipi_cpu: cpu: 0 ipi: 1
>> >   3834 ipi_cpu: cpu: 0 ipi: 1
>> >   3833 ipi_cpu: cpu: 0 ipi: 1
>> >   3832 ipi_cpu: cpu: 0 ipi: 1
>> >   3831 ipi_cpu: cpu: 0 ipi: 1
>> >   3830 ipi_cpu: cpu: 0 ipi: 1
>>
>> Unfortunately this has a lot of other noise.  Can you add some
>> traces specifically in sched_ule in tdq_notify to note that it
>> is deciding to notify a CPU due to scheduling a thread?
>>
>> --
>> John Baldwin
>>
>
> OK. Here's the patch I have used in tdq_notify on Freebsd 9.3:
>
> --- kern/sched_ule.c    2015-08-06 09:03:49.000000000 +0200
> +++ kern/sched_ule.c    2015-08-06 09:03:38.000000000 +0200
> @@ -1020,6 +1020,7 @@
>                         return;
>         }
>         tdq->tdq_ipipending = 1;
> +       CTR3(KTR_SMP, "%s: cpu: %d IPI_PREEMPT=%x", __func__, cpu,
> IPI_PREEMPT);
>         ipi_cpu(cpu, IPI_PREEMPT);
>  }
>
> Please find the output of ktrdump in attachment.
> The last IPI_PREEMPT sent is at index #9424. Hope it helps.
>
>
>