Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 8 Aug 2017 10:51:35 +0200
From:      Ben RUBSON <ben.rubson@gmail.com>
To:        FreeBSD Net <freebsd-net@freebsd.org>
Cc:        jch <jch@FreeBSD.org>, hiren <hiren@strugglingcoder.info>, Slawa Olhovchenkov <slw@zxy.spb.ru>, FreeBSD Stable <freebsd-stable@freebsd.org>
Subject:   Re: mlx4en, timer irq @100%... (11.0 stuck on high network load ???)
Message-ID:  <18031E23-711F-4E74-9D59-15A3C1DB113F@gmail.com>
In-Reply-To: <c05c2b1c-b5a8-c39c-6dff-e6cc0d8642bf@selasky.org>
References:  <BF3A3E47-A726-49FB-B83F-666EFC5C0FF1@gmail.com> <c3fdf0bb-833d-915f-c246-9459819cd825@selasky.org> <4DF74CB8-23D2-4CCF-B699-5B86DAEA65E5@gmail.com> <40602CEA-D417-4E5B-8C68-916958D49A0B@gmail.com> <9c306f10-7c05-d28d-e551-a930603aaafa@selasky.org> <896dd782-cb2c-0259-65d1-b00daae452de@FreeBSD.org> <0DB9F6FF-8BC9-48F5-B359-AC1905B9EB06@gmail.com> <7f14c95d-1ef8-bf82-c469-e6566c3aba66@selasky.org> <76A5EE7E-1D2E-46B4-86F1-F219C3DCE6EA@gmail.com> <e6f9df1c-8b55-8a3b-9f44-e67c26561543@selasky.org> <4C91C6E5-0725-42E7-9813-1F3ACF3DDD6E@gmail.com> <5840c25e-7472-3276-6df9-1ed4183078ad@selasky.org> <2ADA8C57-2C2D-4F97-9F0B-82D53EDDC649@gmail.com> <061cdf72-6285-8239-5380-58d9d19a1ef7@selasky.org> <92BEE83D-498F-47D5-A53C-39DCDC00A0FD@gmail.com> <5d8960d8-e1ff-8719-320f-d3ae84054714@selasky.org> <6B4A35F7-5694-4945-9575-19ADB678F9FA@gmail.com> <297a784a-3d80-b1a6-652e-a78621fe5a8b@selasky.org> <3ECCFBF1-18D9-4E33-8F39-0C366C3BB8B4@gmail.com> <c05c2b1c-b5a8-c39c-6dff-e6cc0d8642bf@selasky.org>

next in thread | previous in thread | raw e-mail | index | archive | help

> On 08 Aug 2017, at 10:31, Hans Petter Selasky <hps@selasky.org> wrote:
>=20
> On 08/08/17 10:06, Ben RUBSON wrote:
>>> On 08 Aug 2017, at 10:02, Hans Petter Selasky <hps@selasky.org> =
wrote:
>>>=20
>>> On 08/08/17 10:00, Ben RUBSON wrote:
>>>> kgdb) print *twq_2msl.tqh_first
>>>> $2 =3D {
>>>>   tw_inpcb =3D 0xfffff8031c570740,
>>>=20
>>> print *twq_2msl.tqh_first->tw_inpcb
>> (kgdb) print *twq_2msl.tqh_first->tw_inpcb
>> $3 =3D {
>>   inp_hash =3D {
>>     le_next =3D 0x0,
>>     le_prev =3D 0xfffffe000f78adb8
>>   },
>>   inp_pcbgrouphash =3D {
>>     le_next =3D 0x0,
>>     le_prev =3D 0x0
>>   },
>>   inp_list =3D {
>>     le_next =3D 0xfffff80c2a07f570,
>>     le_prev =3D 0xffffffff81e15e20
>>   },
>>   inp_ppcb =3D 0xfffff80d1bf12210,
>>   inp_pcbinfo =3D 0xffffffff81e15e28,
>>   inp_pcbgroup =3D 0x0,
>>   inp_pcbgroup_wild =3D {
>>     le_next =3D 0x0,
>>     le_prev =3D 0x0
>>   },
>>   inp_socket =3D 0x0,
>>   inp_cred =3D 0xfffff804ae6ca400,
>>   inp_flow =3D 0,
>>   inp_flags =3D 92274688,
>>   inp_flags2 =3D 16,
>>   inp_vflag =3D 0 '\0',
>>   inp_ip_ttl =3D 64 '@',
>>   inp_ip_p =3D 0 '\0',
>>   inp_ip_minttl =3D 0 '\0',
>>   inp_flowid =3D 946611505,
>>   inp_refcount =3D 2,
>>   inp_pspare =3D 0xfffff8031c5707c0,
>>   inp_flowtype =3D 191,
>>   inp_rss_listen_bucket =3D 0,
>>   inp_ispare =3D 0xfffff8031c5707f0,
>>   inp_inc =3D {
>>     inc_flags =3D 0 '\0',
>>     inc_len =3D 0 '\0',
>>     inc_fibnum =3D 0,
>>     inc_ie =3D {
>>       ie_fport =3D 53987,
>>       ie_lport =3D 47873,
>>       ie_dependfaddr =3D {
>>         ie46_foreign =3D {
>>           ia46_pad32 =3D 0xfffff8031c570808,
>>           ia46_addr4 =3D {
>>             s_addr =3D 3011802202
>>           }
>>         },
>>         ie6_foreign =3D {
>>           __u6_addr =3D {
>>             __u6_addr8 =3D 0xfffff8031c570808 "",
>>             __u6_addr16 =3D 0xfffff8031c570808,
>>             __u6_addr32 =3D 0xfffff8031c570808
>>           }
>>         }
>>       },
>>       ie_dependladdr =3D {
>>         ie46_local =3D {
>>           ia46_pad32 =3D 0xfffff8031c570818,
>>           ia46_addr4 =3D {
>>             s_addr =3D 4068705883
>>           }
>>         },
>>         ie6_local =3D {
>>           __u6_addr =3D {
>>             __u6_addr8 =3D 0xfffff8031c570818 "",
>>             __u6_addr16 =3D 0xfffff8031c570818,
>>             __u6_addr32 =3D 0xfffff8031c570818
>>           }
>>         }
>>       },
>>       ie6_zoneid =3D 0
>>     }
>>   },
>>   inp_label =3D 0x0,
>>   inp_sp =3D 0x0,
>>   inp_depend4 =3D {
>>     inp4_ip_tos =3D 0 '\0',
>>     inp4_options =3D 0x0,
>>     inp4_moptions =3D 0x0
>>   },
>>   inp_depend6 =3D {
>>     inp6_options =3D 0x0,
>>     inp6_outputopts =3D 0x0,
>>     inp6_moptions =3D 0x0,
>>     inp6_icmp6filt =3D 0x0,
>>     inp6_cksum =3D 0,
>>     inp6_hops =3D 0
>>   },
>>   inp_portlist =3D {
>>     le_next =3D 0xfffff80274298ae0,
>>     le_prev =3D 0xfffff800454999b0
>>   },
>>   inp_phd =3D 0xfffff800454999a0,
>>   inp_gencnt =3D 2119756,
>>   inp_lle =3D 0x0,
>>   inp_lock =3D {
>>     lock_object =3D {
>>       lo_name =3D 0xffffffff814e6940 "tcpinp",
>>       lo_flags =3D 90898432,
>>       lo_data =3D 0,
>>       lo_witness =3D 0x0
>>     },
>>     rw_lock =3D 18446735277871559936
>>   },
>>   inp_rt_cookie =3D 10,
>>   inp_rtu =3D {
>>     inpu_route =3D {
>>       ro_rt =3D 0x0,
>>       ro_lle =3D 0x0,
>>       ro_prepend =3D 0x0,
>>       ro_plen =3D 0,
>>       ro_flags =3D 384,
>>       ro_mtu =3D 0,
>>       spare =3D 0,
>>       ro_dst =3D {
>>         sa_len =3D 16 '\020',
>>         sa_family =3D 2 '\002',
>>         sa_data =3D 0xfffff8031c5708f2 ""
>>       }
>>     },
>>     inpu_route6 =3D {
>>       ro_rt =3D 0x0,
>>       ro_lle =3D 0x0,
>>       ro_prepend =3D 0x0,
>>       ro_plen =3D 0,
>>       ro_flags =3D 384,
>>       ro_mtu =3D 0,
>>       spare =3D 0,
>>       ro_dst =3D {
>>         sin6_len =3D 16 '\020',
>>         sin6_family =3D 2 '\002',
>>         sin6_port =3D 0,
>>         sin6_flowinfo =3D 3011802202,
>>         sin6_addr =3D {
>>           __u6_addr =3D {
>>             __u6_addr8 =3D 0xfffff8031c5708f8 "",
>>             __u6_addr16 =3D 0xfffff8031c5708f8,
>>             __u6_addr32 =3D 0xfffff8031c5708f8
>>           }
>>         },
>>         sin6_scope_id =3D 0
>>       }
>>     }
>>   }
>> }
>> (kgdb)
>=20
> Hi,
>=20
> Here is the conclusion:
>=20
> The following code is going in an infinite loop:
>=20
>=20
>>        for (;;) {
>>                TW_RLOCK(V_tw_lock);
>>                tw =3D TAILQ_FIRST(&V_twq_2msl);
>>                if (tw =3D=3D NULL || (!reuse && (tw->tw_time - ticks) =
> 0)) {
>>                        TW_RUNLOCK(V_tw_lock);
>>                        break;
>>                }
>>                KASSERT(tw->tw_inpcb !=3D NULL, ("%s: tw->tw_inpcb =3D=3D=
 NULL",
>>                    __func__));
>>                inp =3D tw->tw_inpcb;
>>                in_pcbref(inp);
>>                TW_RUNLOCK(V_tw_lock);
>>                if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
>>                        INP_WLOCK(inp);
>>                        tw =3D intotw(inp);
>>                        if (in_pcbrele_wlocked(inp)) {
>=20
> in_pcbrele_wlocked() returns (1) because INP_FREED (16) is set in =
inp->inp_flags2. I guess you have invariants disabled, because the =
KASSERT() below should have caused a panic.

You're right, INVARIANTS is not in GENERIC :

# grep -i INVARIANTS /usr/src/sys/amd64/conf/GENERIC
# grep -i INVARIANTS /usr/src/sys/amd64/conf/*
/usr/src/sys/amd64/conf/MINIMAL:options 	INVARIANTS		=
# Enable calls of extra sanity checking
/usr/src/sys/amd64/conf/MINIMAL:options 	INVARIANT_SUPPORT	=
# Extra sanity checks of internal structures, required by INVARIANTS

>>                                KASSERT(tw =3D=3D NULL, ("%s: held =
last inp "
>>                                    "reference but tw not NULL", =
__func__));
>>                                INP_INFO_RUNLOCK(&V_tcbinfo);
>>                                continue;
>>                        }
>=20
> This is a regression issue after:
>=20
>> commit 5630210a7f1dbbd903b77b2aef939cd47c63da58
>> Author: jch <jch@FreeBSD.org>
>> Date:   Thu Oct 30 08:53:56 2014 +0000
>>    Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() =
and
>>    tcp_tw_2msl_scan().  This race condition drives unplanned timewait
>>    timeout cancellation.  Also simplify implementation by holding =
inpcb
>>    reference and removing tcptw reference counting.
>=20
> Suggested fix attached.

Thank you very much for your investigation HPS and for the patch !
Not sure however how to test it as I don't know a way to reproduce the =
issue.

Certainly your patch will reach 11.0 & 11.1 soon.

Thank you again, and glad to have helped to track this down.

Ben




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?18031E23-711F-4E74-9D59-15A3C1DB113F>