Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 08 Jul 2014 09:14:53 -0400
From:      Bob Healey <healer@rpi.edu>
To:        freebsd-stable@freebsd.org
Subject:   Re: Interactions with mxge, pf, nfsd, and the kernel
Message-ID:  <53BBEECD.6000709@rpi.edu>
In-Reply-To: <53B42139.302@rpi.edu>
References:  <53B42139.302@rpi.edu>

next in thread | previous in thread | raw e-mail | index | archive | help
I've been running one of these machines without pf, and it has ceased 
responding on all interfaces (mxge and bce).  The console still works 
fine, and a reboot will clear the problems for now.  I'm running out of 
ideas.
root@helo:~ # netstat -i
Name    Mtu Network       Address              Ipkts Ierrs Idrop    
Opkts Oerrs  Coll
mxge0  9000 <Link#1>      00:60:dd:44:d2:07 44838061 164399     0 
31944144     0     0
mxge0  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
3     -     -
bce0   1500 <Link#2>      08:9e:01:50:a3:08    97018 0     0        
0     0     0
bce0   1500 fe80::a9e:1ff fe80::a9e:1ff:fe5        0     - -        
3     -     -
bce1   1500 <Link#3>      08:9e:01:50:a3:09 889442915 1791     0 
557044449     0     0
bce1   1500 128.113.12.0  helo              888129846     -     - 
676300451     -     -
bce1   1500 fe80::a9e:1ff fe80::a9e:1ff:fe5        0     - -        
4     -     -
lo0   16384 <Link#4>                           28448 0     0    
28448     0     0
lo0   16384 localhost     ::1                     59     - -       
59     -     -
lo0   16384 fe80::1%lo0   fe80::1                  0     - -        
0     -     -
lo0   16384 your-net      localhost            28389     - -    
28389     -     -
vlan2  9000 <Link#5>      00:60:dd:44:d2:07 28107520 0     0 
19859118     0     0
vlan2  9000 10.2.3.0      helo.galactica.lo 28088754     -     - 
24433917     -     -
vlan2  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
3     -     -
vlan2  9000 <Link#6>      00:60:dd:44:d2:07 16730541 0     0 
12084894     0     0
vlan2  9000 10.2.4.0      helo.enterprise.l 16724370     -     - 
12924742     -     -
vlan2  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
3     -     -
root@helo:~ # netstat -m
7632/6798/14430 mbufs in use (current/cache/total)
4186/2886/7072/1018944 mbuf clusters in use (current/cache/total/max)
4080/1420 mbuf+clusters out of packet secondary zone in use (current/cache)
0/6/6/509472 4k (page size) jumbo clusters in use (current/cache/total/max)
593/25/618/150954 9k jumbo clusters in use (current/cache/total/max)
0/0/0/84912 16k jumbo clusters in use (current/cache/total/max)
15617K/7720K/23337K bytes allocated to network (current/cache/total)
3/72461/0 requests for mbufs denied (mbufs/clusters/mbuf+clusters)
0/0/0 requests for mbufs delayed (mbufs/clusters/mbuf+clusters)
0/0/0 requests for jumbo clusters delayed (4k/9k/16k)
122/391912/0 requests for jumbo clusters denied (4k/9k/16k)
0 requests for sfbufs denied
0 requests for sfbufs delayed
0 requests for I/O initiated by sendfile
root@helo:~ # uptime
  9:07AM  up 12 days,  8:15, 1 user, load averages: 0.19, 0.19, 0.20
root@helo:~ # ifconfig
mxge0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
options=6c03bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,TSO6,VLAN_HWTSO,LINKSTATE,RXCSUM_IPV6,TXCSUM_IPV6>
         ether 00:60:dd:44:d2:07
         inet6 fe80::260:ddff:fe44:d207%mxge0 prefixlen 64 scopeid 0x1
         nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
         media: Ethernet 10Gbase-CX4 <full-duplex>
         status: active
bce0: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 1500
options=c01bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,VLAN_HWTSO,LINKSTATE>
         ether 08:9e:01:50:a3:08
         inet6 fe80::a9e:1ff:fe50:a308%bce0 prefixlen 64 scopeid 0x2
         nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
         media: Ethernet autoselect (1000baseT <full-duplex>)
         status: active
bce1: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 1500
options=c01bb<RXCSUM,TXCSUM,VLAN_MTU,VLAN_HWTAGGING,JUMBO_MTU,VLAN_HWCSUM,TSO4,VLAN_HWTSO,LINKSTATE>
         ether 08:9e:01:50:a3:09
         inet 128.113.12.134 netmask 0xffffff00 broadcast 128.113.12.255
         inet6 fe80::a9e:1ff:fe50:a309%bce1 prefixlen 64 scopeid 0x3
         nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
         media: Ethernet autoselect (1000baseT <full-duplex,master>)
         status: active
lo0: flags=8049<UP,LOOPBACK,RUNNING,MULTICAST> metric 0 mtu 16384
options=600003<RXCSUM,TXCSUM,RXCSUM_IPV6,TXCSUM_IPV6>
         inet6 ::1 prefixlen 128
         inet6 fe80::1%lo0 prefixlen 64 scopeid 0x4
         inet 127.0.0.1 netmask 0xff000000
         nd6 options=21<PERFORMNUD,AUTO_LINKLOCAL>
vlan23: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
         options=303<RXCSUM,TXCSUM,TSO4,TSO6>
         ether 00:60:dd:44:d2:07
         inet 10.2.3.244 netmask 0xffffff00 broadcast 10.2.3.255
         inet6 fe80::260:ddff:fe44:d207%vlan23 prefixlen 64 scopeid 0x5
         nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
         media: Ethernet 10Gbase-CX4 <full-duplex>
         status: active
         vlan: 23 parent interface: mxge0
vlan24: flags=8843<UP,BROADCAST,RUNNING,SIMPLEX,MULTICAST> metric 0 mtu 9000
         options=303<RXCSUM,TXCSUM,TSO4,TSO6>
         ether 00:60:dd:44:d2:07
         inet 10.2.4.244 netmask 0xffffff00 broadcast 10.2.4.255
         inet6 fe80::260:ddff:fe44:d207%vlan24 prefixlen 64 scopeid 0x6
         nd6 options=29<PERFORMNUD,IFDISABLED,AUTO_LINKLOCAL>
         media: Ethernet 10Gbase-CX4 <full-duplex>
         status: active
         vlan: 24 parent interface: mxge0
rc.conf:
hostname="helo.bio.rpi.edu"
ifconfig_bce1=" inet 128.113.12.134 netmask 0xffffff00"
ifconfig_mxge0="up mtu 9000"
ifconfig_bce0="up"
cloned_interfaces="vlan23 vlan24"
ifconfig_vlan23="inet 10.2.3.244 netmask 255.255.255.0 vlan 23 vlandev 
mxge0"
ifconfig_vlan24="inet 10.2.4.244 netmask 255.255.255.0 vlan 24 vlandev 
mxge0"
defaultrouter="128.113.12.254"
sshd_enable="YES"
ntpd_enable="YES"
powerd_enable="YES"
# Set dumpdev to "AUTO" to enable crash dumps, "NO" to disable
dumpdev="NO"
zfs_enable="YES"
nisdomainname="GALACTICA.BIO.RPI.EDU"
ntpdate_enable="YES"
ntpdate_hosts="ntp.rpi.edu"
rpc_lockd_enable="YES"
rpc_statd_enable="YES"
rpcbind_enable="YES"
nis_client_enable="YES"
nis_client_flags="-m -S GALACTICA.BIO.RPI.EDU,adama.galactica.local"
nfs_server_enable="YES"
mountd_enable="YES"
nfsd_enable="YES"
apcupsd_enable="YES"
#pf_enable="YES"
netwait_enable="YES"
netwait_ip="128.113.12.254"
netwait_if="mxge0"
static_routes="management"
route_management="-net 10.1.1.0/24 10.2.3.254"
amd_enable="YES"                 # Run amd service with $amd_flags (or NO).
amd_flags="-a /.amd_mnt -l syslog /home amd.home"
amd_map_program="NO"            # Can be set to "ypcat -k amd.master"
root@helo:~ # uname -a
FreeBSD helo.bio.rpi.edu 10.0-RELEASE-p4 FreeBSD 10.0-RELEASE-p4 #0: Tue 
Jun  3 13:14:57 UTC 2014 
root@amd64-builder.daemonology.net:/usr/obj/usr/src/sys/GENERIC amd64

Bob Healey
Systems Administrator
Biocomputation and Bioinformatics Constellation
and Molecularium
healer@rpi.edu
(518) 276-4407

On 7/2/2014 11:11 AM, Bob Healey wrote:
> Hello.
>
> I've been wrestling with this on and off for a few months now.  I have 
> an assortment of systems (some Dell Poweredge R515, R610, and IBM 
> x3630M3) with 10 gig Myricom ethernet cards acting as nfs servers to 
> Linux HPC compute clusters (12-36 nodes, 384 - 480 cores) connected 
> via gigabit ethernet.  They are also connected to the outside world 
> via onboard bce (Dell) or igb (IBM).  After a variable length of time, 
> I will lose all network access to a host. Connecting via console, the 
> machine tends to be fully responsive. A reboot clears the problem, but 
> I have yet to figure out any sysctls/loader.conf tunables to clear the 
> problem and make it stay away.  PF is in use to restrict access to the 
> host to a pair of public /24's, and to 10/8.  If there is a way in 
> zfs's sharenfs property to make that restriction, I'd be happy to 
> change, but I really don't like leaving nfs open to the university's 
> quartet of /16's, so PF it is.  The vlan2 interface has mxge0 as its 
> parent.
>
> Thanks for any help.
>
> This host is getting ready to crash soon, based on netstat.
> root@husker:~ # netstat -i
> Name    Mtu Network       Address              Ipkts Ierrs Idrop Opkts 
> Oerrs  Coll
> mxge0  9000 <Link#1>      00:60:dd:44:d2:0a  6358280   262 0  
> 4061637     0     0
> mxge0  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
> 2     -     -
> bce0   1500 <Link#2>      08:9e:01:50:a1:ac   276391     0 0        
> 0     0     0
> bce0   1500 fe80::a9e:1ff fe80::a9e:1ff:fe5        0     - -        
> 3     -     -
> bce1   1500 <Link#3>      08:9e:01:50:a1:ad 2229709391 16921     0 
> 1182942116     0     0
> bce1   1500 128.113.12.0  husker            2226254093     -     - 
> 1183962005     -     -
> bce1   1500 fe80::a9e:1ff fe80::a9e:1ff:fe5        0     - -        
> 3     -     -
> lo0   16384 <Link#4>                            2030     0 0     
> 2030     0     0
> lo0   16384 localhost     ::1                      4     - -        
> 4     -     -
> lo0   16384 fe80::1%lo0   fe80::1                  0     - -        
> 0     -     -
> lo0   16384 your-net      localhost             2026     -     - 
> 2026     -     -
> vlan2  9000 <Link#5>      00:60:dd:44:d2:0a  4387250     0 0  
> 3060586     0     0
> vlan2  9000 10.2.3.0      husker.galactica.  4370309     -     - 
> 3963931     -     -
> vlan2  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
> 2     -     -
> vlan2  9000 <Link#6>      00:60:dd:44:d2:0a  1971034     0 0  
> 1001061     0     0
> vlan2  9000 10.2.4.0      husker.enterprise  1700742     -     - 
> 1961891     -     -
> vlan2  9000 fe80::260:ddf fe80::260:ddff:fe        0     - -        
> 4     -     -
> root@husker:~ # netstat -im
> 6157/3233/9390 mbufs in use (current/cache/total)
> 4081/1883/5964/1018800 mbuf clusters in use (current/cache/total/max)
> 4080/795 mbuf+clusters out of packet secondary zone in use 
> (current/cache)
> 0/5/5/509399 4k (page size) jumbo clusters in use 
> (current/cache/total/max)
> 512/23/535/150933 9k jumbo clusters in use (current/cache/total/max)
> 0/0/0/84899 16k jumbo clusters in use (current/cache/total/max)
> 14309K/4801K/19110K bytes allocated to network (current/cache/total)
> 10/1883/0 requests for mbufs denied (mbufs/clusters/mbuf+clusters)
> 0/0/0 requests for mbufs delayed (mbufs/clusters/mbuf+clusters)
> 0/0/0 requests for jumbo clusters delayed (4k/9k/16k)
> 2/1736/0 requests for jumbo clusters denied (4k/9k/16k)
> 0 requests for sfbufs denied
> 0 requests for sfbufs delayed
> 0 requests for I/O initiated by sendfile
> root@husker:~ # uptime
> 11:07AM  up 23 days, 19:27, 1 user, load averages: 0.14, 0.17, 0.13
> root@husker:~ # sysctl -a | grep nmb
> kern.ipc.nmbclusters: 1018800
> kern.ipc.nmbjumbop: 509399
> kern.ipc.nmbjumbo9: 452799
> kern.ipc.nmbjumbo16: 339596
> kern.ipc.nmbufs: 6520320
> root@husker:~ # cat /boot/loader.conf
> zfs_load="YES"
> amdtemp_load="YES"
> if_mxge_load="YES"
> mxge_ethp_z8e_load="YES"
> mxge_eth_z8e_load="YES"
> mxge_rss_ethp_z8e_load="YES"
> mxge_rss_eth_z8e_load="YES"
> vfs.zfs.arc_max="12288M"
> root@husker:~ # cat /var/run/dmesg.boot | head -16
> Copyright (c) 1992-2014 The FreeBSD Project.
> Copyright (c) 1979, 1980, 1983, 1986, 1988, 1989, 1991, 1992, 1993, 1994
>         The Regents of the University of California. All rights reserved.
> FreeBSD is a registered trademark of The FreeBSD Foundation.
> FreeBSD 10.0-RELEASE-p4 #0: Tue Jun  3 13:14:57 UTC 2014
> root@amd64-builder.daemonology.net:/usr/obj/usr/src/sys/GENERIC amd64
> FreeBSD clang version 3.3 (tags/RELEASE_33/final 183502) 20130610
> CPU: AMD Opteron(tm) Processor 4122 (2200.07-MHz K8-class CPU)
>   Origin = "AuthenticAMD"  Id = 0x100f80  Family = 0x10  Model = 0x8  
> Stepping = 0
> Features=0x178bfbff<FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CLFLUSH,MMX,FXSR,SSE,SSE2,HTT> 
>
>   Features2=0x802009<SSE3,MON,CX16,POPCNT>
>   AMD 
> Features=0xee500800<SYSCALL,NX,MMX+,FFXSR,Page1GB,RDTSCP,LM,3DNow!+,3DNow!>
>   AMD 
> Features2=0x837ff<LAHF,CMP,SVM,ExtAPIC,CR8,ABM,SSE4A,MAS,Prefetch,OSVW,IBS,SKINIT,WDT,NodeId>
>   TSC: P-state invariant
> real memory  = 17179869184 (16384 MB)
> avail memory = 16588054528 (15819 MB)
>
>




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?53BBEECD.6000709>