Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 1 Apr 2015 14:34:57 -0700
From:      Jim Harris <jim.harris@gmail.com>
To:        Konstantin Belousov <kostikbel@gmail.com>
Cc:        "freebsd-hackers@freebsd.org" <freebsd-hackers@freebsd.org>, Tobias Oberstein <tobias.oberstein@gmail.com>, Michael Fuckner <michael@fuckner.net>, Alan Somers <asomers@freebsd.org>
Subject:   Re: NVMe performance 4x slower than expected
Message-ID:  <CAJP=Hc87FMYCrQYGfAtefQ8PLT3WtnvPfPSppp3zRF-0noQR9Q@mail.gmail.com>
In-Reply-To: <20150401212303.GB2379@kib.kiev.ua>
References:  <551BC57D.5070101@gmail.com> <CAOtMX2jVwMHSnQfphAF%2Ba2%2Bo7eLp62nHmUo4t%2BEahrXLWReaFQ@mail.gmail.com> <CAJP=Hc-RNVuhPePg7bnpmT4ByzyXs_CNvAs7Oy7ntXjqhZYhCQ@mail.gmail.com> <551C5A82.2090306@gmail.com> <20150401212303.GB2379@kib.kiev.ua>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, Apr 1, 2015 at 2:23 PM, Konstantin Belousov <kostikbel@gmail.com>
wrote:

> On Wed, Apr 01, 2015 at 10:52:18PM +0200, Tobias Oberstein wrote:
> > >     > FreeBSD 11 Current with patches (DMAR and ZFS patches, otherwise
> the box
> > >     > doesn't boot at all .. because of 3TB RAM and the amount of
> periphery).
> > >
> > >     Do you still have WITNESS and INVARIANTS turned on in your kernel
> > >     config?  They're turned on by default for Current, but they do have
> > >     some performance impact.  To turn them off, just build a
> > >     GENERIC-NODEBUG kernel .
> >
> > WITNESS is off, INVARIANTS is still on.
> INVARIANTS are costly.
>
> >
> > Here is complete config:
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/freebsd_kernel_conf.md
> >
> > This is the aggregated patch (work was done by Konstantin - thanks again
> > btw!)
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/freebsd_patch.md
> >
> > > Could you also post full dmesg output as well as vmstat -i?
> >
> > dmesg:
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/freebsd_dmesg.md
> >
> > vmstat:
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/freebsd_vmstat.md
> >
> > ===
> >
> > Here are results from FIO under FreeBSD:
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/freebsd.md
> >
> > Here are results using _same_ FIO control file under Linux:
> >
> >
> https://github.com/oberstet/scratchbox/blob/master/freebsd/cruncher/results/linux.md
>
> Is this vmstat after the test ?
> Somewhat funny is that nvme does not use MSI(X).
>

Yes - this is exactly the problem.

nvme does use MSI-X if it can allocate the vectors (one per core).  With 48
cores,
I suspect we are quickly running out of vectors, so NVMe is reverting to
INTx.

Could you actually send vmstat -ia (I left off the 'a' previously) - just
so we can
see all allocated interrupt vectors.

As an experiment, can you try disabling hyperthreading - this will reduce
the
number of cores and should let you get MSI-X vectors allocated for at least
the first couple of NVMe controllers.  Then please re-run your performance
test on one of those controllers.

sys/x86/x86/local_apic.c defines APIC_NUM_IOINTS to 191 - it looks like this
is the actual limit for MSI-X vectors, even though NUM_MSI_INTS is set to
512.




>
> I have the following patch for a long time, it allowed to increase pps
> in iperf and similar tests when DMAR is enabled. In your case it could
> reduce the rate of the DMAR interrupts.
>
> diff --git a/sys/x86/iommu/intel_ctx.c b/sys/x86/iommu/intel_ctx.c
> index a18adcf..b23a4c1 100644
> --- a/sys/x86/iommu/intel_ctx.c
> +++ b/sys/x86/iommu/intel_ctx.c
> @@ -586,6 +586,18 @@ dmar_ctx_unload_entry(struct dmar_map_entry *entry,
> bool free)
>         }
>  }
>
> +static struct dmar_qi_genseq *
> +dmar_ctx_unload_gseq(struct dmar_ctx *ctx, struct dmar_map_entry *entry,
> +    struct dmar_qi_genseq *gseq)
> +{
> +
> +       if (TAILQ_NEXT(entry, dmamap_link) != NULL)
> +               return (NULL);
> +       if (ctx->batch_no++ % dmar_batch_coalesce != 0)
> +               return (NULL);
> +       return (gseq);
> +}
> +
>  void
>  dmar_ctx_unload(struct dmar_ctx *ctx, struct dmar_map_entries_tailq
> *entries,
>      bool cansleep)
> @@ -619,8 +631,7 @@ dmar_ctx_unload(struct dmar_ctx *ctx, struct
> dmar_map_entries_tailq *entries,
>                 entry->gseq.gen = 0;
>                 entry->gseq.seq = 0;
>                 dmar_qi_invalidate_locked(ctx, entry->start, entry->end -
> -                   entry->start, TAILQ_NEXT(entry, dmamap_link) == NULL ?
> -                   &gseq : NULL);
> +                   entry->start, dmar_ctx_unload_gseq(ctx, entry, &gseq));
>         }
>         TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
>                 entry->gseq = gseq;
> diff --git a/sys/x86/iommu/intel_dmar.h b/sys/x86/iommu/intel_dmar.h
> index 2865ab5..6e0ab7f 100644
> --- a/sys/x86/iommu/intel_dmar.h
> +++ b/sys/x86/iommu/intel_dmar.h
> @@ -93,6 +93,7 @@ struct dmar_ctx {
>         u_int entries_cnt;
>         u_long loads;
>         u_long unloads;
> +       u_int batch_no;
>         struct dmar_gas_entries_tree rb_root;
>         struct dmar_map_entries_tailq unload_entries; /* Entries to unload
> */
>         struct dmar_map_entry *first_place, *last_place;
> @@ -339,6 +340,7 @@ extern dmar_haddr_t dmar_high;
>  extern int haw;
>  extern int dmar_tbl_pagecnt;
>  extern int dmar_match_verbose;
> +extern int dmar_batch_coalesce;
>  extern int dmar_check_free;
>
>  static inline uint32_t
> diff --git a/sys/x86/iommu/intel_drv.c b/sys/x86/iommu/intel_drv.c
> index c239579..e7dc3f9 100644
> --- a/sys/x86/iommu/intel_drv.c
> +++ b/sys/x86/iommu/intel_drv.c
> @@ -153,7 +153,7 @@ dmar_count_iter(ACPI_DMAR_HEADER *dmarh, void *arg)
>         return (1);
>  }
>
> -static int dmar_enable = 0;
> +static int dmar_enable = 1;
>  static void
>  dmar_identify(driver_t *driver, device_t parent)
>  {
> diff --git a/sys/x86/iommu/intel_utils.c b/sys/x86/iommu/intel_utils.c
> index f696f9d..d3c3267 100644
> --- a/sys/x86/iommu/intel_utils.c
> +++ b/sys/x86/iommu/intel_utils.c
> @@ -624,6 +624,7 @@ dmar_barrier_exit(struct dmar_unit *dmar, u_int
> barrier_id)
>  }
>
>  int dmar_match_verbose;
> +int dmar_batch_coalesce = 100;
>
>  static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL, "");
>  SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
> @@ -632,6 +633,9 @@ SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD,
>  SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RWTUN,
>      &dmar_match_verbose, 0,
>      "Verbose matching of the PCI devices to DMAR paths");
> +SYSCTL_INT(_hw_dmar, OID_AUTO, batch_coalesce, CTLFLAG_RW | CTLFLAG_TUN,
> +    &dmar_batch_coalesce, 0,
> +    "Number of qi batches between interrupt");
>  #ifdef INVARIANTS
>  int dmar_check_free;
>  SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RWTUN,
>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAJP=Hc87FMYCrQYGfAtefQ8PLT3WtnvPfPSppp3zRF-0noQR9Q>