Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 24 Jun 2012 17:30:11 GMT
From:      Fabian Keil <fk@fabiankeil.de>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   amd64/169379: [PATCH] dtrace's timestamp variable is unreliable when the cpu goes into C3 state
Message-ID:  <201206241730.q5OHUBNm048260@red.freebsd.org>
Resent-Message-ID: <201206241740.q5OHe6Hr086840@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         169379
>Category:       amd64
>Synopsis:       [PATCH] dtrace's timestamp variable is unreliable when the cpu goes into C3 state
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-amd64
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Sun Jun 24 17:40:06 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     Fabian Keil
>Release:        HEAD
>Organization:
>Environment:
FreeBSD r500.local 10.0-CURRENT FreeBSD 10.0-CURRENT #444 r+46680b7: Sat Jun 23 18:58:48 CEST 2012     fk@r500.local:/usr/obj/usr/src/sys/ZOEY  amd64

>Description:
On my amd64 system dtrace's timestamp variable is unreliable
if the cpu is allowed to go into C3 state and the best
kern.timecounter.hardware available (HPET) is used.

The timestamp counter seems to tick about ten times slower than
expected, causing things that are timed with timestamp deltas to
appear ten times faster than they are.

This does not happen if TSC is used as timecounter,
but it's rated as -1000 and using it is undesirable.

Looking at the code, I suspect the same issue exists on i386
but I didn't test this.
>How-To-Repeat:
Set hw.acpi.cpu.cx_lowest to C3 and execute the dtrace
script below while calling date in a loop with:

while date; do sleep 1; done

I get averages around 90 ms instead of 1000 ms + overhead as expected.

-------

#!/usr/sbin/dtrace -s

#pragma D option quiet

dtrace:::BEGIN
{
        cached_timestamp = (int64_t)0;
}

syscall::exec*:return
/(execname == "date") && cached_timestamp/
{
        self->elapsed = (timestamp - cached_timestamp) / 1000000;
        printf("ms since the last date execution: %d\n", self->elapsed);
        @elapsed_time["elapsed"] = lquantize(self->elapsed, 0, 1200, 10);
        @elapsed_avg["elapsed avg"] = avg(self->elapsed);
        self->elapsed = 0;
}

syscall::exec*:return
/execname == "date"/
{
        cached_timestamp = timestamp;
        printf("Date execution detected at %d\n", cached_timestamp);
}

tick-60sec
{
        exit(1);
}
>Fix:
The attached patch replaces the kernel timestamp code on
amd64 and i386 with the much simpler code already used on
mips.

This fixes the issue for me on amd64, I now reliably
get an average of 1007 ms which seems reasonable to me.

Patch attached with submission follows:

>From 802f15e0c4a8ae946fc96370e484eabf2dfda091 Mon Sep 17 00:00:00 2001
From: Fabian Keil <fk@fabiankeil.de>
Date: Sun, 24 Jun 2012 15:50:28 +0200
Subject: [PATCH 2/2] dtrace: Fix timestamp unreliability on x86 when the cpu
 goes into C3 state

This replaces the code on amd64 and i386 with the much
simpler one used on mips.
---
 sys/cddl/dev/dtrace/amd64/dtrace_subr.c |   94 ++-----------------------------
 sys/cddl/dev/dtrace/i386/dtrace_subr.c  |   94 ++-----------------------------
 2 files changed, 8 insertions(+), 180 deletions(-)

diff --git a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
index 2d42ce2..0557f1e 100644
--- a/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
+++ b/sys/cddl/dev/dtrace/amd64/dtrace_subr.c
@@ -355,82 +355,6 @@ dtrace_safe_defer_signal(void)
 }
 #endif
 
-static int64_t	tgt_cpu_tsc;
-static int64_t	hst_cpu_tsc;
-static int64_t	tsc_skew[MAXCPU];
-static uint64_t	nsec_scale;
-
-/* See below for the explanation of this macro. */
-#define SCALE_SHIFT	28
-
-static void
-dtrace_gethrtime_init_cpu(void *arg)
-{
-	uintptr_t cpu = (uintptr_t) arg;
-
-	if (cpu == curcpu)
-		tgt_cpu_tsc = rdtsc();
-	else
-		hst_cpu_tsc = rdtsc();
-}
-
-static void
-dtrace_gethrtime_init(void *arg)
-{
-	struct pcpu *pc;
-	uint64_t tsc_f;
-	cpuset_t map;
-	int i;
-
-	/*
-	 * Get TSC frequency known at this moment.
-	 * This should be constant if TSC is invariant.
-	 * Otherwise tick->time conversion will be inaccurate, but
-	 * will preserve monotonic property of TSC.
-	 */
-	tsc_f = atomic_load_acq_64(&tsc_freq);
-
-	/*
-	 * The following line checks that nsec_scale calculated below
-	 * doesn't overflow 32-bit unsigned integer, so that it can multiply
-	 * another 32-bit integer without overflowing 64-bit.
-	 * Thus minimum supported TSC frequency is 62.5MHz.
-	 */
-	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
-
-	/*
-	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
-	 * as possible.
-	 * 2^28 factor was chosen quite arbitrarily from practical
-	 * considerations:
-	 * - it supports TSC frequencies as low as 62.5MHz (see above);
-	 * - it provides quite good precision (e < 0.01%) up to THz
-	 *   (terahertz) values;
-	 */
-	nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f;
-
-	/* The current CPU is the reference one. */
-	sched_pin();
-	tsc_skew[curcpu] = 0;
-	CPU_FOREACH(i) {
-		if (i == curcpu)
-			continue;
-
-		pc = pcpu_find(i);
-		CPU_SETOF(PCPU_GET(cpuid), &map);
-		CPU_SET(pc->pc_cpuid, &map);
-
-		smp_rendezvous_cpus(map, NULL,
-		    dtrace_gethrtime_init_cpu,
-		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
-
-		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
-	}
-	sched_unpin();
-}
-
-SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL);
-
 /*
  * DTrace needs a high resolution time function which can
  * be called from a probe context and guaranteed not to have
@@ -441,21 +365,11 @@ SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init,
 uint64_t
 dtrace_gethrtime()
 {
-	uint64_t tsc;
-	uint32_t lo;
-	uint32_t hi;
+	struct      timespec curtime;
 
-	/*
-	 * We split TSC value into lower and higher 32-bit halves and separately
-	 * scale them with nsec_scale, then we scale them down by 2^28
-	 * (see nsec_scale calculations) taking into account 32-bit shift of
-	 * the higher half and finally add.
-	 */
-	tsc = rdtsc() - tsc_skew[curcpu];
-	lo = tsc;
-	hi = tsc >> 32;
-	return (((lo * nsec_scale) >> SCALE_SHIFT) +
-	    ((hi * nsec_scale) << (32 - SCALE_SHIFT)));
+	nanouptime(&curtime);
+
+	return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec);
 }
 
 uint64_t
diff --git a/sys/cddl/dev/dtrace/i386/dtrace_subr.c b/sys/cddl/dev/dtrace/i386/dtrace_subr.c
index 3a1677a..4b6ae24 100644
--- a/sys/cddl/dev/dtrace/i386/dtrace_subr.c
+++ b/sys/cddl/dev/dtrace/i386/dtrace_subr.c
@@ -356,82 +356,6 @@ dtrace_safe_defer_signal(void)
 }
 #endif
 
-static int64_t	tgt_cpu_tsc;
-static int64_t	hst_cpu_tsc;
-static int64_t	tsc_skew[MAXCPU];
-static uint64_t	nsec_scale;
-
-/* See below for the explanation of this macro. */
-#define SCALE_SHIFT	28
-
-static void
-dtrace_gethrtime_init_cpu(void *arg)
-{
-	uintptr_t cpu = (uintptr_t) arg;
-
-	if (cpu == curcpu)
-		tgt_cpu_tsc = rdtsc();
-	else
-		hst_cpu_tsc = rdtsc();
-}
-
-static void
-dtrace_gethrtime_init(void *arg)
-{
-	cpuset_t map;
-	struct pcpu *pc;
-	uint64_t tsc_f;
-	int i;
-
-	/*
-	 * Get TSC frequency known at this moment.
-	 * This should be constant if TSC is invariant.
-	 * Otherwise tick->time conversion will be inaccurate, but
-	 * will preserve monotonic property of TSC.
-	 */
-	tsc_f = atomic_load_acq_64(&tsc_freq);
-
-	/*
-	 * The following line checks that nsec_scale calculated below
-	 * doesn't overflow 32-bit unsigned integer, so that it can multiply
-	 * another 32-bit integer without overflowing 64-bit.
-	 * Thus minimum supported TSC frequency is 62.5MHz.
-	 */
-	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
-
-	/*
-	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
-	 * as possible.
-	 * 2^28 factor was chosen quite arbitrarily from practical
-	 * considerations:
-	 * - it supports TSC frequencies as low as 62.5MHz (see above);
-	 * - it provides quite good precision (e < 0.01%) up to THz
-	 *   (terahertz) values;
-	 */
-	nsec_scale = ((uint64_t)NANOSEC << SCALE_SHIFT) / tsc_f;
-
-	/* The current CPU is the reference one. */
-	sched_pin();
-	tsc_skew[curcpu] = 0;
-	CPU_FOREACH(i) {
-		if (i == curcpu)
-			continue;
-
-		pc = pcpu_find(i);
-		CPU_SETOF(PCPU_GET(cpuid), &map);
-		CPU_SET(pc->pc_cpuid, &map);
-
-		smp_rendezvous_cpus(map, NULL,
-		    dtrace_gethrtime_init_cpu,
-		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
-
-		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
-	}
-	sched_unpin();
-}
-
-SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, NULL);
-
 /*
  * DTrace needs a high resolution time function which can
  * be called from a probe context and guaranteed not to have
@@ -442,21 +366,11 @@ SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init,
 uint64_t
 dtrace_gethrtime()
 {
-	uint64_t tsc;
-	uint32_t lo;
-	uint32_t hi;
+	struct      timespec curtime;
 
-	/*
-	 * We split TSC value into lower and higher 32-bit halves and separately
-	 * scale them with nsec_scale, then we scale them down by 2^28
-	 * (see nsec_scale calculations) taking into account 32-bit shift of
-	 * the higher half and finally add.
-	 */
-	tsc = rdtsc() - tsc_skew[curcpu];
-	lo = tsc;
-	hi = tsc >> 32;
-	return (((lo * nsec_scale) >> SCALE_SHIFT) +
-	    ((hi * nsec_scale) << (32 - SCALE_SHIFT)));
+	nanouptime(&curtime);
+
+	return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec);
 }
 
 uint64_t
-- 
1.7.10.3



>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201206241730.q5OHUBNm048260>