Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 10 Nov 2004 10:25:10 +0000 (GMT)
From:      Robert Watson <rwatson@freebsd.org>
To:        Zoltan Frombach <tssajo@hotmail.com>
Cc:        =?iso-8859-1?Q?S=F8ren_Schmidt?= <sos@DeepCore.dk>
Subject:   Re: Re: 5.3-RELEASE: WARNING - WRITE_DMA interrupt timout - what does it	mean?
Message-ID:  <Pine.NEB.3.96L.1041110102232.60848X-100000@fledge.watson.org>
In-Reply-To: <BAY2-DAV1tDEJmGzrLC0000b117@hotmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help

On Wed, 10 Nov 2004, Zoltan Frombach wrote:

> I'm on FreeBSD 5.3-RELEASE. I get this when I try to patch the file:
<snip>
> Is your patch were produce for HEAD? Or what is wrong?

Indeed, where the contents of struct task have changed slightly to allow
for safe draining while shutting down a task.  Try this variation; the
timeout is also modified to intentionally be 5/100 of a second per Andrea
Campi's keen bug searchign eyes :-).

Index: kern/subr_taskqueue.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/sys/kern/subr_taskqueue.c,v
retrieving revision 1.24
diff -u -r1.24 subr_taskqueue.c
--- kern/subr_taskqueue.c=098 Aug 2004 02:37:21 -0000=091.24
+++ kern/subr_taskqueue.c=0910 Nov 2004 10:22:41 -0000
@@ -36,9 +36,16 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/unistd.h>
=20
+int tq_in;
+SYSCTL_INT(_kern, OID_AUTO, tq_in, CTLFLAG_RD, &tq_in, 0, "");
+int tq_out;
+SYSCTL_INT(_kern, OID_AUTO, tq_out, CTLFLAG_RD, &tq_out, 0, "");
+
 static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
 static void=09*taskqueue_giant_ih;
 static void=09*taskqueue_ih;
@@ -140,6 +147,9 @@
 =09=09return 0;
 =09}
=20
+=09getnanotime(&task->ta_queuetime);
+=09tq_in++;
+
 =09/*
 =09 * Optimise the case when all tasks have the same priority.
 =09 */
@@ -172,6 +182,7 @@
 taskqueue_run(struct taskqueue *queue)
 {
 =09struct task *task;
+=09struct timespec tv;
 =09int owned, pending;
=20
 =09owned =3D mtx_owned(&queue->tq_mutex);
@@ -186,8 +197,16 @@
 =09=09STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
 =09=09pending =3D task->ta_pending;
 =09=09task->ta_pending =3D 0;
+=09=09tq_out++;
 =09=09mtx_unlock(&queue->tq_mutex);
=20
+=09=09getnanotime(&tv);
+=09=09timespecsub(&tv, &task->ta_queuetime);
+=09=09if (tv.tv_nsec >=3D 50000000) {
+=09=09=09printf("taskqueue_run: warning, queue time of %d.%09ld "
+=09=09=09    "for context %p\n", tv.tv_sec, tv.tv_nsec,
+=09=09=09    task->ta_func);
+=09=09}
 =09=09task->ta_func(task->ta_context, pending);
=20
 =09=09mtx_lock(&queue->tq_mutex);
Index: sys/_task.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/sys/sys/_task.h,v
retrieving revision 1.2
diff -u -r1.2 _task.h
--- sys/_task.h=0928 Jul 2004 09:12:54 -0000=091.2
+++ sys/_task.h=0910 Nov 2004 10:22:19 -0000
@@ -45,6 +45,7 @@
 =09int=09ta_priority;=09=09/* priority of task in queue */
 =09task_fn_t *ta_func;=09=09/* task handler */
 =09void=09*ta_context;=09=09/* argument for handler */
+=09struct timespec=09ta_queuetime;=09/* time enqueued */
 };
=20
 #endif /* !_SYS__TASK_H_ */



>=20
> Zoltan
>=20
> On Wed, 10 Nov 2004, S=F8ren Schmidt wrote:
>=20
> > Zoltan Frombach wrote:
> > > I just upgraded to 5.3-RELEASE a few days ago. This morning this line
> > > got into my system log file:
> > > Nov  9 06:14:03 www kernel: ad0: WARNING - WRITE_DMA interrupt was se=
en
> > > but timeout fired LBA=3D2491143
> > >
> > > I've never seen this message before. Can someone please explain what =
it
> > > means? With Thanks,
> >
> > It means that the disk has processed the write request (interrupt seen)=
,
> > but that the system (the bio_taskqueue) hasn't been able to get the
> > result returned to the kernel.
> >
> > Your disk is not involved in this problem since it has done its part,
> > but the rest of the system is either busy with something else, or there
> > are bugs lurking that prohibits the bio_taskqueue from running.
> >
> > Either way its a WARNING not a FAILURE :)
>=20
> I'm still a bit skeptical that the task queue is at fault -- I run my
> notebook with continuous measurement of the latency to schedule tasks,
> generating a warning for any latency > .5 seconds, and the only time I
> ever see that sort of latency is during the boot process when ACPI has
> scheduled a task to run, but the task queue thread has not yet been
> allowed to run:
>=20
> ipfw2 initialized, divert loadable, rule-based forwarding disabled,
> default to deny, logging disabled
> taskqueue_run: warning, queue time of 0.900011314 for context 0xc045638c
> ad0: 19077MB <FUJITSU MHS2020AT E/8307> [38760/16/63] at ata0-master
> UDMA33
>=20
> It would be quite interesting to have someone run with this timing code o=
n
> a system reporting the warning to see what they find.  What is the
> threshold to generate the ATA timeout warning?  My impression was that it
> was several seconds, which is quite a long time.  The attached patch
> checks for .5 second or above latency.  If the task queue is really
> getting stalled for several seconds, we should figure out what task it is=
=2E
> There's KTR tracing in the taskqueue code that we can use to dump a trace
> stream to see what task is taking so long.
>=20
> Robert N M Watson             FreeBSD Core Team, TrustedBSD Projects
> robert@fledge.watson.org      Principal Research Scientist, McAfee Resear=
ch
>=20
> Index: kern/subr_taskqueue.c
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /home/ncvs/src/sys/kern/subr_taskqueue.c,v
> retrieving revision 1.25
> diff -u -r1.25 subr_taskqueue.c
> --- kern/subr_taskqueue.c 5 Oct 2004 04:16:00 -0000 1.25
> +++ kern/subr_taskqueue.c 23 Oct 2004 19:18:26 -0000
> @@ -36,9 +36,16 @@
>  #include <sys/lock.h>
>  #include <sys/malloc.h>
>  #include <sys/mutex.h>
> +#include <sys/sysctl.h>
>  #include <sys/taskqueue.h>
> +#include <sys/time.h>
>  #include <sys/unistd.h>
>=20
> +int tq_in;
> +SYSCTL_INT(_kern, OID_AUTO, tq_in, CTLFLAG_RD, &tq_in, 0, "");
> +int tq_out;
> +SYSCTL_INT(_kern, OID_AUTO, tq_out, CTLFLAG_RD, &tq_out, 0, "");
> +
>  static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
>  static void *taskqueue_giant_ih;
>  static void *taskqueue_ih;
> @@ -140,6 +147,9 @@
>   return 0;
>   }
>=20
> + getnanotime(&task->ta_queuetime);
> + tq_in++;
> +
>   /*
>   * Optimise the case when all tasks have the same priority.
>   */
> @@ -172,6 +182,7 @@
>  taskqueue_run(struct taskqueue *queue)
>  {
>   struct task *task;
> + struct timespec tv;
>   int owned, pending;
>=20
>   owned =3D mtx_owned(&queue->tq_mutex);
> @@ -187,8 +198,16 @@
>   pending =3D task->ta_pending;
>   task->ta_pending =3D 0;
>   task->ta_flags |=3D TAF_PENDING;
> + tq_out++;
>   mtx_unlock(&queue->tq_mutex);
>=20
> + getnanotime(&tv);
> + timespecsub(&tv, &task->ta_queuetime);
> + if (tv.tv_nsec >=3D 0500000000) {
> + printf("taskqueue_run: warning, queue time of %d.%09ld "
> +     "for context %p\n", tv.tv_sec, tv.tv_nsec,
> +     task->ta_func);
> + }
>   task->ta_func(task->ta_context, pending);
>=20
>   mtx_lock(&queue->tq_mutex);
> Index: sys/_task.h
> =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
> RCS file: /home/ncvs/src/sys/sys/_task.h,v
> retrieving revision 1.3
> diff -u -r1.3 _task.h
> --- sys/_task.h 5 Oct 2004 04:16:01 -0000 1.3
> +++ sys/_task.h 23 Oct 2004 19:38:16 -0000
> @@ -46,6 +46,7 @@
>   task_fn_t *ta_func; /* task handler */
>   void *ta_context; /* argument for handler */
>   int ta_flags; /* Flags */
> + struct timespec ta_queuetime; /* time enqueued */
>  };
>=20
>  #define TAF_PENDING 0x1 /* Task is being run now */=20
>=20



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.NEB.3.96L.1041110102232.60848X-100000>