Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 22 Aug 2002 15:10:11 -0700 (PDT)
From:      Nate Lawson <nate@root.org>
To:        freebsd-bugs@FreeBSD.org
Subject:   Re: kern/37043: Latest stable causes SCSI bus freeze on sym0 when running SMP
Message-ID:  <200208222210.g7MMABwT084798@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help
The following reply was made to PR kern/37043; it has been noted by GNATS.

From: Nate Lawson <nate@root.org>
To: freebsd-gnats-submit@freebsd.org
Cc:  
Subject: Re: kern/37043: Latest stable causes SCSI bus freeze on sym0 when
 running SMP
Date: Thu, 22 Aug 2002 15:06:14 -0700 (PDT)

 Info provided by Gerard.
 
 ---------- Forwarded message ----------
 Date: Mon, 19 Aug 2002 01:18:22 +0200 (CEST)
 From: "[ISO-8859-1] Gérard Roudier" <groudier@free.fr>
 To: Pete French <pfrench@firstcallgroup.co.uk>
 Cc: freebsd-bugs@FreeBSD.ORG, njl@FreeBSD.ORG, webadmin@firstcallgroup.co.uk
 Subject: Re: kern/37043: Latest stable causes SCSI bus freeze on sym0 when
     running SMP
 
 
 On Sat, 17 Aug 2002, Pete French wrote:
 > > Synopsis: Latest stable causes SCSI bus freeze on sym0 when running SMP
 > > State-Changed-From-To: feedback->closed
 > > State-Changed-By: njl
 > > State-Changed-When: Fri Aug 16 13:14:57 PDT 2002
 > > State-Changed-Why:
 >
 > > Workaround is to not share interrupts between ATA and SCSI controllers.
 > > This is not a complete fix so we should revisit this if others have the
 > > same trouble in the future.
 >
 > Its not specificly ATA controllers - everyone else who had the problem
 > was sharing interrupts with Ethernet adapters as I recall. But the fix
 > does work.
 
 It is indeed not a fix, but some last chance workaround.
 (The patch against sym is at the end of this email)
 
 Basically, the code tries to detect an interrupt stall and if such seems
 to happen, it installs the work-around that just polls the interrupt
 status of the chip 100 times per second.
 
 Btw, the ncr had this just hardcoded since day one, but I disliked it for
 the reason it can hide severe hardware or software flaws.
 
 As PCI interrupt trigerring relies on level sensitive logic, an interrupt
 stall should never happen. The risk is rather an interrupt storm if any
 interrupt condition is not properly handled by software.
 
 IMO, if an interrupt stall happens in PCI, then the cause can be either a
 flawed/misconfigured piece of hardware that doesn't implement the correct
 triggerring or a software bug that leaves the interrupt masked somewhere.
 
 (IIRC, the problem didn't show up with IO/APIC but only happenned using
 the legacy interrupt controller.)
 
 May-be, users that get their system fixed by this work-around in sym
 should also report a description of their system hardware and software.
 This may help find out where the actual flaw actually is.
 
 
 Regards,
   Gérard.
 
 PS: The first line of the patch, i.e.:
 
 +#define SYM_CONF_HANDLE_INTR_STALL
 
 should be removed, if it happens that it will be worthwhile to commit this
 code, in order to allow to conditionnaly compile the workaround.
 
 --------------------- PATCH --------------------------
 
 --- sym_hipd.c.orig	Sun Jun  9 18:37:50 2002
 +++ sym_hipd.c	Sun Jun  9 16:36:07 2002
 @@ -1,3 +1,7 @@
 +#define SYM_CONF_HANDLE_INTR_STALL
 +#if 0
 +#define DEBUG_INTR_STALL
 +#endif
  /*
   *  Device driver optimized for the Symbios/LSI 53C896/53C895A/53C1010
   *  PCI-SCSI controllers.
 @@ -1922,6 +1926,17 @@
  	struct sym_tblmove abrt_tbl;	/* Table for the MOV of it 	*/
  	struct sym_tblsel  abrt_sel;	/* Sync params for selection	*/
  	u_char		istat_sem;	/* Tells the chip to stop (SEM)	*/
 +
 +#ifdef SYM_CONF_HANDLE_INTR_STALL
 +	int stall_state;	/* State of the algorithm */
 +	int stall_count;	/* Number of intr stall observed */
 +	u_long intr_count;	/* Real interrupt counter */
 +	u_long intr_prevc;	/* Previous counter seen from clock hanlder */
 +	u_long clock_curr;	/* Our clock in ticks */
 +	u_long clock_stall;	/* Clock value at a possible stall */
 +	struct callout_handle clock_ch;/* Kernel timer alchemy :) */
 +#define SYM_CLOCK_TICK	((hz+99)/100)
 +#endif
  };
 
  #define HCB_BA(np, lbl)	    (np->hcb_ba      + offsetof(struct sym_hcb, lbl))
 @@ -2513,6 +2528,10 @@
  static void sym_nvram_setup_target (hcb_p np, int targ, struct sym_nvram *nvp);
  static int sym_read_nvram (hcb_p np, struct sym_nvram *nvp);
 
 +#ifdef SYM_CONF_HANDLE_INTR_STALL
 +static void sym_clock_handler(void *arg);
 +#endif
 +
  /*
   *  Print something which allows to retrieve the controler type,
   *  unit, target, lun concerned by a kernel message.
 @@ -4216,6 +4235,9 @@
  static void sym_intr(void *arg)
  {
  	if (DEBUG_FLAGS & DEBUG_TINY) printf ("[");
 +#ifdef SYM_CONF_HANDLE_INTR_STALL
 +	++((hcb_p)arg)->intr_count;
 +#endif
  	sym_intr1((hcb_p) arg);
  	if (DEBUG_FLAGS & DEBUG_TINY) printf ("]");
  	return;
 @@ -9509,6 +9531,13 @@
  		goto attach_failed;
 
  	/*
 +	 * No comments for this one. :)
 +	 */
 +#ifdef SYM_CONF_HANDLE_INTR_STALL
 +	np->clock_ch = timeout(sym_clock_handler, (caddr_t)np, SYM_CLOCK_TICK);
 +#endif
 +
 +	/*
  	 *  Sigh! we are done.
  	 */
  	return 0;
 @@ -10410,3 +10439,91 @@
  }
 
  #endif	/* SYM_CONF_NVRAM_SUPPORT */
 +
 +#ifdef SYM_CONF_HANDLE_INTR_STALL
 +/*
 + * The below code tries to detect interrupt stalls.
 + *
 + * It assumes that an interrupt condition raised
 + * in the chip interrupt status that is not serviced
 + * for 0.2 second is a possible stall.
 + *
 + * If such happens 5 times, it installs a work-around
 + * that forces interrupt service each time an interrupt
 + * condition is present in the chip interrupt status.
 + */
 +
 +static void sym_clock_handler(void *arg)
 +{
 +	int s;
 +	hcb_p np;
 +	u_char istat;
 +	int intr_prevc;
 +
 +	np = arg;
 +	if (!np)
 +		return;
 +
 +	s = splcam();
 +
 +	/*
 +	 * Update our clock and interrupt counter copy.
 +	 */
 +	intr_prevc = np->intr_prevc;
 +	np->intr_prevc = np->intr_count;
 +	np->clock_curr += SYM_CLOCK_TICK;
 +
 +	/*
 +	 * Read the chip interrupt status.
 +	 */
 +	istat = INB (nc_istat) & (INTF|SIP|DIP);
 +
 +	/*
 +	 * Try to detect interrupt stalls.
 +	 */
 +	switch(np->stall_state) {
 +	default:
 +	case 0:	/* Wait for the first unserviced interrupt condition */
 +		np->stall_count = 0;
 +
 +	case 2:	/* Wait for subsequent ones */
 +		if (istat) {
 +			np->clock_stall = np->clock_curr;
 +			np->stall_state = 1;
 +		}
 +		break;
 +
 +	case 1:	/* Detect a possible interrupt stall */
 +#ifndef DEBUG_INTR_STALL
 +		if (intr_prevc != np->intr_count || !istat) {
 +			np->stall_state = 2;
 +			break;
 +		}
 +#endif
 +		if (((int)(np->clock_curr - np->clock_stall)) < (hz+4)/5)
 +			break;
 +
 +		++np->stall_count;
 +		if (np->stall_count < 5) {
 +			np->stall_state = 2;
 +			printf("%s: interrupt stall, forcing service.\n",
 +			       sym_name(np));
 +		}
 +		else {
 +			np->stall_state = 3;
 +			printf("%s: interrupt stall, installing workaround.\n",
 +			       sym_name(np));
 +		}
 +		sym_intr1(np);
 +		break;
 +
 +	case 3:	/* Force service if interrupt condition is pending */
 +		if (istat)
 +			sym_intr1(np);
 +		break;
 +	}
 +
 +	np->clock_ch = timeout(sym_clock_handler, (caddr_t)np, SYM_CLOCK_TICK);
 +	splx(s);
 +}
 +#endif /* SYM_CONF_HANDLE_INTR_STALL */
 

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200208222210.g7MMABwT084798>