From owner-svn-src-all@FreeBSD.ORG Sat Sep 29 16:47:57 2012 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52]) by hub.freebsd.org (Postfix) with ESMTP id A4DE71065670; Sat, 29 Sep 2012 16:47:57 +0000 (UTC) (envelope-from ae@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 5EA538FC12; Sat, 29 Sep 2012 16:47:57 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q8TGlvom058843; Sat, 29 Sep 2012 16:47:57 GMT (envelope-from ae@svn.freebsd.org) Received: (from ae@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id q8TGlvbr058837; Sat, 29 Sep 2012 16:47:57 GMT (envelope-from ae@svn.freebsd.org) Message-Id: <201209291647.q8TGlvbr058837@svn.freebsd.org> From: "Andrey V. Elsukov" Date: Sat, 29 Sep 2012 16:47:57 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r241053 - in head/sys/boot: common i386/libi386 uboot/lib userboot/userboot X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 29 Sep 2012 16:47:57 -0000 Author: ae Date: Sat Sep 29 16:47:56 2012 New Revision: 241053 URL: http://svn.freebsd.org/changeset/base/241053 Log: Almost each time when loader opens a file, this leads to calling disk_open(). Very often this is called several times for one file. This leads to reading partition table metadata for each call. To reduce the number of disk I/O we have a simple block cache, but it is very dumb and more than half of I/O operations related to reading metadata, misses this cache. Introduce new cache layer to resolve this problem. It is independent and doesn't need initialization like bcache, and will work by default for all loaders which use the new DISK API. A successful disk_open() call to each new disk or partition produces new entry in the cache. Even more, when disk was already open, now opening of any nested partitions does not require reading top level partition table. So, if without this cache, partition table metadata was read around 20-50 times during boot, now it reads only once. This affects the booting from GPT and MBR from the UFS. Modified: head/sys/boot/common/disk.c head/sys/boot/common/disk.h head/sys/boot/i386/libi386/biosdisk.c head/sys/boot/uboot/lib/disk.c head/sys/boot/userboot/userboot/userboot_disk.c Modified: head/sys/boot/common/disk.c ============================================================================== --- head/sys/boot/common/disk.c Sat Sep 29 16:42:01 2012 (r241052) +++ head/sys/boot/common/disk.c Sat Sep 29 16:47:56 2012 (r241053) @@ -29,6 +29,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include #include @@ -46,6 +47,7 @@ struct open_disk { struct ptable *table; off_t mediasize; u_int sectorsize; + int rcnt; }; struct print_args { @@ -54,6 +56,96 @@ struct print_args { int verbose; }; +struct dentry { + const struct devsw *d_dev; + int d_unit; + int d_slice; + int d_partition; + + struct open_disk *od; + off_t d_offset; + STAILQ_ENTRY(dentry) entry; +#ifdef DISK_DEBUG + uint32_t count; +#endif +}; + +static STAILQ_HEAD(, dentry) opened_disks = + STAILQ_HEAD_INITIALIZER(opened_disks); + +static int +disk_lookup(struct disk_devdesc *dev) +{ + struct dentry *entry; + int rc; + + rc = ENOENT; + STAILQ_FOREACH(entry, &opened_disks, entry) { + if (entry->d_dev != dev->d_dev || + entry->d_unit != dev->d_unit) + continue; + dev->d_opendata = entry->od; + if (entry->d_slice == dev->d_slice && + entry->d_partition == dev->d_partition) { + dev->d_offset = entry->d_offset; + DEBUG("%s offset %lld", disk_fmtdev(dev), + dev->d_offset); +#ifdef DISK_DEBUG + entry->count++; +#endif + return (0); + } + rc = EAGAIN; + } + return (rc); +} + +static void +disk_insert(struct disk_devdesc *dev) +{ + struct dentry *entry; + + entry = (struct dentry *)malloc(sizeof(struct dentry)); + if (entry == NULL) { + DEBUG("no memory"); + return; + } + entry->d_dev = dev->d_dev; + entry->d_unit = dev->d_unit; + entry->d_slice = dev->d_slice; + entry->d_partition = dev->d_partition; + entry->od = (struct open_disk *)dev->d_opendata; + entry->od->rcnt++; + entry->d_offset = dev->d_offset; +#ifdef DISK_DEBUG + entry->count = 1; +#endif + STAILQ_INSERT_TAIL(&opened_disks, entry, entry); + DEBUG("%s cached", disk_fmtdev(dev)); +} + +#ifdef DISK_DEBUG +COMMAND_SET(dcachestat, "dcachestat", "get disk cache stats", + command_dcachestat); + +static int +command_dcachestat(int argc, char *argv[]) +{ + struct disk_devdesc dev; + struct dentry *entry; + + STAILQ_FOREACH(entry, &opened_disks, entry) { + dev.d_dev = (struct devsw *)entry->d_dev; + dev.d_unit = entry->d_unit; + dev.d_slice = entry->d_slice; + dev.d_partition = entry->d_partition; + printf("%s %d => %p [%d]\n", disk_fmtdev(&dev), entry->count, + entry->od, entry->od->rcnt); + } + return (CMD_OK); +} +#endif /* DISK_DEBUG */ + /* Convert size to a human-readable number. */ static char * display_size(uint64_t size, u_int sectorsize) @@ -145,25 +237,43 @@ disk_open(struct disk_devdesc *dev, off_ struct open_disk *od; struct ptable *table; struct ptable_entry part; - int rc; + int rc, slice, partition; - od = (struct open_disk *)malloc(sizeof(struct open_disk)); - if (od == NULL) { - DEBUG("no memory"); - return (ENOMEM); - } + rc = disk_lookup(dev); + if (rc == 0) + return (0); /* * While we are reading disk metadata, make sure we do it relative * to the start of the disk */ - rc = 0; - table = NULL; dev->d_offset = 0; + table = NULL; + slice = dev->d_slice; + partition = dev->d_partition; + if (rc == EAGAIN) { + /* + * This entire disk was already opened and there is no + * need to allocate new open_disk structure and open the + * main partition table. + */ + od = (struct open_disk *)dev->d_opendata; + DEBUG("%s unit %d, slice %d, partition %d => %p (cached)", + disk_fmtdev(dev), dev->d_unit, dev->d_slice, + dev->d_partition, od); + goto opened; + } else { + od = (struct open_disk *)malloc(sizeof(struct open_disk)); + if (od == NULL) { + DEBUG("no memory"); + return (ENOMEM); + } + } dev->d_opendata = od; od->mediasize = mediasize; od->sectorsize = sectorsize; - DEBUG("%s unit %d, slice %d, partition %d", - disk_fmtdev(dev), dev->d_unit, dev->d_slice, dev->d_partition); + od->rcnt = 0; + DEBUG("%s unit %d, slice %d, partition %d => %p", + disk_fmtdev(dev), dev->d_unit, dev->d_slice, dev->d_partition, od); /* Determine disk layout. */ od->table = ptable_open(dev, mediasize / sectorsize, sectorsize, @@ -173,35 +283,34 @@ disk_open(struct disk_devdesc *dev, off_ rc = ENXIO; goto out; } +opened: + rc = 0; if (ptable_gettype(od->table) == PTABLE_BSD && - dev->d_partition >= 0) { + partition >= 0) { /* It doesn't matter what value has d_slice */ - rc = ptable_getpart(od->table, &part, dev->d_partition); + rc = ptable_getpart(od->table, &part, partition); if (rc == 0) dev->d_offset = part.start; - } else if (dev->d_slice >= 0) { + } else if (slice >= 0) { /* Try to get information about partition */ - if (dev->d_slice == 0) + if (slice == 0) rc = ptable_getbestpart(od->table, &part); else - rc = ptable_getpart(od->table, &part, dev->d_slice); + rc = ptable_getpart(od->table, &part, slice); if (rc != 0) /* Partition doesn't exist */ goto out; dev->d_offset = part.start; - if (dev->d_slice == 0) { - /* Save the slice number of best partition to dev */ - dev->d_slice = part.index; - if (ptable_gettype(od->table) == PTABLE_GPT) - dev->d_partition = 255; - } - if (dev->d_partition == 255) + slice = part.index; + if (ptable_gettype(od->table) == PTABLE_GPT) { + partition = 255; goto out; /* Nothing more to do */ + } /* * If d_partition < 0 and we are looking at a BSD slice, * then try to read BSD label, otherwise return the * whole MBR slice. */ - if (dev->d_partition == -1 && + if (partition == -1 && part.type != PART_FREEBSD) goto out; /* Try to read BSD label */ @@ -217,12 +326,12 @@ disk_open(struct disk_devdesc *dev, off_ * assume the 'a' partition. Otherwise just return the * whole MBR slice, because it can contain ZFS. */ - if (dev->d_partition < 0) { + if (partition < 0) { if (ptable_gettype(table) != PTABLE_BSD) goto out; - dev->d_partition = 0; + partition = 0; } - rc = ptable_getpart(table, &part, dev->d_partition); + rc = ptable_getpart(table, &part, partition); if (rc != 0) goto out; dev->d_offset += part.start; @@ -232,12 +341,19 @@ out: ptable_close(table); if (rc != 0) { - if (od->table != NULL) - ptable_close(od->table); - free(od); + if (od->rcnt < 1) { + if (od->table != NULL) + ptable_close(od->table); + free(od); + } DEBUG("%s could not open", disk_fmtdev(dev)); } else { - DEBUG("%s offset %lld", disk_fmtdev(dev), dev->d_offset); + disk_insert(dev); + /* Save the slice and partition number to the dev */ + dev->d_slice = slice; + dev->d_partition = partition; + DEBUG("%s offset %lld => %p", disk_fmtdev(dev), + dev->d_offset, od); } return (rc); } @@ -245,15 +361,43 @@ out: int disk_close(struct disk_devdesc *dev) { +#if DISK_DEBUG struct open_disk *od; od = (struct open_disk *)dev->d_opendata; - DEBUG("%s closed", disk_fmtdev(dev)); - ptable_close(od->table); - free(od); + DEBUG("%s closed => %p [%d]", disk_fmtdev(dev), od, od->rcnt); +#endif return (0); } +void +disk_cleanup(const struct devsw *d_dev) +{ + struct disk_devdesc dev; + struct dentry *entry, *tmp; + + STAILQ_FOREACH_SAFE(entry, &opened_disks, entry, tmp) { + if (entry->d_dev != d_dev) + continue; + entry->od->rcnt--; +#ifdef DISK_DEBUG + dev.d_dev = (struct devsw *)entry->d_dev; + dev.d_unit = entry->d_unit; + dev.d_slice = entry->d_slice; + dev.d_partition = entry->d_partition; + STAILQ_REMOVE(&opened_disks, entry, dentry, entry); + DEBUG("%s was freed => %p [%d]", disk_fmtdev(&dev), + entry->od, entry->od->rcnt); +#endif + if (entry->od->rcnt < 1) { + if (entry->od->table != NULL) + ptable_close(entry->od->table); + free(entry->od); + } + free(entry); + } +} + char* disk_fmtdev(struct disk_devdesc *dev) { @@ -261,7 +405,7 @@ disk_fmtdev(struct disk_devdesc *dev) char *cp; cp = buf + sprintf(buf, "%s%d", dev->d_dev->dv_name, dev->d_unit); - if (dev->d_slice > 0) { + if (dev->d_slice >= 0) { #ifdef LOADER_GPT_SUPPORT if (dev->d_partition == 255) { sprintf(cp, "p%d:", dev->d_slice); Modified: head/sys/boot/common/disk.h ============================================================================== --- head/sys/boot/common/disk.h Sat Sep 29 16:42:01 2012 (r241052) +++ head/sys/boot/common/disk.h Sat Sep 29 16:47:56 2012 (r241053) @@ -95,6 +95,7 @@ struct disk_devdesc extern int disk_open(struct disk_devdesc *dev, off_t mediasize, u_int sectorsize); extern int disk_close(struct disk_devdesc *dev); +extern void disk_cleanup(const struct devsw *d_dev); /* * Print information about slices on a disk. Modified: head/sys/boot/i386/libi386/biosdisk.c ============================================================================== --- head/sys/boot/i386/libi386/biosdisk.c Sat Sep 29 16:42:01 2012 (r241052) +++ head/sys/boot/i386/libi386/biosdisk.c Sat Sep 29 16:47:56 2012 (r241053) @@ -106,6 +106,7 @@ static int bd_open(struct open_file *f, static int bd_close(struct open_file *f); static int bd_ioctl(struct open_file *f, u_long cmd, void *data); static void bd_print(int verbose); +static void bd_cleanup(void); struct devsw biosdisk = { "disk", @@ -116,7 +117,7 @@ struct devsw biosdisk = { bd_close, bd_ioctl, bd_print, - NULL + bd_cleanup }; /* @@ -181,6 +182,13 @@ bd_init(void) return(0); } +static void +bd_cleanup(void) +{ + + disk_cleanup(&biosdisk); +} + /* * Try to detect a device supported by the legacy int13 BIOS */ Modified: head/sys/boot/uboot/lib/disk.c ============================================================================== --- head/sys/boot/uboot/lib/disk.c Sat Sep 29 16:42:01 2012 (r241052) +++ head/sys/boot/uboot/lib/disk.c Sat Sep 29 16:47:56 2012 (r241053) @@ -142,6 +142,7 @@ stor_cleanup(void) for (i = 0; i < stor_info_no; i++) if (stor_info[i].opened > 0) ub_dev_close(stor_info[i].handle); + disk_cleanup(&uboot_storage); } static int Modified: head/sys/boot/userboot/userboot/userboot_disk.c ============================================================================== --- head/sys/boot/userboot/userboot/userboot_disk.c Sat Sep 29 16:42:01 2012 (r241052) +++ head/sys/boot/userboot/userboot/userboot_disk.c Sat Sep 29 16:47:56 2012 (r241053) @@ -104,6 +104,7 @@ userdisk_cleanup(void) if (userdisk_maxunit > 0) free(ud_info); + disk_cleanup(&userboot_disk); } /*