From owner-svn-src-stable-10@freebsd.org Sat Oct 3 07:50:16 2015 Return-Path: Delivered-To: svn-src-stable-10@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 2A51DA0F8F9; Sat, 3 Oct 2015 07:50:16 +0000 (UTC) (envelope-from mav@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 1A31610F1; Sat, 3 Oct 2015 07:50:16 +0000 (UTC) (envelope-from mav@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.70]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id t937oFF5074565; Sat, 3 Oct 2015 07:50:15 GMT (envelope-from mav@FreeBSD.org) Received: (from mav@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id t937oFXZ074564; Sat, 3 Oct 2015 07:50:15 GMT (envelope-from mav@FreeBSD.org) Message-Id: <201510030750.t937oFXZ074564@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: mav set sender to mav@FreeBSD.org using -f From: Alexander Motin Date: Sat, 3 Oct 2015 07:50:15 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r288562 - stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs X-SVN-Group: stable-10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable-10@freebsd.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: SVN commit messages for only the 10-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 03 Oct 2015 07:50:16 -0000 Author: mav Date: Sat Oct 3 07:50:15 2015 New Revision: 288562 URL: https://svnweb.freebsd.org/changeset/base/288562 Log: MFC r286625: 5376 arc_kmem_reap_now() should not result in clearing arc_no_grow Reviewed by: Christopher Siden Reviewed by: George Wilson Reviewed by: Steven Hartland Reviewed by: Richard Elling Approved by: Dan McDonald Author: Matthew Ahrens illumos/illumos-gate@2ec99e3e987d8aa273f1e9ba2b983557d058198c Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sat Oct 3 07:49:16 2015 (r288561) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sat Oct 3 07:50:15 2015 (r288562) @@ -153,13 +153,7 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; +uint_t arc_reduce_dnlc_percent = 3; /* * The number of iterations through arc_evict_*() before we @@ -174,7 +168,19 @@ static int arc_grow_retry = 60; static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 5; +static int arc_shrink_shift = 7; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks @@ -3058,13 +3064,10 @@ arc_flush(spa_t *spa) } void -arc_shrink(void) +arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { - uint64_t to_free; - - to_free = arc_c >> arc_shrink_shift; DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) @@ -3092,44 +3095,76 @@ arc_shrink(void) } } -static int needfree = 0; +static long needfree = 0; -static int -arc_reclaim_needed(void) +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, + FMR_ZIO_FRAG, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +/* + * Additional reserve of pages for pp_reserve. + */ +int64_t arc_pages_pp_reserve = 64; + +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) { + int64_t lowest = INT64_MAX; + int64_t n; + free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL - - if (needfree) { - DTRACE_PROBE(arc__reclaim_needfree); - return (1); + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - if (freemem < zfs_arc_free_target) { - DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, - freemem, uint64_t, zfs_arc_free_target); - return (1); + n = PAGESIZE * (int64_t)(freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; } #ifdef sun /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later - */ - extra = desfree; - - /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ - if (freemem < lotsfree + needfree + extra) - return (1); + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } /* * check to make sure that swapfs has enough space so that anon @@ -3138,8 +3173,13 @@ arc_reclaim_needed(void) * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } + /* * Check that we have enough availrmem that memory locking (e.g., via @@ -3148,8 +3188,12 @@ arc_reclaim_needed(void) * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ - if (availrmem <= pages_pp_maximum) - return (1); + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } #endif /* sun */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) @@ -3164,12 +3208,11 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (vmem_size(heap_arena, VMEM_FREE) < - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { - DTRACE_PROBE2(arc__reclaim_used, uint64_t, - vmem_size(heap_arena, VMEM_FREE), uint64_t, - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); - return (1); + n = vmem_size(heap_arena, VMEM_FREE) - + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2) + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; } #define zio_arena NULL #else @@ -3185,29 +3228,50 @@ arc_reclaim_needed(void) * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ - if (zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) - return (1); + if (zio_arena != NULL) { + n = vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } /* * Above limits know nothing about real level of KVA fragmentation. * Start aggressive reclamation if too little sequential KVA left. */ - if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { - DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, - vmem_size(heap_arena, VMEM_MAXFREE), - uint64_t, zfs_max_recordsize); - return (1); + if (lowest > 0) { + n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? + -(vmem_size(heap_arena, VMEM_ALLOC) >> 4) : INT64_MAX; + if (n < lowest) { + lowest = n; + r = FMR_ZIO_FRAG; + } } #else /* _KERNEL */ + /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) - return (1); + lowest = -1024; #endif /* _KERNEL */ - DTRACE_PROBE(arc__reclaim_no); - return (0); + last_free_memory = lowest; + last_free_reason = r; + DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); +} + + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); } extern kmem_cache_t *zio_buf_cache[]; @@ -3215,7 +3279,7 @@ extern kmem_cache_t *zio_data_buf_cache[ extern kmem_cache_t *range_seg_cache; static __noinline void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) +arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -3238,13 +3302,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t #endif #endif - /* - * An aggressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -3261,12 +3318,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t kmem_cache_reap_now(range_seg_cache); #ifdef sun - /* - * Ask the vmem arena to reclaim unused memory from its - * quantum caches. - */ - if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ vmem_qcache_reap(zio_arena); + } #endif DTRACE_PROBE(arc__kmem_reap_end); } @@ -3275,46 +3333,44 @@ static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { + int64_t free_memory = arc_available_memory(); + if (free_memory < 0) { - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - DTRACE_PROBE(arc__reclaim_aggr_no_grow); - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - DTRACE_PROBE(arc__reclaim_aggr); - membar_producer(); - } + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; - /* reset the growth delay for every reclaim */ + /* + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. + */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - if (needfree && last_reclaim == ARC_RECLAIM_CONS) { - /* - * If needfree is TRUE our vm_lowmem hook - * was called and in that case we must free some - * memory, so switch to aggressive mode. - */ - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - } - arc_kmem_reap_now(last_reclaim); - arc_warm = B_TRUE; + arc_kmem_reap_now(); - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { - arc_no_grow = FALSE; + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL + to_free = MAX(to_free, ptob(needfree)); +#endif + arc_shrink(to_free); + } + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (ddi_get_lbolt() >= growtime) { + arc_no_grow = B_FALSE; } arc_adjust(); @@ -4784,7 +4840,8 @@ arc_lowmem(void *arg __unused, int howto { mutex_enter(&arc_reclaim_thr_lock); - needfree = 1; + /* XXX: Memory deficit should be passed as argument. */ + needfree = btoc(arc_c >> arc_shrink_shift); DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thr_cv); @@ -4868,6 +4925,12 @@ arc_init(void) if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; + /* + * Ensure that arc_no_grow_shift is less than arc_shrink_shift. + */ + if (arc_no_grow_shift >= arc_shrink_shift) + arc_no_grow_shift = arc_shrink_shift - 1; + if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift;