struct lruvec { struct list_head lists[NR_LRU_LISTS];//5个lru双向链表头 struct zone_reclaim_stat reclaim_stat; //与回收相关的统计数据 /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults;//记录最后一次回收周期发生的结果 #ifdef CONFIG_MEMCG struct pglist_data *pgdat;//所属内存节点结构体 struct pglist_data #endif };
static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; enum compact_priority compact_priority; enum compact_result compact_result; int compaction_retries; int no_progress_loops; unsigned int cpuset_mems_cookie; int reserve_flags;
/* * We also sanity check to catch abuse of atomic reserves being used by * callers that are not in atomic context. */ if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC;
/* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ //把分配标志位转化为内部的分配标志位,调整为最低水线标志 alloc_flags = gfp_to_alloc_flags(gfp_mask);
/* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or * there was a cpuset modification and we are retrying - otherwise we * could end up iterating over non-eligible zones endlessly. */ //获取首选的内存区域,因为在快速路径中使用了不同的节点掩码,避免再次遍历不合格的区域。 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); if (!ac->preferred_zoneref->zone) goto nopage;
//异步回收页,唤醒kswapd内核线程进行页面回收 if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac);
/* * The adjusted alloc_flags might result in immediate success, so try * that first */ //调整alloc_flags后可能会立即申请成功,所以先尝试一下 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg;
/* * For costly allocations, try direct compaction first, as it's likely * that we have enough base pages and don't need to reclaim. For non- * movable high-order allocations, do that as well, as compaction will * try prevent permanent fragmentation by migrating from blocks of the * same migratetype. * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ //申请阶数大于0,不可移动的位于高阶的,忽略水位线的 if (can_direct_reclaim && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { //尝试内存压缩,进行页面迁移,然后进行页面分配 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, &compact_result); if (page) goto got_pg;
/* * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If * this is the case and the caller requested a THP * allocation, we do not want to heavily disrupt the * system, so we fail the allocation instead of entering * direct reclaim. */ if (compact_result == COMPACT_DEFERRED) goto nopage;
/* * Looks like reclaim/compaction is worth trying, but * sync compaction could be very expensive, so keep * using async compaction. */ //同步压缩非常昂贵,所以继续使用异步压缩 compact_priority = INIT_COMPACT_PRIORITY; } }
retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ //如果页回收线程意外睡眠则再次唤醒,确保交换线程没有意外 if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, gfp_mask, ac);
//如果调用者承若给我们紧急内存使用,我们就忽略水线,进行无水线分配 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) alloc_flags = reserve_flags;
/* * Reset the nodemask and zonelist iterators if memory policies can be * ignored. These allocations are high priority and system rather than * user oriented. */ //如果可以忽略内存策略,则重置nodemask和zonelist if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->nodemask = NULL; ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); }
/* Attempt with potentially adjusted zonelist and alloc_flags */ //尝试使用可能调整的区域备用列表和分配标志 page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); if (page) goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */ //如果不可以直接回收,则申请失败 if (!can_direct_reclaim) goto nopage;
/* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage;
/* Try direct reclaim and then allocating */ //直接页面回收,然后进行页面分配 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); if (page) goto got_pg;
/* Try direct compaction and then allocating */ //进行页面压缩,然后进行页面分配 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, compact_priority, &compact_result); if (page) goto got_pg;
/* Do not loop if specifically requested */ //如果调用者要求不要重试,则放弃 if (gfp_mask & __GFP_NORETRY) goto nopage;
/* * Do not retry costly high order allocations unless they are * __GFP_RETRY_MAYFAIL */ //不要重试代价高昂的高阶分配,除非它们是__GFP_RETRY_MAYFAIL if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage;
/* * It doesn't make any sense to retry for the compaction if the order-0 * reclaim is not able to make any progress because the current * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ //如果申请阶数大于0,判断是否需要重新尝试压缩 if (did_some_progress > 0 && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) goto retry;
/* Deal with possible cpuset update races before we start OOM killing */ //如果cpuset允许修改内存节点申请就修改 if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset;
/* Reclaim has failed us, start killing things */ //使用oom选择一个进程杀死 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */ //如果当前进程是oom选择的进程,并且忽略了水线,则放弃申请 if (tsk_is_oom_victim(current) && (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage;
/* Retry as long as the OOM killer is making progress */ //如果OOM杀手正在取得进展,再试一次 if (did_some_progress) { no_progress_loops = 0; goto retry; }
nopage: /* Deal with possible cpuset update races before we fail */ if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset;
/* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure * we always retry */ if (gfp_mask & __GFP_NOFAIL) { /* * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ if (WARN_ON_ONCE(!can_direct_reclaim)) goto fail;
/* * PF_MEMALLOC request from this context is rather bizarre * because we cannot reclaim anything and only can loop waiting * for somebody to do a work for us */ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/* * non failing costly orders are a hard requirement which we * are not prepared for much so let's warn about these users * so that we can identify them and convert them to something * else. */ WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/* * Help non-failing allocations by giving them access to memory * reserves but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make * the situation worse */ //允许它们访问内存备用列表 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); if (page) goto got_pg;
/* * If an allocation failed after direct reclaim, it could be because * pages are pinned on the per-cpu lists or in high alloc reserves. * Shrink them them and try again */ //如果在直接回收之后分配失败,可能是因为页面固定在每个cpu列表上或处于高分配预留中 if (!page && !drained) { unreserve_highatomic_pageblock(ac, false);//unreserve处于高分配预留中的内存 drain_all_pages(NULL);//释放固定在每个cpu列表上页面 drained = true; goto retry;//再试一次慢路径回收,分配内存 }
static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag;
cond_resched();//主动让出cpu
/* We now go into synchronous reclaim */ cpuset_memory_pressure_bump();//计算内存压力 fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save();//保存内存标志 reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state;
/* * scan_control uses s8 fields for order, priority, and reclaim_idx. * Confirm they are large enough for max values. */ BUILD_BUG_ON(MAX_ORDER > S8_MAX); BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
/* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this * point. */ //如果在节流时发送了致命信号,不要进入回收,返回1 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1;
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL;
/* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; sc->reclaim_idx = gfp_zone(sc->gfp_mask); }
for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (global_reclaim(sc)) {//如果当前进行的是全局页回收 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue;
/* * If we already have plenty of memory free for * compaction in this zone, don't free any more. * Even though compaction is invoked for any * non-zero order, only frequent costly order * reclamation is disruptive enough to become a * noticeable problem, like transparent huge * page allocations. */ //如果可以压缩规整,并且有足够空间 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; }
/* * Shrink each node in the zonelist once. If the * zonelist is ordered by zone (not the default) then a * node may be shrunk multiple times but in that case * the user prefers lower zones being preserved. */ if (zone->zone_pgdat == last_pgdat) continue;
/* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and * scanned pages. This works for global memory pressure * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ }
/* See comment about same check for global reclaim above */ if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); }
/* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask;//恢复到原来的掩码 }
memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; unsigned long reclaimed; unsigned long scanned;
switch (mem_cgroup_protected(root, memcg)) { case MEMCG_PROT_MIN: /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; case MEMCG_PROT_LOW: /* * Soft protection. * Respect the protection only as long as * there is an unprotected supply * of reclaimable memory from other cgroups. */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: break; }
/* Record the group's reclaim efficiency */ //测量虚拟内存的压力,用于记录回收效率 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
/* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the * node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will * retry with decreasing priority if one round over the * whole hierarchy is not sufficient. */ if (!global_reclaim(sc) && sc->nr_reclaimed >= sc->nr_to_reclaim) { mem_cgroup_iter_break(root, memcg); break; } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; }
/* Record the subtree's reclaim efficiency */ //测量虚拟内存的压力,用于记录回收效率 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true;
if (current_is_kswapd()) { /* * If reclaim is isolating dirty pages under writeback, * it implies that the long-lived page allocation rate * is exceeding the page laundering rate. Either the * global limits are not being effective at throttling * processes due to the page distribution throughout * zones or there is heavy usage of a slow backing * device. The only option is to throttle from reclaim * context which is not ideal as there is no guarantee * the dirtying process is throttled in the same way * balance_dirty_pages() manages. * * Once a node is flagged PGDAT_WRITEBACK, kswapd will * count the number of pages under pages flagged for * immediate reclaim and stall if any are encountered * in the nr_immediate check below. */ if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/* * Tag a node as congested if all the dirty pages * scanned were backed by a congested BDI and * wait_iff_congested will stall. */ if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_bit(PGDAT_CONGESTED, &pgdat->flags);
/* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags);
/* * If kswapd scans pages marked marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so also forcibly stall. */ if (sc->nr.immediate) congestion_wait(BLK_RW_ASYNC, HZ/10); }
/* * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ if (!global_reclaim(sc) && sane_reclaim(sc) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_memcg_congestion(pgdat, root, true);
/* * Stall direct reclaim for IO completions if underlying BDIs * and node is congested. Allow kswapd to continue until it * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ if (!sc->hibernation_mode && !current_is_kswapd() && current_may_throttle() && pgdat_memcg_congested(pgdat, root)) wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc));
/* * Kswapd gives up on balancing particular nodes after too * many failures to reclaim anything from them and goes to * sleep. On reclaim progress, reset the failure counter. A * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) pgdat->kswapd_failures = 0;
return reclaimable; }
shrink_node调用shrink_node_memcg:
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *lru_pages) { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; bool scan_adjusted;
/* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr));
/* * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal * event that can occur when there is little memory pressure e.g. * multiple streaming readers/writers. Hence, we do not abort scanning * when the requested number of pages are reclaimed when scanning at * DEF_PRIORITY on the assumption that the fact we are direct * reclaiming implies that kswapd is not keeping up and it is best to * do a batch of work at once. For memcg reclaim one check is made to * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);// 初始化blk_plug并在task_struct内跟踪它,一旦阻塞则刷新挂起任务 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { unsigned long nr_anon, nr_file, percentage; unsigned long nr_scanned;
if (nr_reclaimed < nr_to_reclaim || scan_adjusted) continue;
/* * For kswapd and memcg, reclaim at least the number of pages * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. */ nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
/* * It's just vindictive to attack the larger once the smaller * has gone to zero. And given the way we stop scanning the * smaller below, this makes sure that we only make one nudge * towards proportionality once we've got nr_to_reclaim. */ if (!nr_file || !nr_anon) break;
/* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ //如果lru活动页和不活动页比例不平衡 if (inactive_list_is_low(lruvec, false, memcg, sc, true)) //调整lru活动页和不活动页比例 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); }
shrink_node_memcg主要调用shrink_list:
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc) { if (is_active_lru(lru)) { //如果lru活动页和不活动页比例不平衡 if (inactive_list_is_low(lruvec, is_file_lru(lru), memcg, sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru);//调整lru活动页和不活动页比例 return 0; }
/* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of * dirty pages grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ //如果脏页队列没有IO等待,则唤醒flusher冲刷线程将数据回写磁盘 if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN);
/* * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. */ //判断是否为脏页或者在回写处理中 page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) nr_dirty++;//记录回写脏页数量
if (dirty && !writeback) nr_unqueued_dirty++;//未回写脏页数量
/* * Treat this page as congested if the underlying BDI is or if * pages are cycling through the LRU so quickly that the * pages marked for immediate reclaim are making it to the * end of the LRU a second time. */ //判断是否映射页,文件页返回MAPPING,匿名页返回NULL,交换缓存页返回SWAP mapping = page_mapping(page); //增加阻塞页框的两种情况:1.脏页或者在回写的页的inode标志位阻塞,2.回写的页标记正在回收 if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++;
/* * If a page at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number of pages * under writeback and this page is both under writeback and * PageReclaim then it indicates that pages are being queued * for IO but are being recycled through the LRU before the * IO can complete. Waiting on the page itself risks an * indefinite stall if it is impossible to writeback the * page due to IO error or disconnected storage so instead * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * * 2) Global or new memcg reclaim encounters a page that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * * Require may_enter_fs because we would wait on fs, which * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. * * In cases 1) and 2) we activate the pages to get them out of * the way while we continue scanning for clean pages on the * inactive list and refilling from the active list. The * observation here is that waiting for disk writes is more * expensive than potentially causing reloads down the line. * Since they're marked for immediate reclaim, they won't put * memory pressure on the cache working set any longer than it * takes to write them to disk. */ //如果页处于回写状态中 if (PageWriteback(page)) { /* Case 1 above */ //页在交换中 if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto activate_locked;
/* Case 2 above */ //页再回收中 } else if (sane_reclaim(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then * setting PageReclaim here end up interpreted * as PageReadahead - but that does not matter * enough to care. What we do want is for this * page to have PageReclaim set next time memcg * reclaim reaches the tests above, so it will * then wait_on_page_writeback() to avoid OOM; * and it's also appropriate in global reclaim. */ SetPageReclaim(page); nr_writeback++; goto activate_locked;
/* Case 3 above */ //不在交换也不在回收,则等待回写完毕 } else { unlock_page(page); wait_on_page_writeback(page); /* then go back and try same page again */ list_add_tail(&page->lru, page_list); continue; } }
if (!force_reclaim)//如果没有设置强制回收则检查也的访问情况 references = page_check_references(page, sc);
switch (references) { case PAGEREF_ACTIVATE: goto activate_locked;//近期访问次数大于1则设置为activate,并且放到ret_page中 case PAGEREF_KEEP: nr_ref_keep++; goto keep_locked; case PAGEREF_RECLAIM: case PAGEREF_RECLAIM_CLEAN: ; /* try to reclaim the page below */ }
/* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree page could be freed directly */ //如果是匿名也且有后备缓存 if (PageAnon(page) && PageSwapBacked(page)) { if (!PageSwapCache(page)) {//不在交换缓存中 if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (PageTransHuge(page)) { /* cannot split THP, skip it */ if (!can_split_huge_page(page, NULL)) goto activate_locked; /* * Split pages without a PMD map right * away. Chances are some or all of the * tail pages can be freed without IO. */ if (!compound_mapcount(page) && split_huge_page_to_list(page, page_list)) goto activate_locked; } if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked; /* Fallback to swap normal pages */ if (split_huge_page_to_list(page, page_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(page)) goto activate_locked; }
may_enter_fs = 1;
/* Adding to swap updated mapping */ mapping = page_mapping(page);//添加到交换缓存中 } } else if (unlikely(PageTransHuge(page))) { /* Split file THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; }
/* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ //如果页有映射到进程 if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; if (!try_to_unmap(page, flags)) {//尝试接触映射 nr_unmap_fail++; goto activate_locked; } }
if (PageDirty(page)) {//如果是脏页 /* * Only kswapd can writeback filesystem pages * to avoid risk of stack overflow. But avoid * injecting inefficient single-page IO into * flusher writeback as much as possible: only * write pages when we've encountered many * dirty pages, and when we've already scanned * the rest of the LRU for clean pages and see * the same dirty pages again (PageReclaim). */ /* 如果这个是文件页,但是想要往下回收还要满足三个条件: 1.是kswapd线程,只有kswapd可以回写文件系统页面,以避免堆栈溢出的风险 2.是正在回收的页 3.是大量脏页处理的回写操作,只有当我们遇到许多脏页时才写页,避免将低效的单页IO 如果满足三个要求,可以往下走,否则走activate_locked设置为活动页 */ if (page_is_file_cache(page) && (!current_is_kswapd() || !PageReclaim(page) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() * except we already have the page isolated * and know it's dirty */ inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page);
goto activate_locked; }
if (references == PAGEREF_RECLAIM_CLEAN)//如果被访问过就放到ret_page中 goto keep_locked; if (!may_enter_fs)//不允许文件系统操作就放到ret_page中 goto keep_locked; if (!sc->may_writepage)//不允许回写则放到ret_page中 goto keep_locked;
/* * Page is dirty. Flush the TLB if a writable entry * potentially exists to avoid CPU writes after IO * starts and then write it out here. */ //走到这里说明页面很脏。 如果可能存在可写条目,则刷新TLB,以避免IO启动后进行CPU写操作,然后在这里将其写入。 try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) {//将页面回写出去操作 case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: if (PageWriteback(page)) goto keep; if (PageDirty(page)) goto keep;
/* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; mapping = page_mapping(page); case PAGE_CLEAN: ; /* try to free the page below */ } }
/* * If the page has buffers, try to free the buffer mappings * associated with this page. If we succeed we try to free * the page as well. * * We do this even if the page is PageDirty(). * try_to_release_page() does not perform I/O, but it is * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * * Rarely, pages can have buffers and no ->mapping. These are * the pages which were not successfully invalidated in * truncate_complete_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; if (!mapping && page_count(page) == 1) { unlock_page(page); if (put_page_testzero(page)) goto free_it; else { /* * rare race with speculative reference. * the speculative reference will free * this page shortly, so we may * increment nr_reclaimed here (and * leave it off the LRU). */ nr_reclaimed++; continue; } } }
if (PageAnon(page) && !PageSwapBacked(page)) { /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; if (PageDirty(page)) { page_ref_unfreeze(page, 1); goto keep_locked; }
count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed * from pagecache). Can use non-atomic bitops now (and * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ __ClearPageLocked(page); free_it: nr_reclaimed++;
/* * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ if (unlikely(PageTransHuge(page))) { mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); } else list_add(&page->lru, &free_pages);//将页面放入free_pages释放掉 continue;
activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); }
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ /*如果该页面为匿名映射,返回该页面对应的匿名结构*/ anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; /*这里可以看出,vma的anon_vma_node字段链接到 anon_vma的head字段*/ /*扫描线性区描述符的anon_vma链表*/ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { if (MLOCK_PAGES && unlikely(unlock)) { if (!((vma->vm_flags & VM_LOCKED) && page_mapped_in_vma(page, vma))) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { /*对anon_vma链表中的每一个vma线性区描述符 调用该函数*/ ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) break; } if (ret == SWAP_MLOCK) { mlocked = try_to_mlock_page(page, vma); if (mlocked) break; /* stop if actually mlocked page */ } }
page_unlock_anon_vma(anon_vma);
if (mlocked) ret = SWAP_MLOCK; /* actually mlocked the page */ else if (ret == SWAP_MLOCK) ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
return ret; }
/* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ /** *page是一个指向目标页描述符的指针; *vma是指向线性区描述符的指针 */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; /*计算出待回收页的线性地址*/ address = vma_address(page, vma); if (address == -EFAULT) goto out; /*获取线性地址对应的页表项地址*/ pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out;
/* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ /*下面为判断是否可以被回收*/ if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } } if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; } }
if (PageSwapCache(page)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... */ /*保存换出位置*/ swap_duplicate(entry); if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) /*添加到init_mm的相应链表,从这里可以 看出mm->mmlist为交换用的链表*/ list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } dec_mm_counter(mm, anon_rss); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else dec_mm_counter(mm, file_rss);