A Linux Kernel Miracle Tour

编程入门行业动态更新时间:2024-10-07 03:21:45

A Linux <a href=https://www.elefans.com/category/jswz/34/1768523.html style= Kernel Miracle Tour"/>

A Linux Kernel Miracle Tour

内存回收

为什么需要内存回收

因为Linux的设计是用内存换效率，所以有多少内存它就会吃多少内存。如果你读一次文件，它会使用内存缓存文件，你每次修改不用操作实际的存储设备，而只是操作内存而言。所以随着系统运行时间的推移，越来越来的内存被用作系统加速的缓存。当你申请内存的时候，可能发现已经没有足够可用的内存了。所以内存回收就是为了解决这样的问题而诞生的。

内存回收回收哪些内存

所谓的内存回收主要是针对用户空间的匿名页面（heap / stack / mmap private），文件映射（mmap files），和内核空间部分缓存。它会将匿名页面置换到swap空间；将被写过（脏）文件映射的内存回写到实际的存储设备，然后丢弃此页面；将只读的文件映射直接丢弃来释放内存页面；将回收关闭文件后缓存的inode和dentry 缓存；以及slab的缓存等等。

从存储链表的位置的角度来看，一般而言，其回收和交换的内存位于LRU的inactive链表上。大内存回收除外。

从page flag的角度来看，除了 PG_unevictable 的内存，基本都是可以回收或者交换的。

内存回收的运行时机

内存回收何时会触发

为了提高缺页中断的效率，定期性的扫描内存状态。
当分配内存时主动性的触发，为了在恶劣的条件下为紧急的进程分配足够的内存。

回收页面的选择

试着想象如下情况，当缺页中断发生时，内核为我们分配了一个页面，我们将其加入到ative lru 链表并且设置page flag 为active 状态，随着时间的推移，后面分配的内存，会添加到链表头部，那么之前最开始申请的页面调整到lru的链表尾部，那么正常情况下，因为最久没有使用，我们应该选择链表尾部的这个页面来作为回收或者交换的选择，但是可能这个页面刚刚被访问过。所以在这种时机下，我们去回收这个页面是不合理。故内核引入了二次机会算法来改善这个选择。

内核使用 LRU + 二次机会算法来寻找最合适的页面进行回收

二次机会算法

核心使用了page flag - PG_referenced 来实现是否被2次引用。其实像x86架构，硬件上有提供页面是否被访问的支持，那为什么还需要这个软件的标志位呢？这是为了兼容。

二次机会算法本质就是从inactive lru 链表中选择回收页面时，去检查这个标志位，如果是1，就给她第二次机会，继续寻找下一个页面。

LRU 算法

内核里面实现LRU算法是很简单的，通过每次添加元素到链表头，和选择prev指针即可实现LRU。

active_page

内核使用active_page将一个页面设置为active 状态，并且会将page迁移到对应的链表上。

void activate_page(struct page *page)
{page = compound_head(page);if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);get_page(page);if (!pagevec_add(pvec, page) || PageCompound(page))pagevec_lru_move_fn(pvec, __activate_page, NULL);put_cpu_var(activate_page_pvecs);}
}

mark_page_accessed

代码就不在这里跟踪了了，其主要的功能就将page flag中的状态变更。

调用前状态	调用后状态
inactive,unreferenced	inactive,referenced
inactive,referenced	active,unreferenced
active,unreferenced	active,referenced

page_referenced

检查当前页面有多少个pte引用，它会去调用page_referenced_one , 实际就是去判断ptep指针的低12位中的标记。同时会清除这个标记。

page_check_references

二次算法的核心函数 - 此函数会在shrink_inactive_list中被调用。先通过page_referenced 获取此page被引用的pte的数量，TestClearPageReferenced 确定此page是否被访问过。

PAGEREF_ACTIVATE
- 匿名的swapbaked的页面
- 第二次访问标记有置位或者有多个pte引用
- 可执行的文件映射
PAGEREF_KEEP - 在page 有被pte 引用的情况下，除了以上的条件，都会先保持在inactive lru 链表中。
PAGEREF_RECLAIM - 适合回收的页面
PAGEREF_RECLAIM_CLEAN - 也会按keep 状态处理。

static enum page_references page_check_references(struct page *page,struct scan_control *sc)
{int referenced_ptes, referenced_page;unsigned long vm_flags;referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,&vm_flags);referenced_page = TestClearPageReferenced(page);if (vm_flags & VM_LOCKED)return PAGEREF_RECLAIM;if (referenced_ptes) {if (PageSwapBacked(page))return PAGEREF_ACTIVATE;SetPageReferenced(page);if (referenced_page || referenced_ptes > 1)return PAGEREF_ACTIVATE;/** Activate file-backed executable pages after first usage.*/if (vm_flags & VM_EXEC)return PAGEREF_ACTIVATE;return PAGEREF_KEEP;}/* Reclaim if clean, defer dirty pages to writeback */if (referenced_page && !PageSwapBacked(page))return PAGEREF_RECLAIM_CLEAN;return PAGEREF_RECLAIM;
}

pagevec

为提高操作LRU链表的效率，内核使用批量的操作方式进行一次性添加。意思就是说先把page暂存在pagevec 里面，待存满的时候再一次性的刷到对应的LRU链表中。

struct pagevec {unsigned char nr;bool percpu_pvec_drained;struct page *pages[PAGEVEC_SIZE];
};

pagevec 提供的API 如下:

pagevec_init - 初始化pagevec
pagevec_reinit - 重置pagevec
pagevec_count - 当前占用的pagevec的页面数量
agevec_space - pagevec 可容纳剩余空间
pagevec_add - 将page添加到pagevec
pagevec_release - 将page的_refcount减 1，如果为0，则释放该页到伙伴系统

static inline void pagevec_init(struct pagevec *pvec)
{pvec->nr = 0;pvec->percpu_pvec_drained = false;
}static inline void pagevec_reinit(struct pagevec *pvec)
{pvec->nr = 0;
}static inline unsigned pagevec_count(struct pagevec *pvec)
{return pvec->nr;
}static inline unsigned pagevec_space(struct pagevec *pvec)
{return PAGEVEC_SIZE - pvec->nr;
}/** Add a page to a pagevec.  Returns the number of slots still available.*/
static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
{pvec->pages[pvec->nr++] = page;return pagevec_space(pvec);
}static inline void pagevec_release(struct pagevec *pvec)
{if (pagevec_count(pvec))__pagevec_release(pvec);
}

其中我们重点来看一下页面释放的部分，pagevec_release会去调用release_pages，而release_pages 会先从locked_pgdat 获得 lruvec，然后将这个page从对应的lru链表中删除，同时加入到pages_to_free的临时链表，再遍历完再pagevec中的页以后在，调用free_unref_page_list （在page_alloc.c中实现）一起释放，返回给伙伴系统。内存回收筛选到合适的页面以后就会这样来释放页面，而不是找到一个页面以后就释放的。

void release_pages(struct page **pages, int nr)
{int i;LIST_HEAD(pages_to_free);struct pglist_data *locked_pgdat = NULL;struct lruvec *lruvec;unsigned long uninitialized_var(flags);unsigned int uninitialized_var(lock_batch);for (i = 0; i < nr; i++) {struct page *page = pages[i];...if (PageLRU(page)) {struct pglist_data *pgdat = page_pgdat(page);...lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);__ClearPageLRU(page);del_page_from_lru_list(page, lruvec, page_off_lru(page));}/* Clear Active bit in case of parallel mark_page_accessed */__ClearPageActive(page);__ClearPageWaiters(page);list_add(&page->lru, &pages_to_free);}...free_unref_page_list(&pages_to_free);
}

LRU 链表

随便内核版本的更新，LRU链表已经从原来的2个全局的 inactive_lru 和 active_lru 变成了嵌入pglist_data 节点的多个lru链表。从下面的代码可以看到，有5个LRU链表，对应不同的page种类

lru_inactive_anon
lru_active_anon
lru_inactive_file
lru_active_file
lru_unevictable

enum lru_list {LRU_INACTIVE_ANON = LRU_BASE,LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,LRU_UNEVICTABLE,NR_LRU_LISTS
};struct lruvec {struct list_head        lists[NR_LRU_LISTS];struct zone_reclaim_stat    reclaim_stat;/* Evictions & activations on the inactive file list */atomic_long_t           inactive_age;/* Refaults at the time of last reclaim cycle */unsigned long           refaults;
#ifdef CONFIG_MEMCGstruct pglist_data *pgdat;
#endif
};typedef struct pglist_data {/* Fields commonly accessed by the page reclaim scanner */struct lruvec       lruvec;
} pg_data_t;

lru 链表的操作

可以看到以下函数的参数中都有enum lru_list lru ，用lru 来决定要操作的链表。

static __always_inline void add_page_to_lru_list(struct page *page,struct lruvec *lruvec, enum lru_list lru)
{update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));list_add(&page->lru, &lruvec->lists[lru]);
}static __always_inline void add_page_to_lru_list_tail(struct page *page,struct lruvec *lruvec, enum lru_list lru)
{update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));list_add_tail(&page->lru, &lruvec->lists[lru]);
}static __always_inline void del_page_from_lru_list(struct page *page,struct lruvec *lruvec, enum lru_list lru)
{list_del(&page->lru);update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
}

LRU 缓存

同时内核提供了如下的LRU pagevec 缓存来加速LRU的操作

static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);

lru_cache_add

提供了将页面添加到lru链表的接口，如下代码所示，其主要的内容在__lru_cache_add 中，先去把当前的page添加到lru_add_pvec 向量中，如果发现此pagevec的空间已满，则将链表刷新到对应的内存节点的5个lru 链表中。后者在__pagevec_lru_add_fn 中实现。从page_lru中我们可以看到通过page的flag得到lru的在lruvec 数据结构中的index，从而将不同类型的page添加到不同的链表上

static __always_inline enum lru_list page_lru(struct page *page)
{enum lru_list lru;if (PageUnevictable(page))lru = LRU_UNEVICTABLE;else {lru = page_lru_base_type(page);if (PageActive(page))lru += LRU_ACTIVE;}return lru;
}void __pagevec_lru_add(struct pagevec *pvec)
{pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}static void __lru_cache_add(struct page *page)
{struct pagevec *pvec = &get_cpu_var(lru_add_pvec);get_page(page);if (!pagevec_add(pvec, page) || PageCompound(page))__pagevec_lru_add(pvec);put_cpu_var(lru_add_pvec);
}void lru_cache_add(struct page *page)
{VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);VM_BUG_ON_PAGE(PageLRU(page), page);__lru_cache_add(page);
}

lru_add_drain

将lru_add_pvec，lru_rotate_pvecs，lru_deactivate_file_pvecs，lru_lazyfree_pvecs 和 activate_page_pvecs LRU缓存存储的页面全部转移到内存节点的lru链表中。用于内存回收shrink_page_list 开头处。

LRU页面的隔离

内存提供了函数 isolate_lru_pages 来将LRU链表中的页进行隔离，为什么要隔离，为了效率。因为在做内存回收的路径了一定会各种遍历LRU链表，但是这个时候系统并没有停止，也会频繁的访问链表。所以为了避免对锁的竞争，内核决定将页面从LRU的链表上隔离出来去做内存回收的页面筛选动作。

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,struct lruvec *lruvec, struct list_head *dst,unsigned long *nr_scanned, struct scan_control *sc,isolate_mode_t mode, enum lru_list lru)
{struct list_head *src = &lruvec->lists[lru];unsigned long nr_taken = 0;unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };unsigned long skipped = 0;unsigned long scan, total_scan, nr_pages;LIST_HEAD(pages_skipped);scan = 0;for (total_scan = 0;scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);total_scan++) {struct page *page;page = lru_to_page(src);prefetchw_prev_lru_page(page, src, flags);VM_BUG_ON_PAGE(!PageLRU(page), page);if (page_zonenum(page) > sc->reclaim_idx) {list_move(&page->lru, &pages_skipped);nr_skipped[page_zonenum(page)]++;continue;}scan++;switch (__isolate_lru_page(page, mode)) {case 0:nr_pages = hpage_nr_pages(page);nr_taken += nr_pages;nr_zone_taken[page_zonenum(page)] += nr_pages;list_move(&page->lru, dst);break;case -EBUSY:/* else it is being freed elsewhere */list_move(&page->lru, src);continue;default:BUG();}}if (!list_empty(&pages_skipped)) {int zid;list_splice(&pages_skipped, src);for (zid = 0; zid < MAX_NR_ZONES; zid++) {if (!nr_skipped[zid])continue;__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);skipped += nr_skipped[zid];}}*nr_scanned = total_scan;trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,total_scan, skipped, nr_taken, mode, lru);update_lru_sizes(lruvec, lru, nr_zone_taken);return nr_taken;
}

内存水位

页面回收的其他重要函数

shrink_active_list

先通过lru_add_drain将pagevec中的页面更新到对应的lru链表中，然后再通过isolate_lru_pages 隔离lru对应的链表尾部的 nr_to_scan 个page从到临时的l_hold链表。从l_hold链表的尾部开始遍历，如果页面没有被引用过，则清除page active flag，同时添加到临时的l_inactive链表头部中；如果有被引用过，是 file_cache且具有可执行权限，则添加到临时 l_active链表头部中。再调用move_active_pages_to_lru 将_refcount 为1 的页面添加到临时链表 l_hold，这类page代表内核已经没有人引用，可以释放了。将_refcount 大于1的页面添加到对应的lru 链表中。最后释放l_hold 中的页面。

static void shrink_active_list(unsigned long nr_to_scan,struct lruvec *lruvec,struct scan_control *sc,enum lru_list lru)
{unsigned long nr_taken;unsigned long nr_scanned;unsigned long vm_flags;LIST_HEAD(l_hold);  /* The pages which were snipped off */LIST_HEAD(l_active);LIST_HEAD(l_inactive);struct page *page;struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;unsigned nr_deactivate, nr_activate;unsigned nr_rotated = 0;isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct pglist_data *pgdat = lruvec_pgdat(lruvec);lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;spin_lock_irq(&pgdat->lru_lock);nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,&nr_scanned, sc, isolate_mode, lru);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);reclaim_stat->recent_scanned[file] += nr_taken;__count_vm_events(PGREFILL, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);spin_unlock_irq(&pgdat->lru_lock);while (!list_empty(&l_hold)) {cond_resched();page = lru_to_page(&l_hold);list_del(&page->lru);if (unlikely(!page_evictable(page))) {putback_lru_page(page);continue;}if (unlikely(buffer_heads_over_limit)) {if (page_has_private(page) && trylock_page(page)) {if (page_has_private(page))try_to_release_page(page, 0);unlock_page(page);}}if (page_referenced(page, 0, sc->target_mem_cgroup,&vm_flags)) {nr_rotated += hpage_nr_pages(page);if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {list_add(&page->lru, &l_active);continue;}}ClearPageActive(page);  /* we are de-activating */list_add(&page->lru, &l_inactive);}spin_lock_irq(&pgdat->lru_lock);reclaim_stat->recent_rotated[file] += nr_rotated;nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&pgdat->lru_lock);mem_cgroup_uncharge_list(&l_hold);free_unref_page_list(&l_hold);trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,nr_deactivate, nr_rotated, sc->priority, file);
}

shrink_inactive_list

static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,struct scan_control *sc, enum lru_list lru)
{LIST_HEAD(page_list);unsigned long nr_scanned;unsigned long nr_reclaimed = 0;unsigned long nr_taken;struct reclaim_stat stat = {};isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct pglist_data *pgdat = lruvec_pgdat(lruvec);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;bool stalled = false;while (unlikely(too_many_isolated(pgdat, file, sc))) {if (stalled)return 0;/* wait a bit for the reclaimer. */msleep(100);stalled = true;/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;}lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;spin_lock_irq(&pgdat->lru_lock);nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,&nr_scanned, sc, isolate_mode, lru);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);reclaim_stat->recent_scanned[file] += nr_taken;if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSCAN_KSWAPD, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,nr_scanned);} else {if (global_reclaim(sc))__count_vm_events(PGSCAN_DIRECT, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,nr_scanned);}spin_unlock_irq(&pgdat->lru_lock);if (nr_taken == 0)return 0;nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,&stat, false);spin_lock_irq(&pgdat->lru_lock);if (current_is_kswapd()) {if (global_reclaim(sc))__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,nr_reclaimed);} else {if (global_reclaim(sc))__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,nr_reclaimed);}putback_inactive_pages(lruvec, &page_list);__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&pgdat->lru_lock);mem_cgroup_uncharge_list(&page_list);free_unref_page_list(&page_list);if (stat.nr_writeback && stat.nr_writeback == nr_taken)set_bit(PGDAT_WRITEBACK, &pgdat->flags);if (stat.nr_unqueued_dirty == nr_taken)wakeup_flusher_threads(WB_REASON_VMSCAN);/** Legacy memcg will stall in page writeback so avoid forcibly* stalling here.*/if (sane_reclaim(sc)) {if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)set_bit(PGDAT_CONGESTED, &pgdat->flags);/* Allow kswapd to start writing pages during reclaim. */if (stat.nr_unqueued_dirty == nr_taken)set_bit(PGDAT_DIRTY, &pgdat->flags);if (stat.nr_immediate && current_may_throttle())congestion_wait(BLK_RW_ASYNC, HZ/10);}if (!sc->hibernation_mode && !current_is_kswapd() &&current_may_throttle())wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,nr_scanned, nr_reclaimed,stat.nr_dirty,  stat.nr_writeback,stat.nr_congested, stat.nr_immediate,stat.nr_activate, stat.nr_ref_keep,stat.nr_unmap_fail,sc->priority, file);return nr_reclaimed;
}

move_active_pages_to_lru

遍历 list 链表

将_refcount 为1 的页面添加到链表pages_to_free
将_refcount 大于1的页面添加到 lru 对应的链表。

static unsigned move_active_pages_to_lru(struct lruvec *lruvec,struct list_head *list,struct list_head *pages_to_free,enum lru_list lru)
{struct pglist_data *pgdat = lruvec_pgdat(lruvec);struct page *page;int nr_pages;int nr_moved = 0;while (!list_empty(list)) {page = lru_to_page(list);lruvec = mem_cgroup_page_lruvec(page, pgdat);VM_BUG_ON_PAGE(PageLRU(page), page);SetPageLRU(page);nr_pages = hpage_nr_pages(page);update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);list_move(&page->lru, &lruvec->lists[lru]);if (put_page_testzero(page)) {__ClearPageLRU(page);__ClearPageActive(page);del_page_from_lru_list(page, lruvec, lru);if (unlikely(PageCompound(page))) {spin_unlock_irq(&pgdat->lru_lock);mem_cgroup_uncharge(page);(*get_compound_page_dtor(page))(page);spin_lock_irq(&pgdat->lru_lock);} elselist_add(&page->lru, pages_to_free);} else {nr_moved += nr_pages;}}if (!is_active_lru(lru)) {__count_vm_events(PGDEACTIVATE, nr_moved);count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,nr_moved);}return nr_moved;
}

页回收的流程

okay，到目前为止，我们了解了如何判断页面是否被访问过，如何更新lru链表，如何使用lru缓存，以及如何去判断内存节点是否平衡。接下来我们开始进去正式的页面回收流程。

内核为每一个cpu都会创建一个kswapd的线程用于内存回收，它主要干2件事情，如果发现当前的内存节点是内存状态是平衡的，那么它就去休息；如果发现不是平衡的，那么它就会尽自己的努力来让内存平衡以满足分配的需求。相信从函数名字来看，就能猜出来kswapd_try_to_sleep 就是会让kswapd 睡眠的函数，而balance_pgdat 则是来回收内存的。再进入balance_pgdat之前有2个参数要先确定一下，一个是alloc_order, 一个是classzone_idx。前者涉及到此次回收的目标大小，后者决定了回收的地点，也就是当前内存节点的哪些zone上去回收内存。让我们先再回到wakeup_kswapd中，在唤醒kswapd线程之前对这2个变量进行赋值。前者是 max(pgdat->kswapd_order, alloc_page 申请的order），后者为max(pgdat->kswapd_classzone_idx, alloc_page 申请上下文 ac 的classzone_idx）。

static int kswapd(void *p)
{...for ( ; ; ) {bool ret;alloc_order = reclaim_order = pgdat->kswapd_order;classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);kswapd_try_sleep:kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,classzone_idx);/* Read the new order and classzone_idx */alloc_order = reclaim_order = pgdat->kswapd_order;classzone_idx = kswapd_classzone_idx(pgdat, 0);...reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);if (reclaim_order < alloc_order)goto kswapd_try_sleep;}tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);current->reclaim_state = NULL;return 0;
}

接下来让我们来看看它到底是如何回收的。

static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{int i;unsigned long nr_soft_reclaimed;unsigned long nr_soft_scanned;struct zone *zone;struct scan_control sc = {.gfp_mask = GFP_KERNEL,.order = order,.priority = DEF_PRIORITY,.may_writepage = !laptop_mode,.may_unmap = 1,.may_swap = 1,};count_vm_event(PAGEOUTRUN);do {unsigned long nr_reclaimed = sc.nr_reclaimed;bool raise_priority = true;sc.reclaim_idx = classzone_idx;...// 如果当前的内存节点对于此次分配是平衡的，则退出if (pgdat_balanced(pgdat, sc.order, classzone_idx))goto out;...// 由于内核规定了一个 inactive ratio = active / inactive// 如果 inactive ratio 没有达到标准// 则从active anon lru 中转移32个page到inactive anon lru 中age_active_anon(pgdat, &sc);// 如果扫描的优先级提升为9一下，则可以回写页面if (sc.priority < DEF_PRIORITY - 2)sc.may_writepage = 1;// 软件上限制回收的一些参数sc.nr_scanned = 0;nr_soft_scanned = 0;nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,sc.gfp_mask, &nr_soft_scanned);sc.nr_reclaimed += nr_soft_reclaimed;// node的回收函数，如果成功则不需要提高扫描的优先级if (kswapd_shrink_node(pgdat, &sc))raise_priority = false;...// 确定是否需要提高扫描的力度nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;if (raise_priority || !nr_reclaimed)sc.priority--;} while (sc.priority >= 1);if (!sc.nr_reclaimed)pgdat->kswapd_failures++;out:snapshot_refaults(NULL, pgdat);return sc.order;
}

shrink_page_list

kswapd_shrink_node -> shrink_node_memcg -> shrink_list -> shrink_inactive_list -> shrink_page_list

跳过上面的函数调用路径，直接来看核心函数，其决定了到底哪些页面会被回收。这个函数有点长，我们一些干扰我们步骤主题的先拿掉。

static unsigned long shrink_page_list(struct list_head *page_list,struct pglist_data *pgdat,struct scan_control *sc,enum ttu_flags ttu_flags,struct reclaim_stat *stat,bool force_reclaim)
{...while (!list_empty(page_list)) {struct address_space *mapping;struct page *page;int may_enter_fs;enum page_references references = PAGEREF_RECLAIM_CLEAN;bool dirty, writeback;page = lru_to_page(page_list);list_del(&page->lru);// 先检查页面是否被二次引用if (!force_reclaim)references = page_check_references(page, sc);switch (references) {case PAGEREF_ACTIVATE:goto activate_locked;case PAGEREF_KEEP:nr_ref_keep++;goto keep_locked;case PAGEREF_RECLAIM:case PAGEREF_RECLAIM_CLEAN:; /* try to reclaim the page below */}// 运行到这里就是页面回收的部分if (PageAnon(page) && PageSwapBacked(page)) {// 匿名页面并且没有分配交换的slot，则通过add_to_swap分配空间if (!PageSwapCache(page)) {if (!(sc->gfp_mask & __GFP_IO))goto keep_locked;...if (!add_to_swap(page)) {if (!PageTransHuge(page))goto activate_locked;/* Fallback to swap normal pages */if (split_huge_page_to_list(page,page_list))goto activate_locked;if (!add_to_swap(page))goto activate_locked;}may_enter_fs = 1;// 更新address_space的地址到swap的空间地址mapping = page_mapping(page);}} else if (unlikely(PageTransHuge(page))) {/* Split file THP */if (split_huge_page_to_list(page, page_list))goto keep_locked;}// 判断_mapcount的值，多用户映射页面if (page_mapped(page)) {enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;if (unlikely(PageTransHuge(page)))flags |= TTU_SPLIT_HUGE_PMD;// 解除这些用户映射的PTEif (!try_to_unmap(page, flags)) {nr_unmap_fail++;goto activate_locked;}}// 如果是脏页面if (PageDirty(page)) {if (page_is_file_cache(page) &&(!current_is_kswapd() || !PageReclaim(page) ||!test_bit(PGDAT_DIRTY, &pgdat->flags))) {inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);SetPageReclaim(page);goto activate_locked;}if (references == PAGEREF_RECLAIM_CLEAN)goto keep_locked;if (!may_enter_fs)goto keep_locked;if (!sc->may_writepage)goto keep_locked;try_to_unmap_flush_dirty();// 写入交换分区switch (pageout(page, mapping, sc)) {case PAGE_KEEP:goto keep_locked;case PAGE_ACTIVATE:goto activate_locked;case PAGE_SUCCESS:if (PageWriteback(page))goto keep;if (PageDirty(page))goto keep;if (!trylock_page(page))goto keep;if (PageDirty(page) || PageWriteback(page))goto keep_locked;mapping = page_mapping(page);case PAGE_CLEAN:; /* try to free the page below */}}// 如果是块设备的缓存if (page_has_private(page)) {if (!try_to_release_page(page, sc->gfp_mask))goto activate_locked;if (!mapping && page_count(page) == 1) {unlock_page(page);if (put_page_testzero(page))goto free_it;else {nr_reclaimed++;continue;}}}// 此处开始为free 页面之前的收尾操作。if (PageAnon(page) && !PageSwapBacked(page)) {/* follow __remove_mapping for reference */if (!page_ref_freeze(page, 1))goto keep_locked;if (PageDirty(page)) {page_ref_unfreeze(page, 1);goto keep_locked;}count_vm_event(PGLAZYFREED);count_memcg_page_event(page, PGLAZYFREED);} else if (!mapping || !__remove_mapping(mapping, page, true))goto keep_locked;__ClearPageLocked(page);
free_it:nr_reclaimed++;if (unlikely(PageTransHuge(page))) {mem_cgroup_uncharge(page);(*get_compound_page_dtor(page))(page);} else// 添加到free_pages的临时链表，然后之后返回上层函数统一释放list_add(&page->lru, &free_pages);continue;
}

最后上一张流程图

更多推荐

A Linux Kernel Miracle Tour

本文发布于:2024-03-09 23:57:06，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1726489.html