在Linux 内存管理中,在系统启动时,函数start_kernel()调用mm_init()对内存相关的模块初始化。这里我们关注mem_init()函数实现,这个是跟体系架构相关的,不同体系架构实现并不相同,但大致处理类似,即释放内存到伙伴系统,对一些内存方面的全局变量设置。我们下面看不同体系下实现:
ARM:
void __init mem_init(void)
{
#ifdef CONFIG_HAVE_TCM
/* These pointers are filled in on TCM detection */
extern u32 dtcm_end;
extern u32 itcm_end;
#endif
set_max_mapnr(pfn_to_page(max_pfn) - mem_map); /* this will put all unused low memory onto the freelists */
free_unused_memmap();
free_all_bootmem(); #ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
#endif free_highpages(); mem_init_print_info(NULL); #define MLK(b, t) b, t, ((t) - (b)) >> 10
#define MLM(b, t) b, t, ((t) - (b)) >> 20
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K) pr_notice("Virtual kernel memory layout:\n"
" vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HAVE_TCM
" DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
" ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
#ifdef CONFIG_MODULES
" modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
" .text : 0x%p" " - 0x%p" " (%4td kB)\n"
" .init : 0x%p" " - 0x%p" " (%4td kB)\n"
" .data : 0x%p" " - 0x%p" " (%4td kB)\n"
" .bss : 0x%p" " - 0x%p" " (%4td kB)\n", MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
(PAGE_SIZE)),
#ifdef CONFIG_HAVE_TCM
MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
MLK(ITCM_OFFSET, (unsigned long) itcm_end),
#endif
MLK(FIXADDR_START, FIXADDR_END),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
(PAGE_SIZE)),
#endif
#ifdef CONFIG_MODULES
MLM(MODULES_VADDR, MODULES_END),
#endif MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_sdata, _edata),
MLK_ROUNDUP(__bss_start, __bss_stop)); #undef MLK
#undef MLM
#undef MLK_ROUNDUP /*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#ifdef CONFIG_MMU
BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
BUG_ON(TASK_SIZE > MODULES_VADDR);
#endif #ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
#endif if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
/*
* On a machine this small we won't get
* anywhere without overcommit, so turn
* it on by default.
*/
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
ARM64:
void __init mem_init(void) {
if (swiotlb_force || max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
swiotlb_init(1); set_max_mapnr(pfn_to_page(max_pfn) - mem_map); #ifndef CONFIG_SPARSEMEM_VMEMMAP
free_unused_memmap();
#endif
/* this will put all unused low memory onto the freelists */
free_all_bootmem(); mem_init_print_info(NULL); #define MLK(b, t) b, t, ((t) - (b)) >> 10
#define MLM(b, t) b, t, ((t) - (b)) >> 20
#define MLG(b, t) b, t, ((t) - (b)) >> 30
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K) pr_notice("Virtual kernel memory layout:\n");
#ifdef CONFIG_KASAN
pr_notice(" kasan : 0x%16lx - 0x%16lx (%6ld GB)\n",
MLG(KASAN_SHADOW_START, KASAN_SHADOW_END));
#endif
pr_notice(" modules : 0x%16lx - 0x%16lx (%6ld MB)\n",
MLM(MODULES_VADDR, MODULES_END));
pr_notice(" vmalloc : 0x%16lx - 0x%16lx (%6ld GB)\n",
MLG(VMALLOC_START, VMALLOC_END));
pr_notice(" .text : 0x%p" " - 0x%p" " (%6ld KB)\n",
MLK_ROUNDUP(_text, _etext));
pr_notice(" .rodata : 0x%p" " - 0x%p" " (%6ld KB)\n",
MLK_ROUNDUP(__start_rodata, __init_begin));
pr_notice(" .init : 0x%p" " - 0x%p" " (%6ld KB)\n",
MLK_ROUNDUP(__init_begin, __init_end));
pr_notice(" .data : 0x%p" " - 0x%p" " (%6ld KB)\n",
MLK_ROUNDUP(_sdata, _edata));
pr_notice(" .bss : 0x%p" " - 0x%p" " (%6ld KB)\n",
MLK_ROUNDUP(__bss_start, __bss_stop));
pr_notice(" fixed : 0x%16lx - 0x%16lx (%6ld KB)\n",
MLK(FIXADDR_START, FIXADDR_TOP));
pr_notice(" PCI I/O : 0x%16lx - 0x%16lx (%6ld MB)\n",
MLM(PCI_IO_START, PCI_IO_END));
#ifdef CONFIG_SPARSEMEM_VMEMMAP
pr_notice(" vmemmap : 0x%16lx - 0x%16lx (%6ld GB maximum)\n",
MLG(VMEMMAP_START, VMEMMAP_START + VMEMMAP_SIZE));
pr_notice(" 0x%16lx - 0x%16lx (%6ld MB actual)\n",
MLM((unsigned long)phys_to_page(memblock_start_of_DRAM()),
(unsigned long)virt_to_page(high_memory)));
#endif
pr_notice(" memory : 0x%16lx - 0x%16lx (%6ld MB)\n",
MLM(__phys_to_virt(memblock_start_of_DRAM()),
(unsigned long)high_memory)); #undef MLK
#undef MLM
#undef MLK_ROUNDUP #ifdef CONFIG_COMPAT
BUILD_BUG_ON(TASK_SIZE_32 > TASK_SIZE_64);
#endif BUILD_BUG_ON(sizeof(struct page) > (1 << STRUCT_PAGE_MAX_SHIFT)); if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
extern int sysctl_overcommit_memory;
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
x86_32: void __init mem_init(void)
{
pci_iommu_alloc(); #ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
/*
* With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
* be done before free_all_bootmem(). Memblock use free low memory for
* temporary data (see find_range_array()) and for this purpose can use
* pages that was already passed to the buddy allocator, hence marked as
* not accessible in the page tables when compiled with
* CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
* important here.
*/
set_highmem_pages_init(); /* this will put all low memory onto the freelists */
free_all_bootmem(); after_bootmem = 1; mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10, #ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
#endif VMALLOC_START, VMALLOC_END,
(VMALLOC_END - VMALLOC_START) >> 20, (unsigned long)__va(0), (unsigned long)high_memory,
((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, (unsigned long)&__init_begin, (unsigned long)&__init_end,
((unsigned long)&__init_end -
(unsigned long)&__init_begin) >> 10, (unsigned long)&_etext, (unsigned long)&_edata,
((unsigned long)&_edata - (unsigned long)&_etext) >> 10, (unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10); /*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#define __FIXADDR_TOP (-PAGE_SIZE)
#ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
#define high_memory (-128UL << 20)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP #ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START); if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
}
X86_64: void __init mem_init(void)
{
pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ register_page_bootmem_info(); /* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1; /* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
PAGE_SIZE, KCORE_OTHER); mem_init_print_info(NULL);
}
MIPS: void __init mem_init(void)
{
#ifdef CONFIG_HIGHMEM
#ifdef CONFIG_DISCONTIGMEM
#error "CONFIG_HIGHMEM and CONFIG_DISCONTIGMEM dont work together yet"
#endif
max_mapnr = highend_pfn ? highend_pfn : max_low_pfn;
#else
max_mapnr = max_low_pfn;
#endif
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); maar_init();
free_all_bootmem();
setup_zero_pages(); /* Setup zeroed pages. */
mem_init_free_highmem();
mem_init_print_info(NULL); #ifdef CONFIG_64BIT
if ((unsigned long) &_text > (unsigned long) CKSEG0)
/* The -4 is a hack so that user tools don't have to handle
the overflow. */
kclist_add(&kcore_kseg0, (void *) CKSEG0,
0x80000000 - 4, KCORE_TEXT);
#endif
} 上面这些函数尽管不同体系架构实现不同,甚至同一架构下不同位的架构也实现不同。我们这里以ARM64为例,说明这些函数主要处理: 首先我们需要明白mem_map作用,其是描述所有的物理内存采用的struct page结构的数组的基指针。比如说,对于4GB的内存来说,如果一个页定义为4KB,即2^12字节。那么可想而知,总共这个mem_map数组大小为2^20个。注意我们这里都以flat型内存描述为主,即平坦型内存模型、 而这些页都有一个具体的页帧号与之对应。页帧号一般用pfn来表示,那么由于每个页都有一个页帧号,那最小的页帧号和最大的页帧号为多少呢?需要特别注意的是,页帧号也是与mem_map数组的index相对应。我们一般认为pfn_min为0,而最大pfn_max为mem_map数组下标的最大值,这个最大值也就是max_pfn,这个值跟内核的max_mapnr相对应。 函数set_max_mapnr()就是用于计算max_mapnr。我们可能会想,这个max_pfn是什么时候设置的呢?这个是在setup_arch的paging_init()中调用bootmem_init()来进行的。在成功设置max_mapnr后,我们要把启动过程时所有的空闲内存释放到伙伴系统,这里需要注意三点: 一. bootmem内存管理或者nobootmem管理 二. memblock内存管理 三. 伙伴系统 显然,启动时,不存在伙伴系统,在Linux 内核启动的早期,BSP相关的代码需要把内核能使用的内存块大小告知内核,要么通过bootload传递参数给出DDR大小,要么通过命令行形式给出DDR大小,或者通过FDT等形式对DTS分析得出DDR大小。不管什么样的方法,内核需要了解这些信息,我们这里以DTS形式给出内存大小,这些内存块会以 memblock_add形式添加到memblock内存管理块中。这些添加到内核中的内存块被标记为memory类型,另外一种类型为reserve类型。BSP可以通过不同方式添加到内核, 但是在我们内核使用内存之前,必须先添加一块,否则我们使用的内存哪里来呢,使用的内存被标记为reserve。这样,通过这种简单的管理,memblock把所有的内存块维护起来,之后内核慢慢的从这些内存块中获取内存。我们一般称memblock是逻辑内存块管理。 对于bootmem来说,它是物理内存管理。我们这里不详细介绍,后面会有篇章分析。 函数free_unused_memmap()和free_all_bootmem()都是把空闲内存释放到伙伴系统,前者释放memblock中空闲内存,后者释放bootmem中内存。 函数mem_init_print_info()是把内核映像的各个段地址打印出来。我们这里看下这个信息: [ 0.000000] Memory: 832MB 2080MB = 2912MB total
[ 0.000000] <0>I{0}[0:swapper]Memory: 2852320k/2852320k available, 129568k reserved
[ 0.000000] <0>I{0}[0:swapper]Virtual kernel memory layout:
[ 0.000000] <0> vmalloc : 0xffffff8000000000 - 0xffffffbbffff0000 (245759 MB)
[ 0.000000] <0> vmemmap : 0xffffffbc001dc000 - 0xffffffbc029c8000 ( 39 MB)
[ 0.000000] <0> modules : 0xffffffbffc000000 - 0xffffffc000000000 ( 64 MB)
[ 0.000000] <0> memory : 0xffffffc000000000 - 0xffffffc0b6800000 ( 2920 MB)
[ 0.000000] <0> .init : 0xffffffc000d29000 - 0xffffffc000f881c0 ( 2429 kB)
[ 0.000000] <0> .text : 0xffffffc000080000 - 0xffffffc000d28da4 ( 12964 kB)
[ 0.000000] <0> .data : 0xffffffc000f89000 - 0xffffffc001088e78 ( 1024 kB) 上面的832MB和2080MB是说明有两个memblock,第一个memblock大小为832MB,第二个为2080MB,所以,总共内存大小为2912MB。 对于Memory:下面几个数字来说, 2852320k是当前系统空闲的内存,这说明总共有713080个空闲页。需要注意2852320k/2852320k 前面这个是当前系统空闲页数,它是个动态变化的值。它是由函数nr_free_pages()得到,可以看到,其采用global_page_state(NR_FREE_PAGES)方式获取的空闲内存,这个值会动态变化,每次申请内存时都会减少。而后面这个值是恒定的,它是从页的page_count(page)得到,因为对于内核来说,如果BSP申请内存一定,这个值就应该恒定,它是每个memblock中除了reserve之后的内存,所以,在BSP开发期间,可以通过这个值来了解内核可用内存是否减少。 从上面给出的log可以看到,这两个值相等,这说明memblock中reserve的值后并没有动态申请内存,否则前面这个值应该减小。 最后这个129568k 是reserve的内存,那么什么是reserve的内存呢?其实这个reserve内存是相对free内存来说的。因为其无法再分配供其他程序使用,这部分内存一般是内核一些模块申请,如vfs_caches_init_early()分配的目录项和索引节点hash,这些模块需要连续的物理内存,在系统启动时可以方便获取,这样我们在系统早期分配后,对其标记为reserve,再比如内核代码段,数据段等,这些都被标记。这样看来,上面129568KB的reserve内存,即32392个页被标记reserve。 空闲free页 + reserve页 = 总页数 = 713080 + 32392 = 745472页。 总内存2912MB = 745742个页。 对于下面的内核镜像内存布局则很容易理解了: 因为我们是64位系统,虚拟地址空间采用48位。 上面把物理内存完全映射到memory : 0xffffffc000000000 - 0xffffffc0b6800000 ( 2920 MB) 这个里面。
这里把0x0000000000000000 - 0xffffff8000000000-1 映射到用户地址空间 0xffffffc000000000 - 0xffff,ffff,ffff,ffff 映射到内核空间 可以看到,内核空间已经足够大,所以不需要高端内存,内核空间已经足够囊括3GB的物理内存了。整个物理内存映射到0xffffffc000000000 - 0xffffffc0b6800000 范围,这里只所以有8MB的洞,是保护非连续区内存管理使用的。 可以看到,内核代码段,init段,数据段大小。对于vmemmap和module段很容易理解其大小。这些都跟内核定义的一些宏或者常熟相关。
|