1. kmalloc分配内存的大小

本文基于linux-5.15分析,linux-6.6已经删除slob,后续也会将slab移除。

kmalloc会根据申请的内存大小来决定来决定使用块分配器(slab/slub/slob)或页分配器进行内存分配。 控制kmalloc分配行为的主要有如下三个宏。

macro desc
KMALLOC_MAX_SIZE kmalloc可以分配的最大内存,超过此大小时返回NULL
KMALLOC_MAX_CACHE_SIZE kmalloc使用slab分配器分配的最大内存,超过此大小后会通过伙伴系统分配页
KMALLOC_MIN_SIZE kmalloc可以分配的最小内存,小于此大小时,kmalloc内部会按此大小分配

1.1. KMALLOC_MAX_SIZE:kmalloc可以分配的最大内存

KMALLOC_MAX_SIZE与块分配器类型(slab/slub/slob)和页面大小以及MAX_ORDER有关,相关定义在include/linux/slab.h中。一般最大为2 ^ (MAX_ORDER + PAGE_SHIFT - 1),也就是2 ^ (MAX_ORDER - 1)个页面,就是伙伴系统所管理的最大内存块。通常MAX_ORDER为11,页面大小为4K,相应的,kmalloc最大可以分配1024个页面,也就是4M。

分配器类型 KMALLOC_MAX_SIZE
slab 2 ^ (MAX_ORDER + PAGE_SHIFT - 1),但不得超过32M(2^25)
slub 2 ^ (MAX_ORDER + PAGE_SHIFT - 1)
slob 2 ^ (MAX_ORDER + PAGE_SHIFT - 1)

早期的slab是可以支持分配64M的,在5.13版本时,改为了32M,具体可以参考commit 588c7fa022d7b2361500ead5660d9a1a2ecd9b7d

linux-6.0修改了kmalloc_infokmalloc_index,最大分配大小改为2M,详见commit d6a71648dbc0ca5520cba16a8fdce8d37ae74218

1.2. KMALLOC_MAX_CACHE_SIZE:kmalloc使用slab分配器分配的最大内存

kmalloc申请的内存大小小于或等于KMALLOC_MAX_CACHE_SIZE时,使用块分配器(slab/slub/slob)进行分配。

kmalloc申请的内存大小超过KMALLOC_MAX_CACHE_SIZE时,使用页分配器(伙伴系统)进行分配。

块分配器类型使用slab时,KMALLOC_MAX_CACHE_SIZE定义为KMALLOC_MAX_SIZEkmalloc总是使用slab分配器。

分配器类型 KMALLOC_MAX_CACHE_SIZE
slab 2 ^ (MAX_ORDER + PAGE_SHIFT - 1),最大限制为32M(2^25)
slub 2 ^ (PAGE_SHIFT + 1),2个页面大小
slob 2 ^ PAGE_SHIFT,1个页面大小

1.3. KMALLOC_MIN_SIZE:kmalloc可以分配的最小内存

KMALLOC_MIN_SIZEARCH_DMA_MINALIGN大小有关。

 1/*
 2 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 3 * alignment larger than the alignment of a 64-bit integer.
 4 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
 5 */
 6#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
 7#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 8#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 9#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
10#else
11#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
12#endif
13
14
15#ifndef KMALLOC_MIN_SIZE
16#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
17#endif

在没有定义ARCH_DMA_MINALIGN的情况下,KMALLOC_MIN_SIZE的默认值如下

分配器类型 默认KMALLOC_MIN_SIZE
slab 2 ^ 5
slub 2 ^ 3
slob 2 ^ 3

接下来看一下ARCH_DMA_MINALIGN,linux-5.15的arch目录下共24个目录,其中18个对ARCH_DMA_MINALIGN进行了定义,且很多都将ARCH_DMA_MINALIGN定义为L1 cache的大小,这样可以提高性能。

 1arch/arc/include/asm/cache.h:52:#define ARCH_DMA_MINALIGN       SMP_CACHE_BYTES
 2arch/arm/include/asm/cache.h:18:#define ARCH_DMA_MINALIGN       L1_CACHE_BYTES
 3arch/arm64/include/asm/cache.h:50:#define ARCH_DMA_MINALIGN     (128)
 4arch/csky/include/asm/cache.h:11:#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
 5arch/hexagon/include/asm/cache.h:15:#define ARCH_DMA_MINALIGN   L1_CACHE_BYTES
 6arch/m68k/include/asm/cache.h:12:#define ARCH_DMA_MINALIGN      L1_CACHE_BYTES
 7arch/microblaze/include/asm/page.h:34:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
 8arch/mips/include/asm/mach-generic/kmalloc.h:10:#define ARCH_DMA_MINALIGN       128
 9arch/mips/include/asm/mach-ip32/kmalloc.h:7:#define ARCH_DMA_MINALIGN   32
10arch/mips/include/asm/mach-ip32/kmalloc.h:9:#define ARCH_DMA_MINALIGN   128
11arch/mips/include/asm/mach-n64/kmalloc.h:6:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
12arch/mips/include/asm/mach-tx49xx/kmalloc.h:5:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
13arch/nds32/include/asm/cache.h:10:#define ARCH_DMA_MINALIGN   L1_CACHE_BYTES
14arch/nios2/include/asm/cache.h:21:#define ARCH_DMA_MINALIGN     L1_CACHE_BYTES
15arch/parisc/include/asm/cache.h:23:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES
16arch/powerpc/include/asm/page_32.h:16:#define ARCH_DMA_MINALIGN L1_CACHE_BYTES
17arch/sh/include/asm/page.h:184:#define ARCH_DMA_MINALIGN        L1_CACHE_BYTES
18arch/xtensa/include/asm/cache.h:32:#define ARCH_DMA_MINALIGN    L1_CACHE_BYTES

1.4. 关于KMALLOC_MIN_SIZE的进一步说明

KMALLOC_MIN_SIZE的效果会跟块分配器的类型有关。

1.4.1. slab/slub

KMALLOC_MIN_SIZE必须是2的整数次幂,且不能超过256。具体可以看下边setup_kmalloc_cache_index_table

 1/*
 2 * Patch up the size_index table if we have strange large alignment
 3 * requirements for the kmalloc array. This is only the case for
 4 * MIPS it seems. The standard arches will not generate any code here.
 5 *
 6 * Largest permitted alignment is 256 bytes due to the way we
 7 * handle the index determination for the smaller caches.
 8 *
 9 * Make sure that nothing crazy happens if someone starts tinkering
10 * around with ARCH_KMALLOC_MINALIGN
11 */
12void __init setup_kmalloc_cache_index_table(void)
13{
14    unsigned int i;
15
16    BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
17        (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
18
19    ...

1.4.2. slob

slob分配器中,实际起作用的是SLAB_OBJ_MIN_SIZE,而SLAB_OBJ_MIN_SIZE被限制为最大16字节。

 1/*
 2 * This restriction comes from byte sized index implementation.
 3 * Page size is normally 2^12 bytes and, in this case, if we want to use
 4 * byte sized index which can represent 2^8 entries, the size of the object
 5 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 6 * If minimum size of kmalloc is less than 16, we use it as minimum object
 7 * size and give up to use byte sized index.
 8 */
 9#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
10                               (KMALLOC_MIN_SIZE) : 16)

1.5. size为0时的处理

kmalloc允许传入值为0,此时kmalloc返回ZERO_SIZE_PTR。可以借用此值来判断是内存不足,还是传入参数为0。

如下代码可以在kmalloc__kmalloc(slab/slub/slob)的路径中找到。

1    if (!size)
2        return ZERO_SIZE_PTR;
 1/*
 2 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 3 *
 4 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 5 *
 6 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 7 * Both make kfree a no-op.
 8 */
 9#define ZERO_SIZE_PTR ((void *)16)
10
11#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
12                (unsigned long)ZERO_SIZE_PTR)

ZERO_SIZE_PTR传给kfree也是可以的。

1void kfree(const void *x)
2{
3    ...
4    if (unlikely(ZERO_OR_NULL_PTR(x)))
5        return;
6    ...
7}

2. kmalloc内存分配大小计算

kmalloc只能分配几个固定大小的内存,申请值不在这些固定值之内时,会按向上对齐的原则,分配最接近申请值的内存块。

2.2. kmalloc_info

kmalloc_indexmm/slab_common.c中的kmalloc_info[]是对应的,kmalloc_index返回的就是所需大小在kmalloc_info[]数组中对应的索引。

 1#define INIT_KMALLOC_INFO(__size, __short_size)            \
 2{                                \
 3    .name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,    \
 4    .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,    \
 5    KMALLOC_CGROUP_NAME(__short_size)            \
 6    KMALLOC_DMA_NAME(__short_size)                \
 7    .size = __size,                        \
 8}
 9
10/*
11 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
12 * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
13 * kmalloc-32M.
14 */
15const struct kmalloc_info_struct kmalloc_info[] __initconst = {
16    INIT_KMALLOC_INFO(0, 0),
17    INIT_KMALLOC_INFO(96, 96),
18    INIT_KMALLOC_INFO(192, 192),
19    INIT_KMALLOC_INFO(8, 8),
20    INIT_KMALLOC_INFO(16, 16),
21    INIT_KMALLOC_INFO(32, 32),
22    INIT_KMALLOC_INFO(64, 64),
23    INIT_KMALLOC_INFO(128, 128),
24    INIT_KMALLOC_INFO(256, 256),
25    INIT_KMALLOC_INFO(512, 512),
26    INIT_KMALLOC_INFO(1024, 1k),
27    INIT_KMALLOC_INFO(2048, 2k),
28    INIT_KMALLOC_INFO(4096, 4k),
29    INIT_KMALLOC_INFO(8192, 8k),
30    INIT_KMALLOC_INFO(16384, 16k),
31    INIT_KMALLOC_INFO(32768, 32k),
32    INIT_KMALLOC_INFO(65536, 64k),
33    INIT_KMALLOC_INFO(131072, 128k),
34    INIT_KMALLOC_INFO(262144, 256k),
35    INIT_KMALLOC_INFO(524288, 512k),
36    INIT_KMALLOC_INFO(1048576, 1M),
37    INIT_KMALLOC_INFO(2097152, 2M),
38    INIT_KMALLOC_INFO(4194304, 4M),
39    INIT_KMALLOC_INFO(8388608, 8M),
40    INIT_KMALLOC_INFO(16777216, 16M),
41    INIT_KMALLOC_INFO(33554432, 32M)
42};

内核按KMALLOC_MIN_SIZE的倍数创建cache, 当KMALLOC_MIN_SIZE <= 32时,创建大小为96和192字节的cache。 当KMALLOC_MIN_SIZE <= 64时,创建大小为192字节的cache。 当KMALLOC_MIN_SIZE > 64时,不会额外创建96和192字节的cache。

 1/*
 2 * Create the kmalloc array. Some of the regular kmalloc arrays
 3 * may already have been created because they were needed to
 4 * enable allocations for slab creation.
 5 */
 6void __init create_kmalloc_caches(slab_flags_t flags)
 7{
 8    int i;
 9    enum kmalloc_cache_type type;
10
11    /*
12     * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
13     */
14    for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
15        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
16            if (!kmalloc_caches[type][i])
17                new_kmalloc_cache(i, type, flags);
18
19            /*
20             * Caches that are not of the two-to-the-power-of size.
21             * These have to be created immediately after the
22             * earlier power of two caches
23             */
24            if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
25                    !kmalloc_caches[type][1])
26                new_kmalloc_cache(1, type, flags);
27            if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
28                    !kmalloc_caches[type][2])
29                new_kmalloc_cache(2, type, flags);
30        }
31    }
32    ...

2.1. kmalloc_index

kmalloc_index定义为#define kmalloc_index(s) __kmalloc_index(s, true)

kmalloc_index__kmalloc_index的定义在include/linux/slab.h

 1/*
 2 * Figure out which kmalloc slab an allocation of a certain size
 3 * belongs to.
 4 * 0 = zero alloc
 5 * 1 =  65 .. 96 bytes
 6 * 2 = 129 .. 192 bytes
 7 * n = 2^(n-1)+1 .. 2^n
 8 *
 9 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
10 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
11 * Callers where !size_is_constant should only be test modules, where runtime
12 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
13 */
14static __always_inline unsigned int __kmalloc_index(size_t size,
15                            bool size_is_constant)
16{
17    if (!size)
18        return 0;
19
20    if (size <= KMALLOC_MIN_SIZE)
21        return KMALLOC_SHIFT_LOW;
22
23    if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
24        return 1;
25    if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
26        return 2;
27    if (size <=          8) return 3;
28    if (size <=         16) return 4;
29    if (size <=         32) return 5;
30    if (size <=         64) return 6;
31    if (size <=        128) return 7;
32    if (size <=        256) return 8;
33    if (size <=        512) return 9;
34    if (size <=       1024) return 10;
35    if (size <=   2 * 1024) return 11;
36    if (size <=   4 * 1024) return 12;
37    if (size <=   8 * 1024) return 13;
38    if (size <=  16 * 1024) return 14;
39    if (size <=  32 * 1024) return 15;
40    if (size <=  64 * 1024) return 16;
41    if (size <= 128 * 1024) return 17;
42    if (size <= 256 * 1024) return 18;
43    if (size <= 512 * 1024) return 19;
44    if (size <= 1024 * 1024) return 20;
45    if (size <=  2 * 1024 * 1024) return 21;
46    if (size <=  4 * 1024 * 1024) return 22;
47    if (size <=  8 * 1024 * 1024) return 23;
48    if (size <=  16 * 1024 * 1024) return 24;
49    if (size <=  32 * 1024 * 1024) return 25;
50
51    if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000)
52        && !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
53        BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
54    else
55        BUG();
56
57    /* Will never be reached. Needed because the compiler may complain */
58    return -1;
59}

可以看到kmalloc_index(64, 96](128, 192]这两个区间做了特殊处理。 96和192不是2的整数次幂,但内核中很多地方需要申请与96或192字节大小相近的内存,内核舍弃了2字节(index=1)和4字节(index=2)的cache。将这两个index留给了96和192,内核会根据KMALLOC_MIN_SIZE的大小来决定是否创建对应的cache,以此来减少内存浪费。

2.3. kmalloc_index编译时优化

对于slab和slub,__kmalloc_index可以实现编译时优化。这种优化体现在kmallockmalloc_node,当传入参数size在编译时为定值时,会在编译时优化分配路径。

slab/slub/slob分别实现了各自的__kmalloc__kmalloc_node,对于slab或size编译时不为定值时,kmallockmalloc_node会分别走到分配器对应的__kmalloc__kmalloc_node

 1static __always_inline void *kmalloc(size_t size, gfp_t flags)
 2{
 3    if (__builtin_constant_p(size)) {
 4#ifndef CONFIG_SLOB
 5        unsigned int index;
 6#endif
 7        if (size > KMALLOC_MAX_CACHE_SIZE)
 8            return kmalloc_large(size, flags);
 9#ifndef CONFIG_SLOB
10        index = kmalloc_index(size);
11
12        if (!index)
13            return ZERO_SIZE_PTR;
14
15        return kmem_cache_alloc_trace(
16                kmalloc_caches[kmalloc_type(flags)][index],
17                flags, size);
18#endif
19    }
20    return __kmalloc(size, flags);
21}
22
23static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
24{
25#ifndef CONFIG_SLOB
26    if (__builtin_constant_p(size) &&
27        size <= KMALLOC_MAX_CACHE_SIZE) {
28        unsigned int i = kmalloc_index(size);
29
30        if (!i)
31            return ZERO_SIZE_PTR;
32
33        return kmem_cache_alloc_node_trace(
34                kmalloc_caches[kmalloc_type(flags)][i],
35                        flags, node, size);
36    }
37#endif
38    return __kmalloc_node(size, flags, node);
39}

2.4. kmalloc的运行时路径

前边KMALLOC_MAX_SIZEKMALLOC_MAX_CACHE_SIZEKMALLOC_MIN_SIZE三个宏直接起作用主要是在编译时,运行时的大小计算不一定会直接使用,但其实在运行时,分配内存的限制也会与这三个宏保持一致。

先说slab__do_kmalloc中会直接判断申请内存大小是否超过KMALLOC_MAX_CACHE_SIZE,如果是,则直接返回NULL。注意,对于slabKMALLOC_MAX_CACHE_SIZE等于KMALLOC_MAX_SIZE

再看slubslob,在申请大块内存时,最终都会调到alloc_pages,这个是通过伙伴系统来分配页,最大为2 ^ (MAX_ORDER + PAGE_SHIFT - 1)字节,也就是2 ^ (MAX_ORDER - 1)个页面,与KMALLOC_MAX_SIZE一致。

KMALLOC_MIN_SIZE的情况相对复杂,暂不详述。