[PATCH] slab: updates for per-arch alignments

From: Manfred Spraul <manfred@colorfullife.com> Description: Right now kmem_cache_create automatically decides about the alignment of allocated objects. The automatic decisions are sometimes wrong: - for some objects, it's better to keep them as small as possible to reduce the memory usage. Ingo already added a parameter to kmem_cache_create for the sigqueue cache, but it wasn't implemented. - for s390, normal kmalloc must be 8-byte aligned. With debugging enabled, the default allocation was 4-bytes. This means that s390 cannot enable slab debugging. - arm26 needs 1 kB aligned objects. Previously this was impossible to generate, therefore arm has its own allocator in arm26/machine/small_page.c - most objects should be cache line aligned, to avoid false sharing. But the cache line size was set at compile time, often to 128 bytes for generic kernels. This wastes memory. The new code uses the runtime determined cache line size instead. - some caches want an explicit alignment. One example are the pte_chain objects: they must find the start of the object with addr&mask. Right now pte_chain objects are scaled to the cache line size, because that was the only alignment that could be generated reliably. The implementation reuses the "offset" parameter of kmem_cache_create and now uses it to pass in the requested alignment. offset was ignored by the current implementation, and the only user I found is sigqueue, which intended to set the alignment. In the long run, it might be interesting for the main tree: due to the 128 byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9 inodes - 20% memory recovered for Athlon systems. For generic kernels running on P6 cpus (i.e. 32 byte cachelines), it means Number of objects per page: ext2_inode_cache: 8 instead of 7 ext3_inode_cache: 8 instead of 7 fat_inode_cache: 9 instead of 7 rpc_tasks: 24 instead of 15 tcp_tw_bucket: 40 instead of 30 arp_cache: 40 instead of 30 nfs_write_data: 9 instead of 7

[PATCH] slab: updates for per-arch alignments
From: Manfred Spraul <manfred@colorfullife.com> Description: Right now kmem_cache_create automatically decides about the alignment of allocated objects. The automatic decisions are sometimes wrong: - for some objects, it's better to keep them as small as possible to reduce the memory usage. Ingo already added a parameter to kmem_cache_create for the sigqueue cache, but it wasn't implemented. - for s390, normal kmalloc must be 8-byte aligned. With debugging enabled, the default allocation was 4-bytes. This means that s390 cannot enable slab debugging. - arm26 needs 1 kB aligned objects. Previously this was impossible to generate, therefore arm has its own allocator in arm26/machine/small_page.c - most objects should be cache line aligned, to avoid false sharing. But the cache line size was set at compile time, often to 128 bytes for generic kernels. This wastes memory. The new code uses the runtime determined cache line size instead. - some caches want an explicit alignment. One example are the pte_chain objects: they must find the start of the object with addr&mask. Right now pte_chain objects are scaled to the cache line size, because that was the only alignment that could be generated reliably. The implementation reuses the "offset" parameter of kmem_cache_create and now uses it to pass in the requested alignment. offset was ignored by the current implementation, and the only user I found is sigqueue, which intended to set the alignment. In the long run, it might be interesting for the main tree: due to the 128 byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9 inodes - 20% memory recovered for Athlon systems. For generic kernels running on P6 cpus (i.e. 32 byte cachelines), it means Number of objects per page: ext2_inode_cache: 8 instead of 7 ext3_inode_cache: 8 instead of 7 fat_inode_cache: 9 instead of 7 rpc_tasks: 24 instead of 15 tcp_tw_bucket: 40 instead of 30 arp_cache: 40 instead of 30 nfs_write_data: 9 instead of 7
b9e55f3d · Andrew Morton · Linus Torvalds · 1aa6c0d1 · b9e55f3d · b9e55f3d
Commit b9e55f3d authored Apr 11, 2004 by Andrew Morton Committed by Linus Torvalds Apr 11, 2004
Showing with 89 additions and 61 deletions

arch/i386/mm/init.c arch/i386/mm/init.c +2 -2

include/asm-i386/processor.h include/asm-i386/processor.h +2 -0

kernel/fork.c kernel/fork.c +5 -2

mm/rmap.c mm/rmap.c +1 -1

mm/slab.c mm/slab.c +79 -56

No files found.
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -530,18 +530,18 @@ void __init pgtable_cache_init(void)
 {
 	if (PTRS_PER_PMD > 1) {
 		pmd_cache = kmem_cache_create("pmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					0,
-					SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 					pmd_ctor,
 					NULL);
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
 	}
 	pgd_cache = kmem_cache_create("pgd",
+				PTRS_PER_PGD*sizeof(pgd_t),
 				PTRS_PER_PGD*sizeof(pgd_t),
 				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 				pgd_ctor,
 				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
 	if (!pgd_cache)

--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -403,6 +403,8 @@ struct tss_struct {
 	unsigned long stack[64];
 } __attribute__((packed));

+#define ARCH_MIN_TASKALIGN	16
+
 struct thread_struct {
 /* cached TLS descriptors. */
 	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	0
+#endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct",
-				  sizeof(struct task_struct),0,
-				  SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+				  sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
+				  0, NULL, NULL);
 	if (!task_struct_cachep)
 		panic("fork_init(): cannot create task_struct SLAB cache");
 #endif

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags)
 void __init pte_chain_init(void)
 {
 	pte_chain_cache = kmem_cache_create(	"pte_chain",
+						sizeof(struct pte_chain),
 						sizeof(struct pte_chain),
 						0,
-						SLAB_MUST_HWCACHE_ALIGN,
 						pte_chain_ctor,
 						NULL);


--- a/mm/slab.c
+++ b/mm/slab.c