the IRQ balancing feature is based on the following requirements:

- irq handlers should be cache-affine to a large degree, without the explicit use of /proc/irq/*/smp_affinity. - idle CPUs should be preferred over busy CPUs when directing IRQs towards them. - the distribution of IRQs should be random, to avoid all IRQs going to the same CPU, and to avoid 'heavy' IRQs from loading certain CPUs unfairly over CPUs that handle 'light' IRQs. The IRQ system has no knowledge about how 'heavy' an IRQ handler is in terms of CPU cycles. here is the design and implementation: - we make per-irq decisions about where the IRQ will go to next. Right now it's a fastpath and a slowpath, the real stuff happens in the slow path. The fastpath is very lightweight. - [ i decided not to measure IRQ handler overhead via RDTSC - it ends up being very messy, and if we want to be 100% fair then we also need to measure softirq overhead, and since there is no 1:1 relationship between softirq load and hardirq load, it's impossible to do correctly. So the IRQ balancer achieves fairness via randomness. ] - we stay affine in the micro timescale, and we are loading the CPUs fairly in the macro timescale. The IO-APIC's lowest priority distribution method rotated IRQs between CPUs once per IRQ, which was the worst possible solution for good cache-affinity. - to achieve fairness and to avoid lock-step situations some real randomness is needed. The IRQs will wander in the allowed CPU group randomly, in a brownean motion fashion. This is what the 'move()' function accomplishes. The IRQ moves one step forward or one step backwards in the allowed CPU mask. [ Note that this achieves a level of NUMA affinity as well, nearby CPUs are more likely to be NUMA-affine. ] - the irq balancer has some knowledge about 'how idle' a single CPU is. The idle task updates the idle_timestamp. Since this update is in the idle-to-be codepath, it does not increase the latency of idle-wakeup, the overhead should be zero in all cases that matter. The idle-balancing happens the following way: when searching for the next target CPU after a 'IRQ tick' has expired, we first search 'idle enough' CPUs in the allowed set. If this does not succeed then we search all CPUs. - the patch is fully compatible with the /proc/irq/*/smp_affinity interface as well, everything works as expected. note that the current implementation can be expressed equivalently in terms of timer-interrupt-driven IRQ redirection. But i wanted to get some real feedback before removing the possibility to do finer grained decisions - and the per-IRQ overhead is very small anyway.

the IRQ balancing feature is based on the following requirements:
- irq handlers should be cache-affine to a large degree, without the explicit use of /proc/irq/*/smp_affinity. - idle CPUs should be preferred over busy CPUs when directing IRQs towards them. - the distribution of IRQs should be random, to avoid all IRQs going to the same CPU, and to avoid 'heavy' IRQs from loading certain CPUs unfairly over CPUs that handle 'light' IRQs. The IRQ system has no knowledge about how 'heavy' an IRQ handler is in terms of CPU cycles. here is the design and implementation: - we make per-irq decisions about where the IRQ will go to next. Right now it's a fastpath and a slowpath, the real stuff happens in the slow path. The fastpath is very lightweight. - [ i decided not to measure IRQ handler overhead via RDTSC - it ends up being very messy, and if we want to be 100% fair then we also need to measure softirq overhead, and since there is no 1:1 relationship between softirq load and hardirq load, it's impossible to do correctly. So the IRQ balancer achieves fairness via randomness. ] - we stay affine in the micro timescale, and we are loading the CPUs fairly in the macro timescale. The IO-APIC's lowest priority distribution method rotated IRQs between CPUs once per IRQ, which was the worst possible solution for good cache-affinity. - to achieve fairness and to avoid lock-step situations some real randomness is needed. The IRQs will wander in the allowed CPU group randomly, in a brownean motion fashion. This is what the 'move()' function accomplishes. The IRQ moves one step forward or one step backwards in the allowed CPU mask. [ Note that this achieves a level of NUMA affinity as well, nearby CPUs are more likely to be NUMA-affine. ] - the irq balancer has some knowledge about 'how idle' a single CPU is. The idle task updates the idle_timestamp. Since this update is in the idle-to-be codepath, it does not increase the latency of idle-wakeup, the overhead should be zero in all cases that matter. The idle-balancing happens the following way: when searching for the next target CPU after a 'IRQ tick' has expired, we first search 'idle enough' CPUs in the allowed set. If this does not succeed then we search all CPUs. - the patch is fully compatible with the /proc/irq/*/smp_affinity interface as well, everything works as expected. note that the current implementation can be expressed equivalently in terms of timer-interrupt-driven IRQ redirection. But i wanted to get some real feedback before removing the possibility to do finer grained decisions - and the per-IRQ overhead is very small anyway.
cf6f7853 · Ingo Molnar · 92ebc650 · cf6f7853 · cf6f7853 · cf6f7853
Commit cf6f7853 authored Apr 15, 2002 by Ingo Molnar
4 changed files
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -28,6 +28,7 @@
 #include <linux/config.h>
 #include <linux/smp_lock.h>
 #include <linux/mc146818rtc.h>
+#include <linux/compiler.h>

 #include <asm/io.h>
 #include <asm/smp.h>
@@ -183,6 +184,86 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }

+static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
+{
+	unsigned long flags;
+
+	/*
+	 * Only the first 8 bits are valid.
+	 */
+	mask = mask << 24;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = mask, )
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#if CONFIG_SMP
+
+typedef struct {
+	unsigned int cpu;
+	unsigned long timestamp;
+} ____cacheline_aligned irq_balance_t;
+
+static irq_balance_t irq_balance[NR_IRQS] __cacheline_aligned
+			= { [ 0 ... NR_IRQS-1 ] = { 1, 0 } };
+
+extern unsigned long irq_affinity [NR_IRQS];
+
+#endif
+
+#define IDLE_ENOUGH(cpu,now) \
+		(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
+
+#define IRQ_ALLOWED(cpu,allowed_mask) \
+		((1 << cpu) & (allowed_mask))
+
+static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
+{
+	int search_idle = 1;
+	int cpu = curr_cpu;
+
+	goto inside;
+
+	do {
+		if (unlikely(cpu == curr_cpu))
+			search_idle = 0;
+inside:
+		if (direction == 1) {
+			cpu++;
+			if (cpu >= smp_num_cpus)
+				cpu = 0;
+		} else {
+			cpu--;
+			if (cpu == -1)
+				cpu = smp_num_cpus-1;
+		}
+	} while (!IRQ_ALLOWED(cpu,allowed_mask) ||
+			(search_idle && !IDLE_ENOUGH(cpu,now)));
+
+	return cpu;
+}
+
+static inline void balance_irq(int irq)
+{
+#if CONFIG_SMP
+	irq_balance_t *entry = irq_balance + irq;
+	unsigned long now = jiffies;
+
+	if (unlikely(entry->timestamp != now)) {
+		unsigned long allowed_mask;
+		int random_number;
+
+		rdtscl(random_number);
+		random_number &= 1;
+
+		allowed_mask = cpu_online_map & irq_affinity[irq];
+		entry->timestamp = now;
+		entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
+		set_ioapic_affinity(irq, 1 << entry->cpu);
+	}
+#endif
+}
+
 /*
 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
 * specific CPU-side IRQs.
@@ -672,8 +753,7 @@ void __init setup_IO_APIC_irqs(void)
 }

 /*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the 8259A-master output pin:
 */
 void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
 {
@@ -1193,6 +1273,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
 */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	balance_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1232,6 +1313,7 @@ static void end_level_ioapic_irq (unsigned int irq)
 	unsigned long v;
 	int i;

+	balance_irq(irq);
 /*
 * It appears there is an erratum which affects at least version 0x11
 * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -1288,19 +1370,6 @@ static void end_level_ioapic_irq (unsigned int irq)

 static void mask_and_ack_level_ioapic_irq (unsigned int irq) { /* nothing */ }

-static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
-{
-	unsigned long flags;
-	/*
-	 * Only the first 8 bits are valid.
-	 */
-	mask = mask << 24;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = mask, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 /*
 * Level and edge triggered IO-APIC interrupts need different handling,
 * so we use two separate IRQ descriptors. Edge triggered IRQs can be

--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -1073,7 +1073,7 @@ static unsigned int parse_hex_value (const char *buffer,

 static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];

-static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL };
+unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL };
 static int irq_affinity_read_proc (char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {

--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -142,6 +142,7 @@ void cpu_idle (void)
 		void (*idle)(void) = pm_idle;
 		if (!idle)
 			idle = default_idle;
+		irq_stat[smp_processor_id()].idle_timestamp = jiffies;
 		while (!need_resched())
 			idle();
 		schedule();

--- a/include/asm-i386/hardirq.h
+++ b/include/asm-i386/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
 	unsigned int __local_bh_count;
 	unsigned int __syscall_count;
 	struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
+	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 } ____cacheline_aligned irq_cpustat_t;