Commit 8526eb7f authored by Srinivas Pandruvada's avatar Srinivas Pandruvada Committed by Rafael J. Wysocki

thermal: intel: powerclamp: Use powercap idle-inject feature

There are two idle injection implementation in the Linux kernel. One
via intel_powerclamp and the other using powercap/idle_inject. Both
implementation end up in calling play_idle* function from a FIFO
priority thread. Both can't be used at the same time.

It is better to use one idle injection framework for better
maintainability. In this way, there is only one caller for play_idle.

Here powercap/idle_inject can be used for both per-core and for system
wide idle injection. This framework has a well defined interface which
allow registry for per-core or for all CPUs (system wide).

This reduces code complexity in the intel powerclamp driver as all the
per CPU kthreads, delayed work and calls to play_idle can be removed.

The changes include:
 - Remove unneeded include files
 - Remove per CPU kthread workers: balancing_work and idle_injection_work.
 - Reuse the compensation related code by moving from previous worker
   thread to idle_injection callback.
 - Adjust the idle_duration and runtime by using powercap/idle_inject
   interface.
 - Remove all variables, which are not required once powercap/idle_inject
   is used.
 - Add mutex to avoid race during removal of idle injection during module
   unload and user action to change idle inject percent. Also for
   protection during dynamic adjustment of run and idle time from
   update() callback.
 - Remove online/offline callbacks to designate control CPU
 - Use cpu_present_mask global variable for CPU mask
 - Remove hot plug locks
Signed-off-by: default avatarSrinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: default avatarRafael J. Wysocki <rafael.j.wysocki@intel.com>
parent acbc6610
...@@ -3,6 +3,9 @@ config INTEL_POWERCLAMP ...@@ -3,6 +3,9 @@ config INTEL_POWERCLAMP
tristate "Intel PowerClamp idle injection driver" tristate "Intel PowerClamp idle injection driver"
depends on X86 depends on X86
depends on CPU_SUP_INTEL depends on CPU_SUP_INTEL
depends on CPU_IDLE
select POWERCAP
select IDLE_INJECT
help help
Enable this to enable Intel PowerClamp idle injection driver. This Enable this to enable Intel PowerClamp idle injection driver. This
enforce idle time which results in more package C-state residency. The enforce idle time which results in more package C-state residency. The
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
/* /*
* intel_powerclamp.c - package c-state idle injection * intel_powerclamp.c - package c-state idle injection
* *
* Copyright (c) 2012, Intel Corporation. * Copyright (c) 2012-2023, Intel Corporation.
* *
* Authors: * Authors:
* Arjan van de Ven <arjan@linux.intel.com> * Arjan van de Ven <arjan@linux.intel.com>
...@@ -27,21 +27,15 @@ ...@@ -27,21 +27,15 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/thermal.h> #include <linux/thermal.h>
#include <linux/slab.h>
#include <linux/tick.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/sched/rt.h> #include <linux/idle_inject.h>
#include <uapi/linux/sched/types.h>
#include <asm/nmi.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/mwait.h> #include <asm/mwait.h>
#include <asm/cpu_device_id.h> #include <asm/cpu_device_id.h>
#include <asm/hardirq.h>
#define MAX_TARGET_RATIO (50U) #define MAX_TARGET_RATIO (50U)
/* For each undisturbed clamping period (no extra wake ups during idle time), /* For each undisturbed clamping period (no extra wake ups during idle time),
...@@ -59,35 +53,26 @@ static unsigned int target_mwait; ...@@ -59,35 +53,26 @@ static unsigned int target_mwait;
static struct dentry *debug_dir; static struct dentry *debug_dir;
static bool poll_pkg_cstate_enable; static bool poll_pkg_cstate_enable;
/* user selected target */ /* Idle ratio observed using package C-state counters */
static unsigned int set_target_ratio;
static unsigned int current_ratio; static unsigned int current_ratio;
static bool should_skip;
static unsigned int control_cpu; /* The cpu assigned to collect stat and update /* Skip the idle injection till set to true */
* control parameters. default to BSP but BSP static bool should_skip;
* can be offlined.
*/
static bool clamping;
struct powerclamp_worker_data { struct powerclamp_data {
struct kthread_worker *worker;
struct kthread_work balancing_work;
struct kthread_delayed_work idle_injection_work;
unsigned int cpu; unsigned int cpu;
unsigned int count; unsigned int count;
unsigned int guard; unsigned int guard;
unsigned int window_size_now; unsigned int window_size_now;
unsigned int target_ratio; unsigned int target_ratio;
unsigned int duration_jiffies;
bool clamping; bool clamping;
}; };
static struct powerclamp_worker_data __percpu *worker_data; static struct powerclamp_data powerclamp_data;
static struct thermal_cooling_device *cooling_dev; static struct thermal_cooling_device *cooling_dev;
static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
* clamping kthread worker static DEFINE_MUTEX(powerclamp_lock);
*/
static unsigned int duration; static unsigned int duration;
static unsigned int pkg_cstate_ratio_cur; static unsigned int pkg_cstate_ratio_cur;
...@@ -306,7 +291,7 @@ static void adjust_compensation(int target_ratio, unsigned int win) ...@@ -306,7 +291,7 @@ static void adjust_compensation(int target_ratio, unsigned int win)
if (d->confidence >= CONFIDENCE_OK) if (d->confidence >= CONFIDENCE_OK)
return; return;
delta = set_target_ratio - current_ratio; delta = powerclamp_data.target_ratio - current_ratio;
/* filter out bad data */ /* filter out bad data */
if (delta >= 0 && delta <= (1+target_ratio/10)) { if (delta >= 0 && delta <= (1+target_ratio/10)) {
if (d->steady_comp) if (d->steady_comp)
...@@ -345,82 +330,39 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, ...@@ -345,82 +330,39 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
adjust_compensation(target_ratio, win); adjust_compensation(target_ratio, win);
/* if we are above target+guard, skip */ /* if we are above target+guard, skip */
return set_target_ratio + guard <= current_ratio; return powerclamp_data.target_ratio + guard <= current_ratio;
} }
static void clamp_balancing_func(struct kthread_work *work) /*
* This function calculates runtime from the current target ratio.
* This function gets called under powerclamp_lock.
*/
static unsigned int get_run_time(void)
{ {
struct powerclamp_worker_data *w_data;
int sleeptime;
unsigned long target_jiffies;
unsigned int compensated_ratio; unsigned int compensated_ratio;
int interval; /* jiffies to sleep for each attempt */ unsigned int runtime;
w_data = container_of(work, struct powerclamp_worker_data,
balancing_work);
/* /*
* make sure user selected ratio does not take effect until * make sure user selected ratio does not take effect until
* the next round. adjust target_ratio if user has changed * the next round. adjust target_ratio if user has changed
* target such that we can converge quickly. * target such that we can converge quickly.
*/ */
w_data->target_ratio = READ_ONCE(set_target_ratio); powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20;
w_data->guard = 1 + w_data->target_ratio / 20; powerclamp_data.window_size_now = window_size;
w_data->window_size_now = window_size;
w_data->duration_jiffies = msecs_to_jiffies(duration);
w_data->count++;
/* /*
* systems may have different ability to enter package level * systems may have different ability to enter package level
* c-states, thus we need to compensate the injected idle ratio * c-states, thus we need to compensate the injected idle ratio
* to achieve the actual target reported by the HW. * to achieve the actual target reported by the HW.
*/ */
compensated_ratio = w_data->target_ratio + compensated_ratio = powerclamp_data.target_ratio +
get_compensation(w_data->target_ratio); get_compensation(powerclamp_data.target_ratio);
if (compensated_ratio <= 0) if (compensated_ratio <= 0)
compensated_ratio = 1; compensated_ratio = 1;
interval = w_data->duration_jiffies * 100 / compensated_ratio;
/* align idle time */
target_jiffies = roundup(jiffies, interval);
sleeptime = target_jiffies - jiffies;
if (sleeptime <= 0)
sleeptime = 1;
if (clamping && w_data->clamping && cpu_online(w_data->cpu))
kthread_queue_delayed_work(w_data->worker,
&w_data->idle_injection_work,
sleeptime);
}
static void clamp_idle_injection_func(struct kthread_work *work) runtime = duration * 100 / compensated_ratio - duration;
{
struct powerclamp_worker_data *w_data;
w_data = container_of(work, struct powerclamp_worker_data,
idle_injection_work.work);
/* return runtime;
* only elected controlling cpu can collect stats and update
* control parameters.
*/
if (w_data->cpu == control_cpu &&
!(w_data->count % w_data->window_size_now)) {
should_skip =
powerclamp_adjust_controls(w_data->target_ratio,
w_data->guard,
w_data->window_size_now);
smp_mb();
}
if (should_skip)
goto balance;
play_idle(jiffies_to_usecs(w_data->duration_jiffies));
balance:
if (clamping && w_data->clamping && cpu_online(w_data->cpu))
kthread_queue_work(w_data->worker, &w_data->balancing_work);
} }
/* /*
...@@ -456,127 +398,135 @@ static void poll_pkg_cstate(struct work_struct *dummy) ...@@ -456,127 +398,135 @@ static void poll_pkg_cstate(struct work_struct *dummy)
msr_last = msr_now; msr_last = msr_now;
tsc_last = tsc_now; tsc_last = tsc_now;
if (true == clamping) mutex_lock(&powerclamp_lock);
if (powerclamp_data.clamping)
schedule_delayed_work(&poll_pkg_cstate_work, HZ); schedule_delayed_work(&poll_pkg_cstate_work, HZ);
mutex_unlock(&powerclamp_lock);
} }
static void start_power_clamp_worker(unsigned long cpu) static struct idle_inject_device *ii_dev;
/*
* This function is called from idle injection core on timer expiry
* for the run duration. This allows powerclamp to readjust or skip
* injecting idle for this cycle.
*/
static bool idle_inject_update(void)
{ {
struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); bool update = false;
struct kthread_worker *worker;
worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu); /* We can't sleep in this callback */
if (IS_ERR(worker)) if (!mutex_trylock(&powerclamp_lock))
return; return true;
w_data->worker = worker; if (!(powerclamp_data.count % powerclamp_data.window_size_now)) {
w_data->count = 0;
w_data->cpu = cpu; should_skip = powerclamp_adjust_controls(powerclamp_data.target_ratio,
w_data->clamping = true; powerclamp_data.guard,
set_bit(cpu, cpu_clamping_mask); powerclamp_data.window_size_now);
sched_set_fifo(worker->task); update = true;
kthread_init_work(&w_data->balancing_work, clamp_balancing_func); }
kthread_init_delayed_work(&w_data->idle_injection_work,
clamp_idle_injection_func); if (update) {
kthread_queue_work(w_data->worker, &w_data->balancing_work); unsigned int runtime = get_run_time();
idle_inject_set_duration(ii_dev, runtime, duration);
}
powerclamp_data.count++;
mutex_unlock(&powerclamp_lock);
if (should_skip)
return false;
return true;
} }
static void stop_power_clamp_worker(unsigned long cpu) /* This function starts idle injection by calling idle_inject_start() */
static void trigger_idle_injection(void)
{ {
struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); unsigned int runtime = get_run_time();
if (!w_data->worker) idle_inject_set_duration(ii_dev, runtime, duration);
return; idle_inject_start(ii_dev);
powerclamp_data.clamping = true;
}
w_data->clamping = false; /*
/* * This function is called from start_power_clamp() to register
* Make sure that all works that get queued after this point see * CPUS with powercap idle injection register and set default
* the clamping disabled. The counter part is not needed because * idle duration and latency.
* there is an implicit memory barrier when the queued work
* is proceed.
*/ */
smp_wmb(); static int powerclamp_idle_injection_register(void)
kthread_cancel_work_sync(&w_data->balancing_work); {
kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
/* /*
* The balancing work still might be queued here because * The idle inject core will only inject for online CPUs,
* the handling of the "clapming" variable, cancel, and queue * So we can register for all present CPUs. In this way
* operations are not synchronized via a lock. But it is not * if some CPU goes online/offline while idle inject
* a big deal. The balancing work is fast and destroy kthread * is registered, nothing additional calls are required.
* will wait for it. * The same runtime and idle time is applicable for
* newly onlined CPUs if any.
*
* Here cpu_present_mask can be used as is.
* cast to (struct cpumask *) is required as the
* cpu_present_mask is const struct cpumask *, otherwise
* there will be compiler warnings.
*/ */
clear_bit(w_data->cpu, cpu_clamping_mask); ii_dev = idle_inject_register_full((struct cpumask *)cpu_present_mask,
kthread_destroy_worker(w_data->worker); idle_inject_update);
if (!ii_dev) {
pr_err("powerclamp: idle_inject_register failed\n");
return -EAGAIN;
}
idle_inject_set_duration(ii_dev, TICK_USEC, duration);
idle_inject_set_latency(ii_dev, UINT_MAX);
w_data->worker = NULL; return 0;
} }
static int start_power_clamp(void) /*
* This function is called from end_power_clamp() to stop idle injection
* and unregister CPUS from powercap idle injection core.
*/
static void remove_idle_injection(void)
{ {
unsigned long cpu; if (!powerclamp_data.clamping)
return;
set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); powerclamp_data.clamping = false;
/* prevent cpu hotplug */ idle_inject_stop(ii_dev);
cpus_read_lock(); }
/* prefer BSP */ /*
control_cpu = cpumask_first(cpu_online_mask); * This function is called when user change the cooling device
* state from zero to some other value.
*/
static int start_power_clamp(void)
{
int ret;
clamping = true; ret = powerclamp_idle_injection_register();
if (!ret) {
trigger_idle_injection();
if (poll_pkg_cstate_enable) if (poll_pkg_cstate_enable)
schedule_delayed_work(&poll_pkg_cstate_work, 0); schedule_delayed_work(&poll_pkg_cstate_work, 0);
/* start one kthread worker per online cpu */
for_each_online_cpu(cpu) {
start_power_clamp_worker(cpu);
} }
cpus_read_unlock();
return 0; return ret;
} }
static void end_power_clamp(void) /*
{ * This function is called when user change the cooling device
int i; * state from non zero value zero.
/*
* Block requeuing in all the kthread workers. They will flush and
* stop faster.
*/ */
clamping = false; static void end_power_clamp(void)
for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
pr_debug("clamping worker for cpu %d alive, destroy\n", i);
stop_power_clamp_worker(i);
}
}
static int powerclamp_cpu_online(unsigned int cpu)
{ {
if (clamping == false) if (powerclamp_data.clamping) {
return 0; remove_idle_injection();
start_power_clamp_worker(cpu); idle_inject_unregister(ii_dev);
/* prefer BSP as controlling CPU */
if (cpu == 0) {
control_cpu = 0;
smp_mb();
} }
return 0;
}
static int powerclamp_cpu_predown(unsigned int cpu)
{
if (clamping == false)
return 0;
stop_power_clamp_worker(cpu);
if (cpu != control_cpu)
return 0;
control_cpu = cpumask_first(cpu_online_mask);
if (control_cpu == cpu)
control_cpu = cpumask_next(cpu, cpu_online_mask);
smp_mb();
return 0;
} }
static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
...@@ -590,16 +540,20 @@ static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, ...@@ -590,16 +540,20 @@ static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state) unsigned long *state)
{ {
if (clamping) { mutex_lock(&powerclamp_lock);
if (powerclamp_data.clamping) {
if (poll_pkg_cstate_enable) if (poll_pkg_cstate_enable)
*state = pkg_cstate_ratio_cur; *state = pkg_cstate_ratio_cur;
else else
*state = set_target_ratio; *state = powerclamp_data.target_ratio;
} else { } else {
/* to save power, do not poll idle ratio while not clamping */ /* to save power, do not poll idle ratio while not clamping */
*state = -1; /* indicates invalid state */ *state = -1; /* indicates invalid state */
} }
mutex_unlock(&powerclamp_lock);
return 0; return 0;
} }
...@@ -608,24 +562,32 @@ static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, ...@@ -608,24 +562,32 @@ static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
{ {
int ret = 0; int ret = 0;
mutex_lock(&powerclamp_lock);
new_target_ratio = clamp(new_target_ratio, 0UL, new_target_ratio = clamp(new_target_ratio, 0UL,
(unsigned long) (MAX_TARGET_RATIO-1)); (unsigned long) (MAX_TARGET_RATIO - 1));
if (set_target_ratio == 0 && new_target_ratio > 0) { if (!powerclamp_data.target_ratio && new_target_ratio > 0) {
pr_info("Start idle injection to reduce power\n"); pr_info("Start idle injection to reduce power\n");
set_target_ratio = new_target_ratio; powerclamp_data.target_ratio = new_target_ratio;
ret = start_power_clamp(); ret = start_power_clamp();
if (ret)
powerclamp_data.target_ratio = 0;
goto exit_set; goto exit_set;
} else if (set_target_ratio > 0 && new_target_ratio == 0) { } else if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) {
pr_info("Stop forced idle injection\n"); pr_info("Stop forced idle injection\n");
end_power_clamp(); end_power_clamp();
set_target_ratio = 0; powerclamp_data.target_ratio = 0;
} else /* adjust currently running */ { } else /* adjust currently running */ {
set_target_ratio = new_target_ratio; unsigned int runtime;
/* make new set_target_ratio visible to other cpus */
smp_mb(); powerclamp_data.target_ratio = new_target_ratio;
runtime = get_run_time();
idle_inject_set_duration(ii_dev, runtime, duration);
} }
exit_set: exit_set:
mutex_unlock(&powerclamp_lock);
return ret; return ret;
} }
...@@ -666,7 +628,6 @@ static int powerclamp_debug_show(struct seq_file *m, void *unused) ...@@ -666,7 +628,6 @@ static int powerclamp_debug_show(struct seq_file *m, void *unused)
{ {
int i = 0; int i = 0;
seq_printf(m, "controlling cpu: %d\n", control_cpu);
seq_printf(m, "pct confidence steady dynamic (compensation)\n"); seq_printf(m, "pct confidence steady dynamic (compensation)\n");
for (i = 0; i < MAX_TARGET_RATIO; i++) { for (i = 0; i < MAX_TARGET_RATIO; i++) {
seq_printf(m, "%d\t%lu\t%lu\t%lu\n", seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
...@@ -689,78 +650,50 @@ static inline void powerclamp_create_debug_files(void) ...@@ -689,78 +650,50 @@ static inline void powerclamp_create_debug_files(void)
&powerclamp_debug_fops); &powerclamp_debug_fops);
} }
static enum cpuhp_state hp_state;
static int __init powerclamp_init(void) static int __init powerclamp_init(void)
{ {
int retval; int retval;
cpu_clamping_mask = bitmap_zalloc(num_possible_cpus(), GFP_KERNEL);
if (!cpu_clamping_mask)
return -ENOMEM;
/* probe cpu features and ids here */ /* probe cpu features and ids here */
retval = powerclamp_probe(); retval = powerclamp_probe();
if (retval) if (retval)
goto exit_free; return retval;
/* set default limit, maybe adjusted during runtime based on feedback */ /* set default limit, maybe adjusted during runtime based on feedback */
window_size = 2; window_size = 2;
retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"thermal/intel_powerclamp:online",
powerclamp_cpu_online,
powerclamp_cpu_predown);
if (retval < 0)
goto exit_free;
hp_state = retval;
worker_data = alloc_percpu(struct powerclamp_worker_data);
if (!worker_data) {
retval = -ENOMEM;
goto exit_unregister;
}
if (topology_max_packages() == 1 && topology_max_die_per_package() == 1) if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
poll_pkg_cstate_enable = true; poll_pkg_cstate_enable = true;
cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL, cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
&powerclamp_cooling_ops); &powerclamp_cooling_ops);
if (IS_ERR(cooling_dev)) { if (IS_ERR(cooling_dev))
retval = -ENODEV; return -ENODEV;
goto exit_free_thread;
}
if (!duration) if (!duration)
duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES); duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES);
powerclamp_create_debug_files(); powerclamp_create_debug_files();
return 0; return 0;
exit_free_thread:
free_percpu(worker_data);
exit_unregister:
cpuhp_remove_state_nocalls(hp_state);
exit_free:
bitmap_free(cpu_clamping_mask);
return retval;
} }
module_init(powerclamp_init); module_init(powerclamp_init);
static void __exit powerclamp_exit(void) static void __exit powerclamp_exit(void)
{ {
mutex_lock(&powerclamp_lock);
end_power_clamp(); end_power_clamp();
cpuhp_remove_state_nocalls(hp_state); mutex_unlock(&powerclamp_lock);
free_percpu(worker_data);
thermal_cooling_device_unregister(cooling_dev); thermal_cooling_device_unregister(cooling_dev);
bitmap_free(cpu_clamping_mask);
cancel_delayed_work_sync(&poll_pkg_cstate_work); cancel_delayed_work_sync(&poll_pkg_cstate_work);
debugfs_remove_recursive(debug_dir); debugfs_remove_recursive(debug_dir);
} }
module_exit(powerclamp_exit); module_exit(powerclamp_exit);
MODULE_IMPORT_NS(IDLE_INJECT);
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>"); MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment