From 3cedbc5a6d7f7c5539e139f89ec9f6e1ed668418 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 1 May 2017 23:06:08 -0400 Subject: intel_pstate: use updated msr-index.h HWP.EPP values intel_pstate exports sysfs attributes for setting and observing HWP.EPP. These attributes use strings to describe 4 operating states, and inside the driver, these strings are mapped to numerical register values. The authorative mapping between the strings and numerical HWP.EPP values are now globally defined in msr-index.h, replacing the out-dated mapping that were open-coded into intel_pstate.c new old string --- --- ------ 0 0 performance 128 64 balance_performance 192 128 balance_power 255 192 power Note that the HW and BIOS default value on most system is 128, which intel_pstate will now call "balance_performance" while it used to call it "balance_power". Signed-off-by: Len Brown --- drivers/cpufreq/intel_pstate.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 50bd6d987fc3..ab8ebaeb3621 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -716,6 +716,12 @@ static const char * const energy_perf_strings[] = { "power", NULL }; +static const unsigned int epp_values[] = { + HWP_EPP_PERFORMANCE, + HWP_EPP_BALANCE_PERFORMANCE, + HWP_EPP_BALANCE_POWERSAVE, + HWP_EPP_POWERSAVE +}; static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) { @@ -727,17 +733,14 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) return epp; if (static_cpu_has(X86_FEATURE_HWP_EPP)) { - /* - * Range: - * 0x00-0x3F : Performance - * 0x40-0x7F : Balance performance - * 0x80-0xBF : Balance power - * 0xC0-0xFF : Power - * The EPP is a 8 bit value, but our ranges restrict the - * value which can be set. Here only using top two bits - * effectively. - */ - index = (epp >> 6) + 1; + if (epp == HWP_EPP_PERFORMANCE) + return 1; + if (epp <= HWP_EPP_BALANCE_PERFORMANCE) + return 2; + if (epp <= HWP_EPP_BALANCE_POWERSAVE) + return 3; + else + return 4; } else if (static_cpu_has(X86_FEATURE_EPB)) { /* * Range: @@ -775,15 +778,8 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, value &= ~GENMASK_ULL(31, 24); - /* - * If epp is not default, convert from index into - * energy_perf_strings to epp value, by shifting 6 - * bits left to use only top two bits in epp. - * The resultant epp need to shifted by 24 bits to - * epp position in MSR_HWP_REQUEST. - */ if (epp == -EINVAL) - epp = (pref_index - 1) << 6; + epp = epp_values[pref_index - 1]; value |= (u64)epp << 24; ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); -- cgit v1.2.3 From 1a4fe38add8be45eb3873ad756561b9baf6bcaef Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 12 Jun 2017 16:30:27 -0700 Subject: cpufreq: intel_pstate: Remove max/min fractions to limit performance In the current model the max/min perf limits are a fraction of current user space limits to the allowed max_freq or 100% for global limits. This results in wrong ratio limits calculation because of rounding issues for some user space limits. Initially we tried to solve this issue by issue by having more shift bits to increase precision. Still there are isolated cases where we still have error. This can be avoided by using ratios all together. Since the way we get cpuinfo.max_freq is by multiplying scaling factor to max ratio, we can easily keep the max/min ratios in terms of ratios and not fractions. For example: if the max ratio = 36 cpuinfo.max_freq = 36 * 100000 = 3600000 Suppose user space sets a limit of 1200000, then we can calculate max ratio limit as = 36 * 1200000 / 3600000 = 12 This will be correct for any user limits. The other advantage is that, we don't need to do any calculation in the fast path as ratio limit is already calculated via set_policy() callback. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 114 +++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 51 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index eb1158532de3..be0290a1a5e2 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -231,10 +231,8 @@ struct global_params { * @prev_cummulative_iowait: IO Wait time difference from last and * current sample * @sample: Storage for storing last Sample data - * @min_perf: Minimum capacity limit as a fraction of the maximum - * turbo P-state capacity. - * @max_perf: Maximum capacity limit as a fraction of the maximum - * turbo P-state capacity. + * @min_perf_ratio: Minimum capacity in terms of PERF or HWP ratios + * @max_perf_ratio: Maximum capacity in terms of PERF or HWP ratios * @acpi_perf_data: Stores ACPI perf information read from _PSS * @valid_pss_table: Set to true for valid ACPI _PSS entries found * @epp_powersave: Last saved HWP energy performance preference @@ -266,8 +264,8 @@ struct cpudata { u64 prev_tsc; u64 prev_cummulative_iowait; struct sample sample; - int32_t min_perf; - int32_t max_perf; + int32_t min_perf_ratio; + int32_t max_perf_ratio; #ifdef CONFIG_ACPI struct acpi_processor_performance acpi_perf_data; bool valid_pss_table; @@ -794,25 +792,32 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { NULL, }; -static void intel_pstate_hwp_set(unsigned int cpu) +static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, + int *current_max) { - struct cpudata *cpu_data = all_cpu_data[cpu]; - int min, hw_min, max, hw_max; - u64 value, cap; - s16 epp; + u64 cap; rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); - hw_min = HWP_LOWEST_PERF(cap); if (global.no_turbo) - hw_max = HWP_GUARANTEED_PERF(cap); + *current_max = HWP_GUARANTEED_PERF(cap); else - hw_max = HWP_HIGHEST_PERF(cap); + *current_max = HWP_HIGHEST_PERF(cap); + + *phy_max = HWP_HIGHEST_PERF(cap); +} + +static void intel_pstate_hwp_set(unsigned int cpu) +{ + struct cpudata *cpu_data = all_cpu_data[cpu]; + int max, min; + u64 value; + s16 epp; + + max = cpu_data->max_perf_ratio; + min = cpu_data->min_perf_ratio; - max = fp_ext_toint(hw_max * cpu_data->max_perf); if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) min = max; - else - min = fp_ext_toint(hw_max * cpu_data->min_perf); rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); @@ -1528,8 +1533,7 @@ static void intel_pstate_max_within_limits(struct cpudata *cpu) update_turbo_state(); pstate = intel_pstate_get_base_pstate(cpu); - pstate = max(cpu->pstate.min_pstate, - fp_ext_toint(pstate * cpu->max_perf)); + pstate = max(cpu->pstate.min_pstate, cpu->max_perf_ratio); intel_pstate_set_pstate(cpu, pstate); } @@ -1695,9 +1699,8 @@ static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) int max_pstate = intel_pstate_get_base_pstate(cpu); int min_pstate; - min_pstate = max(cpu->pstate.min_pstate, - fp_ext_toint(max_pstate * cpu->min_perf)); - max_pstate = max(min_pstate, fp_ext_toint(max_pstate * cpu->max_perf)); + min_pstate = max(cpu->pstate.min_pstate, cpu->min_perf_ratio); + max_pstate = max(min_pstate, cpu->max_perf_ratio); return clamp_t(int, pstate, min_pstate, max_pstate); } @@ -1967,52 +1970,61 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, { int max_freq = intel_pstate_get_max_freq(cpu); int32_t max_policy_perf, min_policy_perf; + int max_state, turbo_max; - max_policy_perf = div_ext_fp(policy->max, max_freq); - max_policy_perf = clamp_t(int32_t, max_policy_perf, 0, int_ext_tofp(1)); + /* + * HWP needs some special consideration, because on BDX the + * HWP_REQUEST uses abstract value to represent performance + * rather than pure ratios. + */ + if (hwp_active) { + intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + } else { + max_state = intel_pstate_get_base_pstate(cpu); + turbo_max = cpu->pstate.turbo_pstate; + } + + max_policy_perf = max_state * policy->max / max_freq; if (policy->max == policy->min) { min_policy_perf = max_policy_perf; } else { - min_policy_perf = div_ext_fp(policy->min, max_freq); + min_policy_perf = max_state * policy->min / max_freq; min_policy_perf = clamp_t(int32_t, min_policy_perf, 0, max_policy_perf); } + pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n", + policy->cpu, max_state, + min_policy_perf, max_policy_perf); + /* Normalize user input to [min_perf, max_perf] */ if (per_cpu_limits) { - cpu->min_perf = min_policy_perf; - cpu->max_perf = max_policy_perf; + cpu->min_perf_ratio = min_policy_perf; + cpu->max_perf_ratio = max_policy_perf; } else { int32_t global_min, global_max; /* Global limits are in percent of the maximum turbo P-state. */ - global_max = percent_ext_fp(global.max_perf_pct); - global_min = percent_ext_fp(global.min_perf_pct); - if (max_freq != cpu->pstate.turbo_freq) { - int32_t turbo_factor; - - turbo_factor = div_ext_fp(cpu->pstate.turbo_pstate, - cpu->pstate.max_pstate); - global_min = mul_ext_fp(global_min, turbo_factor); - global_max = mul_ext_fp(global_max, turbo_factor); - } + global_max = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100); + global_min = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100); global_min = clamp_t(int32_t, global_min, 0, global_max); - cpu->min_perf = max(min_policy_perf, global_min); - cpu->min_perf = min(cpu->min_perf, max_policy_perf); - cpu->max_perf = min(max_policy_perf, global_max); - cpu->max_perf = max(min_policy_perf, cpu->max_perf); + pr_debug("cpu:%d global_min:%d global_max:%d\n", policy->cpu, + global_min, global_max); - /* Make sure min_perf <= max_perf */ - cpu->min_perf = min(cpu->min_perf, cpu->max_perf); - } + cpu->min_perf_ratio = max(min_policy_perf, global_min); + cpu->min_perf_ratio = min(cpu->min_perf_ratio, max_policy_perf); + cpu->max_perf_ratio = min(max_policy_perf, global_max); + cpu->max_perf_ratio = max(min_policy_perf, cpu->max_perf_ratio); - cpu->max_perf = round_up(cpu->max_perf, EXT_FRAC_BITS); - cpu->min_perf = round_up(cpu->min_perf, EXT_FRAC_BITS); + /* Make sure min_perf <= max_perf */ + cpu->min_perf_ratio = min(cpu->min_perf_ratio, + cpu->max_perf_ratio); - pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu, - fp_ext_toint(cpu->max_perf * 100), - fp_ext_toint(cpu->min_perf * 100)); + } + pr_debug("cpu:%d max_perf_ratio:%d min_perf_ratio:%d\n", policy->cpu, + cpu->max_perf_ratio, + cpu->min_perf_ratio); } static int intel_pstate_set_policy(struct cpufreq_policy *policy) @@ -2115,8 +2127,8 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - cpu->max_perf = int_ext_tofp(1); - cpu->min_perf = 0; + cpu->max_perf_ratio = 0xFF; + cpu->min_perf_ratio = 0; policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; -- cgit v1.2.3 From 62611cb912f7cb08ef7815780bcc20a09abedd20 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 23 Jun 2017 22:11:53 -0700 Subject: intel_pstate: delete scheduler hook in HWP mode The cpufreq/scaling_cur_freq sysfs attribute is now provided by shared x86 cpufreq code on modern x86 systems, including all systems supported by the intel_pstate driver. In HWP mode, maintaining that value was the sole purpose of the scheduler hook, intel_pstate_update_util_hwp(), so it can now be removed. Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index be0290a1a5e2..b00a45595c89 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1736,16 +1736,6 @@ static void intel_pstate_adjust_pstate(struct cpudata *cpu, int target_pstate) fp_toint(cpu->iowait_boost * 100)); } -static void intel_pstate_update_util_hwp(struct update_util_data *data, - u64 time, unsigned int flags) -{ - struct cpudata *cpu = container_of(data, struct cpudata, update_util); - u64 delta_ns = time - cpu->sample.time; - - if ((s64)delta_ns >= INTEL_PSTATE_HWP_SAMPLING_INTERVAL) - intel_pstate_sample(cpu, time); -} - static void intel_pstate_update_util_pid(struct update_util_data *data, u64 time, unsigned int flags) { @@ -1937,6 +1927,9 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num) { struct cpudata *cpu = all_cpu_data[cpu_num]; + if (hwp_active) + return; + if (cpu->update_util_set) return; @@ -2570,7 +2563,6 @@ static int __init intel_pstate_init(void) } else { hwp_active++; intel_pstate.attr = hwp_cpufreq_attrs; - pstate_funcs.update_util = intel_pstate_update_util_hwp; goto hwp_cpu_matched; } } else { -- cgit v1.2.3 From 82b4e03e01bc8def546b52707962809f04ae5c7a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 23 Jun 2017 22:11:54 -0700 Subject: intel_pstate: skip scheduler hook when in "performance" mode When the governor is set to "performance", intel_pstate does not need the scheduler hook for doing any calculations. Under these conditions, its only purpose is to continue to maintain cpufreq/scaling_cur_freq. The cpufreq/scaling_cur_freq sysfs attribute is now provided by shared x86 cpufreq code on modern x86 systems, including all systems supported by the intel_pstate driver. So in "performance" governor mode, the scheduler hook can be skipped. This applies to both in Software and Hardware P-state control modes. Suggested-by: Srinivas Pandruvada Signed-off-by: Len Brown Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index b00a45595c89..1772d309bea9 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2044,10 +2044,10 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) */ intel_pstate_clear_update_util_hook(policy->cpu); intel_pstate_max_within_limits(cpu); + } else { + intel_pstate_set_update_util_hook(policy->cpu); } - intel_pstate_set_update_util_hook(policy->cpu); - if (hwp_active) intel_pstate_hwp_set(policy->cpu); -- cgit v1.2.3 From fab24dcc395637557a7988a867e7b3a5823917a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Jun 2017 01:47:56 +0200 Subject: cpufreq: intel_pstate: Clean up after performance governor changes After commit 82b4e03e01bc (intel_pstate: skip scheduler hook when in "performance" mode) get_target_pstate_use_performance() and get_target_pstate_use_cpu_load() are never called if scaling_governor is "performance", so drop the CPUFREQ_POLICY_PERFORMANCE checks from them as they will never trigger anyway. Moreover, the documentation needs to be updated to reflect the change made by the above commit, so do that too. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/cpufreq/intel_pstate.c') diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1772d309bea9..756483251832 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1620,9 +1620,6 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu) int32_t busy_frac, boost; int target, avg_pstate; - if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) - return cpu->pstate.turbo_pstate; - busy_frac = div_fp(sample->mperf, sample->tsc); boost = cpu->iowait_boost; @@ -1659,9 +1656,6 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) int32_t perf_scaled, max_pstate, current_pstate, sample_ratio; u64 duration_ns; - if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) - return cpu->pstate.turbo_pstate; - /* * perf_scaled is the ratio of the average P-state during the last * sampling period to the P-state requested last time (in percent). -- cgit v1.2.3