aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/e820/api.h3
-rw-r--r--arch/x86/include/asm/e820/types.h2
-rw-r--r--arch/x86/include/asm/uv/bios.h2
-rw-r--r--arch/x86/kernel/acpi/boot.c12
-rw-r--r--arch/x86/kernel/cpu/topology.c15
-rw-r--r--arch/x86/kernel/e820.c479
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--include/linux/compiler_types.h19
-rw-r--r--include/linux/sched.h13
-rw-r--r--kernel/sched/core.c45
-rw-r--r--kernel/sched/deadline.c14
-rw-r--r--kernel/sched/ext.c5
-rw-r--r--kernel/sched/fair.c361
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c9
-rw-r--r--kernel/sched/sched.h59
-rw-r--r--kernel/sched/stop_task.c3
18 files changed, 550 insertions, 507 deletions
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index c83645d5b2a86d..bbe0c8de976c0b 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -16,10 +16,9 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
extern void e820__range_add (u64 start, u64 size, enum e820_type type);
extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
-extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
+extern void e820__range_remove(u64 start, u64 size, enum e820_type filter_type);
extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
-extern void e820__print_table(char *who);
extern int e820__update_table(struct e820_table *table);
extern void e820__update_table_print(void);
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
index 80c4a7266629c9..df12f7ee75d369 100644
--- a/arch/x86/include/asm/e820/types.h
+++ b/arch/x86/include/asm/e820/types.h
@@ -83,7 +83,7 @@ struct e820_entry {
* The whole array of E820 entries:
*/
struct e820_table {
- __u32 nr_entries;
+ u32 nr_entries;
struct e820_entry entries[E820_MAX_ENTRIES];
};
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 6989b824fd321b..d0b62e2552902c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -122,7 +122,7 @@ struct uv_systab {
struct {
u32 type:8; /* type of entry */
u32 offset:24; /* byte offset from struct start to entry */
- } entry[1]; /* additional entries follow */
+ } entry[]; /* additional entries follow */
};
extern struct uv_systab *uv_systab;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9fa321a95eb33f..d6138b2b633a31 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
#include <asm/smp.h>
#include <asm/i8259.h>
#include <asm/setup.h>
+#include <asm/hypervisor.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -164,11 +165,14 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags)
if (lapic_flags & ACPI_MADT_ENABLED)
return true;
- if (!acpi_support_online_capable ||
- (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
- return true;
+ if (acpi_support_online_capable)
+ return lapic_flags & ACPI_MADT_ONLINE_CAPABLE;
- return false;
+ /*
+ * QEMU expects legacy "Enabled=0" LAPIC entries to be counted as usable
+ * in order to support CPU hotplug in guests.
+ */
+ return !hypervisor_is_type(X86_HYPER_NATIVE);
}
static int __init
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index f55ea3cdbf88ef..23190a786d3104 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,7 +27,6 @@
#include <xen/xen.h>
#include <asm/apic.h>
-#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
@@ -236,20 +235,6 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
cpuid_to_apicid[cpu] = apic_id;
topo_set_cpuids(cpu, apic_id, acpi_id);
} else {
- u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
-
- /*
- * Check for present APICs in the same package when running
- * on bare metal. Allow the bogosity in a guest.
- */
- if (hypervisor_is_type(X86_HYPER_NATIVE) &&
- topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
- pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
- apic_id);
- topo_info.nr_rejected_cpus++;
- return;
- }
-
topo_info.nr_disabled_cpus++;
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index b15b97d3cb52dc..97b54bd0f4822b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -9,13 +9,11 @@
* quirks and other tweaks, and feeds that into the generic Linux memory
* allocation code routines via a platform independent interface (memblock, etc.).
*/
-#include <linux/crash_dump.h>
#include <linux/memblock.h>
#include <linux/suspend.h>
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/sort.h>
-#include <linux/memory_hotplug.h>
#include <linux/kvm_types.h>
#include <asm/e820/api.h>
@@ -57,13 +55,13 @@
* re-propagated. So its main role is a temporary bootstrap storage of firmware
* specific memory layout data during early bootup.
*/
-static struct e820_table e820_table_init __initdata;
-static struct e820_table e820_table_kexec_init __initdata;
-static struct e820_table e820_table_firmware_init __initdata;
+__initdata static struct e820_table e820_table_init;
+__initdata static struct e820_table e820_table_kexec_init;
+__initdata static struct e820_table e820_table_firmware_init;
-struct e820_table *e820_table __refdata = &e820_table_init;
-struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init;
-struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init;
+__refdata struct e820_table *e820_table = &e820_table_init;
+__refdata struct e820_table *e820_table_kexec = &e820_table_kexec_init;
+__refdata struct e820_table *e820_table_firmware = &e820_table_firmware_init;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -78,10 +76,10 @@ EXPORT_SYMBOL(pci_mem_start);
static bool _e820__mapped_any(struct e820_table *table,
u64 start, u64 end, enum e820_type type)
{
- int i;
+ u32 idx;
- for (i = 0; i < table->nr_entries; i++) {
- struct e820_entry *entry = &table->entries[i];
+ for (idx = 0; idx < table->nr_entries; idx++) {
+ struct e820_entry *entry = &table->entries[idx];
if (type && entry->type != type)
continue;
@@ -113,10 +111,10 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
enum e820_type type)
{
- int i;
+ u32 idx;
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
if (type && entry->type != type)
continue;
@@ -146,7 +144,7 @@ static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
/*
* This function checks if the entire range <start,end> is mapped with type.
*/
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+__init bool e820__mapped_all(u64 start, u64 end, enum e820_type type)
{
return __e820__mapped_all(start, end, type);
}
@@ -164,54 +162,74 @@ int e820__get_entry_type(u64 start, u64 end)
/*
* Add a memory region to the kernel E820 map.
*/
-static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
+__init static void __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
{
- int x = table->nr_entries;
+ u32 idx = table->nr_entries;
+ struct e820_entry *entry_new;
- if (x >= ARRAY_SIZE(table->entries)) {
- pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
- start, start + size - 1);
+ if (idx >= ARRAY_SIZE(table->entries)) {
+ pr_err("E820 table full; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size-1);
return;
}
- table->entries[x].addr = start;
- table->entries[x].size = size;
- table->entries[x].type = type;
+ entry_new = table->entries + idx;
+
+ entry_new->addr = start;
+ entry_new->size = size;
+ entry_new->type = type;
+
table->nr_entries++;
}
-void __init e820__range_add(u64 start, u64 size, enum e820_type type)
+__init void e820__range_add(u64 start, u64 size, enum e820_type type)
{
__e820__range_add(e820_table, start, size, type);
}
-static void __init e820_print_type(enum e820_type type)
+__init static void e820_print_type(enum e820_type type)
{
switch (type) {
- case E820_TYPE_RAM: pr_cont("usable"); break;
- case E820_TYPE_RESERVED: pr_cont("reserved"); break;
- case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break;
- case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
- case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
- case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
+ case E820_TYPE_RAM: pr_cont(" System RAM"); break;
+ case E820_TYPE_RESERVED: pr_cont(" device reserved"); break;
+ case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break;
+ case E820_TYPE_ACPI: pr_cont(" ACPI data"); break;
+ case E820_TYPE_NVS: pr_cont(" ACPI NVS"); break;
+ case E820_TYPE_UNUSABLE: pr_cont(" unusable"); break;
case E820_TYPE_PMEM: /* Fall through: */
- case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break;
- default: pr_cont("type %u", type); break;
+ case E820_TYPE_PRAM: pr_cont(" persistent RAM (type %u)", type); break;
+ default: pr_cont(" type %u", type); break;
}
}
-void __init e820__print_table(char *who)
+__init static void e820__print_table(const char *who)
{
- int i;
+ u64 range_end_prev = 0;
+ u32 idx;
+
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = e820_table->entries + idx;
+ u64 range_start, range_end;
+
+ range_start = entry->addr;
+ range_end = entry->addr + entry->size;
- for (i = 0; i < e820_table->nr_entries; i++) {
- pr_info("%s: [mem %#018Lx-%#018Lx] ",
- who,
- e820_table->entries[i].addr,
- e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+ /* Out of order E820 maps should not happen: */
+ if (range_start < range_end_prev)
+ pr_info(FW_BUG "out of order E820 entry!\n");
- e820_print_type(e820_table->entries[i].type);
+ if (range_start > range_end_prev) {
+ pr_info("%s: [gap %#018Lx-%#018Lx]\n",
+ who,
+ range_end_prev,
+ range_start-1);
+ }
+
+ pr_info("%s: [mem %#018Lx-%#018Lx] ", who, range_start, range_end-1);
+ e820_print_type(entry->type);
pr_cont("\n");
+
+ range_end_prev = range_end;
}
}
@@ -280,15 +298,15 @@ struct change_member {
/* Pointer to the original entry: */
struct e820_entry *entry;
/* Address for this change point: */
- unsigned long long addr;
+ u64 addr;
};
-static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
-static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
-static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
-static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
+__initdata static struct change_member change_point_list[2*E820_MAX_ENTRIES];
+__initdata static struct change_member *change_point[2*E820_MAX_ENTRIES];
+__initdata static struct e820_entry *overlap_list[E820_MAX_ENTRIES];
+__initdata static struct e820_entry new_entries[E820_MAX_ENTRIES];
-static int __init cpcompare(const void *a, const void *b)
+__init static int cpcompare(const void *a, const void *b)
{
struct change_member * const *app = a, * const *bpp = b;
const struct change_member *ap = *app, *bp = *bpp;
@@ -305,28 +323,32 @@ static int __init cpcompare(const void *a, const void *b)
return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
}
-static bool e820_nomerge(enum e820_type type)
+/*
+ * Can two consecutive E820 entries of this same E820 type be merged?
+ */
+static bool e820_type_mergeable(enum e820_type type)
{
/*
* These types may indicate distinct platform ranges aligned to
- * numa node, protection domain, performance domain, or other
+ * NUMA node, protection domain, performance domain, or other
* boundaries. Do not merge them.
*/
if (type == E820_TYPE_PRAM)
- return true;
+ return false;
if (type == E820_TYPE_SOFT_RESERVED)
- return true;
- return false;
+ return false;
+
+ return true;
}
-int __init e820__update_table(struct e820_table *table)
+__init int e820__update_table(struct e820_table *table)
{
struct e820_entry *entries = table->entries;
u32 max_nr_entries = ARRAY_SIZE(table->entries);
enum e820_type current_type, last_type;
- unsigned long long last_addr;
+ u64 last_addr;
u32 new_nr_entries, overlap_entries;
- u32 i, chg_idx, chg_nr;
+ u32 idx, chg_idx, chg_nr;
/* If there's only one memory region, don't bother: */
if (table->nr_entries < 2)
@@ -335,26 +357,26 @@ int __init e820__update_table(struct e820_table *table)
BUG_ON(table->nr_entries > max_nr_entries);
/* Bail out if we find any unreasonable addresses in the map: */
- for (i = 0; i < table->nr_entries; i++) {
- if (entries[i].addr + entries[i].size < entries[i].addr)
+ for (idx = 0; idx < table->nr_entries; idx++) {
+ if (entries[idx].addr + entries[idx].size < entries[idx].addr)
return -1;
}
/* Create pointers for initial change-point information (for sorting): */
- for (i = 0; i < 2 * table->nr_entries; i++)
- change_point[i] = &change_point_list[i];
+ for (idx = 0; idx < 2 * table->nr_entries; idx++)
+ change_point[idx] = &change_point_list[idx];
/*
* Record all known change-points (starting and ending addresses),
* omitting empty memory regions:
*/
chg_idx = 0;
- for (i = 0; i < table->nr_entries; i++) {
- if (entries[i].size != 0) {
- change_point[chg_idx]->addr = entries[i].addr;
- change_point[chg_idx++]->entry = &entries[i];
- change_point[chg_idx]->addr = entries[i].addr + entries[i].size;
- change_point[chg_idx++]->entry = &entries[i];
+ for (idx = 0; idx < table->nr_entries; idx++) {
+ if (entries[idx].size != 0) {
+ change_point[chg_idx]->addr = entries[idx].addr;
+ change_point[chg_idx++]->entry = &entries[idx];
+ change_point[chg_idx]->addr = entries[idx].addr + entries[idx].size;
+ change_point[chg_idx++]->entry = &entries[idx];
}
}
chg_nr = chg_idx;
@@ -376,9 +398,9 @@ int __init e820__update_table(struct e820_table *table)
overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
} else {
/* Remove entry from list (order independent, so swap with last): */
- for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i] == change_point[chg_idx]->entry)
- overlap_list[i] = overlap_list[overlap_entries-1];
+ for (idx = 0; idx < overlap_entries; idx++) {
+ if (overlap_list[idx] == change_point[chg_idx]->entry)
+ overlap_list[idx] = overlap_list[overlap_entries-1];
}
overlap_entries--;
}
@@ -388,13 +410,13 @@ int __init e820__update_table(struct e820_table *table)
* 1=usable, 2,3,4,4+=unusable)
*/
current_type = 0;
- for (i = 0; i < overlap_entries; i++) {
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
+ for (idx = 0; idx < overlap_entries; idx++) {
+ if (overlap_list[idx]->type > current_type)
+ current_type = overlap_list[idx]->type;
}
/* Continue building up new map based on this information: */
- if (current_type != last_type || e820_nomerge(current_type)) {
+ if (current_type != last_type || !e820_type_mergeable(current_type)) {
if (last_type) {
new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was non-zero: */
@@ -419,17 +441,22 @@ int __init e820__update_table(struct e820_table *table)
return 0;
}
-static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
+/*
+ * Copy the BIOS E820 map into the kernel's e820_table.
+ *
+ * Sanity-check it while we're at it..
+ */
+__init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
{
struct boot_e820_entry *entry = entries;
while (nr_entries) {
u64 start = entry->addr;
- u64 size = entry->size;
- u64 end = start + size - 1;
- u32 type = entry->type;
+ u64 size = entry->size;
+ u64 end = start + size-1;
+ u32 type = entry->type;
- /* Ignore the entry on 64-bit overflow: */
+ /* Ignore the remaining entries on 64-bit overflow: */
if (start > end && likely(size))
return -1;
@@ -441,29 +468,11 @@ static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_en
return 0;
}
-/*
- * Copy the BIOS E820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- */
-static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
-{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_entries < 2)
- return -1;
-
- return __append_e820_table(entries, nr_entries);
-}
-
-static u64 __init
+__init static u64
__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
u64 end;
- unsigned int i;
+ u32 idx;
u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -472,14 +481,14 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx]", start, end - 1);
e820_print_type(old_type);
- pr_cont(" ==> ");
+ pr_cont(" ==>");
e820_print_type(new_type);
pr_cont("\n");
- for (i = 0; i < table->nr_entries; i++) {
- struct e820_entry *entry = &table->entries[i];
+ for (idx = 0; idx < table->nr_entries; idx++) {
+ struct e820_entry *entry = &table->entries[idx];
u64 final_start, final_end;
u64 entry_end;
@@ -527,46 +536,44 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty
return real_updated_size;
}
-u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+__init u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
{
return __e820__range_update(e820_table, start, size, old_type, new_type);
}
-u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
+__init u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size,
enum e820_type old_type, enum e820_type new_type)
{
return __e820__range_update(t, start, size, old_type, new_type);
}
/* Remove a range of memory from the E820 table: */
-u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
+__init void e820__range_remove(u64 start, u64 size, enum e820_type filter_type)
{
- int i;
+ u32 idx;
u64 end;
- u64 real_removed_size = 0;
if (size > (ULLONG_MAX - start))
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
- if (check_type)
- e820_print_type(old_type);
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx]", start, end - 1);
+ if (filter_type)
+ e820_print_type(filter_type);
pr_cont("\n");
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
u64 final_start, final_end;
u64 entry_end;
- if (check_type && entry->type != old_type)
+ if (filter_type && entry->type != filter_type)
continue;
entry_end = entry->addr + entry->size;
/* Completely covered? */
if (entry->addr >= start && entry_end <= end) {
- real_removed_size += entry->size;
memset(entry, 0, sizeof(*entry));
continue;
}
@@ -575,7 +582,6 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool
if (entry->addr < start && entry_end > end) {
e820__range_add(end, entry_end - end, entry->type);
entry->size = start - entry->addr;
- real_removed_size += size;
continue;
}
@@ -585,8 +591,6 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool
if (final_start >= final_end)
continue;
- real_removed_size += final_end - final_start;
-
/*
* Left range could be head or tail, so need to update
* the size first:
@@ -597,10 +601,9 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool
entry->addr = final_end;
}
- return real_removed_size;
}
-void __init e820__update_table_print(void)
+__init void e820__update_table_print(void)
{
if (e820__update_table(e820_table))
return;
@@ -609,42 +612,64 @@ void __init e820__update_table_print(void)
e820__print_table("modified");
}
-static void __init e820__update_table_kexec(void)
+__init static void e820__update_table_kexec(void)
{
e820__update_table(e820_table_kexec);
}
-#define MAX_GAP_END 0x100000000ull
+#define MAX_GAP_END SZ_4G
/*
* Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
*/
-static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
+__init static int e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size)
{
- unsigned long long last = MAX_GAP_END;
- int i = e820_table->nr_entries;
+ struct e820_entry *entry;
+ u64 range_end_prev = 0;
int found = 0;
+ u32 idx;
+
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ u64 range_start, range_end;
+
+ entry = e820_table->entries + idx;
+ range_start = entry->addr;
+ range_end = entry->addr + entry->size;
+
+ /* Process any gap before this entry: */
+ if (range_start > range_end_prev) {
+ u64 gap_start = range_end_prev;
+ u64 gap_end = range_start;
+ u64 gap_size;
+
+ if (gap_start < MAX_GAP_END) {
+ /* Make sure the entirety of the gap is below MAX_GAP_END: */
+ gap_end = min(gap_end, MAX_GAP_END);
+ gap_size = gap_end-gap_start;
+
+ if (gap_size >= *max_gap_size) {
+ *max_gap_start = gap_start;
+ *max_gap_size = gap_size;
+ found = 1;
+ }
+ }
+ }
- while (--i >= 0) {
- unsigned long long start = e820_table->entries[i].addr;
- unsigned long long end = start + e820_table->entries[i].size;
+ range_end_prev = range_end;
+ }
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true:
- */
- if (last > end) {
- unsigned long gap = last - end;
+ /* Is there a usable gap beyond the last entry: */
+ if (entry->addr + entry->size < MAX_GAP_END) {
+ u64 gap_start = entry->addr + entry->size;
+ u64 gap_size = MAX_GAP_END-gap_start;
- if (gap >= *gapsize) {
- *gapsize = gap;
- *gapstart = end;
- found = 1;
- }
+ if (gap_size >= *max_gap_size) {
+ *max_gap_start = gap_start;
+ *max_gap_size = gap_size;
+ found = 1;
}
- if (start < last)
- last = start;
}
+
return found;
}
@@ -658,29 +683,30 @@ static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsiz
*/
__init void e820__setup_pci_gap(void)
{
- unsigned long gapstart, gapsize;
+ unsigned long max_gap_start, max_gap_size;
int found;
- gapsize = 0x400000;
- found = e820_search_gap(&gapstart, &gapsize);
+ /* The minimum eligible gap size is 4MB: */
+ max_gap_size = SZ_4M;
+ found = e820_search_gap(&max_gap_start, &max_gap_size);
if (!found) {
#ifdef CONFIG_X86_64
- gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
+ max_gap_start = (max_pfn << PAGE_SHIFT) + SZ_1M;
pr_err("Cannot find an available gap in the 32-bit address range\n");
pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
#else
- gapstart = 0x10000000;
+ max_gap_start = SZ_256M;
#endif
}
/*
* e820__reserve_resources_late() protects stolen RAM already:
*/
- pci_mem_start = gapstart;
+ pci_mem_start = max_gap_start;
- pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
- gapstart, gapstart + gapsize - 1);
+ pr_info("[gap %#010lx-%#010lx] available for PCI devices\n",
+ max_gap_start, max_gap_start + max_gap_size-1);
}
/*
@@ -722,7 +748,7 @@ __init void e820__reallocate_tables(void)
* the remaining (if any) entries are passed via the SETUP_E820_EXT node of
* struct setup_data, which is parsed here.
*/
-void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
+__init void e820__memory_setup_extended(u64 phys_addr, u32 data_len)
{
int entries;
struct boot_e820_entry *extmap;
@@ -732,7 +758,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
entries = sdata->len / sizeof(*extmap);
extmap = (struct boot_e820_entry *)(sdata->data);
- __append_e820_table(extmap, entries);
+ append_e820_table(extmap, entries);
e820__update_table(e820_table);
memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
@@ -751,13 +777,13 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
* This function requires the E820 map to be sorted and without any
* overlapping entries.
*/
-void __init e820__register_nosave_regions(unsigned long limit_pfn)
+__init void e820__register_nosave_regions(unsigned long limit_pfn)
{
- int i;
+ u32 idx;
u64 last_addr = 0;
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
if (entry->type != E820_TYPE_RAM)
continue;
@@ -776,12 +802,12 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn)
* Register ACPI NVS memory regions, so that we can save/restore them during
* hibernation and the subsequent resume:
*/
-static int __init e820__register_nvs_regions(void)
+__init static int e820__register_nvs_regions(void)
{
- int i;
+ u32 idx;
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
if (entry->type == E820_TYPE_NVS)
acpi_nvs_register(entry->addr, entry->size);
@@ -800,7 +826,7 @@ core_initcall(e820__register_nvs_regions);
* This allows kexec to fake a new mptable, as if it came from the real
* system.
*/
-u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
+__init u64 e820__memblock_alloc_reserved(u64 size, u64 align)
{
u64 addr;
@@ -827,14 +853,14 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
+__init static unsigned long e820__end_ram_pfn(unsigned long limit_pfn)
{
- int i;
+ u32 idx;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
unsigned long start_pfn;
unsigned long end_pfn;
@@ -863,26 +889,20 @@ static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
return last_pfn;
}
-unsigned long __init e820__end_of_ram_pfn(void)
+__init unsigned long e820__end_of_ram_pfn(void)
{
return e820__end_ram_pfn(MAX_ARCH_PFN);
}
-unsigned long __init e820__end_of_low_ram_pfn(void)
+__init unsigned long e820__end_of_low_ram_pfn(void)
{
return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
-static void __init early_panic(char *msg)
-{
- early_printk(msg);
- panic(msg);
-}
-
-static int userdef __initdata;
+__initdata static int userdef;
/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
-static int __init parse_memopt(char *p)
+__init static int parse_memopt(char *p)
{
u64 mem_size;
@@ -906,7 +926,7 @@ static int __init parse_memopt(char *p)
if (mem_size == 0)
return -EINVAL;
- e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM);
#ifdef CONFIG_MEMORY_HOTPLUG
max_mem_size = mem_size;
@@ -916,7 +936,7 @@ static int __init parse_memopt(char *p)
}
early_param("mem", parse_memopt);
-static int __init parse_memmap_one(char *p)
+__init static int parse_memmap_one(char *p)
{
char *oldp;
u64 start_at, mem_size;
@@ -962,18 +982,16 @@ static int __init parse_memmap_one(char *p)
e820__range_update(start_at, mem_size, from, to);
else if (to)
e820__range_add(start_at, mem_size, to);
- else if (from)
- e820__range_remove(start_at, mem_size, from, 1);
else
- e820__range_remove(start_at, mem_size, 0, 0);
+ e820__range_remove(start_at, mem_size, from);
} else {
- e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM);
}
return *p == '\0' ? 0 : -EINVAL;
}
-static int __init parse_memmap_opt(char *str)
+__init static int parse_memmap_opt(char *str)
{
while (str) {
char *k = strchr(str, ',');
@@ -994,18 +1012,18 @@ early_param("memmap", parse_memmap_opt);
* have been processed, in which case we already have an E820 table filled in
* via the parameter callback function(s), but it's not sorted and printed yet:
*/
-void __init e820__finish_early_params(void)
+__init void e820__finish_early_params(void)
{
if (userdef) {
if (e820__update_table(e820_table) < 0)
- early_panic("Invalid user supplied memory map");
+ panic("Invalid user supplied memory map");
pr_info("user-defined physical RAM map:\n");
e820__print_table("user");
}
}
-static const char *__init e820_type_to_string(struct e820_entry *entry)
+__init static const char * e820_type_to_string(struct e820_entry *entry)
{
switch (entry->type) {
case E820_TYPE_RAM: return "System RAM";
@@ -1020,7 +1038,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry)
}
}
-static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
+__init static unsigned long e820_type_to_iomem_type(struct e820_entry *entry)
{
switch (entry->type) {
case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
@@ -1035,7 +1053,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
}
}
-static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
+__init static unsigned long e820_type_to_iores_desc(struct e820_entry *entry)
{
switch (entry->type) {
case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES;
@@ -1050,40 +1068,47 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
}
}
-static bool __init do_mark_busy(enum e820_type type, struct resource *res)
+/*
+ * We assign one resource entry for each E820 map entry:
+ */
+__initdata static struct resource *e820_res;
+
+/*
+ * Is this a device address region that should not be marked busy?
+ * (Versus system address regions that we register & lock early.)
+ */
+__init static bool e820_device_region(enum e820_type type, struct resource *res)
{
- /* this is the legacy bios/dos rom-shadow + mmio region */
- if (res->start < (1ULL<<20))
- return true;
+ /* This is the legacy BIOS/DOS ROM-shadow + MMIO region: */
+ if (res->start < SZ_1M)
+ return false;
/*
* Treat persistent memory and other special memory ranges like
- * device memory, i.e. reserve it for exclusive use of a driver
+ * device memory, i.e. keep it available for exclusive use of a
+ * driver:
*/
switch (type) {
case E820_TYPE_RESERVED:
case E820_TYPE_SOFT_RESERVED:
case E820_TYPE_PRAM:
case E820_TYPE_PMEM:
- return false;
+ return true;
case E820_TYPE_RAM:
case E820_TYPE_ACPI:
case E820_TYPE_NVS:
case E820_TYPE_UNUSABLE:
default:
- return true;
+ return false;
}
}
/*
- * Mark E820 reserved areas as busy for the resource manager:
+ * Mark E820 system regions as busy for the resource manager:
*/
-
-static struct resource __initdata *e820_res;
-
-void __init e820__reserve_resources(void)
+__init void e820__reserve_resources(void)
{
- int i;
+ u32 idx;
struct resource *res;
u64 end;
@@ -1091,8 +1116,8 @@ void __init e820__reserve_resources(void)
SMP_CACHE_BYTES);
e820_res = res;
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = e820_table->entries + i;
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = e820_table->entries + idx;
end = entry->addr + entry->size - 1;
if (end != (resource_size_t)end) {
@@ -1106,20 +1131,20 @@ void __init e820__reserve_resources(void)
res->desc = e820_type_to_iores_desc(entry);
/*
- * Don't register the region that could be conflicted with
- * PCI device BAR resources and insert them later in
- * pcibios_resource_survey():
+ * Skip and don't register device regions that could be conflicted
+ * with PCI device BAR resources. They get inserted later in
+ * pcibios_resource_survey() -> e820__reserve_resources_late():
*/
- if (do_mark_busy(entry->type, res)) {
+ if (!e820_device_region(entry->type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- /* Expose the kexec e820 table to the sysfs. */
- for (i = 0; i < e820_table_kexec->nr_entries; i++) {
- struct e820_entry *entry = e820_table_kexec->entries + i;
+ /* Expose the kexec e820 table to sysfs: */
+ for (idx = 0; idx < e820_table_kexec->nr_entries; idx++) {
+ struct e820_entry *entry = e820_table_kexec->entries + idx;
firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
}
@@ -1128,7 +1153,7 @@ void __init e820__reserve_resources(void)
/*
* How much should we pad the end of RAM, depending on where it is?
*/
-static unsigned long __init ram_alignment(resource_size_t pos)
+__init static unsigned long ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1146,24 +1171,36 @@ static unsigned long __init ram_alignment(resource_size_t pos)
#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
-void __init e820__reserve_resources_late(void)
+__init void e820__reserve_resources_late(void)
{
- int i;
+ u32 idx;
struct resource *res;
+ /*
+ * Register device address regions listed in the E820 map,
+ * these can be claimed by device drivers later on:
+ */
res = e820_res;
- for (i = 0; i < e820_table->nr_entries; i++) {
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
/*
- * Try to bump up RAM regions to reasonable boundaries, to
- * avoid stolen RAM:
+ * Create additional 'gaps' at the end of RAM regions,
+ * rounding them up to 64k/1MB/64MB boundaries, should
+ * they be weirdly sized, and register extra, locked
+ * resource regions for them, to make sure drivers
+ * won't claim those addresses.
+ *
+ * These are basically blind guesses and heuristics to
+ * avoid resource conflicts with broken firmware that
+ * doesn't properly list 'stolen RAM' as a system region
+ * in the E820 map.
*/
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
u64 start, end;
if (entry->type != E820_TYPE_RAM)
@@ -1176,7 +1213,7 @@ void __init e820__reserve_resources_late(void)
if (start >= end)
continue;
- printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+ pr_info("e820: register RAM buffer resource [mem %#010llx-%#010llx]\n", start, end);
reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
}
}
@@ -1184,7 +1221,7 @@ void __init e820__reserve_resources_late(void)
/*
* Pass the firmware (bootloader) E820 map to the kernel and process it:
*/
-char *__init e820__memory_setup_default(void)
+__init char * e820__memory_setup_default(void)
{
char *who = "BIOS-e820";
@@ -1222,7 +1259,7 @@ char *__init e820__memory_setup_default(void)
* E820 map - with an optional platform quirk available for virtual platforms
* to override this method of boot environment processing:
*/
-void __init e820__memory_setup(void)
+__init void e820__memory_setup(void)
{
char *who;
@@ -1238,9 +1275,9 @@ void __init e820__memory_setup(void)
e820__print_table(who);
}
-void __init e820__memblock_setup(void)
+__init void e820__memblock_setup(void)
{
- int i;
+ u32 idx;
u64 end;
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1284,8 +1321,8 @@ void __init e820__memblock_setup(void)
*/
memblock_allow_resize();
- for (i = 0; i < e820_table->nr_entries; i++) {
- struct e820_entry *entry = &e820_table->entries[i];
+ for (idx = 0; idx < e820_table->nr_entries; idx++) {
+ struct e820_entry *entry = &e820_table->entries[idx];
end = entry->addr + entry->size;
if (end != (resource_size_t)end)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1b2edd07a3e176..ffbd04ee0f6853 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -763,7 +763,7 @@ static void __init trim_bios_range(void)
* area (640Kb -> 1Mb) as RAM even though it is not.
* take them out.
*/
- e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
+ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM);
e820__update_table(e820_table);
}
@@ -785,7 +785,7 @@ static void __init e820_add_kernel_range(void)
return;
pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
- e820__range_remove(start, size, E820_TYPE_RAM, 0);
+ e820__range_remove(start, size, 0);
e820__range_add(start, size, E820_TYPE_RAM);
}
@@ -1015,11 +1015,9 @@ void __init setup_arch(char **cmdline_p)
trim_bios_range();
#ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
- e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
- E820_TYPE_RESERVED);
+ pr_info("Applying PPro RAM bug workaround: punching 256 kB hole at 1.75 GB physical.\n");
+ e820__range_update(0x70000000ULL, SZ_256K, E820_TYPE_RAM, E820_TYPE_RESERVED);
e820__update_table(e820_table);
- printk(KERN_INFO "fixed physical RAM map:\n");
- e820__print_table("bad_ppro");
}
#else
early_gart_iommu_check();
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 463b784499a8f5..d00c6de7f3b733 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -333,8 +333,7 @@ static void __init efi_remove_e820_mmio(void)
if (size >= 256*1024) {
pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n",
i, start, end, size >> 20);
- e820__range_remove(start, size,
- E820_TYPE_RESERVED, 1);
+ e820__range_remove(start, size, E820_TYPE_RESERVED);
} else {
pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n",
i, start, end, size >> 10);
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1280693766b9dd..280b4ac0990ff3 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -586,6 +586,25 @@ struct ftrace_likely_data {
__scalar_type_to_expr_cases(long long), \
default: (x)))
+/*
+ * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving
+ * non-scalar types unchanged.
+ */
+
+#define __scalar_type_to_signed_cases(type) \
+ unsigned type: (signed type)0, \
+ signed type: (signed type)0
+
+#define __signed_scalar_typeof(x) typeof( \
+ _Generic((x), \
+ char: (signed char)0, \
+ __scalar_type_to_signed_cases(char), \
+ __scalar_type_to_signed_cases(short), \
+ __scalar_type_to_signed_cases(int), \
+ __scalar_type_to_signed_cases(long), \
+ __scalar_type_to_signed_cases(long long), \
+ default: (x)))
+
/* Is this type a native word size -- useful for atomic operations */
#define __native_word(t) \
(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d395f2810facbe..bf96a7d595e226 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -586,15 +586,10 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
- union {
- /*
- * When !@on_rq this field is vlag.
- * When cfs_rq->curr == se (which implies @on_rq)
- * this field is vprot. See protect_slice().
- */
- s64 vlag;
- u64 vprot;
- };
+ /* Approximated virtual lag: */
+ s64 vlag;
+ /* 'Protected' deadline, to give out minimum quantums: */
+ u64 vprot;
u64 slice;
u64 nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41ba0be1691174..7d0a862a8c75ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
- rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
- rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
- if (p->sched_class == donor->sched_class)
- donor->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, donor->sched_class))
+ if (p->sched_class == rq->next_class) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
+
+ } else if (sched_class_above(p->sched_class, rq->next_class)) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
resched_curr(rq);
+ rq->next_class = p->sched_class;
+ }
/*
* A queue event has occurred, and we're going to schedule. In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ rq->next_class = next->sched_class;
if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
+ rq->next_class = &idle_sched_class;
+
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,13 +10780,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
flags |= DEQUEUE_NOCLOCK;
}
- if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
- p->sched_class->switching_from(rq, p);
- }
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
*ctx = (struct sched_change_ctx){
.p = p,
+ .class = p->sched_class,
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current_donor(rq, p),
@@ -10812,6 +10816,11 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ /*
+ * Changing class without *QUEUE_CLASS is bad.
+ */
+ WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS));
+
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
@@ -10823,6 +10832,24 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->flags & ENQUEUE_CLASS) {
if (p->sched_class->switched_to)
p->sched_class->switched_to(rq, p);
+
+ /*
+ * If this was a class promotion; let the old class know it
+ * got preempted. Note that none of the switch*_from() methods
+ * know the new class and none of the switch*_to() methods
+ * know the old class.
+ */
+ if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+ rq->next_class->wakeup_preempt(rq, p, 0);
+ rq->next_class = p->sched_class;
+ }
+
+ /*
+ * If this was a degradation in class someone should have set
+ * need_resched by now.
+ */
+ WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) &&
+ !test_tsk_need_resched(p));
} else {
p->sched_class->prio_changed(rq, p, ctx->prio);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439fe187026..80c9559a3e30ea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
- int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * Can only get preempted by stop-class, and those should be
+ * few and short lived, doesn't really make sense to push
+ * anything away for that.
+ */
+ if (p->sched_class != &dl_sched_class)
+ return;
+
if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
#endif
DEFINE_SCHED_CLASS(dl) = {
-
- .queue_mask = 8,
-
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49e9649a9..8015ab6eb332e9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
scx_disable_task(p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
- .queue_mask = 1,
-
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da46c316453783..76f5e4b78b3069 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
+extern void __BUILD_BUG_vruntime_cmp(void);
+
+/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
+
+#define vruntime_cmp(A, CMP_STR, B) ({ \
+ int __res = 0; \
+ \
+ if (!__builtin_strcmp(CMP_STR, "<")) { \
+ __res = ((s64)((A)-(B)) < 0); \
+ } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
+ __res = ((s64)((A)-(B)) <= 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">")) { \
+ __res = ((s64)((A)-(B)) > 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
+ __res = ((s64)((A)-(B)) >= 0); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_cmp(); \
+ } \
+ \
+ __res; \
+})
+
+extern void __BUILD_BUG_vruntime_op(void);
+
+#define vruntime_op(A, OP_STR, B) ({ \
+ s64 __res = 0; \
+ \
+ if (!__builtin_strcmp(OP_STR, "-")) { \
+ __res = (s64)((A)-(B)); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_op(); \
+ } \
+ \
+ __res; \
+})
+
+
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
+ if (vruntime_cmp(vruntime, ">", max_vruntime))
max_vruntime = vruntime;
return max_vruntime;
@@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
+ if (vruntime_cmp(vruntime, "<", min_vruntime))
min_vruntime = vruntime;
return min_vruntime;
@@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a,
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
- return (s64)(a->deadline - b->deadline) < 0;
+ return vruntime_cmp(a->deadline, "<", b->deadline);
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+ return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -607,8 +644,8 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Which we track using:
*
* v0 := cfs_rq->zero_vruntime
- * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
- * \Sum w_i := cfs_rq->avg_load
+ * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
+ * \Sum w_i := cfs_rq->sum_weight
*
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v), will be in the order of the maximal (virtual) lag
@@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
-avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime += key * weight;
- cfs_rq->avg_load += weight;
+ cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_weight += weight;
}
static void
-avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime -= key * weight;
- cfs_rq->avg_load -= weight;
+ cfs_rq->sum_w_vruntime -= key * weight;
+ cfs_rq->sum_weight -= weight;
}
static inline
-void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
*/
- cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
}
/*
@@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+ return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+ s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
- avg_vruntime_update(cfs_rq, delta);
+ sum_w_vruntime_update(cfs_rq, delta);
cfs_rq->zero_vruntime = vruntime;
}
@@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-
static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (vruntime_gt(min_vruntime, se, rse))
+
+ if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
se->min_vruntime = rse->min_vruntime;
}
}
@@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- avg_vruntime_add(cfs_rq, se);
+ sum_w_vruntime_add(cfs_rq, se);
update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
@@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
- avg_vruntime_sub(cfs_rq, se);
+ sum_w_vruntime_sub(cfs_rq, se);
update_zero_vruntime(cfs_rq);
}
@@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti
static inline bool protect_slice(struct sched_entity *se)
{
- return ((s64)(se->vprot - se->vruntime) > 0);
+ return vruntime_cmp(se->vruntime, "<", se->vprot);
}
static inline void cancel_protect_slice(struct sched_entity *se)
@@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if ((s64)(se->vruntime - se->deadline) < 0)
+ if (vruntime_cmp(se->vruntime, "<", se->deadline))
return false;
/*
@@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p)
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p)
pid_t gid = 0;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
@@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env,
return false;
rcu_read_lock();
- cur = rcu_dereference(dst_rq->curr);
+ cur = rcu_dereference_all(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
@@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- cur_ng = rcu_dereference(cur->numa_group);
+ cur_ng = rcu_dereference_all(cur->numa_group);
if (cur_ng == p_ng) {
/*
* Do not swap within a group or between tasks that have
@@ -2499,7 +2535,7 @@ static int task_numa_migrate(struct task_struct *p)
* to satisfy here.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
@@ -3022,7 +3058,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
- grp = rcu_dereference(tsk->numa_group);
+ grp = rcu_dereference_all(tsk->numa_group);
if (!grp)
goto no_join;
@@ -3693,7 +3729,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
+ __signed_scalar_typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
@@ -3705,23 +3741,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
@@ -3732,21 +3751,37 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
+
+/*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ *
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+#define __update_sa(sa, name, delta_avg, delta_sum) do { \
+ add_positive(&(sa)->name##_avg, delta_avg); \
+ add_positive(&(sa)->name##_sum, delta_sum); \
+ (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
+ (sa)->name##_sum, \
+ (sa)->name##_avg * PELT_MIN_DIVIDER); \
+} while (0)
+
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum);
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
new_sum = se->avg.util_avg * divider;
@@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta_avg);
- add_positive(&cfs_rq->avg.util_sum, delta_sum);
-
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
}
static inline void
@@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
}
static inline void
@@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
rq = rq_of(cfs_rq);
rcu_read_lock();
- is_idle = is_idle_task(rcu_dereference(rq->curr));
+ is_idle = is_idle_task(rcu_dereference_all(rq->curr));
rcu_read_unlock();
/*
@@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
- /* See sa->util_sum below */
- sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, load, -r, -r*divider);
r = removed_util;
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
- /*
- * Because of rounding, se->util_sum might ends up being +1 more than
- * cfs->util_sum. Although this is not a problem by itself, detaching
- * a lot of tasks with the rounding problem between 2 updates of
- * util_avg (~1ms) can make cfs->util_sum becoming null whereas
- * cfs_util_avg is not.
- * Check that util_sum is still above its lower bound for the new
- * util_avg. Given that period_contrib might have moved since the last
- * sync, we are only sure that util_sum must be above or equal to
- * util_avg * minimum possible divider
- */
- sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, util, -r, -r*divider);
r = removed_runnable;
- sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
- /* See sa->util_sum above */
- sa->runnable_sum = max_t(u32, sa->runnable_sum,
- sa->runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, runnable, -r, -r*divider);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
-
- sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
+ __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -5175,7 +5167,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* vl_i = (W + w_i)*vl'_i / W
*/
- load = cfs_rq->avg_load;
+ load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
@@ -7148,7 +7140,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
- int has_blocked; /* Idle CPUS has blocked load */
+ int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
@@ -7506,7 +7498,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7515,7 +7507,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7644,7 +7636,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
if (sd_share) {
/* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
@@ -7850,7 +7842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -7865,7 +7857,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = rcu_dereference_all(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -8334,7 +8326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct energy_env eenv;
rcu_read_lock();
- pd = rcu_dereference(rd->pd);
+ pd = rcu_dereference_all(rd->pd);
if (!pd)
goto unlock;
@@ -8342,7 +8334,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
@@ -8744,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags,
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
@@ -8752,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ /*
+ * XXX Getting preempted by higher class, try and find idle CPU?
+ */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
if (unlikely(se == pse))
return;
@@ -9349,7 +9347,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
*/
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
@@ -9778,7 +9776,7 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_NO_HZ_COMMON
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
@@ -9811,16 +9809,16 @@ static inline void update_blocked_load_tick(struct rq *rq)
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
{
- if (!has_blocked)
+ if (!has_blocked_load)
rq->has_blocked_load = 0;
}
#else /* !CONFIG_NO_HZ_COMMON: */
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
@@ -9877,7 +9875,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
}
@@ -9937,7 +9935,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
return decayed;
@@ -9949,23 +9947,27 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
- update_blocked_load_status(rq, !done);
+ update_has_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -11025,7 +11027,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (env->sd->span_weight != llc_weight)
return;
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
if (!sd_share)
return;
@@ -11375,7 +11377,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
- rcu_dereference(env->dst_rq->rd->pd))
+ rcu_dereference_all(env->dst_rq->rd->pd))
goto out_balanced;
/* ASYM feature bypasses nice load balance check */
@@ -12450,7 +12452,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
- if (READ_ONCE(nohz.has_blocked) &&
+ if (READ_ONCE(nohz.has_blocked_load) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
@@ -12464,7 +12466,7 @@ static void nohz_balancer_kick(struct rq *rq)
rcu_read_lock();
- sd = rcu_dereference(rq->sd);
+ sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
* If there's a runnable CFS task and the current CPU has reduced
@@ -12476,7 +12478,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
@@ -12494,7 +12496,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
@@ -12515,7 +12517,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto unlock;
}
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12547,7 +12549,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
@@ -12577,7 +12579,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
@@ -12611,9 +12613,9 @@ void nohz_balance_enter_idle(int cpu)
/*
* The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * meantime. We set the nohz.has_blocked_load flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
+ * of nohz.has_blocked_load can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
@@ -12629,7 +12631,7 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
+ * @idle_cpus_mask store, it must observe the @has_blocked_load
* and @needs_update stores.
*/
smp_mb__after_atomic();
@@ -12642,7 +12644,7 @@ out:
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle CPUs
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
static bool update_nohz_stats(struct rq *rq)
@@ -12683,8 +12685,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/*
* We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trigger another update of idle load.
+ * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked_load flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
@@ -12692,12 +12694,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(nohz.has_blocked_load, 0);
if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/*
- * Ensures that if we miss the CPU, we must see the has_blocked
+ * Ensures that if we miss the CPU, we must see the has_blocked_load
* store from nohz_balance_enter_idle().
*/
smp_mb();
@@ -12764,7 +12766,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
/*
@@ -12826,7 +12828,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
+ if (!READ_ONCE(nohz.has_blocked_load) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
@@ -12896,29 +12898,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!sd) {
- rcu_read_unlock();
+ sd = rcu_dereference_sched_domain(this_rq->sd);
+ if (!sd)
goto out;
- }
if (!get_rd_overloaded(this_rq->rd) ||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
- rcu_read_unlock();
goto out;
}
- rcu_read_unlock();
-
- rq_modified_clear(this_rq);
- raw_spin_rq_unlock(this_rq);
+ /*
+ * Include sched_balance_update_blocked_averages() in the cost
+ * calculation because it can be quite costly -- this ensures we skip
+ * it when avg_idle gets to be very low.
+ */
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
+ __sched_balance_update_blocked_averages(this_rq);
+
+ this_rq->next_class = &fair_sched_class;
+ raw_spin_rq_unlock(this_rq);
- rcu_read_lock();
for_each_domain(this_cpu, sd) {
u64 domain_cost;
@@ -12968,7 +12969,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task || !continue_balancing)
break;
}
- rcu_read_unlock();
raw_spin_rq_lock(this_rq);
@@ -12984,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
+ if (sched_class_above(this_rq->next_class, &fair_sched_class))
pulled_task = -1;
out:
@@ -13335,8 +13335,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
* zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+ delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
+ vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13374,6 +13374,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (queued) {
+ if (!need_resched())
+ hrtick_start_fair(rq, curr);
+ return;
+ }
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
-
- .queue_mask = 2,
-
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .wakeup_preempt = check_preempt_wakeup_fair,
+ .wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
@@ -13950,7 +13953,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
struct numa_group *ng;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe1dd177a..65eb8f8c1a5d3a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
* Simple, special scheduling class for the per-CPU idle tasks:
*/
DEFINE_SCHED_CLASS(idle) = {
-
- .queue_mask = 0,
-
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe8e5c535..0a9b2cd6da7208 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
+ /*
+ * XXX If we're preempted by DL, queue a push?
+ */
+ if (p->sched_class != &rt_sched_class)
+ return;
+
if (p->prio < donor->prio) {
resched_curr(rq);
return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
-
- .queue_mask = 4,
-
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5f9..bdb1e748fe5c97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -670,16 +670,16 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
-/* CFS-related fields in a runqueue */
+/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */
struct cfs_rq {
struct load_weight load;
unsigned int nr_queued;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_idle; /* SCHED_IDLE */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
- s64 avg_vruntime;
- u64 avg_load;
+ s64 sum_w_vruntime;
+ u64 sum_weight;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -690,7 +690,7 @@ struct cfs_rq {
struct rb_root_cached tasks_timeline;
/*
- * 'curr' points to currently running entity on this cfs_rq.
+ * 'curr' points to the currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -726,9 +726,7 @@ struct cfs_rq {
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
@@ -741,19 +739,19 @@ struct cfs_rq {
*/
int on_list;
struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ struct task_group *tg; /* Group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
-#ifdef CONFIG_CFS_BANDWIDTH
+# ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
-#ifndef CONFIG_64BIT
+# ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
-#endif
+# endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -765,7 +763,7 @@ struct cfs_rq {
struct list_head throttled_list;
struct list_head throttled_csd_list;
struct list_head throttled_limbo_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
+# endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1121,7 +1119,6 @@ struct rq {
raw_spinlock_t __lock;
/* Per class runqueue modification mask; bits in class order. */
- unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -1181,6 +1178,7 @@ struct rq {
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
+ const struct sched_class *next_class;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -2010,8 +2008,8 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_sched_domain(p) \
+ rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -2021,7 +2019,7 @@ queue_balance_callback(struct rq *rq,
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
@@ -2428,15 +2426,6 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
- /*
- * idle: 0
- * ext: 1
- * fair: 2
- * rt: 4
- * dl: 8
- * stop: 16
- */
- unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2595,20 +2584,6 @@ struct sched_class {
#endif
};
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
- rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
- unsigned int mask = class->queue_mask;
- return rq->queue_mask & ~((mask << 1) - 1);
-}
-
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -3901,6 +3876,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
deactivate_task(src_rq, task, 0);
set_task_cpu(task, dst_rq->cpu);
activate_task(dst_rq, task, 0);
+ wakeup_preempt(dst_rq, task, 0);
}
static inline
@@ -3968,6 +3944,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+ const struct sched_class *class;
int flags;
bool queued;
bool running;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192be4b5b0a..f95798baddebbd 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
DEFINE_SCHED_CLASS(stop) = {
-
- .queue_mask = 16,
-
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,