diff options
| -rw-r--r-- | arch/x86/include/asm/e820/api.h | 3 | ||||
| -rw-r--r-- | arch/x86/include/asm/e820/types.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/uv/bios.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/acpi/boot.c | 12 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/topology.c | 15 | ||||
| -rw-r--r-- | arch/x86/kernel/e820.c | 479 | ||||
| -rw-r--r-- | arch/x86/kernel/setup.c | 10 | ||||
| -rw-r--r-- | arch/x86/platform/efi/efi.c | 3 | ||||
| -rw-r--r-- | include/linux/compiler_types.h | 19 | ||||
| -rw-r--r-- | include/linux/sched.h | 13 | ||||
| -rw-r--r-- | kernel/sched/core.c | 45 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 14 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 5 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 361 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 3 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 9 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 59 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 3 |
18 files changed, 550 insertions, 507 deletions
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index c83645d5b2a86d..bbe0c8de976c0b 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -16,10 +16,9 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); -extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern void e820__range_remove(u64 start, u64 size, enum e820_type filter_type); extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); -extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); extern void e820__update_table_print(void); diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 80c4a7266629c9..df12f7ee75d369 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -83,7 +83,7 @@ struct e820_entry { * The whole array of E820 entries: */ struct e820_table { - __u32 nr_entries; + u32 nr_entries; struct e820_entry entries[E820_MAX_ENTRIES]; }; diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 6989b824fd321b..d0b62e2552902c 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -122,7 +122,7 @@ struct uv_systab { struct { u32 type:8; /* type of entry */ u32 offset:24; /* byte offset from struct start to entry */ - } entry[1]; /* additional entries follow */ + } entry[]; /* additional entries follow */ }; extern struct uv_systab *uv_systab; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9fa321a95eb33f..d6138b2b633a31 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include <asm/smp.h> #include <asm/i8259.h> #include <asm/setup.h> +#include <asm/hypervisor.h> #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ static int __initdata acpi_force = 0; @@ -164,11 +165,14 @@ static bool __init acpi_is_processor_usable(u32 lapic_flags) if (lapic_flags & ACPI_MADT_ENABLED) return true; - if (!acpi_support_online_capable || - (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) - return true; + if (acpi_support_online_capable) + return lapic_flags & ACPI_MADT_ONLINE_CAPABLE; - return false; + /* + * QEMU expects legacy "Enabled=0" LAPIC entries to be counted as usable + * in order to support CPU hotplug in guests. + */ + return !hypervisor_is_type(X86_HYPER_NATIVE); } static int __init diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index f55ea3cdbf88ef..23190a786d3104 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -27,7 +27,6 @@ #include <xen/xen.h> #include <asm/apic.h> -#include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> #include <asm/msr.h> @@ -236,20 +235,6 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) cpuid_to_apicid[cpu] = apic_id; topo_set_cpuids(cpu, apic_id, acpi_id); } else { - u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); - - /* - * Check for present APICs in the same package when running - * on bare metal. Allow the bogosity in a guest. - */ - if (hypervisor_is_type(X86_HYPER_NATIVE) && - topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { - pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", - apic_id); - topo_info.nr_rejected_cpus++; - return; - } - topo_info.nr_disabled_cpus++; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b15b97d3cb52dc..97b54bd0f4822b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -9,13 +9,11 @@ * quirks and other tweaks, and feeds that into the generic Linux memory * allocation code routines via a platform independent interface (memblock, etc.). */ -#include <linux/crash_dump.h> #include <linux/memblock.h> #include <linux/suspend.h> #include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/sort.h> -#include <linux/memory_hotplug.h> #include <linux/kvm_types.h> #include <asm/e820/api.h> @@ -57,13 +55,13 @@ * re-propagated. So its main role is a temporary bootstrap storage of firmware * specific memory layout data during early bootup. */ -static struct e820_table e820_table_init __initdata; -static struct e820_table e820_table_kexec_init __initdata; -static struct e820_table e820_table_firmware_init __initdata; +__initdata static struct e820_table e820_table_init; +__initdata static struct e820_table e820_table_kexec_init; +__initdata static struct e820_table e820_table_firmware_init; -struct e820_table *e820_table __refdata = &e820_table_init; -struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; -struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; +__refdata struct e820_table *e820_table = &e820_table_init; +__refdata struct e820_table *e820_table_kexec = &e820_table_kexec_init; +__refdata struct e820_table *e820_table_firmware = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -78,10 +76,10 @@ EXPORT_SYMBOL(pci_mem_start); static bool _e820__mapped_any(struct e820_table *table, u64 start, u64 end, enum e820_type type) { - int i; + u32 idx; - for (i = 0; i < table->nr_entries; i++) { - struct e820_entry *entry = &table->entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + struct e820_entry *entry = &table->entries[idx]; if (type && entry->type != type) continue; @@ -113,10 +111,10 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); static struct e820_entry *__e820__mapped_all(u64 start, u64 end, enum e820_type type) { - int i; + u32 idx; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (type && entry->type != type) continue; @@ -146,7 +144,7 @@ static struct e820_entry *__e820__mapped_all(u64 start, u64 end, /* * This function checks if the entire range <start,end> is mapped with type. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +__init bool e820__mapped_all(u64 start, u64 end, enum e820_type type) { return __e820__mapped_all(start, end, type); } @@ -164,54 +162,74 @@ int e820__get_entry_type(u64 start, u64 end) /* * Add a memory region to the kernel E820 map. */ -static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) +__init static void __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type) { - int x = table->nr_entries; + u32 idx = table->nr_entries; + struct e820_entry *entry_new; - if (x >= ARRAY_SIZE(table->entries)) { - pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n", - start, start + size - 1); + if (idx >= ARRAY_SIZE(table->entries)) { + pr_err("E820 table full; ignoring [mem %#010llx-%#010llx]\n", + start, start + size-1); return; } - table->entries[x].addr = start; - table->entries[x].size = size; - table->entries[x].type = type; + entry_new = table->entries + idx; + + entry_new->addr = start; + entry_new->size = size; + entry_new->type = type; + table->nr_entries++; } -void __init e820__range_add(u64 start, u64 size, enum e820_type type) +__init void e820__range_add(u64 start, u64 size, enum e820_type type) { __e820__range_add(e820_table, start, size, type); } -static void __init e820_print_type(enum e820_type type) +__init static void e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: pr_cont("usable"); break; - case E820_TYPE_RESERVED: pr_cont("reserved"); break; - case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; - case E820_TYPE_ACPI: pr_cont("ACPI data"); break; - case E820_TYPE_NVS: pr_cont("ACPI NVS"); break; - case E820_TYPE_UNUSABLE: pr_cont("unusable"); break; + case E820_TYPE_RAM: pr_cont(" System RAM"); break; + case E820_TYPE_RESERVED: pr_cont(" device reserved"); break; + case E820_TYPE_SOFT_RESERVED: pr_cont(" soft reserved"); break; + case E820_TYPE_ACPI: pr_cont(" ACPI data"); break; + case E820_TYPE_NVS: pr_cont(" ACPI NVS"); break; + case E820_TYPE_UNUSABLE: pr_cont(" unusable"); break; case E820_TYPE_PMEM: /* Fall through: */ - case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break; - default: pr_cont("type %u", type); break; + case E820_TYPE_PRAM: pr_cont(" persistent RAM (type %u)", type); break; + default: pr_cont(" type %u", type); break; } } -void __init e820__print_table(char *who) +__init static void e820__print_table(const char *who) { - int i; + u64 range_end_prev = 0; + u32 idx; + + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = e820_table->entries + idx; + u64 range_start, range_end; + + range_start = entry->addr; + range_end = entry->addr + entry->size; - for (i = 0; i < e820_table->nr_entries; i++) { - pr_info("%s: [mem %#018Lx-%#018Lx] ", - who, - e820_table->entries[i].addr, - e820_table->entries[i].addr + e820_table->entries[i].size - 1); + /* Out of order E820 maps should not happen: */ + if (range_start < range_end_prev) + pr_info(FW_BUG "out of order E820 entry!\n"); - e820_print_type(e820_table->entries[i].type); + if (range_start > range_end_prev) { + pr_info("%s: [gap %#018Lx-%#018Lx]\n", + who, + range_end_prev, + range_start-1); + } + + pr_info("%s: [mem %#018Lx-%#018Lx] ", who, range_start, range_end-1); + e820_print_type(entry->type); pr_cont("\n"); + + range_end_prev = range_end; } } @@ -280,15 +298,15 @@ struct change_member { /* Pointer to the original entry: */ struct e820_entry *entry; /* Address for this change point: */ - unsigned long long addr; + u64 addr; }; -static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata; -static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata; -static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata; -static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata; +__initdata static struct change_member change_point_list[2*E820_MAX_ENTRIES]; +__initdata static struct change_member *change_point[2*E820_MAX_ENTRIES]; +__initdata static struct e820_entry *overlap_list[E820_MAX_ENTRIES]; +__initdata static struct e820_entry new_entries[E820_MAX_ENTRIES]; -static int __init cpcompare(const void *a, const void *b) +__init static int cpcompare(const void *a, const void *b) { struct change_member * const *app = a, * const *bpp = b; const struct change_member *ap = *app, *bp = *bpp; @@ -305,28 +323,32 @@ static int __init cpcompare(const void *a, const void *b) return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } -static bool e820_nomerge(enum e820_type type) +/* + * Can two consecutive E820 entries of this same E820 type be merged? + */ +static bool e820_type_mergeable(enum e820_type type) { /* * These types may indicate distinct platform ranges aligned to - * numa node, protection domain, performance domain, or other + * NUMA node, protection domain, performance domain, or other * boundaries. Do not merge them. */ if (type == E820_TYPE_PRAM) - return true; + return false; if (type == E820_TYPE_SOFT_RESERVED) - return true; - return false; + return false; + + return true; } -int __init e820__update_table(struct e820_table *table) +__init int e820__update_table(struct e820_table *table) { struct e820_entry *entries = table->entries; u32 max_nr_entries = ARRAY_SIZE(table->entries); enum e820_type current_type, last_type; - unsigned long long last_addr; + u64 last_addr; u32 new_nr_entries, overlap_entries; - u32 i, chg_idx, chg_nr; + u32 idx, chg_idx, chg_nr; /* If there's only one memory region, don't bother: */ if (table->nr_entries < 2) @@ -335,26 +357,26 @@ int __init e820__update_table(struct e820_table *table) BUG_ON(table->nr_entries > max_nr_entries); /* Bail out if we find any unreasonable addresses in the map: */ - for (i = 0; i < table->nr_entries; i++) { - if (entries[i].addr + entries[i].size < entries[i].addr) + for (idx = 0; idx < table->nr_entries; idx++) { + if (entries[idx].addr + entries[idx].size < entries[idx].addr) return -1; } /* Create pointers for initial change-point information (for sorting): */ - for (i = 0; i < 2 * table->nr_entries; i++) - change_point[i] = &change_point_list[i]; + for (idx = 0; idx < 2 * table->nr_entries; idx++) + change_point[idx] = &change_point_list[idx]; /* * Record all known change-points (starting and ending addresses), * omitting empty memory regions: */ chg_idx = 0; - for (i = 0; i < table->nr_entries; i++) { - if (entries[i].size != 0) { - change_point[chg_idx]->addr = entries[i].addr; - change_point[chg_idx++]->entry = &entries[i]; - change_point[chg_idx]->addr = entries[i].addr + entries[i].size; - change_point[chg_idx++]->entry = &entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + if (entries[idx].size != 0) { + change_point[chg_idx]->addr = entries[idx].addr; + change_point[chg_idx++]->entry = &entries[idx]; + change_point[chg_idx]->addr = entries[idx].addr + entries[idx].size; + change_point[chg_idx++]->entry = &entries[idx]; } } chg_nr = chg_idx; @@ -376,9 +398,9 @@ int __init e820__update_table(struct e820_table *table) overlap_list[overlap_entries++] = change_point[chg_idx]->entry; } else { /* Remove entry from list (order independent, so swap with last): */ - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i] == change_point[chg_idx]->entry) - overlap_list[i] = overlap_list[overlap_entries-1]; + for (idx = 0; idx < overlap_entries; idx++) { + if (overlap_list[idx] == change_point[chg_idx]->entry) + overlap_list[idx] = overlap_list[overlap_entries-1]; } overlap_entries--; } @@ -388,13 +410,13 @@ int __init e820__update_table(struct e820_table *table) * 1=usable, 2,3,4,4+=unusable) */ current_type = 0; - for (i = 0; i < overlap_entries; i++) { - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; + for (idx = 0; idx < overlap_entries; idx++) { + if (overlap_list[idx]->type > current_type) + current_type = overlap_list[idx]->type; } /* Continue building up new map based on this information: */ - if (current_type != last_type || e820_nomerge(current_type)) { + if (current_type != last_type || !e820_type_mergeable(current_type)) { if (last_type) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ @@ -419,17 +441,22 @@ int __init e820__update_table(struct e820_table *table) return 0; } -static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) +/* + * Copy the BIOS E820 map into the kernel's e820_table. + * + * Sanity-check it while we're at it.. + */ +__init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) { struct boot_e820_entry *entry = entries; while (nr_entries) { u64 start = entry->addr; - u64 size = entry->size; - u64 end = start + size - 1; - u32 type = entry->type; + u64 size = entry->size; + u64 end = start + size-1; + u32 type = entry->type; - /* Ignore the entry on 64-bit overflow: */ + /* Ignore the remaining entries on 64-bit overflow: */ if (start > end && likely(size)) return -1; @@ -441,29 +468,11 @@ static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_en return 0; } -/* - * Copy the BIOS E820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - */ -static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_entries < 2) - return -1; - - return __append_e820_table(entries, nr_entries); -} - -static u64 __init +__init static u64 __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { u64 end; - unsigned int i; + u32 idx; u64 real_updated_size = 0; BUG_ON(old_type == new_type); @@ -472,14 +481,14 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx]", start, end - 1); e820_print_type(old_type); - pr_cont(" ==> "); + pr_cont(" ==>"); e820_print_type(new_type); pr_cont("\n"); - for (i = 0; i < table->nr_entries; i++) { - struct e820_entry *entry = &table->entries[i]; + for (idx = 0; idx < table->nr_entries; idx++) { + struct e820_entry *entry = &table->entries[idx]; u64 final_start, final_end; u64 entry_end; @@ -527,46 +536,44 @@ __e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_ty return real_updated_size; } -u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +__init u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { return __e820__range_update(e820_table, start, size, old_type, new_type); } -u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, +__init u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ -u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type) +__init void e820__range_remove(u64 start, u64 size, enum e820_type filter_type) { - int i; + u32 idx; u64 end; - u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1); - if (check_type) - e820_print_type(old_type); + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx]", start, end - 1); + if (filter_type) + e820_print_type(filter_type); pr_cont("\n"); - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; u64 final_start, final_end; u64 entry_end; - if (check_type && entry->type != old_type) + if (filter_type && entry->type != filter_type) continue; entry_end = entry->addr + entry->size; /* Completely covered? */ if (entry->addr >= start && entry_end <= end) { - real_removed_size += entry->size; memset(entry, 0, sizeof(*entry)); continue; } @@ -575,7 +582,6 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool if (entry->addr < start && entry_end > end) { e820__range_add(end, entry_end - end, entry->type); entry->size = start - entry->addr; - real_removed_size += size; continue; } @@ -585,8 +591,6 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool if (final_start >= final_end) continue; - real_removed_size += final_end - final_start; - /* * Left range could be head or tail, so need to update * the size first: @@ -597,10 +601,9 @@ u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool entry->addr = final_end; } - return real_removed_size; } -void __init e820__update_table_print(void) +__init void e820__update_table_print(void) { if (e820__update_table(e820_table)) return; @@ -609,42 +612,64 @@ void __init e820__update_table_print(void) e820__print_table("modified"); } -static void __init e820__update_table_kexec(void) +__init static void e820__update_table_kexec(void) { e820__update_table(e820_table_kexec); } -#define MAX_GAP_END 0x100000000ull +#define MAX_GAP_END SZ_4G /* * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB). */ -static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize) +__init static int e820_search_gap(unsigned long *max_gap_start, unsigned long *max_gap_size) { - unsigned long long last = MAX_GAP_END; - int i = e820_table->nr_entries; + struct e820_entry *entry; + u64 range_end_prev = 0; int found = 0; + u32 idx; + + for (idx = 0; idx < e820_table->nr_entries; idx++) { + u64 range_start, range_end; + + entry = e820_table->entries + idx; + range_start = entry->addr; + range_end = entry->addr + entry->size; + + /* Process any gap before this entry: */ + if (range_start > range_end_prev) { + u64 gap_start = range_end_prev; + u64 gap_end = range_start; + u64 gap_size; + + if (gap_start < MAX_GAP_END) { + /* Make sure the entirety of the gap is below MAX_GAP_END: */ + gap_end = min(gap_end, MAX_GAP_END); + gap_size = gap_end-gap_start; + + if (gap_size >= *max_gap_size) { + *max_gap_start = gap_start; + *max_gap_size = gap_size; + found = 1; + } + } + } - while (--i >= 0) { - unsigned long long start = e820_table->entries[i].addr; - unsigned long long end = start + e820_table->entries[i].size; + range_end_prev = range_end; + } - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true: - */ - if (last > end) { - unsigned long gap = last - end; + /* Is there a usable gap beyond the last entry: */ + if (entry->addr + entry->size < MAX_GAP_END) { + u64 gap_start = entry->addr + entry->size; + u64 gap_size = MAX_GAP_END-gap_start; - if (gap >= *gapsize) { - *gapsize = gap; - *gapstart = end; - found = 1; - } + if (gap_size >= *max_gap_size) { + *max_gap_start = gap_start; + *max_gap_size = gap_size; + found = 1; } - if (start < last) - last = start; } + return found; } @@ -658,29 +683,30 @@ static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsiz */ __init void e820__setup_pci_gap(void) { - unsigned long gapstart, gapsize; + unsigned long max_gap_start, max_gap_size; int found; - gapsize = 0x400000; - found = e820_search_gap(&gapstart, &gapsize); + /* The minimum eligible gap size is 4MB: */ + max_gap_size = SZ_4M; + found = e820_search_gap(&max_gap_start, &max_gap_size); if (!found) { #ifdef CONFIG_X86_64 - gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; + max_gap_start = (max_pfn << PAGE_SHIFT) + SZ_1M; pr_err("Cannot find an available gap in the 32-bit address range\n"); pr_err("PCI devices with unassigned 32-bit BARs may not work!\n"); #else - gapstart = 0x10000000; + max_gap_start = SZ_256M; #endif } /* * e820__reserve_resources_late() protects stolen RAM already: */ - pci_mem_start = gapstart; + pci_mem_start = max_gap_start; - pr_info("[mem %#010lx-%#010lx] available for PCI devices\n", - gapstart, gapstart + gapsize - 1); + pr_info("[gap %#010lx-%#010lx] available for PCI devices\n", + max_gap_start, max_gap_start + max_gap_size-1); } /* @@ -722,7 +748,7 @@ __init void e820__reallocate_tables(void) * the remaining (if any) entries are passed via the SETUP_E820_EXT node of * struct setup_data, which is parsed here. */ -void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) +__init void e820__memory_setup_extended(u64 phys_addr, u32 data_len) { int entries; struct boot_e820_entry *extmap; @@ -732,7 +758,7 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) entries = sdata->len / sizeof(*extmap); extmap = (struct boot_e820_entry *)(sdata->data); - __append_e820_table(extmap, entries); + append_e820_table(extmap, entries); e820__update_table(e820_table); memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); @@ -751,13 +777,13 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) * This function requires the E820 map to be sorted and without any * overlapping entries. */ -void __init e820__register_nosave_regions(unsigned long limit_pfn) +__init void e820__register_nosave_regions(unsigned long limit_pfn) { - int i; + u32 idx; u64 last_addr = 0; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (entry->type != E820_TYPE_RAM) continue; @@ -776,12 +802,12 @@ void __init e820__register_nosave_regions(unsigned long limit_pfn) * Register ACPI NVS memory regions, so that we can save/restore them during * hibernation and the subsequent resume: */ -static int __init e820__register_nvs_regions(void) +__init static int e820__register_nvs_regions(void) { - int i; + u32 idx; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; if (entry->type == E820_TYPE_NVS) acpi_nvs_register(entry->addr, entry->size); @@ -800,7 +826,7 @@ core_initcall(e820__register_nvs_regions); * This allows kexec to fake a new mptable, as if it came from the real * system. */ -u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) +__init u64 e820__memblock_alloc_reserved(u64 size, u64 align) { u64 addr; @@ -827,14 +853,14 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) +__init static unsigned long e820__end_ram_pfn(unsigned long limit_pfn) { - int i; + u32 idx; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; unsigned long start_pfn; unsigned long end_pfn; @@ -863,26 +889,20 @@ static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) return last_pfn; } -unsigned long __init e820__end_of_ram_pfn(void) +__init unsigned long e820__end_of_ram_pfn(void) { return e820__end_ram_pfn(MAX_ARCH_PFN); } -unsigned long __init e820__end_of_low_ram_pfn(void) +__init unsigned long e820__end_of_low_ram_pfn(void) { return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT)); } -static void __init early_panic(char *msg) -{ - early_printk(msg); - panic(msg); -} - -static int userdef __initdata; +__initdata static int userdef; /* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */ -static int __init parse_memopt(char *p) +__init static int parse_memopt(char *p) { u64 mem_size; @@ -906,7 +926,7 @@ static int __init parse_memopt(char *p) if (mem_size == 0) return -EINVAL; - e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM); #ifdef CONFIG_MEMORY_HOTPLUG max_mem_size = mem_size; @@ -916,7 +936,7 @@ static int __init parse_memopt(char *p) } early_param("mem", parse_memopt); -static int __init parse_memmap_one(char *p) +__init static int parse_memmap_one(char *p) { char *oldp; u64 start_at, mem_size; @@ -962,18 +982,16 @@ static int __init parse_memmap_one(char *p) e820__range_update(start_at, mem_size, from, to); else if (to) e820__range_add(start_at, mem_size, to); - else if (from) - e820__range_remove(start_at, mem_size, from, 1); else - e820__range_remove(start_at, mem_size, 0, 0); + e820__range_remove(start_at, mem_size, from); } else { - e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1); + e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM); } return *p == '\0' ? 0 : -EINVAL; } -static int __init parse_memmap_opt(char *str) +__init static int parse_memmap_opt(char *str) { while (str) { char *k = strchr(str, ','); @@ -994,18 +1012,18 @@ early_param("memmap", parse_memmap_opt); * have been processed, in which case we already have an E820 table filled in * via the parameter callback function(s), but it's not sorted and printed yet: */ -void __init e820__finish_early_params(void) +__init void e820__finish_early_params(void) { if (userdef) { if (e820__update_table(e820_table) < 0) - early_panic("Invalid user supplied memory map"); + panic("Invalid user supplied memory map"); pr_info("user-defined physical RAM map:\n"); e820__print_table("user"); } } -static const char *__init e820_type_to_string(struct e820_entry *entry) +__init static const char * e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_RAM: return "System RAM"; @@ -1020,7 +1038,7 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) } } -static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) +__init static unsigned long e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; @@ -1035,7 +1053,7 @@ static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) } } -static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) +__init static unsigned long e820_type_to_iores_desc(struct e820_entry *entry) { switch (entry->type) { case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES; @@ -1050,40 +1068,47 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) } } -static bool __init do_mark_busy(enum e820_type type, struct resource *res) +/* + * We assign one resource entry for each E820 map entry: + */ +__initdata static struct resource *e820_res; + +/* + * Is this a device address region that should not be marked busy? + * (Versus system address regions that we register & lock early.) + */ +__init static bool e820_device_region(enum e820_type type, struct resource *res) { - /* this is the legacy bios/dos rom-shadow + mmio region */ - if (res->start < (1ULL<<20)) - return true; + /* This is the legacy BIOS/DOS ROM-shadow + MMIO region: */ + if (res->start < SZ_1M) + return false; /* * Treat persistent memory and other special memory ranges like - * device memory, i.e. reserve it for exclusive use of a driver + * device memory, i.e. keep it available for exclusive use of a + * driver: */ switch (type) { case E820_TYPE_RESERVED: case E820_TYPE_SOFT_RESERVED: case E820_TYPE_PRAM: case E820_TYPE_PMEM: - return false; + return true; case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: default: - return true; + return false; } } /* - * Mark E820 reserved areas as busy for the resource manager: + * Mark E820 system regions as busy for the resource manager: */ - -static struct resource __initdata *e820_res; - -void __init e820__reserve_resources(void) +__init void e820__reserve_resources(void) { - int i; + u32 idx; struct resource *res; u64 end; @@ -1091,8 +1116,8 @@ void __init e820__reserve_resources(void) SMP_CACHE_BYTES); e820_res = res; - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = e820_table->entries + i; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = e820_table->entries + idx; end = entry->addr + entry->size - 1; if (end != (resource_size_t)end) { @@ -1106,20 +1131,20 @@ void __init e820__reserve_resources(void) res->desc = e820_type_to_iores_desc(entry); /* - * Don't register the region that could be conflicted with - * PCI device BAR resources and insert them later in - * pcibios_resource_survey(): + * Skip and don't register device regions that could be conflicted + * with PCI device BAR resources. They get inserted later in + * pcibios_resource_survey() -> e820__reserve_resources_late(): */ - if (do_mark_busy(entry->type, res)) { + if (!e820_device_region(entry->type, res)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } res++; } - /* Expose the kexec e820 table to the sysfs. */ - for (i = 0; i < e820_table_kexec->nr_entries; i++) { - struct e820_entry *entry = e820_table_kexec->entries + i; + /* Expose the kexec e820 table to sysfs: */ + for (idx = 0; idx < e820_table_kexec->nr_entries; idx++) { + struct e820_entry *entry = e820_table_kexec->entries + idx; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } @@ -1128,7 +1153,7 @@ void __init e820__reserve_resources(void) /* * How much should we pad the end of RAM, depending on where it is? */ -static unsigned long __init ram_alignment(resource_size_t pos) +__init static unsigned long ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; @@ -1146,24 +1171,36 @@ static unsigned long __init ram_alignment(resource_size_t pos) #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -void __init e820__reserve_resources_late(void) +__init void e820__reserve_resources_late(void) { - int i; + u32 idx; struct resource *res; + /* + * Register device address regions listed in the E820 map, + * these can be claimed by device drivers later on: + */ res = e820_res; - for (i = 0; i < e820_table->nr_entries; i++) { + for (idx = 0; idx < e820_table->nr_entries; idx++) { if (!res->parent && res->end) insert_resource_expand_to_fit(&iomem_resource, res); res++; } /* - * Try to bump up RAM regions to reasonable boundaries, to - * avoid stolen RAM: + * Create additional 'gaps' at the end of RAM regions, + * rounding them up to 64k/1MB/64MB boundaries, should + * they be weirdly sized, and register extra, locked + * resource regions for them, to make sure drivers + * won't claim those addresses. + * + * These are basically blind guesses and heuristics to + * avoid resource conflicts with broken firmware that + * doesn't properly list 'stolen RAM' as a system region + * in the E820 map. */ - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; u64 start, end; if (entry->type != E820_TYPE_RAM) @@ -1176,7 +1213,7 @@ void __init e820__reserve_resources_late(void) if (start >= end) continue; - printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end); + pr_info("e820: register RAM buffer resource [mem %#010llx-%#010llx]\n", start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } } @@ -1184,7 +1221,7 @@ void __init e820__reserve_resources_late(void) /* * Pass the firmware (bootloader) E820 map to the kernel and process it: */ -char *__init e820__memory_setup_default(void) +__init char * e820__memory_setup_default(void) { char *who = "BIOS-e820"; @@ -1222,7 +1259,7 @@ char *__init e820__memory_setup_default(void) * E820 map - with an optional platform quirk available for virtual platforms * to override this method of boot environment processing: */ -void __init e820__memory_setup(void) +__init void e820__memory_setup(void) { char *who; @@ -1238,9 +1275,9 @@ void __init e820__memory_setup(void) e820__print_table(who); } -void __init e820__memblock_setup(void) +__init void e820__memblock_setup(void) { - int i; + u32 idx; u64 end; #ifdef CONFIG_MEMORY_HOTPLUG @@ -1284,8 +1321,8 @@ void __init e820__memblock_setup(void) */ memblock_allow_resize(); - for (i = 0; i < e820_table->nr_entries; i++) { - struct e820_entry *entry = &e820_table->entries[i]; + for (idx = 0; idx < e820_table->nr_entries; idx++) { + struct e820_entry *entry = &e820_table->entries[idx]; end = entry->addr + entry->size; if (end != (resource_size_t)end) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e176..ffbd04ee0f6853 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -763,7 +763,7 @@ static void __init trim_bios_range(void) * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ - e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM); e820__update_table(e820_table); } @@ -785,7 +785,7 @@ static void __init e820_add_kernel_range(void) return; pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); - e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_remove(start, size, 0); e820__range_add(start, size, E820_TYPE_RAM); } @@ -1015,11 +1015,9 @@ void __init setup_arch(char **cmdline_p) trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, - E820_TYPE_RESERVED); + pr_info("Applying PPro RAM bug workaround: punching 256 kB hole at 1.75 GB physical.\n"); + e820__range_update(0x70000000ULL, SZ_256K, E820_TYPE_RAM, E820_TYPE_RESERVED); e820__update_table(e820_table); - printk(KERN_INFO "fixed physical RAM map:\n"); - e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 463b784499a8f5..d00c6de7f3b733 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -333,8 +333,7 @@ static void __init efi_remove_e820_mmio(void) if (size >= 256*1024) { pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n", i, start, end, size >> 20); - e820__range_remove(start, size, - E820_TYPE_RESERVED, 1); + e820__range_remove(start, size, E820_TYPE_RESERVED); } else { pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n", i, start, end, size >> 10); diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1280693766b9dd..280b4ac0990ff3 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -586,6 +586,25 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long long), \ default: (x))) +/* + * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving + * non-scalar types unchanged. + */ + +#define __scalar_type_to_signed_cases(type) \ + unsigned type: (signed type)0, \ + signed type: (signed type)0 + +#define __signed_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (signed char)0, \ + __scalar_type_to_signed_cases(char), \ + __scalar_type_to_signed_cases(short), \ + __scalar_type_to_signed_cases(int), \ + __scalar_type_to_signed_cases(long), \ + __scalar_type_to_signed_cases(long long), \ + default: (x))) + /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810facbe..bf96a7d595e226 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,15 +586,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - union { - /* - * When !@on_rq this field is vlag. - * When cfs_rq->curr == se (which implies @on_rq) - * this field is vprot. See protect_slice(). - */ - s64 vlag; - u64 vprot; - }; + /* Approximated virtual lag: */ + s64 vlag; + /* 'Protected' deadline, to give out minimum quantums: */ + u64 vprot; u64 slice; u64 nr_migrations; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41ba0be1691174..7d0a862a8c75ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); - rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); - rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; - if (p->sched_class == donor->sched_class) - donor->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, donor->sched_class)) + if (p->sched_class == rq->next_class) { + rq->next_class->wakeup_preempt(rq, p, flags); + + } else if (sched_class_above(p->sched_class, rq->next_class)) { + rq->next_class->wakeup_preempt(rq, p, flags); resched_curr(rq); + rq->next_class = p->sched_class; + } /* * A queue event has occurred, and we're going to schedule. In @@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + rq->next_class = next->sched_class; if (unlikely(task_is_blocked(next))) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -8650,6 +8653,8 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif + rq->next_class = &idle_sched_class; + rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -10775,13 +10780,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags |= DEQUEUE_NOCLOCK; } - if (flags & DEQUEUE_CLASS) { - if (p->sched_class->switching_from) - p->sched_class->switching_from(rq, p); - } + if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); *ctx = (struct sched_change_ctx){ .p = p, + .class = p->sched_class, .flags = flags, .queued = task_on_rq_queued(p), .running = task_current_donor(rq, p), @@ -10812,6 +10816,11 @@ void sched_change_end(struct sched_change_ctx *ctx) lockdep_assert_rq_held(rq); + /* + * Changing class without *QUEUE_CLASS is bad. + */ + WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) p->sched_class->switching_to(rq, p); @@ -10823,6 +10832,24 @@ void sched_change_end(struct sched_change_ctx *ctx) if (ctx->flags & ENQUEUE_CLASS) { if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + + /* + * If this was a class promotion; let the old class know it + * got preempted. Note that none of the switch*_from() methods + * know the new class and none of the switch*_to() methods + * know the old class. + */ + if (ctx->running && sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + + /* + * If this was a degradation in class someone should have set + * need_resched by now. + */ + WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) && + !test_tsk_need_resched(p)); } else { p->sched_class->prio_changed(rq, p, ctx->prio); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 319439fe187026..80c9559a3e30ea 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { + /* + * Can only get preempted by stop-class, and those should be + * few and short lived, doesn't really make sense to push + * anything away for that. + */ + if (p->sched_class != &dl_sched_class) + return; + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; @@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) #endif DEFINE_SCHED_CLASS(dl) = { - - .queue_mask = 8, - .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649a9..8015ab6eb332e9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} + static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { - .queue_mask = 1, - .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da46c316453783..76f5e4b78b3069 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ +extern void __BUILD_BUG_vruntime_cmp(void); + +/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */ + +#define vruntime_cmp(A, CMP_STR, B) ({ \ + int __res = 0; \ + \ + if (!__builtin_strcmp(CMP_STR, "<")) { \ + __res = ((s64)((A)-(B)) < 0); \ + } else if (!__builtin_strcmp(CMP_STR, "<=")) { \ + __res = ((s64)((A)-(B)) <= 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">")) { \ + __res = ((s64)((A)-(B)) > 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">=")) { \ + __res = ((s64)((A)-(B)) >= 0); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_cmp(); \ + } \ + \ + __res; \ +}) + +extern void __BUILD_BUG_vruntime_op(void); + +#define vruntime_op(A, OP_STR, B) ({ \ + s64 __res = 0; \ + \ + if (!__builtin_strcmp(OP_STR, "-")) { \ + __res = (s64)((A)-(B)); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_op(); \ + } \ + \ + __res; \ +}) + + static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) + if (vruntime_cmp(vruntime, ">", max_vruntime)) max_vruntime = vruntime; return max_vruntime; @@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) + if (vruntime_cmp(vruntime, "<", min_vruntime)) min_vruntime = vruntime; return min_vruntime; @@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a, * Tiebreak on vruntime seems unnecessary since it can * hardly happen. */ - return (s64)(a->deadline - b->deadline) < 0; + return vruntime_cmp(a->deadline, "<", b->deadline); } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->zero_vruntime); + return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -607,8 +644,8 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Which we track using: * * v0 := cfs_rq->zero_vruntime - * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime - * \Sum w_i := cfs_rq->avg_load + * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime + * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v), will be in the order of the maximal (virtual) lag @@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void -avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime += key * weight; - cfs_rq->avg_load += weight; + cfs_rq->sum_w_vruntime += key * weight; + cfs_rq->sum_weight += weight; } static void -avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime -= key * weight; - cfs_rq->avg_load -= weight; + cfs_rq->sum_w_vruntime -= key * weight; + cfs_rq->sum_weight -= weight; } static inline -void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; + cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_zero_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - avg_vruntime_update(cfs_rq, delta); + sum_w_vruntime_update(cfs_rq, delta); cfs_rq->zero_vruntime = vruntime; } @@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) - static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (vruntime_gt(min_vruntime, se, rse)) + + if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; } } @@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - avg_vruntime_add(cfs_rq, se); + sum_w_vruntime_add(cfs_rq, se); update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; @@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); + sum_w_vruntime_sub(cfs_rq, se); update_zero_vruntime(cfs_rq); } @@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti static inline bool protect_slice(struct sched_entity *se) { - return ((s64)(se->vprot - se->vruntime) > 0); + return vruntime_cmp(se->vruntime, "<", se->vprot); } static inline void cancel_protect_slice(struct sched_entity *se) @@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if ((s64)(se->vruntime - se->deadline) < 0) + if (vruntime_cmp(se->vruntime, "<", se->deadline)) return false; /* @@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p) /* Scale the maximum scan period with the amount of shared memory. */ rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p) pid_t gid = 0; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) gid = ng->gid; rcu_read_unlock(); @@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env, return false; rcu_read_lock(); - cur = rcu_dereference(dst_rq->curr); + cur = rcu_dereference_all(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || !cur->mm)) cur = NULL; @@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - cur_ng = rcu_dereference(cur->numa_group); + cur_ng = rcu_dereference_all(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have @@ -2499,7 +2535,7 @@ static int task_numa_migrate(struct task_struct *p) * to satisfy here. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu)); if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imb_numa_nr = sd->imb_numa_nr; @@ -3022,7 +3058,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!cpupid_match_pid(tsk, cpupid)) goto no_join; - grp = rcu_dereference(tsk->numa_group); + grp = rcu_dereference_all(tsk->numa_group); if (!grp) goto no_join; @@ -3693,7 +3729,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ #define add_positive(_ptr, _val) do { \ typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ + __signed_scalar_typeof(*ptr) val = (_val); \ typeof(*ptr) res, var = READ_ONCE(*ptr); \ \ res = var + val; \ @@ -3705,23 +3741,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) /* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - -/* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store @@ -3732,21 +3751,37 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) + +/* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ +#define __update_sa(sa, name, delta_avg, delta_sum) do { \ + add_positive(&(sa)->name##_avg, delta_avg); \ + add_positive(&(sa)->name##_sum, delta_sum); \ + (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \ + (sa)->name##_sum, \ + (sa)->name##_avg * PELT_MIN_DIVIDER); \ +} while (0) + static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; new_sum = se->avg.util_avg * divider; @@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta_avg); - add_positive(&cfs_rq->avg.util_sum, delta_sum); - - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum); } static inline void @@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum); } static inline void @@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) rq = rq_of(cfs_rq); rcu_read_lock(); - is_idle = is_idle_task(rcu_dereference(rq->curr)); + is_idle = is_idle_task(rcu_dereference_all(rq->curr)); rcu_read_unlock(); /* @@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; - sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * divider); - /* See sa->util_sum below */ - sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); + __update_sa(sa, load, -r, -r*divider); r = removed_util; - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * divider); - /* - * Because of rounding, se->util_sum might ends up being +1 more than - * cfs->util_sum. Although this is not a problem by itself, detaching - * a lot of tasks with the rounding problem between 2 updates of - * util_avg (~1ms) can make cfs->util_sum becoming null whereas - * cfs_util_avg is not. - * Check that util_sum is still above its lower bound for the new - * util_avg. Given that period_contrib might have moved since the last - * sync, we are only sure that util_sum must be above or equal to - * util_avg * minimum possible divider - */ - sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + __update_sa(sa, util, -r, -r*divider); r = removed_runnable; - sub_positive(&sa->runnable_avg, r); - sub_positive(&sa->runnable_sum, r * divider); - /* See sa->util_sum above */ - sa->runnable_sum = max_t(u32, sa->runnable_sum, - sa->runnable_avg * PELT_MIN_DIVIDER); + __update_sa(sa, runnable, -r, -r*divider); /* * removed_runnable is the unweighted version of removed_load so we @@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { dequeue_load_avg(cfs_rq, se); - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); - - sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum); + __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -5175,7 +5167,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * vl_i = (W + w_i)*vl'_i / W */ - load = cfs_rq->avg_load; + load = cfs_rq->sum_weight; if (curr && curr->on_rq) load += scale_load_down(curr->load.weight); @@ -7148,7 +7140,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; - int has_blocked; /* Idle CPUS has blocked load */ + int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ @@ -7506,7 +7498,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7515,7 +7507,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7644,7 +7636,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; @@ -7850,7 +7842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * sd_asym_cpucapacity rather than sd_llc. */ if (sched_asym_cpucap_active()) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique @@ -7865,7 +7857,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } - sd = rcu_dereference(per_cpu(sd_llc, target)); + sd = rcu_dereference_all(per_cpu(sd_llc, target)); if (!sd) return target; @@ -8334,7 +8326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct energy_env eenv; rcu_read_lock(); - pd = rcu_dereference(rd->pd); + pd = rcu_dereference_all(rd->pd); if (!pd) goto unlock; @@ -8342,7 +8334,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) @@ -8744,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags, /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags) { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; @@ -8752,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + /* + * XXX Getting preempted by higher class, try and find idle CPU? + */ + if (p->sched_class != &fair_sched_class) + return; + if (unlikely(se == pse)) return; @@ -9349,7 +9347,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) */ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - struct numa_group *numa_group = rcu_dereference(p->numa_group); + struct numa_group *numa_group = rcu_dereference_all(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; @@ -9778,7 +9776,7 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_NO_HZ_COMMON -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) return true; @@ -9811,16 +9809,16 @@ static inline void update_blocked_load_tick(struct rq *rq) WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) { - if (!has_blocked) + if (!has_blocked_load) rq->has_blocked_load = 0; } #else /* !CONFIG_NO_HZ_COMMON: */ -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {} #endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) @@ -9877,7 +9875,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) list_del_leaf_cfs_rq(cfs_rq); /* Don't need periodic decay once load/util_avg are null */ - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; } @@ -9937,7 +9935,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; return decayed; @@ -9949,23 +9947,27 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); - update_blocked_load_status(rq, !done); + update_has_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -11025,7 +11027,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (env->sd->span_weight != llc_weight) return; - sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return; @@ -11375,7 +11377,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) goto force_balance; if (!is_rd_overutilized(env->dst_rq->rd) && - rcu_dereference(env->dst_rq->rd->pd)) + rcu_dereference_all(env->dst_rq->rd->pd)) goto out_balanced; /* ASYM feature bypasses nice load balance check */ @@ -12450,7 +12452,7 @@ static void nohz_balancer_kick(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return; - if (READ_ONCE(nohz.has_blocked) && + if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; @@ -12464,7 +12466,7 @@ static void nohz_balancer_kick(struct rq *rq) rcu_read_lock(); - sd = rcu_dereference(rq->sd); + sd = rcu_dereference_all(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced @@ -12476,7 +12478,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU @@ -12494,7 +12496,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU @@ -12515,7 +12517,7 @@ static void nohz_balancer_kick(struct rq *rq) goto unlock; } - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12547,7 +12549,7 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; @@ -12577,7 +12579,7 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; @@ -12611,9 +12613,9 @@ void nohz_balance_enter_idle(int cpu) /* * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the + * meantime. We set the nohz.has_blocked_load flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load + * of nohz.has_blocked_load can only happen after checking the new load */ if (rq->nohz_tick_stopped) goto out; @@ -12629,7 +12631,7 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked + * @idle_cpus_mask store, it must observe the @has_blocked_load * and @needs_update stores. */ smp_mb__after_atomic(); @@ -12642,7 +12644,7 @@ out: * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs */ - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } static bool update_nohz_stats(struct rq *rq) @@ -12683,8 +12685,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) /* * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trigger another update of idle load. + * the has_blocked_load flag. If a cpu enters idle in the mean time, it will + * set the has_blocked_load flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. @@ -12692,12 +12694,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) - WRITE_ONCE(nohz.has_blocked, 0); + WRITE_ONCE(nohz.has_blocked_load, 0); if (flags & NOHZ_NEXT_KICK) WRITE_ONCE(nohz.needs_update, 0); /* - * Ensures that if we miss the CPU, we must see the has_blocked + * Ensures that if we miss the CPU, we must see the has_blocked_load * store from nohz_balance_enter_idle(). */ smp_mb(); @@ -12764,7 +12766,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } /* @@ -12826,7 +12828,7 @@ static void nohz_newidle_balance(struct rq *this_rq) return; /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || + if (!READ_ONCE(nohz.has_blocked_load) || time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; @@ -12896,29 +12898,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!sd) { - rcu_read_unlock(); + sd = rcu_dereference_sched_domain(this_rq->sd); + if (!sd) goto out; - } if (!get_rd_overloaded(this_rq->rd) || this_rq->avg_idle < sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); - rcu_read_unlock(); goto out; } - rcu_read_unlock(); - - rq_modified_clear(this_rq); - raw_spin_rq_unlock(this_rq); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + + this_rq->next_class = &fair_sched_class; + raw_spin_rq_unlock(this_rq); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; @@ -12968,7 +12969,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (pulled_task || !continue_balancing) break; } - rcu_read_unlock(); raw_spin_rq_lock(this_rq); @@ -12984,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (rq_modified_above(this_rq, &fair_sched_class)) + if (sched_class_above(this_rq->next_class, &fair_sched_class)) pulled_task = -1; out: @@ -13335,8 +13335,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13374,6 +13374,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (queued) { + if (!need_resched()) + hrtick_start_fair(rq, curr); + return; + } + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = { - - .queue_mask = 2, - .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .wakeup_preempt = check_preempt_wakeup_fair, + .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, .pick_next_task = pick_next_task_fair, @@ -13950,7 +13953,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c174afe1dd177a..65eb8f8c1a5d3a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ DEFINE_SCHED_CLASS(idle) = { - - .queue_mask = 0, - /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f1867fe8e5c535..0a9b2cd6da7208 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; + /* + * XXX If we're preempted by DL, queue a push? + */ + if (p->sched_class != &rt_sched_class) + return; + if (p->prio < donor->prio) { resched_curr(rq); return; @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) #endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { - - .queue_mask = 4, - .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d30cca6870f5f9..bdb1e748fe5c97 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,16 +670,16 @@ struct balance_callback { void (*func)(struct rq *rq); }; -/* CFS-related fields in a runqueue */ +/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */ struct cfs_rq { struct load_weight load; unsigned int nr_queued; - unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_idle; /* SCHED_IDLE */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ - s64 avg_vruntime; - u64 avg_load; + s64 sum_w_vruntime; + u64 sum_weight; u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE @@ -690,7 +690,7 @@ struct cfs_rq { struct rb_root_cached tasks_timeline; /* - * 'curr' points to currently running entity on this cfs_rq. + * 'curr' points to the currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; @@ -726,9 +726,7 @@ struct cfs_rq { unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* @@ -741,19 +739,19 @@ struct cfs_rq { */ int on_list; struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* Group that "owns" this runqueue */ /* Locally cached copy of our task_group's idle value */ int idle; -#ifdef CONFIG_CFS_BANDWIDTH +# ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; u64 throttled_pelt_idle; -#ifndef CONFIG_64BIT +# ifndef CONFIG_64BIT u64 throttled_pelt_idle_copy; -#endif +# endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -765,7 +763,7 @@ struct cfs_rq { struct list_head throttled_list; struct list_head throttled_csd_list; struct list_head throttled_limbo_list; -#endif /* CONFIG_CFS_BANDWIDTH */ +# endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1121,7 +1119,6 @@ struct rq { raw_spinlock_t __lock; /* Per class runqueue modification mask; bits in class order. */ - unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1181,6 +1178,7 @@ struct rq { struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; + const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; @@ -2010,8 +2008,8 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) +#define rcu_dereference_sched_domain(p) \ + rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -2021,7 +2019,7 @@ queue_balance_callback(struct rq *rq, * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ @@ -2428,15 +2426,6 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif - /* - * idle: 0 - * ext: 1 - * fair: 2 - * rt: 4 - * dl: 8 - * stop: 16 - */ - unsigned int queue_mask; /* * move_queued_task/activate_task/enqueue_task: rq->lock @@ -2595,20 +2584,6 @@ struct sched_class { #endif }; -/* - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. - */ -static inline void rq_modified_clear(struct rq *rq) -{ - rq->queue_mask = 0; -} - -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) -{ - unsigned int mask = class->queue_mask; - return rq->queue_mask & ~((mask << 1) - 1); -} - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -3901,6 +3876,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s deactivate_task(src_rq, task, 0); set_task_cpu(task, dst_rq->cpu); activate_task(dst_rq, task, 0); + wakeup_preempt(dst_rq, task, 0); } static inline @@ -3968,6 +3944,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head); struct sched_change_ctx { u64 prio; struct task_struct *p; + const struct sched_class *class; int flags; bool queued; bool running; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4f9192be4b5b0a..f95798baddebbd 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ DEFINE_SCHED_CLASS(stop) = { - - .queue_mask = 16, - .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, |
