aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/arch/x86/boot.rst198
-rw-r--r--arch/x86/include/asm/uv/bios.h2
-rw-r--r--arch/x86/kernel/cpu/amd.c40
-rw-r--r--arch/x86/kernel/cpu/centaur.c6
-rw-r--r--arch/x86/kernel/cpu/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/hygon.c6
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--include/linux/compiler_types.h19
-rw-r--r--include/linux/mm_types.h1
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/trace/events/tlb.h5
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/sched/core.c45
-rw-r--r--kernel/sched/deadline.c14
-rw-r--r--kernel/sched/ext.c7
-rw-r--r--kernel/sched/fair.c361
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c9
-rw-r--r--kernel/sched/sched.h59
-rw-r--r--kernel/sched/stop_task.c3
21 files changed, 395 insertions, 410 deletions
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index 6d36ce86fd8ec0..dca3875a24351e 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -95,26 +95,26 @@ Memory Layout
The traditional memory map for the kernel loader, used for Image or
zImage kernels, typically looks like::
- | |
+ | |
0A0000 +------------------------+
- | Reserved for BIOS | Do not use. Reserved for BIOS EBDA.
+ | Reserved for BIOS | Do not use. Reserved for BIOS EBDA.
09A000 +------------------------+
- | Command line |
- | Stack/heap | For use by the kernel real-mode code.
+ | Command line |
+ | Stack/heap | For use by the kernel real-mode code.
098000 +------------------------+
- | Kernel setup | The kernel real-mode code.
+ | Kernel setup | The kernel real-mode code.
090200 +------------------------+
- | Kernel boot sector | The kernel legacy boot sector.
+ | Kernel boot sector | The kernel legacy boot sector.
090000 +------------------------+
- | Protected-mode kernel | The bulk of the kernel image.
+ | Protected-mode kernel | The bulk of the kernel image.
010000 +------------------------+
- | Boot loader | <- Boot sector entry point 0000:7C00
+ | Boot loader | <- Boot sector entry point 0000:7C00
001000 +------------------------+
- | Reserved for MBR/BIOS |
+ | Reserved for MBR/BIOS |
000800 +------------------------+
- | Typically used by MBR |
+ | Typically used by MBR |
000600 +------------------------+
- | BIOS use only |
+ | BIOS use only |
000000 +------------------------+
When using bzImage, the protected-mode kernel was relocated to
@@ -142,27 +142,27 @@ above the 0x9A000 point; too many BIOSes will break above that point.
For a modern bzImage kernel with boot protocol version >= 2.02, a
memory layout like the following is suggested::
- ~ ~
- | Protected-mode kernel |
+ ~ ~
+ | Protected-mode kernel |
100000 +------------------------+
- | I/O memory hole |
+ | I/O memory hole |
0A0000 +------------------------+
- | Reserved for BIOS | Leave as much as possible unused
- ~ ~
- | Command line | (Can also be below the X+10000 mark)
+ | Reserved for BIOS | Leave as much as possible unused
+ ~ ~
+ | Command line | (Can also be below the X+10000 mark)
X+10000 +------------------------+
- | Stack/heap | For use by the kernel real-mode code.
+ | Stack/heap | For use by the kernel real-mode code.
X+08000 +------------------------+
- | Kernel setup | The kernel real-mode code.
- | Kernel boot sector | The kernel legacy boot sector.
+ | Kernel setup | The kernel real-mode code.
+ | Kernel boot sector | The kernel legacy boot sector.
X +------------------------+
- | Boot loader | <- Boot sector entry point 0000:7C00
+ | Boot loader | <- Boot sector entry point 0000:7C00
001000 +------------------------+
- | Reserved for MBR/BIOS |
+ | Reserved for MBR/BIOS |
000800 +------------------------+
- | Typically used by MBR |
+ | Typically used by MBR |
000600 +------------------------+
- | BIOS use only |
+ | BIOS use only |
000000 +------------------------+
... where the address X is as low as the design of the boot loader permits.
@@ -433,7 +433,7 @@ Protocol: 2.00+
Assigned boot loader IDs:
- == =======================================
+ ==== =======================================
0x0 LILO
(0x00 reserved for pre-2.00 bootloader)
0x1 Loadlin
@@ -456,7 +456,7 @@ Protocol: 2.00+
<http://sebastian-plotz.blogspot.de>
0x12 OVMF UEFI virtualization stack
0x13 barebox
- == =======================================
+ ==== =======================================
Please contact <hpa@zytor.com> if you need a bootloader ID value assigned.
@@ -809,12 +809,12 @@ Protocol: 2.09+
as follow::
struct setup_data {
- __u64 next;
- __u32 type;
- __u32 len;
- __u8 data[];
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[];
}
-
+
Where, the next is a 64-bit physical pointer to the next node of
linked list, the next field of the last node is 0; the type is used
to identify the contents of data; the len is the length of data
@@ -835,10 +835,10 @@ Protocol: 2.09+
protocol 2.15::
struct setup_indirect {
- __u32 type;
- __u32 reserved; /* Reserved, must be set to zero. */
- __u64 len;
- __u64 addr;
+ __u32 type;
+ __u32 reserved; /* Reserved, must be set to zero. */
+ __u64 len;
+ __u64 addr;
};
The type member is a SETUP_INDIRECT | SETUP_* type. However, it cannot be
@@ -850,15 +850,15 @@ Protocol: 2.09+
In this case setup_data and setup_indirect will look like this::
struct setup_data {
- .next = 0, /* or <addr_of_next_setup_data_struct> */
- .type = SETUP_INDIRECT,
- .len = sizeof(setup_indirect),
- .data[sizeof(setup_indirect)] = (struct setup_indirect) {
- .type = SETUP_INDIRECT | SETUP_E820_EXT,
- .reserved = 0,
- .len = <len_of_SETUP_E820_EXT_data>,
- .addr = <addr_of_SETUP_E820_EXT_data>,
- },
+ .next = 0, /* or <addr_of_next_setup_data_struct> */
+ .type = SETUP_INDIRECT,
+ .len = sizeof(setup_indirect),
+ .data[sizeof(setup_indirect)] = (struct setup_indirect) {
+ .type = SETUP_INDIRECT | SETUP_E820_EXT,
+ .reserved = 0,
+ .len = <len_of_SETUP_E820_EXT_data>,
+ .addr = <addr_of_SETUP_E820_EXT_data>,
+ },
}
.. note::
@@ -897,11 +897,11 @@ Offset/size: 0x260/4
The kernel runtime start address is determined by the following algorithm::
if (relocatable_kernel) {
- if (load_address < pref_address)
- load_address = pref_address;
- runtime_start = align_up(load_address, kernel_alignment);
+ if (load_address < pref_address)
+ load_address = pref_address;
+ runtime_start = align_up(load_address, kernel_alignment);
} else {
- runtime_start = pref_address;
+ runtime_start = pref_address;
}
Hence the necessary memory window location and size can be estimated by
@@ -975,22 +975,22 @@ after kernel_info_var_len_data label. Each chunk of variable size data has to
be prefixed with header/magic and its size, e.g.::
kernel_info:
- .ascii "LToP" /* Header, Linux top (structure). */
- .long kernel_info_var_len_data - kernel_info
- .long kernel_info_end - kernel_info
- .long 0x01234567 /* Some fixed size data for the bootloaders. */
+ .ascii "LToP" /* Header, Linux top (structure). */
+ .long kernel_info_var_len_data - kernel_info
+ .long kernel_info_end - kernel_info
+ .long 0x01234567 /* Some fixed size data for the bootloaders. */
kernel_info_var_len_data:
example_struct: /* Some variable size data for the bootloaders. */
- .ascii "0123" /* Header/Magic. */
- .long example_struct_end - example_struct
- .ascii "Struct"
- .long 0x89012345
+ .ascii "0123" /* Header/Magic. */
+ .long example_struct_end - example_struct
+ .ascii "Struct"
+ .long 0x89012345
example_struct_end:
example_strings: /* Some variable size data for the bootloaders. */
- .ascii "ABCD" /* Header/Magic. */
- .long example_strings_end - example_strings
- .asciz "String_0"
- .asciz "String_1"
+ .ascii "ABCD" /* Header/Magic. */
+ .long example_strings_end - example_strings
+ .asciz "String_0"
+ .asciz "String_1"
example_strings_end:
kernel_info_end:
@@ -1132,53 +1132,53 @@ Such a boot loader should enter the following fields in the header::
unsigned long base_ptr; /* base address for real-mode segment */
if (setup_sects == 0)
- setup_sects = 4;
+ setup_sects = 4;
if (protocol >= 0x0200) {
- type_of_loader = <type code>;
- if (loading_initrd) {
- ramdisk_image = <initrd_address>;
- ramdisk_size = <initrd_size>;
- }
-
- if (protocol >= 0x0202 && loadflags & 0x01)
- heap_end = 0xe000;
- else
- heap_end = 0x9800;
-
- if (protocol >= 0x0201) {
- heap_end_ptr = heap_end - 0x200;
- loadflags |= 0x80; /* CAN_USE_HEAP */
- }
-
- if (protocol >= 0x0202) {
- cmd_line_ptr = base_ptr + heap_end;
- strcpy(cmd_line_ptr, cmdline);
- } else {
- cmd_line_magic = 0xA33F;
- cmd_line_offset = heap_end;
- setup_move_size = heap_end + strlen(cmdline) + 1;
- strcpy(base_ptr + cmd_line_offset, cmdline);
- }
+ type_of_loader = <type code>;
+ if (loading_initrd) {
+ ramdisk_image = <initrd_address>;
+ ramdisk_size = <initrd_size>;
+ }
+
+ if (protocol >= 0x0202 && loadflags & 0x01)
+ heap_end = 0xe000;
+ else
+ heap_end = 0x9800;
+
+ if (protocol >= 0x0201) {
+ heap_end_ptr = heap_end - 0x200;
+ loadflags |= 0x80; /* CAN_USE_HEAP */
+ }
+
+ if (protocol >= 0x0202) {
+ cmd_line_ptr = base_ptr + heap_end;
+ strcpy(cmd_line_ptr, cmdline);
+ } else {
+ cmd_line_magic = 0xA33F;
+ cmd_line_offset = heap_end;
+ setup_move_size = heap_end + strlen(cmdline) + 1;
+ strcpy(base_ptr + cmd_line_offset, cmdline);
+ }
} else {
- /* Very old kernel */
+ /* Very old kernel */
- heap_end = 0x9800;
+ heap_end = 0x9800;
- cmd_line_magic = 0xA33F;
- cmd_line_offset = heap_end;
+ cmd_line_magic = 0xA33F;
+ cmd_line_offset = heap_end;
- /* A very old kernel MUST have its real-mode code loaded at 0x90000 */
- if (base_ptr != 0x90000) {
- /* Copy the real-mode kernel */
- memcpy(0x90000, base_ptr, (setup_sects + 1) * 512);
- base_ptr = 0x90000; /* Relocated */
- }
+ /* A very old kernel MUST have its real-mode code loaded at 0x90000 */
+ if (base_ptr != 0x90000) {
+ /* Copy the real-mode kernel */
+ memcpy(0x90000, base_ptr, (setup_sects + 1) * 512);
+ base_ptr = 0x90000; /* Relocated */
+ }
- strcpy(0x90000 + cmd_line_offset, cmdline);
+ strcpy(0x90000 + cmd_line_offset, cmdline);
- /* It is recommended to clear memory up to the 32K mark */
- memset(0x90000 + (setup_sects + 1) * 512, 0, (64 - (setup_sects + 1)) * 512);
+ /* It is recommended to clear memory up to the 32K mark */
+ memset(0x90000 + (setup_sects + 1) * 512, 0, (64 - (setup_sects + 1)) * 512);
}
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 6989b824fd321b..d0b62e2552902c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -122,7 +122,7 @@ struct uv_systab {
struct {
u32 type:8; /* type of entry */
u32 offset:24; /* byte offset from struct start to entry */
- } entry[1]; /* additional entries follow */
+ } entry[]; /* additional entries follow */
};
extern struct uv_systab *uv_systab;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bc94ff1e250ad2..c792c2afd84998 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -873,8 +873,8 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
}
static const struct x86_cpu_id erratum_1386_microcode[] = {
- X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
- X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ ZEN_MODEL_STEP_UCODE(0x17, 0x01, 0x2, 0x0800126e),
+ ZEN_MODEL_STEP_UCODE(0x17, 0x31, 0x0, 0x08301052),
{}
};
@@ -951,26 +951,14 @@ static void init_amd_zen1(struct cpuinfo_x86 *c)
}
}
-static bool cpu_has_zenbleed_microcode(void)
-{
- u32 good_rev = 0;
-
- switch (boot_cpu_data.x86_model) {
- case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
- case 0x60 ... 0x67: good_rev = 0x0860010c; break;
- case 0x68 ... 0x6f: good_rev = 0x08608107; break;
- case 0x70 ... 0x7f: good_rev = 0x08701033; break;
- case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
-
- default:
- return false;
- }
-
- if (boot_cpu_data.microcode < good_rev)
- return false;
-
- return true;
-}
+static const struct x86_cpu_id amd_zenbleed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x17, 0x31, 0x0, 0x0830107b),
+ ZEN_MODEL_STEP_UCODE(0x17, 0x60, 0x1, 0x0860010c),
+ ZEN_MODEL_STEP_UCODE(0x17, 0x68, 0x1, 0x08608107),
+ ZEN_MODEL_STEP_UCODE(0x17, 0x71, 0x0, 0x08701033),
+ ZEN_MODEL_STEP_UCODE(0x17, 0xa0, 0x0, 0x08a00009),
+ {}
+};
static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
{
@@ -980,7 +968,7 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_AVX))
return;
- if (!cpu_has_zenbleed_microcode()) {
+ if (!x86_match_min_microcode_rev(amd_zenbleed_microcode)) {
pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
} else {
@@ -1063,12 +1051,6 @@ static void init_amd(struct cpuinfo_x86 *c)
early_init_amd(c);
- /*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
- */
- clear_cpu_cap(c, 0*32+31);
-
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a3b55db35c9612..c8398940b97526 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -119,12 +119,6 @@ static void init_centaur(struct cpuinfo_x86 *c)
u32 fcr_clr = 0;
u32 lo, hi, newlo;
u32 aa, bb, cc, dd;
-
- /*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
- */
- clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
init_intel_cacheinfo(c);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index dfec2c61e3547d..8f22085c4dd23a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -195,12 +195,6 @@ static void init_cyrix(struct cpuinfo_x86 *c)
char *buf = c->x86_model_id;
const char *p = NULL;
- /*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
- */
- clear_cpu_cap(c, 0*32+31);
-
/* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
if (test_cpu_cap(c, 1*32+24)) {
clear_cpu_cap(c, 1*32+24);
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 1fda6c3a2b65a7..7f95a74e4c6576 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -174,12 +174,6 @@ static void init_hygon(struct cpuinfo_x86 *c)
early_init_hygon(c);
- /*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
- */
- clear_cpu_cap(c, 0*32+31);
-
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 66f1efa16fbb72..9322a9287dc7f5 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -242,7 +242,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
/*
* If the caller requires measurement of the page as a proof for the content,
* use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
- * operation until the entire page is measured."
+ * operation until the entire page is measured.
*/
static int __sgx_encl_extend(struct sgx_encl *encl,
struct sgx_epc_page *epc_page)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 48113c5193aa3c..76153dfb58c9d0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm)
};
if (!dump_emit(cprm, &xc, sizeof(xc)))
- return 0;
+ return -1;
num_records++;
}
@@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm)
return 1;
num_records = dump_xsave_layout_desc(cprm);
- if (!num_records)
+ if (num_records < 0)
return 1;
/* Total size should be equal to the number of records */
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1280693766b9dd..280b4ac0990ff3 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -586,6 +586,25 @@ struct ftrace_likely_data {
__scalar_type_to_expr_cases(long long), \
default: (x)))
+/*
+ * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving
+ * non-scalar types unchanged.
+ */
+
+#define __scalar_type_to_signed_cases(type) \
+ unsigned type: (signed type)0, \
+ signed type: (signed type)0
+
+#define __signed_scalar_typeof(x) typeof( \
+ _Generic((x), \
+ char: (signed char)0, \
+ __scalar_type_to_signed_cases(char), \
+ __scalar_type_to_signed_cases(short), \
+ __scalar_type_to_signed_cases(int), \
+ __scalar_type_to_signed_cases(long), \
+ __scalar_type_to_signed_cases(long long), \
+ default: (x)))
+
/* Is this type a native word size -- useful for atomic operations */
#define __native_word(t) \
(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9f6de068295d30..42af2292951d4f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1631,7 +1631,6 @@ enum tlb_flush_reason {
TLB_LOCAL_MM_SHOOTDOWN,
TLB_REMOTE_SEND_IPI,
TLB_REMOTE_WRONG_CPU,
- NR_TLB_FLUSH_REASONS,
};
/**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d395f2810facbe..bf96a7d595e226 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -586,15 +586,10 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
- union {
- /*
- * When !@on_rq this field is vlag.
- * When cfs_rq->curr == se (which implies @on_rq)
- * this field is vprot. See protect_slice().
- */
- s64 vlag;
- u64 vprot;
- };
+ /* Approximated virtual lag: */
+ s64 vlag;
+ /* 'Protected' deadline, to give out minimum quantums: */
+ u64 vprot;
u64 slice;
u64 nr_migrations;
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index b4d8e7dc38f880..fb836951168564 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -12,8 +12,9 @@
EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \
EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \
EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \
- EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \
- EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" )
+ EM( TLB_LOCAL_MM_SHOOTDOWN, "local MM shootdown" ) \
+ EM( TLB_REMOTE_SEND_IPI, "remote IPI send" ) \
+ EMe( TLB_REMOTE_WRONG_CPU, "remote wrong CPU" )
/*
* First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8b1b4c8a4f54c5..349ae7979da0e3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1414,7 +1414,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* Ensure the thread adjusts the affinity once it reaches the
* thread function.
*/
- new->thread_flags = BIT(IRQTF_AFFINITY);
+ set_bit(IRQTF_AFFINITY, &new->thread_flags);
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41ba0be1691174..7d0a862a8c75ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
- rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
- rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
- if (p->sched_class == donor->sched_class)
- donor->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, donor->sched_class))
+ if (p->sched_class == rq->next_class) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
+
+ } else if (sched_class_above(p->sched_class, rq->next_class)) {
+ rq->next_class->wakeup_preempt(rq, p, flags);
resched_curr(rq);
+ rq->next_class = p->sched_class;
+ }
/*
* A queue event has occurred, and we're going to schedule. In
@@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode)
pick_again:
next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ rq->next_class = next->sched_class;
if (unlikely(task_is_blocked(next))) {
next = find_proxy_task(rq, next, &rf);
if (!next)
@@ -8650,6 +8653,8 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
+ rq->next_class = &idle_sched_class;
+
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -10775,13 +10780,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int
flags |= DEQUEUE_NOCLOCK;
}
- if (flags & DEQUEUE_CLASS) {
- if (p->sched_class->switching_from)
- p->sched_class->switching_from(rq, p);
- }
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
*ctx = (struct sched_change_ctx){
.p = p,
+ .class = p->sched_class,
.flags = flags,
.queued = task_on_rq_queued(p),
.running = task_current_donor(rq, p),
@@ -10812,6 +10816,11 @@ void sched_change_end(struct sched_change_ctx *ctx)
lockdep_assert_rq_held(rq);
+ /*
+ * Changing class without *QUEUE_CLASS is bad.
+ */
+ WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS));
+
if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
p->sched_class->switching_to(rq, p);
@@ -10823,6 +10832,24 @@ void sched_change_end(struct sched_change_ctx *ctx)
if (ctx->flags & ENQUEUE_CLASS) {
if (p->sched_class->switched_to)
p->sched_class->switched_to(rq, p);
+
+ /*
+ * If this was a class promotion; let the old class know it
+ * got preempted. Note that none of the switch*_from() methods
+ * know the new class and none of the switch*_to() methods
+ * know the old class.
+ */
+ if (ctx->running && sched_class_above(p->sched_class, ctx->class)) {
+ rq->next_class->wakeup_preempt(rq, p, 0);
+ rq->next_class = p->sched_class;
+ }
+
+ /*
+ * If this was a degradation in class someone should have set
+ * need_resched by now.
+ */
+ WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) &&
+ !test_tsk_need_resched(p));
} else {
p->sched_class->prio_changed(rq, p, ctx->prio);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 319439fe187026..80c9559a3e30ea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
* Only called when both the current and waking task are -deadline
* tasks.
*/
-static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
- int flags)
+static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags)
{
+ /*
+ * Can only get preempted by stop-class, and those should be
+ * few and short lived, doesn't really make sense to push
+ * anything away for that.
+ */
+ if (p->sched_class != &dl_sched_class)
+ return;
+
if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
@@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu)
#endif
DEFINE_SCHED_CLASS(dl) = {
-
- .queue_mask = 8,
-
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49e9649a9..305877745e331c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq_modified_clear(rq);
+ rq->next_class = &fair_sched_class;
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
scx_disable_task(p);
}
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
+
static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {}
* their current sched_class. Call them directly from sched core instead.
*/
DEFINE_SCHED_CLASS(ext) = {
- .queue_mask = 1,
-
.enqueue_task = enqueue_task_scx,
.dequeue_task = dequeue_task_scx,
.yield_task = yield_task_scx,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index da46c316453783..76f5e4b78b3069 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
+extern void __BUILD_BUG_vruntime_cmp(void);
+
+/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
+
+#define vruntime_cmp(A, CMP_STR, B) ({ \
+ int __res = 0; \
+ \
+ if (!__builtin_strcmp(CMP_STR, "<")) { \
+ __res = ((s64)((A)-(B)) < 0); \
+ } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
+ __res = ((s64)((A)-(B)) <= 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">")) { \
+ __res = ((s64)((A)-(B)) > 0); \
+ } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
+ __res = ((s64)((A)-(B)) >= 0); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_cmp(); \
+ } \
+ \
+ __res; \
+})
+
+extern void __BUILD_BUG_vruntime_op(void);
+
+#define vruntime_op(A, OP_STR, B) ({ \
+ s64 __res = 0; \
+ \
+ if (!__builtin_strcmp(OP_STR, "-")) { \
+ __res = (s64)((A)-(B)); \
+ } else { \
+ /* Unknown operator throws linker error: */ \
+ __BUILD_BUG_vruntime_op(); \
+ } \
+ \
+ __res; \
+})
+
+
static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
+ if (vruntime_cmp(vruntime, ">", max_vruntime))
max_vruntime = vruntime;
return max_vruntime;
@@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
+ if (vruntime_cmp(vruntime, "<", min_vruntime))
min_vruntime = vruntime;
return min_vruntime;
@@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a,
* Tiebreak on vruntime seems unnecessary since it can
* hardly happen.
*/
- return (s64)(a->deadline - b->deadline) < 0;
+ return vruntime_cmp(a->deadline, "<", b->deadline);
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->zero_vruntime);
+ return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -607,8 +644,8 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Which we track using:
*
* v0 := cfs_rq->zero_vruntime
- * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
- * \Sum w_i := cfs_rq->avg_load
+ * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
+ * \Sum w_i := cfs_rq->sum_weight
*
* Since zero_vruntime closely tracks the per-task service, these
* deltas: (v_i - v), will be in the order of the maximal (virtual) lag
@@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
* As measured, the max (key * weight) value was ~44 bits for a kernel build.
*/
static void
-avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime += key * weight;
- cfs_rq->avg_load += weight;
+ cfs_rq->sum_w_vruntime += key * weight;
+ cfs_rq->sum_weight += weight;
}
static void
-avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
+sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long weight = scale_load_down(se->load.weight);
s64 key = entity_key(cfs_rq, se);
- cfs_rq->avg_vruntime -= key * weight;
- cfs_rq->avg_load -= weight;
+ cfs_rq->sum_w_vruntime -= key * weight;
+ cfs_rq->sum_weight -= weight;
}
static inline
-void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
+ * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
*/
- cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
+ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
}
/*
@@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->avg_vruntime;
- long load = cfs_rq->avg_load;
+ s64 avg = cfs_rq->sum_w_vruntime;
+ long load = cfs_rq->sum_weight;
if (curr && curr->on_rq) {
unsigned long weight = scale_load_down(curr->load.weight);
@@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
+ return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
+ s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
- avg_vruntime_update(cfs_rq, delta);
+ sum_w_vruntime_update(cfs_rq, delta);
cfs_rq->zero_vruntime = vruntime;
}
@@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-
static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (vruntime_gt(min_vruntime, se, rse))
+
+ if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
se->min_vruntime = rse->min_vruntime;
}
}
@@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- avg_vruntime_add(cfs_rq, se);
+ sum_w_vruntime_add(cfs_rq, se);
update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
@@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
- avg_vruntime_sub(cfs_rq, se);
+ sum_w_vruntime_sub(cfs_rq, se);
update_zero_vruntime(cfs_rq);
}
@@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti
static inline bool protect_slice(struct sched_entity *se)
{
- return ((s64)(se->vprot - se->vruntime) > 0);
+ return vruntime_cmp(se->vruntime, "<", se->vprot);
}
static inline void cancel_protect_slice(struct sched_entity *se)
@@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
*/
static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if ((s64)(se->vruntime - se->deadline) < 0)
+ if (vruntime_cmp(se->vruntime, "<", se->deadline))
return false;
/*
@@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p)
/* Scale the maximum scan period with the amount of shared memory. */
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p)
pid_t gid = 0;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
if (ng)
gid = ng->gid;
rcu_read_unlock();
@@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env,
return false;
rcu_read_lock();
- cur = rcu_dereference(dst_rq->curr);
+ cur = rcu_dereference_all(dst_rq->curr);
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
@@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- cur_ng = rcu_dereference(cur->numa_group);
+ cur_ng = rcu_dereference_all(cur->numa_group);
if (cur_ng == p_ng) {
/*
* Do not swap within a group or between tasks that have
@@ -2499,7 +2535,7 @@ static int task_numa_migrate(struct task_struct *p)
* to satisfy here.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+ sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
env.imb_numa_nr = sd->imb_numa_nr;
@@ -3022,7 +3058,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!cpupid_match_pid(tsk, cpupid))
goto no_join;
- grp = rcu_dereference(tsk->numa_group);
+ grp = rcu_dereference_all(tsk->numa_group);
if (!grp)
goto no_join;
@@ -3693,7 +3729,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
#define add_positive(_ptr, _val) do { \
typeof(_ptr) ptr = (_ptr); \
- typeof(_val) val = (_val); \
+ __signed_scalar_typeof(*ptr) val = (_val); \
typeof(*ptr) res, var = READ_ONCE(*ptr); \
\
res = var + val; \
@@ -3705,23 +3741,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
} while (0)
/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
-
-/*
* Remove and clamp on negative, from a local variable.
*
* A variant of sub_positive(), which does not use explicit load-store
@@ -3732,21 +3751,37 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
+
+/*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ *
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+#define __update_sa(sa, name, delta_avg, delta_sum) do { \
+ add_positive(&(sa)->name##_avg, delta_avg); \
+ add_positive(&(sa)->name##_sum, delta_sum); \
+ (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
+ (sa)->name##_sum, \
+ (sa)->name##_avg * PELT_MIN_DIVIDER); \
+} while (0)
+
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- cfs_rq->avg.load_avg += se->avg.load_avg;
- cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+ __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum);
}
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum);
}
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
-
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
new_sum = se->avg.util_avg * divider;
@@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta_avg);
- add_positive(&cfs_rq->avg.util_sum, delta_sum);
-
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
}
static inline void
@@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
- add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
}
static inline void
@@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
- add_positive(&cfs_rq->avg.load_avg, delta_avg);
- add_positive(&cfs_rq->avg.load_sum, delta_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
- cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
rq = rq_of(cfs_rq);
rcu_read_lock();
- is_idle = is_idle_task(rcu_dereference(rq->curr));
+ is_idle = is_idle_task(rcu_dereference_all(rq->curr));
rcu_read_unlock();
/*
@@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_rq->removed.lock);
r = removed_load;
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * divider);
- /* See sa->util_sum below */
- sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, load, -r, -r*divider);
r = removed_util;
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * divider);
- /*
- * Because of rounding, se->util_sum might ends up being +1 more than
- * cfs->util_sum. Although this is not a problem by itself, detaching
- * a lot of tasks with the rounding problem between 2 updates of
- * util_avg (~1ms) can make cfs->util_sum becoming null whereas
- * cfs_util_avg is not.
- * Check that util_sum is still above its lower bound for the new
- * util_avg. Given that period_contrib might have moved since the last
- * sync, we are only sure that util_sum must be above or equal to
- * util_avg * minimum possible divider
- */
- sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, util, -r, -r*divider);
r = removed_runnable;
- sub_positive(&sa->runnable_avg, r);
- sub_positive(&sa->runnable_sum, r * divider);
- /* See sa->util_sum above */
- sa->runnable_sum = max_t(u32, sa->runnable_sum,
- sa->runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(sa, runnable, -r, -r*divider);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
dequeue_load_avg(cfs_rq, se);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
- cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
-
- sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
- /* See update_cfs_rq_load_avg() */
- cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
- cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
+ __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
+ __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -5175,7 +5167,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* vl_i = (W + w_i)*vl'_i / W
*/
- load = cfs_rq->avg_load;
+ load = cfs_rq->sum_weight;
if (curr && curr->on_rq)
load += scale_load_down(curr->load.weight);
@@ -7148,7 +7140,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
static struct {
cpumask_var_t idle_cpus_mask;
atomic_t nr_cpus;
- int has_blocked; /* Idle CPUS has blocked load */
+ int has_blocked_load; /* Idle CPUS has blocked load */
int needs_update; /* Newly idle CPUs need their next_balance collated */
unsigned long next_balance; /* in jiffy units */
unsigned long next_blocked; /* Next update of blocked load in jiffies */
@@ -7506,7 +7498,7 @@ static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
@@ -7515,7 +7507,7 @@ static inline bool test_idle_cores(int cpu)
{
struct sched_domain_shared *sds;
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
@@ -7644,7 +7636,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
if (sched_feat(SIS_UTIL)) {
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
if (sd_share) {
/* because !--nr is the condition to stop scan */
nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
@@ -7850,7 +7842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* sd_asym_cpucapacity rather than sd_llc.
*/
if (sched_asym_cpucap_active()) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
/*
* On an asymmetric CPU capacity system where an exclusive
* cpuset defines a symmetric island (i.e. one unique
@@ -7865,7 +7857,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
- sd = rcu_dereference(per_cpu(sd_llc, target));
+ sd = rcu_dereference_all(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -8334,7 +8326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct energy_env eenv;
rcu_read_lock();
- pd = rcu_dereference(rd->pd);
+ pd = rcu_dereference_all(rd->pd);
if (!pd)
goto unlock;
@@ -8342,7 +8334,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* Energy-aware wake-up happens on the lowest sched_domain starting
* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
*/
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
sd = sd->parent;
if (!sd)
@@ -8744,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags,
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
+static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
@@ -8752,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ /*
+ * XXX Getting preempted by higher class, try and find idle CPU?
+ */
+ if (p->sched_class != &fair_sched_class)
+ return;
+
if (unlikely(se == pse))
return;
@@ -9349,7 +9347,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
*/
static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
+ struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
@@ -9778,7 +9776,7 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_NO_HZ_COMMON
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
return true;
@@ -9811,16 +9809,16 @@ static inline void update_blocked_load_tick(struct rq *rq)
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
{
- if (!has_blocked)
+ if (!has_blocked_load)
rq->has_blocked_load = 0;
}
#else /* !CONFIG_NO_HZ_COMMON: */
-static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
@@ -9877,7 +9875,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
list_del_leaf_cfs_rq(cfs_rq);
/* Don't need periodic decay once load/util_avg are null */
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
}
@@ -9937,7 +9935,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
bool decayed;
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
- if (cfs_rq_has_blocked(cfs_rq))
+ if (cfs_rq_has_blocked_load(cfs_rq))
*done = false;
return decayed;
@@ -9949,23 +9947,27 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-static void sched_balance_update_blocked_averages(int cpu)
+static void __sched_balance_update_blocked_averages(struct rq *rq)
{
bool decayed = false, done = true;
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
- rq_lock_irqsave(rq, &rf);
update_blocked_load_tick(rq);
- update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
- update_blocked_load_status(rq, !done);
+ update_has_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
- rq_unlock_irqrestore(rq, &rf);
+}
+
+static void sched_balance_update_blocked_averages(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ guard(rq_lock_irqsave)(rq);
+ update_rq_clock(rq);
+ __sched_balance_update_blocked_averages(rq);
}
/********** Helpers for sched_balance_find_src_group ************************/
@@ -11025,7 +11027,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
if (env->sd->span_weight != llc_weight)
return;
- sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
if (!sd_share)
return;
@@ -11375,7 +11377,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
goto force_balance;
if (!is_rd_overutilized(env->dst_rq->rd) &&
- rcu_dereference(env->dst_rq->rd->pd))
+ rcu_dereference_all(env->dst_rq->rd->pd))
goto out_balanced;
/* ASYM feature bypasses nice load balance check */
@@ -12450,7 +12452,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (likely(!atomic_read(&nohz.nr_cpus)))
return;
- if (READ_ONCE(nohz.has_blocked) &&
+ if (READ_ONCE(nohz.has_blocked_load) &&
time_after(now, READ_ONCE(nohz.next_blocked)))
flags = NOHZ_STATS_KICK;
@@ -12464,7 +12466,7 @@ static void nohz_balancer_kick(struct rq *rq)
rcu_read_lock();
- sd = rcu_dereference(rq->sd);
+ sd = rcu_dereference_all(rq->sd);
if (sd) {
/*
* If there's a runnable CFS task and the current CPU has reduced
@@ -12476,7 +12478,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
if (sd) {
/*
* When ASYM_PACKING; see if there's a more preferred CPU
@@ -12494,7 +12496,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
if (sd) {
/*
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
@@ -12515,7 +12517,7 @@ static void nohz_balancer_kick(struct rq *rq)
goto unlock;
}
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
if (sds) {
/*
* If there is an imbalance between LLC domains (IOW we could
@@ -12547,7 +12549,7 @@ static void set_cpu_sd_state_busy(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || !sd->nohz_idle)
goto unlock;
@@ -12577,7 +12579,7 @@ static void set_cpu_sd_state_idle(int cpu)
struct sched_domain *sd;
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
@@ -12611,9 +12613,9 @@ void nohz_balance_enter_idle(int cpu)
/*
* The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * meantime. We set the nohz.has_blocked_load flag to trig a check of the
* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
+ * of nohz.has_blocked_load can only happen after checking the new load
*/
if (rq->nohz_tick_stopped)
goto out;
@@ -12629,7 +12631,7 @@ void nohz_balance_enter_idle(int cpu)
/*
* Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
+ * @idle_cpus_mask store, it must observe the @has_blocked_load
* and @needs_update stores.
*/
smp_mb__after_atomic();
@@ -12642,7 +12644,7 @@ out:
* Each time a cpu enter idle, we assume that it has blocked load and
* enable the periodic update of the load of idle CPUs
*/
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
static bool update_nohz_stats(struct rq *rq)
@@ -12683,8 +12685,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
/*
* We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trigger another update of idle load.
+ * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
+ * set the has_blocked_load flag and trigger another update of idle load.
* Because a cpu that becomes idle, is added to idle_cpus_mask before
* setting the flag, we are sure to not clear the state and not
* check the load of an idle cpu.
@@ -12692,12 +12694,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
* Same applies to idle_cpus_mask vs needs_update.
*/
if (flags & NOHZ_STATS_KICK)
- WRITE_ONCE(nohz.has_blocked, 0);
+ WRITE_ONCE(nohz.has_blocked_load, 0);
if (flags & NOHZ_NEXT_KICK)
WRITE_ONCE(nohz.needs_update, 0);
/*
- * Ensures that if we miss the CPU, we must see the has_blocked
+ * Ensures that if we miss the CPU, we must see the has_blocked_load
* store from nohz_balance_enter_idle().
*/
smp_mb();
@@ -12764,7 +12766,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
+ WRITE_ONCE(nohz.has_blocked_load, 1);
}
/*
@@ -12826,7 +12828,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
/* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
+ if (!READ_ONCE(nohz.has_blocked_load) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
@@ -12896,29 +12898,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (!sd) {
- rcu_read_unlock();
+ sd = rcu_dereference_sched_domain(this_rq->sd);
+ if (!sd)
goto out;
- }
if (!get_rd_overloaded(this_rq->rd) ||
this_rq->avg_idle < sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
- rcu_read_unlock();
goto out;
}
- rcu_read_unlock();
-
- rq_modified_clear(this_rq);
- raw_spin_rq_unlock(this_rq);
+ /*
+ * Include sched_balance_update_blocked_averages() in the cost
+ * calculation because it can be quite costly -- this ensures we skip
+ * it when avg_idle gets to be very low.
+ */
t0 = sched_clock_cpu(this_cpu);
- sched_balance_update_blocked_averages(this_cpu);
+ __sched_balance_update_blocked_averages(this_rq);
+
+ this_rq->next_class = &fair_sched_class;
+ raw_spin_rq_unlock(this_rq);
- rcu_read_lock();
for_each_domain(this_cpu, sd) {
u64 domain_cost;
@@ -12968,7 +12969,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (pulled_task || !continue_balancing)
break;
}
- rcu_read_unlock();
raw_spin_rq_lock(this_rq);
@@ -12984,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (rq_modified_above(this_rq, &fair_sched_class))
+ if (sched_class_above(this_rq->next_class, &fair_sched_class))
pulled_task = -1;
out:
@@ -13335,8 +13335,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
* zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
- delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
+ delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
+ vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13374,6 +13374,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (queued) {
+ if (!need_resched())
+ hrtick_start_fair(rq, curr);
+ return;
+ }
+
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
@@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
DEFINE_SCHED_CLASS(fair) = {
-
- .queue_mask = 2,
-
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
- .wakeup_preempt = check_preempt_wakeup_fair,
+ .wakeup_preempt = wakeup_preempt_fair,
.pick_task = pick_task_fair,
.pick_next_task = pick_next_task_fair,
@@ -13950,7 +13953,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
struct numa_group *ng;
rcu_read_lock();
- ng = rcu_dereference(p->numa_group);
+ ng = rcu_dereference_all(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe1dd177a..65eb8f8c1a5d3a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq)
* Simple, special scheduling class for the per-CPU idle tasks:
*/
DEFINE_SCHED_CLASS(idle) = {
-
- .queue_mask = 0,
-
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f1867fe8e5c535..0a9b2cd6da7208 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
+ /*
+ * XXX If we're preempted by DL, queue a push?
+ */
+ if (p->sched_class != &rt_sched_class)
+ return;
+
if (p->prio < donor->prio) {
resched_curr(rq);
return;
@@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
-
- .queue_mask = 4,
-
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5f9..bdb1e748fe5c97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -670,16 +670,16 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
-/* CFS-related fields in a runqueue */
+/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */
struct cfs_rq {
struct load_weight load;
unsigned int nr_queued;
- unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int h_nr_idle; /* SCHED_IDLE */
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
- s64 avg_vruntime;
- u64 avg_load;
+ s64 sum_w_vruntime;
+ u64 sum_weight;
u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
@@ -690,7 +690,7 @@ struct cfs_rq {
struct rb_root_cached tasks_timeline;
/*
- * 'curr' points to currently running entity on this cfs_rq.
+ * 'curr' points to the currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -726,9 +726,7 @@ struct cfs_rq {
unsigned long h_load;
u64 last_h_load_update;
struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
/*
@@ -741,19 +739,19 @@ struct cfs_rq {
*/
int on_list;
struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
+ struct task_group *tg; /* Group that "owns" this runqueue */
/* Locally cached copy of our task_group's idle value */
int idle;
-#ifdef CONFIG_CFS_BANDWIDTH
+# ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
u64 throttled_pelt_idle;
-#ifndef CONFIG_64BIT
+# ifndef CONFIG_64BIT
u64 throttled_pelt_idle_copy;
-#endif
+# endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -765,7 +763,7 @@ struct cfs_rq {
struct list_head throttled_list;
struct list_head throttled_csd_list;
struct list_head throttled_limbo_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
+# endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
@@ -1121,7 +1119,6 @@ struct rq {
raw_spinlock_t __lock;
/* Per class runqueue modification mask; bits in class order. */
- unsigned int queue_mask;
unsigned int nr_running;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -1181,6 +1178,7 @@ struct rq {
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
+ const struct sched_class *next_class;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -2010,8 +2008,8 @@ queue_balance_callback(struct rq *rq,
rq->balance_callback = head;
}
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
+#define rcu_dereference_sched_domain(p) \
+ rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -2021,7 +2019,7 @@ queue_balance_callback(struct rq *rq,
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
@@ -2428,15 +2426,6 @@ struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
int uclamp_enabled;
#endif
- /*
- * idle: 0
- * ext: 1
- * fair: 2
- * rt: 4
- * dl: 8
- * stop: 16
- */
- unsigned int queue_mask;
/*
* move_queued_task/activate_task/enqueue_task: rq->lock
@@ -2595,20 +2584,6 @@ struct sched_class {
#endif
};
-/*
- * Does not nest; only used around sched_class::pick_task() rq-lock-breaks.
- */
-static inline void rq_modified_clear(struct rq *rq)
-{
- rq->queue_mask = 0;
-}
-
-static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class)
-{
- unsigned int mask = class->queue_mask;
- return rq->queue_mask & ~((mask << 1) - 1);
-}
-
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->donor != prev);
@@ -3901,6 +3876,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s
deactivate_task(src_rq, task, 0);
set_task_cpu(task, dst_rq->cpu);
activate_task(dst_rq, task, 0);
+ wakeup_preempt(dst_rq, task, 0);
}
static inline
@@ -3968,6 +3944,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
struct sched_change_ctx {
u64 prio;
struct task_struct *p;
+ const struct sched_class *class;
int flags;
bool queued;
bool running;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4f9192be4b5b0a..f95798baddebbd 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq)
* Simple, special scheduling class for the per-CPU stop tasks:
*/
DEFINE_SCHED_CLASS(stop) = {
-
- .queue_mask = 16,
-
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
.yield_task = yield_task_stop,