diff options
| -rw-r--r-- | Documentation/arch/x86/boot.rst | 198 | ||||
| -rw-r--r-- | arch/x86/include/asm/uv/bios.h | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/amd.c | 40 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/centaur.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/cyrix.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/hygon.c | 6 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/sgx/ioctl.c | 2 | ||||
| -rw-r--r-- | arch/x86/kernel/fpu/xstate.c | 4 | ||||
| -rw-r--r-- | include/linux/compiler_types.h | 19 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 1 | ||||
| -rw-r--r-- | include/linux/sched.h | 13 | ||||
| -rw-r--r-- | include/trace/events/tlb.h | 5 | ||||
| -rw-r--r-- | kernel/irq/manage.c | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 45 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 14 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 7 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 361 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 3 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 9 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 59 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 3 |
21 files changed, 395 insertions, 410 deletions
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index 6d36ce86fd8ec0..dca3875a24351e 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -95,26 +95,26 @@ Memory Layout The traditional memory map for the kernel loader, used for Image or zImage kernels, typically looks like:: - | | + | | 0A0000 +------------------------+ - | Reserved for BIOS | Do not use. Reserved for BIOS EBDA. + | Reserved for BIOS | Do not use. Reserved for BIOS EBDA. 09A000 +------------------------+ - | Command line | - | Stack/heap | For use by the kernel real-mode code. + | Command line | + | Stack/heap | For use by the kernel real-mode code. 098000 +------------------------+ - | Kernel setup | The kernel real-mode code. + | Kernel setup | The kernel real-mode code. 090200 +------------------------+ - | Kernel boot sector | The kernel legacy boot sector. + | Kernel boot sector | The kernel legacy boot sector. 090000 +------------------------+ - | Protected-mode kernel | The bulk of the kernel image. + | Protected-mode kernel | The bulk of the kernel image. 010000 +------------------------+ - | Boot loader | <- Boot sector entry point 0000:7C00 + | Boot loader | <- Boot sector entry point 0000:7C00 001000 +------------------------+ - | Reserved for MBR/BIOS | + | Reserved for MBR/BIOS | 000800 +------------------------+ - | Typically used by MBR | + | Typically used by MBR | 000600 +------------------------+ - | BIOS use only | + | BIOS use only | 000000 +------------------------+ When using bzImage, the protected-mode kernel was relocated to @@ -142,27 +142,27 @@ above the 0x9A000 point; too many BIOSes will break above that point. For a modern bzImage kernel with boot protocol version >= 2.02, a memory layout like the following is suggested:: - ~ ~ - | Protected-mode kernel | + ~ ~ + | Protected-mode kernel | 100000 +------------------------+ - | I/O memory hole | + | I/O memory hole | 0A0000 +------------------------+ - | Reserved for BIOS | Leave as much as possible unused - ~ ~ - | Command line | (Can also be below the X+10000 mark) + | Reserved for BIOS | Leave as much as possible unused + ~ ~ + | Command line | (Can also be below the X+10000 mark) X+10000 +------------------------+ - | Stack/heap | For use by the kernel real-mode code. + | Stack/heap | For use by the kernel real-mode code. X+08000 +------------------------+ - | Kernel setup | The kernel real-mode code. - | Kernel boot sector | The kernel legacy boot sector. + | Kernel setup | The kernel real-mode code. + | Kernel boot sector | The kernel legacy boot sector. X +------------------------+ - | Boot loader | <- Boot sector entry point 0000:7C00 + | Boot loader | <- Boot sector entry point 0000:7C00 001000 +------------------------+ - | Reserved for MBR/BIOS | + | Reserved for MBR/BIOS | 000800 +------------------------+ - | Typically used by MBR | + | Typically used by MBR | 000600 +------------------------+ - | BIOS use only | + | BIOS use only | 000000 +------------------------+ ... where the address X is as low as the design of the boot loader permits. @@ -433,7 +433,7 @@ Protocol: 2.00+ Assigned boot loader IDs: - == ======================================= + ==== ======================================= 0x0 LILO (0x00 reserved for pre-2.00 bootloader) 0x1 Loadlin @@ -456,7 +456,7 @@ Protocol: 2.00+ <http://sebastian-plotz.blogspot.de> 0x12 OVMF UEFI virtualization stack 0x13 barebox - == ======================================= + ==== ======================================= Please contact <hpa@zytor.com> if you need a bootloader ID value assigned. @@ -809,12 +809,12 @@ Protocol: 2.09+ as follow:: struct setup_data { - __u64 next; - __u32 type; - __u32 len; - __u8 data[]; + __u64 next; + __u32 type; + __u32 len; + __u8 data[]; } - + Where, the next is a 64-bit physical pointer to the next node of linked list, the next field of the last node is 0; the type is used to identify the contents of data; the len is the length of data @@ -835,10 +835,10 @@ Protocol: 2.09+ protocol 2.15:: struct setup_indirect { - __u32 type; - __u32 reserved; /* Reserved, must be set to zero. */ - __u64 len; - __u64 addr; + __u32 type; + __u32 reserved; /* Reserved, must be set to zero. */ + __u64 len; + __u64 addr; }; The type member is a SETUP_INDIRECT | SETUP_* type. However, it cannot be @@ -850,15 +850,15 @@ Protocol: 2.09+ In this case setup_data and setup_indirect will look like this:: struct setup_data { - .next = 0, /* or <addr_of_next_setup_data_struct> */ - .type = SETUP_INDIRECT, - .len = sizeof(setup_indirect), - .data[sizeof(setup_indirect)] = (struct setup_indirect) { - .type = SETUP_INDIRECT | SETUP_E820_EXT, - .reserved = 0, - .len = <len_of_SETUP_E820_EXT_data>, - .addr = <addr_of_SETUP_E820_EXT_data>, - }, + .next = 0, /* or <addr_of_next_setup_data_struct> */ + .type = SETUP_INDIRECT, + .len = sizeof(setup_indirect), + .data[sizeof(setup_indirect)] = (struct setup_indirect) { + .type = SETUP_INDIRECT | SETUP_E820_EXT, + .reserved = 0, + .len = <len_of_SETUP_E820_EXT_data>, + .addr = <addr_of_SETUP_E820_EXT_data>, + }, } .. note:: @@ -897,11 +897,11 @@ Offset/size: 0x260/4 The kernel runtime start address is determined by the following algorithm:: if (relocatable_kernel) { - if (load_address < pref_address) - load_address = pref_address; - runtime_start = align_up(load_address, kernel_alignment); + if (load_address < pref_address) + load_address = pref_address; + runtime_start = align_up(load_address, kernel_alignment); } else { - runtime_start = pref_address; + runtime_start = pref_address; } Hence the necessary memory window location and size can be estimated by @@ -975,22 +975,22 @@ after kernel_info_var_len_data label. Each chunk of variable size data has to be prefixed with header/magic and its size, e.g.:: kernel_info: - .ascii "LToP" /* Header, Linux top (structure). */ - .long kernel_info_var_len_data - kernel_info - .long kernel_info_end - kernel_info - .long 0x01234567 /* Some fixed size data for the bootloaders. */ + .ascii "LToP" /* Header, Linux top (structure). */ + .long kernel_info_var_len_data - kernel_info + .long kernel_info_end - kernel_info + .long 0x01234567 /* Some fixed size data for the bootloaders. */ kernel_info_var_len_data: example_struct: /* Some variable size data for the bootloaders. */ - .ascii "0123" /* Header/Magic. */ - .long example_struct_end - example_struct - .ascii "Struct" - .long 0x89012345 + .ascii "0123" /* Header/Magic. */ + .long example_struct_end - example_struct + .ascii "Struct" + .long 0x89012345 example_struct_end: example_strings: /* Some variable size data for the bootloaders. */ - .ascii "ABCD" /* Header/Magic. */ - .long example_strings_end - example_strings - .asciz "String_0" - .asciz "String_1" + .ascii "ABCD" /* Header/Magic. */ + .long example_strings_end - example_strings + .asciz "String_0" + .asciz "String_1" example_strings_end: kernel_info_end: @@ -1132,53 +1132,53 @@ Such a boot loader should enter the following fields in the header:: unsigned long base_ptr; /* base address for real-mode segment */ if (setup_sects == 0) - setup_sects = 4; + setup_sects = 4; if (protocol >= 0x0200) { - type_of_loader = <type code>; - if (loading_initrd) { - ramdisk_image = <initrd_address>; - ramdisk_size = <initrd_size>; - } - - if (protocol >= 0x0202 && loadflags & 0x01) - heap_end = 0xe000; - else - heap_end = 0x9800; - - if (protocol >= 0x0201) { - heap_end_ptr = heap_end - 0x200; - loadflags |= 0x80; /* CAN_USE_HEAP */ - } - - if (protocol >= 0x0202) { - cmd_line_ptr = base_ptr + heap_end; - strcpy(cmd_line_ptr, cmdline); - } else { - cmd_line_magic = 0xA33F; - cmd_line_offset = heap_end; - setup_move_size = heap_end + strlen(cmdline) + 1; - strcpy(base_ptr + cmd_line_offset, cmdline); - } + type_of_loader = <type code>; + if (loading_initrd) { + ramdisk_image = <initrd_address>; + ramdisk_size = <initrd_size>; + } + + if (protocol >= 0x0202 && loadflags & 0x01) + heap_end = 0xe000; + else + heap_end = 0x9800; + + if (protocol >= 0x0201) { + heap_end_ptr = heap_end - 0x200; + loadflags |= 0x80; /* CAN_USE_HEAP */ + } + + if (protocol >= 0x0202) { + cmd_line_ptr = base_ptr + heap_end; + strcpy(cmd_line_ptr, cmdline); + } else { + cmd_line_magic = 0xA33F; + cmd_line_offset = heap_end; + setup_move_size = heap_end + strlen(cmdline) + 1; + strcpy(base_ptr + cmd_line_offset, cmdline); + } } else { - /* Very old kernel */ + /* Very old kernel */ - heap_end = 0x9800; + heap_end = 0x9800; - cmd_line_magic = 0xA33F; - cmd_line_offset = heap_end; + cmd_line_magic = 0xA33F; + cmd_line_offset = heap_end; - /* A very old kernel MUST have its real-mode code loaded at 0x90000 */ - if (base_ptr != 0x90000) { - /* Copy the real-mode kernel */ - memcpy(0x90000, base_ptr, (setup_sects + 1) * 512); - base_ptr = 0x90000; /* Relocated */ - } + /* A very old kernel MUST have its real-mode code loaded at 0x90000 */ + if (base_ptr != 0x90000) { + /* Copy the real-mode kernel */ + memcpy(0x90000, base_ptr, (setup_sects + 1) * 512); + base_ptr = 0x90000; /* Relocated */ + } - strcpy(0x90000 + cmd_line_offset, cmdline); + strcpy(0x90000 + cmd_line_offset, cmdline); - /* It is recommended to clear memory up to the 32K mark */ - memset(0x90000 + (setup_sects + 1) * 512, 0, (64 - (setup_sects + 1)) * 512); + /* It is recommended to clear memory up to the 32K mark */ + memset(0x90000 + (setup_sects + 1) * 512, 0, (64 - (setup_sects + 1)) * 512); } diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 6989b824fd321b..d0b62e2552902c 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -122,7 +122,7 @@ struct uv_systab { struct { u32 type:8; /* type of entry */ u32 offset:24; /* byte offset from struct start to entry */ - } entry[1]; /* additional entries follow */ + } entry[]; /* additional entries follow */ }; extern struct uv_systab *uv_systab; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bc94ff1e250ad2..c792c2afd84998 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -873,8 +873,8 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } static const struct x86_cpu_id erratum_1386_microcode[] = { - X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), - X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + ZEN_MODEL_STEP_UCODE(0x17, 0x01, 0x2, 0x0800126e), + ZEN_MODEL_STEP_UCODE(0x17, 0x31, 0x0, 0x08301052), {} }; @@ -951,26 +951,14 @@ static void init_amd_zen1(struct cpuinfo_x86 *c) } } -static bool cpu_has_zenbleed_microcode(void) -{ - u32 good_rev = 0; - - switch (boot_cpu_data.x86_model) { - case 0x30 ... 0x3f: good_rev = 0x0830107b; break; - case 0x60 ... 0x67: good_rev = 0x0860010c; break; - case 0x68 ... 0x6f: good_rev = 0x08608107; break; - case 0x70 ... 0x7f: good_rev = 0x08701033; break; - case 0xa0 ... 0xaf: good_rev = 0x08a00009; break; - - default: - return false; - } - - if (boot_cpu_data.microcode < good_rev) - return false; - - return true; -} +static const struct x86_cpu_id amd_zenbleed_microcode[] = { + ZEN_MODEL_STEP_UCODE(0x17, 0x31, 0x0, 0x0830107b), + ZEN_MODEL_STEP_UCODE(0x17, 0x60, 0x1, 0x0860010c), + ZEN_MODEL_STEP_UCODE(0x17, 0x68, 0x1, 0x08608107), + ZEN_MODEL_STEP_UCODE(0x17, 0x71, 0x0, 0x08701033), + ZEN_MODEL_STEP_UCODE(0x17, 0xa0, 0x0, 0x08a00009), + {} +}; static void zen2_zenbleed_check(struct cpuinfo_x86 *c) { @@ -980,7 +968,7 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_AVX)) return; - if (!cpu_has_zenbleed_microcode()) { + if (!x86_match_min_microcode_rev(amd_zenbleed_microcode)) { pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n"); msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); } else { @@ -1063,12 +1051,6 @@ static void init_amd(struct cpuinfo_x86 *c) early_init_amd(c); - /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway - */ - clear_cpu_cap(c, 0*32+31); - if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index a3b55db35c9612..c8398940b97526 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -119,12 +119,6 @@ static void init_centaur(struct cpuinfo_x86 *c) u32 fcr_clr = 0; u32 lo, hi, newlo; u32 aa, bb, cc, dd; - - /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway - */ - clear_cpu_cap(c, 0*32+31); #endif early_init_centaur(c); init_intel_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index dfec2c61e3547d..8f22085c4dd23a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -195,12 +195,6 @@ static void init_cyrix(struct cpuinfo_x86 *c) char *buf = c->x86_model_id; const char *p = NULL; - /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway - */ - clear_cpu_cap(c, 0*32+31); - /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */ if (test_cpu_cap(c, 1*32+24)) { clear_cpu_cap(c, 1*32+24); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 1fda6c3a2b65a7..7f95a74e4c6576 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -174,12 +174,6 @@ static void init_hygon(struct cpuinfo_x86 *c) early_init_hygon(c); - /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway - */ - clear_cpu_cap(c, 0*32+31); - set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 66f1efa16fbb72..9322a9287dc7f5 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -242,7 +242,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl, /* * If the caller requires measurement of the page as a proof for the content, * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this - * operation until the entire page is measured." + * operation until the entire page is measured. */ static int __sgx_encl_extend(struct sgx_encl *encl, struct sgx_epc_page *epc_page) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 48113c5193aa3c..76153dfb58c9d0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm) }; if (!dump_emit(cprm, &xc, sizeof(xc))) - return 0; + return -1; num_records++; } @@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm) return 1; num_records = dump_xsave_layout_desc(cprm); - if (!num_records) + if (num_records < 0) return 1; /* Total size should be equal to the number of records */ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1280693766b9dd..280b4ac0990ff3 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -586,6 +586,25 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long long), \ default: (x))) +/* + * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving + * non-scalar types unchanged. + */ + +#define __scalar_type_to_signed_cases(type) \ + unsigned type: (signed type)0, \ + signed type: (signed type)0 + +#define __signed_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (signed char)0, \ + __scalar_type_to_signed_cases(char), \ + __scalar_type_to_signed_cases(short), \ + __scalar_type_to_signed_cases(int), \ + __scalar_type_to_signed_cases(long), \ + __scalar_type_to_signed_cases(long long), \ + default: (x))) + /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9f6de068295d30..42af2292951d4f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1631,7 +1631,6 @@ enum tlb_flush_reason { TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, TLB_REMOTE_WRONG_CPU, - NR_TLB_FLUSH_REASONS, }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810facbe..bf96a7d595e226 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,15 +586,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - union { - /* - * When !@on_rq this field is vlag. - * When cfs_rq->curr == se (which implies @on_rq) - * this field is vprot. See protect_slice(). - */ - s64 vlag; - u64 vprot; - }; + /* Approximated virtual lag: */ + s64 vlag; + /* 'Protected' deadline, to give out minimum quantums: */ + u64 vprot; u64 slice; u64 nr_migrations; diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h index b4d8e7dc38f880..fb836951168564 100644 --- a/include/trace/events/tlb.h +++ b/include/trace/events/tlb.h @@ -12,8 +12,9 @@ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ - EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ - EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) + EM( TLB_LOCAL_MM_SHOOTDOWN, "local MM shootdown" ) \ + EM( TLB_REMOTE_SEND_IPI, "remote IPI send" ) \ + EMe( TLB_REMOTE_WRONG_CPU, "remote wrong CPU" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b1b4c8a4f54c5..349ae7979da0e3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1414,7 +1414,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * Ensure the thread adjusts the affinity once it reaches the * thread function. */ - new->thread_flags = BIT(IRQTF_AFFINITY); + set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41ba0be1691174..7d0a862a8c75ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2090,7 +2090,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); - rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2123,7 +2122,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); - rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2174,10 +2172,14 @@ void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; - if (p->sched_class == donor->sched_class) - donor->sched_class->wakeup_preempt(rq, p, flags); - else if (sched_class_above(p->sched_class, donor->sched_class)) + if (p->sched_class == rq->next_class) { + rq->next_class->wakeup_preempt(rq, p, flags); + + } else if (sched_class_above(p->sched_class, rq->next_class)) { + rq->next_class->wakeup_preempt(rq, p, flags); resched_curr(rq); + rq->next_class = p->sched_class; + } /* * A queue event has occurred, and we're going to schedule. In @@ -6804,6 +6806,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + rq->next_class = next->sched_class; if (unlikely(task_is_blocked(next))) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -8650,6 +8653,8 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif + rq->next_class = &idle_sched_class; + rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -10775,13 +10780,12 @@ struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags |= DEQUEUE_NOCLOCK; } - if (flags & DEQUEUE_CLASS) { - if (p->sched_class->switching_from) - p->sched_class->switching_from(rq, p); - } + if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); *ctx = (struct sched_change_ctx){ .p = p, + .class = p->sched_class, .flags = flags, .queued = task_on_rq_queued(p), .running = task_current_donor(rq, p), @@ -10812,6 +10816,11 @@ void sched_change_end(struct sched_change_ctx *ctx) lockdep_assert_rq_held(rq); + /* + * Changing class without *QUEUE_CLASS is bad. + */ + WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) p->sched_class->switching_to(rq, p); @@ -10823,6 +10832,24 @@ void sched_change_end(struct sched_change_ctx *ctx) if (ctx->flags & ENQUEUE_CLASS) { if (p->sched_class->switched_to) p->sched_class->switched_to(rq, p); + + /* + * If this was a class promotion; let the old class know it + * got preempted. Note that none of the switch*_from() methods + * know the new class and none of the switch*_to() methods + * know the old class. + */ + if (ctx->running && sched_class_above(p->sched_class, ctx->class)) { + rq->next_class->wakeup_preempt(rq, p, 0); + rq->next_class = p->sched_class; + } + + /* + * If this was a degradation in class someone should have set + * need_resched by now. + */ + WARN_ON_ONCE(sched_class_above(ctx->class, p->sched_class) && + !test_tsk_need_resched(p)); } else { p->sched_class->prio_changed(rq, p, ctx->prio); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 319439fe187026..80c9559a3e30ea 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2499,9 +2499,16 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, - int flags) +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { + /* + * Can only get preempted by stop-class, and those should be + * few and short lived, doesn't really make sense to push + * anything away for that. + */ + if (p->sched_class != &dl_sched_class) + return; + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { resched_curr(rq); return; @@ -3346,9 +3353,6 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) #endif DEFINE_SCHED_CLASS(dl) = { - - .queue_mask = 8, - .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649a9..305877745e331c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2431,7 +2431,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq_modified_clear(rq); + rq->next_class = &fair_sched_class; rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -3075,7 +3075,8 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} + static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3336,8 +3337,6 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { - .queue_mask = 1, - .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da46c316453783..76f5e4b78b3069 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -524,10 +524,48 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ +extern void __BUILD_BUG_vruntime_cmp(void); + +/* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */ + +#define vruntime_cmp(A, CMP_STR, B) ({ \ + int __res = 0; \ + \ + if (!__builtin_strcmp(CMP_STR, "<")) { \ + __res = ((s64)((A)-(B)) < 0); \ + } else if (!__builtin_strcmp(CMP_STR, "<=")) { \ + __res = ((s64)((A)-(B)) <= 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">")) { \ + __res = ((s64)((A)-(B)) > 0); \ + } else if (!__builtin_strcmp(CMP_STR, ">=")) { \ + __res = ((s64)((A)-(B)) >= 0); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_cmp(); \ + } \ + \ + __res; \ +}) + +extern void __BUILD_BUG_vruntime_op(void); + +#define vruntime_op(A, OP_STR, B) ({ \ + s64 __res = 0; \ + \ + if (!__builtin_strcmp(OP_STR, "-")) { \ + __res = (s64)((A)-(B)); \ + } else { \ + /* Unknown operator throws linker error: */ \ + __BUILD_BUG_vruntime_op(); \ + } \ + \ + __res; \ +}) + + static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) + if (vruntime_cmp(vruntime, ">", max_vruntime)) max_vruntime = vruntime; return max_vruntime; @@ -535,8 +573,7 @@ static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) + if (vruntime_cmp(vruntime, "<", min_vruntime)) min_vruntime = vruntime; return min_vruntime; @@ -549,12 +586,12 @@ static inline bool entity_before(const struct sched_entity *a, * Tiebreak on vruntime seems unnecessary since it can * hardly happen. */ - return (s64)(a->deadline - b->deadline) < 0; + return vruntime_cmp(a->deadline, "<", b->deadline); } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->zero_vruntime); + return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -607,8 +644,8 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Which we track using: * * v0 := cfs_rq->zero_vruntime - * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime - * \Sum w_i := cfs_rq->avg_load + * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime + * \Sum w_i := cfs_rq->sum_weight * * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v), will be in the order of the maximal (virtual) lag @@ -619,32 +656,32 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void -avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime += key * weight; - cfs_rq->avg_load += weight; + cfs_rq->sum_w_vruntime += key * weight; + cfs_rq->sum_weight += weight; } static void -avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long weight = scale_load_down(se->load.weight); s64 key = entity_key(cfs_rq, se); - cfs_rq->avg_vruntime -= key * weight; - cfs_rq->avg_load -= weight; + cfs_rq->sum_w_vruntime -= key * weight; + cfs_rq->sum_weight -= weight; } static inline -void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight */ - cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; + cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; } /* @@ -654,8 +691,8 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -722,8 +759,8 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->avg_vruntime; - long load = cfs_rq->avg_load; + s64 avg = cfs_rq->sum_w_vruntime; + long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { unsigned long weight = scale_load_down(curr->load.weight); @@ -732,7 +769,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; + return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -743,9 +780,9 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_zero_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); + s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - avg_vruntime_update(cfs_rq, delta); + sum_w_vruntime_update(cfs_rq, delta); cfs_rq->zero_vruntime = vruntime; } @@ -770,13 +807,12 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) - static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (vruntime_gt(min_vruntime, se, rse)) + + if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime)) se->min_vruntime = rse->min_vruntime; } } @@ -819,7 +855,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - avg_vruntime_add(cfs_rq, se); + sum_w_vruntime_add(cfs_rq, se); update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; @@ -831,7 +867,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); - avg_vruntime_sub(cfs_rq, se); + sum_w_vruntime_sub(cfs_rq, se); update_zero_vruntime(cfs_rq); } @@ -887,7 +923,7 @@ static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_enti static inline bool protect_slice(struct sched_entity *se) { - return ((s64)(se->vprot - se->vruntime) > 0); + return vruntime_cmp(se->vruntime, "<", se->vprot); } static inline void cancel_protect_slice(struct sched_entity *se) @@ -1024,7 +1060,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if ((s64)(se->vruntime - se->deadline) < 0) + if (vruntime_cmp(se->vruntime, "<", se->deadline)) return false; /* @@ -1513,7 +1549,7 @@ static unsigned int task_scan_start(struct task_struct *p) /* Scale the maximum scan period with the amount of shared memory. */ rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) { unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); @@ -1580,7 +1616,7 @@ pid_t task_numa_group_id(struct task_struct *p) pid_t gid = 0; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); if (ng) gid = ng->gid; rcu_read_unlock(); @@ -2239,7 +2275,7 @@ static bool task_numa_compare(struct task_numa_env *env, return false; rcu_read_lock(); - cur = rcu_dereference(dst_rq->curr); + cur = rcu_dereference_all(dst_rq->curr); if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || !cur->mm)) cur = NULL; @@ -2284,7 +2320,7 @@ static bool task_numa_compare(struct task_numa_env *env, * If dst and source tasks are in the same NUMA group, or not * in any group then look only at task weights. */ - cur_ng = rcu_dereference(cur->numa_group); + cur_ng = rcu_dereference_all(cur->numa_group); if (cur_ng == p_ng) { /* * Do not swap within a group or between tasks that have @@ -2499,7 +2535,7 @@ static int task_numa_migrate(struct task_struct *p) * to satisfy here. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu)); if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; env.imb_numa_nr = sd->imb_numa_nr; @@ -3022,7 +3058,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!cpupid_match_pid(tsk, cpupid)) goto no_join; - grp = rcu_dereference(tsk->numa_group); + grp = rcu_dereference_all(tsk->numa_group); if (!grp) goto no_join; @@ -3693,7 +3729,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ #define add_positive(_ptr, _val) do { \ typeof(_ptr) ptr = (_ptr); \ - typeof(_val) val = (_val); \ + __signed_scalar_typeof(*ptr) val = (_val); \ typeof(*ptr) res, var = READ_ONCE(*ptr); \ \ res = var + val; \ @@ -3705,23 +3741,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } while (0) /* - * Unsigned subtract and clamp on underflow. - * - * Explicitly do a load-store to ensure the intermediate value never hits - * memory. This allows lockless observations without ever seeing the negative - * values. - */ -#define sub_positive(_ptr, _val) do { \ - typeof(_ptr) ptr = (_ptr); \ - typeof(*ptr) val = (_val); \ - typeof(*ptr) res, var = READ_ONCE(*ptr); \ - res = var - val; \ - if (res > var) \ - res = 0; \ - WRITE_ONCE(*ptr, res); \ -} while (0) - -/* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store @@ -3732,21 +3751,37 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) + +/* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ +#define __update_sa(sa, name, delta_avg, delta_sum) do { \ + add_positive(&(sa)->name##_avg, delta_avg); \ + add_positive(&(sa)->name##_sum, delta_sum); \ + (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \ + (sa)->name##_sum, \ + (sa)->name##_avg * PELT_MIN_DIVIDER); \ +} while (0) + static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - cfs_rq->avg.load_avg += se->avg.load_avg; - cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; + __update_sa(&cfs_rq->avg, load, se->avg.load_avg, se->avg.load_sum); } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, -se->avg.load_avg, -se->avg.load_sum); } static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4242,7 +4277,6 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); - /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; new_sum = se->avg.util_avg * divider; @@ -4250,12 +4284,7 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta_avg); - add_positive(&cfs_rq->avg.util_sum, delta_sum); - - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum); } static inline void @@ -4281,11 +4310,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta_avg); - add_positive(&cfs_rq->avg.runnable_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum); } static inline void @@ -4349,11 +4374,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, - cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -4450,7 +4471,7 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) rq = rq_of(cfs_rq); rcu_read_lock(); - is_idle = is_idle_task(rcu_dereference(rq->curr)); + is_idle = is_idle_task(rcu_dereference_all(rq->curr)); rcu_read_unlock(); /* @@ -4552,33 +4573,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_rq->removed.lock); r = removed_load; - sub_positive(&sa->load_avg, r); - sub_positive(&sa->load_sum, r * divider); - /* See sa->util_sum below */ - sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); + __update_sa(sa, load, -r, -r*divider); r = removed_util; - sub_positive(&sa->util_avg, r); - sub_positive(&sa->util_sum, r * divider); - /* - * Because of rounding, se->util_sum might ends up being +1 more than - * cfs->util_sum. Although this is not a problem by itself, detaching - * a lot of tasks with the rounding problem between 2 updates of - * util_avg (~1ms) can make cfs->util_sum becoming null whereas - * cfs_util_avg is not. - * Check that util_sum is still above its lower bound for the new - * util_avg. Given that period_contrib might have moved since the last - * sync, we are only sure that util_sum must be above or equal to - * util_avg * minimum possible divider - */ - sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + __update_sa(sa, util, -r, -r*divider); r = removed_runnable; - sub_positive(&sa->runnable_avg, r); - sub_positive(&sa->runnable_sum, r * divider); - /* See sa->util_sum above */ - sa->runnable_sum = max_t(u32, sa->runnable_sum, - sa->runnable_avg * PELT_MIN_DIVIDER); + __update_sa(sa, runnable, -r, -r*divider); /* * removed_runnable is the unweighted version of removed_load so we @@ -4663,17 +4664,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { dequeue_load_avg(cfs_rq, se); - sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, - cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); - - sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); - /* See update_cfs_rq_load_avg() */ - cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, - cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); + __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum); + __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -5175,7 +5167,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * vl_i = (W + w_i)*vl'_i / W */ - load = cfs_rq->avg_load; + load = cfs_rq->sum_weight; if (curr && curr->on_rq) load += scale_load_down(curr->load.weight); @@ -7148,7 +7140,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; - int has_blocked; /* Idle CPUS has blocked load */ + int has_blocked_load; /* Idle CPUS has blocked load */ int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ @@ -7506,7 +7498,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7515,7 +7507,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7644,7 +7636,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); if (sd_share) { /* because !--nr is the condition to stop scan */ nr = READ_ONCE(sd_share->nr_idle_scan) + 1; @@ -7850,7 +7842,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * sd_asym_cpucapacity rather than sd_llc. */ if (sched_asym_cpucap_active()) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive * cpuset defines a symmetric island (i.e. one unique @@ -7865,7 +7857,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } - sd = rcu_dereference(per_cpu(sd_llc, target)); + sd = rcu_dereference_all(per_cpu(sd_llc, target)); if (!sd) return target; @@ -8334,7 +8326,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct energy_env eenv; rcu_read_lock(); - pd = rcu_dereference(rd->pd); + pd = rcu_dereference_all(rd->pd); if (!pd) goto unlock; @@ -8342,7 +8334,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity)); while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) @@ -8744,7 +8736,7 @@ preempt_sync(struct rq *rq, int wake_flags, /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) +static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags) { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; @@ -8752,6 +8744,12 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + /* + * XXX Getting preempted by higher class, try and find idle CPU? + */ + if (p->sched_class != &fair_sched_class) + return; + if (unlikely(se == pse)) return; @@ -9349,7 +9347,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) */ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - struct numa_group *numa_group = rcu_dereference(p->numa_group); + struct numa_group *numa_group = rcu_dereference_all(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; @@ -9778,7 +9776,7 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_NO_HZ_COMMON -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) return true; @@ -9811,16 +9809,16 @@ static inline void update_blocked_load_tick(struct rq *rq) WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); } -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) { - if (!has_blocked) + if (!has_blocked_load) rq->has_blocked_load = 0; } #else /* !CONFIG_NO_HZ_COMMON: */ -static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} -static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {} #endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) @@ -9877,7 +9875,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) list_del_leaf_cfs_rq(cfs_rq); /* Don't need periodic decay once load/util_avg are null */ - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; } @@ -9937,7 +9935,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - if (cfs_rq_has_blocked(cfs_rq)) + if (cfs_rq_has_blocked_load(cfs_rq)) *done = false; return decayed; @@ -9949,23 +9947,27 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); - update_blocked_load_status(rq, !done); + update_has_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -11025,7 +11027,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (env->sd->span_weight != llc_weight) return; - sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); if (!sd_share) return; @@ -11375,7 +11377,7 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) goto force_balance; if (!is_rd_overutilized(env->dst_rq->rd) && - rcu_dereference(env->dst_rq->rd->pd)) + rcu_dereference_all(env->dst_rq->rd->pd)) goto out_balanced; /* ASYM feature bypasses nice load balance check */ @@ -12450,7 +12452,7 @@ static void nohz_balancer_kick(struct rq *rq) if (likely(!atomic_read(&nohz.nr_cpus))) return; - if (READ_ONCE(nohz.has_blocked) && + if (READ_ONCE(nohz.has_blocked_load) && time_after(now, READ_ONCE(nohz.next_blocked))) flags = NOHZ_STATS_KICK; @@ -12464,7 +12466,7 @@ static void nohz_balancer_kick(struct rq *rq) rcu_read_lock(); - sd = rcu_dereference(rq->sd); + sd = rcu_dereference_all(rq->sd); if (sd) { /* * If there's a runnable CFS task and the current CPU has reduced @@ -12476,7 +12478,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu)); if (sd) { /* * When ASYM_PACKING; see if there's a more preferred CPU @@ -12494,7 +12496,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu)); if (sd) { /* * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU @@ -12515,7 +12517,7 @@ static void nohz_balancer_kick(struct rq *rq) goto unlock; } - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12547,7 +12549,7 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; @@ -12577,7 +12579,7 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); + sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; @@ -12611,9 +12613,9 @@ void nohz_balance_enter_idle(int cpu) /* * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the + * meantime. We set the nohz.has_blocked_load flag to trig a check of the * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load + * of nohz.has_blocked_load can only happen after checking the new load */ if (rq->nohz_tick_stopped) goto out; @@ -12629,7 +12631,7 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked + * @idle_cpus_mask store, it must observe the @has_blocked_load * and @needs_update stores. */ smp_mb__after_atomic(); @@ -12642,7 +12644,7 @@ out: * Each time a cpu enter idle, we assume that it has blocked load and * enable the periodic update of the load of idle CPUs */ - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } static bool update_nohz_stats(struct rq *rq) @@ -12683,8 +12685,8 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) /* * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trigger another update of idle load. + * the has_blocked_load flag. If a cpu enters idle in the mean time, it will + * set the has_blocked_load flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. @@ -12692,12 +12694,12 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * Same applies to idle_cpus_mask vs needs_update. */ if (flags & NOHZ_STATS_KICK) - WRITE_ONCE(nohz.has_blocked, 0); + WRITE_ONCE(nohz.has_blocked_load, 0); if (flags & NOHZ_NEXT_KICK) WRITE_ONCE(nohz.needs_update, 0); /* - * Ensures that if we miss the CPU, we must see the has_blocked + * Ensures that if we miss the CPU, we must see the has_blocked_load * store from nohz_balance_enter_idle(). */ smp_mb(); @@ -12764,7 +12766,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) abort: /* There is still blocked load, enable periodic update */ if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); + WRITE_ONCE(nohz.has_blocked_load, 1); } /* @@ -12826,7 +12828,7 @@ static void nohz_newidle_balance(struct rq *this_rq) return; /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || + if (!READ_ONCE(nohz.has_blocked_load) || time_before(jiffies, READ_ONCE(nohz.next_blocked))) return; @@ -12896,29 +12898,28 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (!sd) { - rcu_read_unlock(); + sd = rcu_dereference_sched_domain(this_rq->sd); + if (!sd) goto out; - } if (!get_rd_overloaded(this_rq->rd) || this_rq->avg_idle < sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); - rcu_read_unlock(); goto out; } - rcu_read_unlock(); - - rq_modified_clear(this_rq); - raw_spin_rq_unlock(this_rq); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + + this_rq->next_class = &fair_sched_class; + raw_spin_rq_unlock(this_rq); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; @@ -12968,7 +12969,6 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (pulled_task || !continue_balancing) break; } - rcu_read_unlock(); raw_spin_rq_lock(this_rq); @@ -12984,7 +12984,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (rq_modified_above(this_rq, &fair_sched_class)) + if (sched_class_above(this_rq->next_class, &fair_sched_class)) pulled_task = -1; out: @@ -13335,8 +13335,8 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ - delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); + delta = vruntime_op(sea->vruntime, "-", seb->vruntime) + + vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13374,6 +13374,12 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (queued) { + if (!need_resched()) + hrtick_start_fair(rq, curr); + return; + } + if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -13882,15 +13888,12 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = { - - .queue_mask = 2, - .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .wakeup_preempt = check_preempt_wakeup_fair, + .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, .pick_next_task = pick_next_task_fair, @@ -13950,7 +13953,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) struct numa_group *ng; rcu_read_lock(); - ng = rcu_dereference(p->numa_group); + ng = rcu_dereference_all(p->numa_group); for_each_online_node(node) { if (p->numa_faults) { tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c174afe1dd177a..65eb8f8c1a5d3a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -536,9 +536,6 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ DEFINE_SCHED_CLASS(idle) = { - - .queue_mask = 0, - /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f1867fe8e5c535..0a9b2cd6da7208 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1615,6 +1615,12 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; + /* + * XXX If we're preempted by DL, queue a push? + */ + if (p->sched_class != &rt_sched_class) + return; + if (p->prio < donor->prio) { resched_curr(rq); return; @@ -2568,9 +2574,6 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) #endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { - - .queue_mask = 4, - .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d30cca6870f5f9..bdb1e748fe5c97 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -670,16 +670,16 @@ struct balance_callback { void (*func)(struct rq *rq); }; -/* CFS-related fields in a runqueue */ +/* Fair scheduling SCHED_{NORMAL,BATCH,IDLE} related fields in a runqueue: */ struct cfs_rq { struct load_weight load; unsigned int nr_queued; - unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int h_nr_idle; /* SCHED_IDLE */ + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ - s64 avg_vruntime; - u64 avg_load; + s64 sum_w_vruntime; + u64 sum_weight; u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE @@ -690,7 +690,7 @@ struct cfs_rq { struct rb_root_cached tasks_timeline; /* - * 'curr' points to currently running entity on this cfs_rq. + * 'curr' points to the currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; @@ -726,9 +726,7 @@ struct cfs_rq { unsigned long h_load; u64 last_h_load_update; struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ /* @@ -741,19 +739,19 @@ struct cfs_rq { */ int on_list; struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ + struct task_group *tg; /* Group that "owns" this runqueue */ /* Locally cached copy of our task_group's idle value */ int idle; -#ifdef CONFIG_CFS_BANDWIDTH +# ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; u64 throttled_pelt_idle; -#ifndef CONFIG_64BIT +# ifndef CONFIG_64BIT u64 throttled_pelt_idle_copy; -#endif +# endif u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; @@ -765,7 +763,7 @@ struct cfs_rq { struct list_head throttled_list; struct list_head throttled_csd_list; struct list_head throttled_limbo_list; -#endif /* CONFIG_CFS_BANDWIDTH */ +# endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1121,7 +1119,6 @@ struct rq { raw_spinlock_t __lock; /* Per class runqueue modification mask; bits in class order. */ - unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1181,6 +1178,7 @@ struct rq { struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; + const struct sched_class *next_class; unsigned long next_balance; struct mm_struct *prev_mm; @@ -2010,8 +2008,8 @@ queue_balance_callback(struct rq *rq, rq->balance_callback = head; } -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex)) +#define rcu_dereference_sched_domain(p) \ + rcu_dereference_all_check((p), lockdep_is_held(&sched_domains_mutex)) /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. @@ -2021,7 +2019,7 @@ queue_balance_callback(struct rq *rq, * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + for (__sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) /* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */ @@ -2428,15 +2426,6 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif - /* - * idle: 0 - * ext: 1 - * fair: 2 - * rt: 4 - * dl: 8 - * stop: 16 - */ - unsigned int queue_mask; /* * move_queued_task/activate_task/enqueue_task: rq->lock @@ -2595,20 +2584,6 @@ struct sched_class { #endif }; -/* - * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. - */ -static inline void rq_modified_clear(struct rq *rq) -{ - rq->queue_mask = 0; -} - -static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) -{ - unsigned int mask = class->queue_mask; - return rq->queue_mask & ~((mask << 1) - 1); -} - static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -3901,6 +3876,7 @@ void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_s deactivate_task(src_rq, task, 0); set_task_cpu(task, dst_rq->cpu); activate_task(dst_rq, task, 0); + wakeup_preempt(dst_rq, task, 0); } static inline @@ -3968,6 +3944,7 @@ extern void balance_callbacks(struct rq *rq, struct balance_callback *head); struct sched_change_ctx { u64 prio; struct task_struct *p; + const struct sched_class *class; int flags; bool queued; bool running; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4f9192be4b5b0a..f95798baddebbd 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -97,9 +97,6 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ DEFINE_SCHED_CLASS(stop) = { - - .queue_mask = 16, - .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, |
