Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 352e70c..515c0ce 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -172,21 +172,6 @@
.endif
.endm
-/*
- * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
- * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
- * is just setting the LSB, which makes it an invalid stack address and is also
- * a signal to the unwinder that it's a pt_regs pointer in disguise.
- *
- * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
- * the original rbp.
- */
-.macro ENCODE_FRAME_POINTER ptregs_offset=0
-#ifdef CONFIG_FRAME_POINTER
- leaq 1+\ptregs_offset(%rsp), %rbp
-#endif
-.endm
-
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
@@ -329,18 +314,55 @@
#endif
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space.
+ *
+ * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
+ * to prevent the swapgs from getting speculatively skipped when coming from
+ * user space.
+ */
+.macro FENCE_SWAPGS_USER_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
+.endm
+.macro FENCE_SWAPGS_KERNEL_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
+.endm
+
+.macro STACKLEAK_ERASE_NOCLOBBER
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ PUSH_AND_CLEAR_REGS
+ call stackleak_erase
+ POP_REGS
+#endif
+.endm
+
#endif /* CONFIG_X86_64 */
+.macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+ call stackleak_erase
+#endif
+.endm
+
/*
* This does 'call enter_from_user_mode' unless we can avoid it based on
* kernel config or using the static jump infrastructure.
*/
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_JUMP_LABEL
STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
#endif
call enter_from_user_mode
.Lafter_call_\@:
#endif
.endm
+
+#ifdef CONFIG_PARAVIRT_XXL
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
+#else
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
+#endif
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b..3f8e226 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* common.c - C code for kernel entry and exit
* Copyright (c) 2015 Andrew Lutomirski
- * GPL v2
*
* Based on asm and ptrace code by many authors. The code here originated
* in ptrace.c and signal.c.
@@ -25,12 +25,14 @@
#include <linux/uprobes.h>
#include <linux/livepatch.h>
#include <linux/syscalls.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
-#include <linux/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/nospec-branch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -70,23 +72,18 @@
struct thread_info *ti = current_thread_info();
unsigned long ret = 0;
- bool emulated = false;
u32 work;
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current));
- work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+ work = READ_ONCE(ti->flags);
- if (unlikely(work & _TIF_SYSCALL_EMU))
- emulated = true;
-
- if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- return -1L;
-
- if (emulated)
- return -1L;
+ if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ ret = tracehook_report_syscall_entry(regs);
+ if (ret || (work & _TIF_SYSCALL_EMU))
+ return -1L;
+ }
#ifdef CONFIG_SECCOMP
/*
@@ -140,7 +137,7 @@
/*
* In order to return to user mode, we need to have IRQs off with
* none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
- * can be set at any time on preemptable kernels if we have IRQs on,
+ * can be set at any time on preemptible kernels if we have IRQs on,
* so we need to loop. Disabling preemption wouldn't help: doing the
* work to clear some of the flags can sleep.
*/
@@ -196,6 +193,13 @@
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
+ /* Reload ti->flags; we may have rescheduled above. */
+ cached_flags = READ_ONCE(ti->flags);
+
+ fpregs_assert_state_consistent();
+ if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
@@ -212,6 +216,8 @@
#endif
user_enter_irqoff();
+
+ mds_user_clear_cpu_buffers();
}
#define SYSCALL_EXIT_WORK_FLAGS \
@@ -279,15 +285,16 @@
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs);
- /*
- * NB: Native and x32 syscalls are dispatched from the same
- * table. The only functional difference is the x32 bit in
- * regs->orig_ax, which changes the behavior of some syscalls.
- */
- nr &= __SYSCALL_MASK;
if (likely(nr < NR_syscalls)) {
nr = array_index_nospec(nr, NR_syscalls);
regs->ax = sys_call_table[nr](regs);
+#ifdef CONFIG_X86_X32_ABI
+ } else if (likely((nr & __X32_SYSCALL_BIT) &&
+ (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
+ nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
+ X32_NR_syscalls);
+ regs->ax = x32_sys_call_table[nr](regs);
+#endif
}
syscall_return_slowpath(regs);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index fbbf1ba..f07baf0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -46,6 +46,8 @@
#include <asm/frame.h>
#include <asm/nospec-branch.h>
+#include "calling.h"
+
.section .entry.text, "ax"
/*
@@ -61,11 +63,10 @@
* enough to patch inline, increasing performance.
*/
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
# define preempt_stop(clobbers)
-# define resume_kernel restore_all_kernel
#endif
.macro TRACE_IRQS_IRET
@@ -171,7 +172,7 @@
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
.if \no_user_check == 0
/* coming from usermode? */
- testl $SEGMENT_RPL_MASK, PT_CS(%esp)
+ testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
jz .Lend_\@
.endif
/* On user-cr3? */
@@ -201,10 +202,126 @@
.Lend_\@:
.endm
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+#define CS_FROM_KERNEL (1 << 29)
+#define CS_FROM_ESPFIX (1 << 28)
+
+.macro FIXUP_FRAME
+ /*
+ * The high bits of the CS dword (__csh) are used for CS_FROM_*.
+ * Clear them in case hardware didn't do this for us.
+ */
+ andl $0x0000ffff, 4*4(%esp)
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, 5*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+#endif
+ testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)
+ jnz .Lfrom_usermode_no_fixup_\@
+
+ orl $CS_FROM_KERNEL, 4*4(%esp)
+
+ /*
+ * When we're here from kernel mode; the (exception) stack looks like:
+ *
+ * 6*4(%esp) - <previous context>
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ *
+ * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+ * is complete and in particular regs->sp is correct. This gives us
+ * the original 6 enties as gap:
+ *
+ * 14*4(%esp) - <previous context>
+ * 13*4(%esp) - gap / flags
+ * 12*4(%esp) - gap / cs
+ * 11*4(%esp) - gap / ip
+ * 10*4(%esp) - gap / orig_eax
+ * 9*4(%esp) - gap / gs / function
+ * 8*4(%esp) - gap / fs
+ * 7*4(%esp) - ss
+ * 6*4(%esp) - sp
+ * 5*4(%esp) - flags
+ * 4*4(%esp) - cs
+ * 3*4(%esp) - ip
+ * 2*4(%esp) - orig_eax
+ * 1*4(%esp) - gs / function
+ * 0*4(%esp) - fs
+ */
+
+ pushl %ss # ss
+ pushl %esp # sp (points at ss)
+ addl $7*4, (%esp) # point sp back at the previous context
+ pushl 7*4(%esp) # flags
+ pushl 7*4(%esp) # cs
+ pushl 7*4(%esp) # ip
+ pushl 7*4(%esp) # orig_eax
+ pushl 7*4(%esp) # gs / function
+ pushl 7*4(%esp) # fs
+.Lfrom_usermode_no_fixup_\@:
+.endm
+
+.macro IRET_FRAME
+ /*
+ * We're called with %ds, %es, %fs, and %gs from the interrupted
+ * frame, so we shouldn't use them. Also, we may be in ESPFIX
+ * mode and therefore have a nonzero SS base and an offset ESP,
+ * so any attempt to access the stack needs to use SS. (except for
+ * accesses through %esp, which automatically use SS.)
+ */
+ testl $CS_FROM_KERNEL, 1*4(%esp)
+ jz .Lfinished_frame_\@
+
+ /*
+ * Reconstruct the 3 entry IRET frame right after the (modified)
+ * regs->sp without lowering %esp in between, such that an NMI in the
+ * middle doesn't scribble our stack.
+ */
+ pushl %eax
+ pushl %ecx
+ movl 5*4(%esp), %eax # (modified) regs->sp
+
+ movl 4*4(%esp), %ecx # flags
+ movl %ecx, %ss:-1*4(%eax)
+
+ movl 3*4(%esp), %ecx # cs
+ andl $0x0000ffff, %ecx
+ movl %ecx, %ss:-2*4(%eax)
+
+ movl 2*4(%esp), %ecx # ip
+ movl %ecx, %ss:-3*4(%eax)
+
+ movl 1*4(%esp), %ecx # eax
+ movl %ecx, %ss:-4*4(%eax)
+
+ popl %ecx
+ lea -4*4(%eax), %esp
+ popl %eax
+.Lfinished_frame_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld
+.if \skip_gs == 0
PUSH_GS
+.endif
pushl %fs
+
+ pushl %eax
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+.if \unwind_espfix > 0
+ UNWIND_ESPFIX_STACK
+.endif
+ popl %eax
+
+ FIXUP_FRAME
pushl %es
pushl %ds
pushl \pt_regs_ax
@@ -217,19 +334,17 @@
movl $(__USER_DS), %edx
movl %edx, %ds
movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
+.if \skip_gs == 0
SET_KERNEL_GS %edx
-
+.endif
/* Switch to kernel stack if necessary */
.if \switch_stacks > 0
SWITCH_TO_KERNEL_STACK
.endif
-
.endm
-.macro SAVE_ALL_NMI cr3_reg:req
- SAVE_ALL
+.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
+ SAVE_ALL unwind_espfix=\unwind_espfix
BUG_IF_WRONG_CR3
@@ -245,22 +360,6 @@
.Lend_\@:
.endm
-/*
- * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
- * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
- * is just clearing the MSB, which makes it an invalid stack address and is also
- * a signal to the unwinder that it's a pt_regs pointer in disguise.
- *
- * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
- * original rbp.
- */
-.macro ENCODE_FRAME_POINTER
-#ifdef CONFIG_FRAME_POINTER
- mov %esp, %ebp
- andl $0x7fffffff, %ebp
-#endif
-.endm
-
.macro RESTORE_INT_REGS
popl %ebx
popl %ecx
@@ -277,6 +376,7 @@
2: popl %es
3: popl %fs
POP_GS \pop
+ IRET_FRAME
.pushsection .fixup, "ax"
4: movl $0, (%esp)
jmp 1b
@@ -315,7 +415,8 @@
.macro CHECK_AND_APPLY_ESPFIX
#ifdef CONFIG_X86_ESPFIX32
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
@@ -373,9 +474,6 @@
* switch to it before we do any copying.
*/
-#define CS_FROM_ENTRY_STACK (1 << 31)
-#define CS_FROM_USER_CR3 (1 << 30)
-
.macro SWITCH_TO_KERNEL_STACK
ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
@@ -389,13 +487,6 @@
* that register for the time this macro runs
*/
- /*
- * The high bits of the CS dword (__csh) are used for
- * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
- * hardware didn't do this for us.
- */
- andl $(0x0000ffff), PT_CS(%esp)
-
/* Are we on the entry stack? Bail out if not! */
movl PER_CPU_VAR(cpu_entry_area), %ecx
addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
@@ -648,6 +739,7 @@
pushl %ebx
pushl %edi
pushl %esi
+ pushfl
/* switch stack */
movl %esp, TASK_threadsp(%eax)
@@ -670,6 +762,7 @@
#endif
/* restore callee-saved registers */
+ popfl
popl %esi
popl %edi
popl %ebx
@@ -712,6 +805,7 @@
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
+ STACKLEAK_ERASE
jmp restore_all
/* kernel thread */
@@ -750,7 +844,7 @@
andl $SEGMENT_RPL_MASK, %eax
#endif
cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
+ jb restore_all_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -760,19 +854,6 @@
jmp restore_all
END(ret_from_exception)
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
-.Lneed_resched:
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz restore_all_kernel
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all_kernel
- call preempt_schedule_irq
- jmp .Lneed_resched
-END(resume_kernel)
-#endif
-
GLOBAL(__begin_SYSENTER_singlestep_region)
/*
* All code from here through __end_SYSENTER_singlestep_region is subject
@@ -783,7 +864,7 @@
* will ignore all of the single-step traps generated in this range.
*/
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
/*
* Xen doesn't set %esp to be precisely what the normal SYSENTER
* entry point expects, so fix it up before using the normal path.
@@ -886,6 +967,8 @@
ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ STACKLEAK_ERASE
+
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
@@ -997,6 +1080,8 @@
call do_int80_syscall_32
.Lsyscall_32_done:
+ STACKLEAK_ERASE
+
restore_all:
TRACE_IRQS_IRET
SWITCH_TO_ENTRY_STACK
@@ -1019,6 +1104,15 @@
INTERRUPT_RETURN
restore_all_kernel:
+#ifdef CONFIG_PREEMPTION
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz .Lno_preempt
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz .Lno_preempt
+ call preempt_schedule_irq
+.Lno_preempt:
+#endif
TRACE_IRQS_IRET
PARANOID_EXIT_TO_KERNEL_MODE
BUG_IF_WRONG_CR3
@@ -1054,30 +1148,43 @@
* We can't call C functions using the ESPFIX stack. This code reads
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
+ *
+ * We might be on user CR3 here, so percpu data is not mapped and we can't
+ * access the GDT through the percpu segment. Instead, use SGDT to find
+ * the cpu_entry_area alias of the GDT.
*/
#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ pushl %ecx
+ subl $2*4, %esp
+ sgdt (%esp)
+ movl 2(%esp), %ecx /* GDT address */
+ /*
+ * Careful: ECX is a linear pointer, so we need to force base
+ * zero. %cs is the only known-linear segment we have right now.
+ */
+ mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */
+ mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */
shl $16, %eax
+ addl $2*4, %esp
+ popl %ecx
addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
pushl %eax
lss (%esp), %esp /* switch to the normal stack segment */
#endif
.endm
+
.macro UNWIND_ESPFIX_STACK
+ /* It's safe to clobber %eax, all other regs need to be preserved */
#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
+ jne .Lno_fixup_\@
/* switch to normal stack */
FIXUP_ESPFIX_STACK
-27:
+.Lno_fixup_\@:
#endif
.endm
@@ -1096,6 +1203,30 @@
.endr
END(irq_entries_start)
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_spurious
+ .align 8
+ .endr
+END(spurious_entries_start)
+
+common_spurious:
+ ASM_CLAC
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call smp_spurious_interrupt
+ jmp ret_from_intr
+ENDPROC(common_spurious)
+#endif
+
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
@@ -1241,13 +1372,8 @@
jmp common_exception
END(spurious_interrupt_bug)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback)
- pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
-
/*
* Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled
@@ -1255,18 +1381,19 @@
* iret instruction's behaviour where it delivers a
* pending interrupt when enabling interrupts:
*/
- movl PT_EIP(%esp), %eax
- cmpl $xen_iret_start_crit, %eax
+ cmpl $xen_iret_start_crit, (%esp)
jb 1f
- cmpl $xen_iret_end_crit, %eax
+ cmpl $xen_iret_end_crit, (%esp)
jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
+ call xen_iret_crit_fixup
+1:
+ pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ mov %esp, %eax
call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
call xen_maybe_preempt_hcall
#endif
jmp ret_from_intr
@@ -1322,11 +1449,13 @@
_ASM_EXTABLE(3b, 8b)
_ASM_EXTABLE(4b, 9b)
ENDPROC(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
xen_evtchn_do_upcall)
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
@@ -1344,37 +1473,48 @@
ENTRY(page_fault)
ASM_CLAC
pushl $do_page_fault
- ALIGN
- jmp common_exception
+ jmp common_exception_read_cr2
END(page_fault)
+common_exception_read_cr2:
+ /* the function address is in %gs's slot on the stack */
+ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
+
+ ENCODE_FRAME_POINTER
+
+ /* fixup %gs */
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+
+ GET_CR2_INTO(%ecx) # might clobber %eax
+
+ /* fixup orig %eax */
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+
+ TRACE_IRQS_OFF
+ movl %esp, %eax # pt_regs pointer
+ CALL_NOSPEC %edi
+ jmp ret_from_exception
+END(common_exception_read_cr2)
+
common_exception:
/* the function address is in %gs's slot on the stack */
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- movl $(__USER_DS), %eax
- movl %eax, %ds
- movl %eax, %es
- movl $(__KERNEL_PERCPU), %eax
- movl %eax, %fs
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
- SWITCH_TO_KERNEL_STACK
+ SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
- cld
- UNWIND_ESPFIX_STACK
+
+ /* fixup %gs */
GS_TO_REG %ecx
movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
REG_TO_PTGS %ecx
SET_KERNEL_GS %ecx
+
+ /* fixup orig %eax */
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+
TRACE_IRQS_OFF
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC %edi
@@ -1402,6 +1542,10 @@
ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32
+ /*
+ * ESPFIX_SS is only ever set on the return to user path
+ * after we've switched to the entry stack.
+ */
pushl %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
@@ -1437,6 +1581,11 @@
movl %ebx, %esp
.Lnmi_return:
+#ifdef CONFIG_X86_ESPFIX32
+ testl $CS_FROM_ESPFIX, PT_CS(%esp)
+ jnz .Lnmi_from_espfix
+#endif
+
CHECK_AND_APPLY_ESPFIX
RESTORE_ALL_NMI cr3_reg=%edi pop=4
jmp .Lirq_return
@@ -1444,23 +1593,42 @@
#ifdef CONFIG_X86_ESPFIX32
.Lnmi_espfix_stack:
/*
- * create the pointer to lss back
+ * Create the pointer to LSS back
*/
pushl %ss
pushl %esp
addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- .endr
- pushl %eax
- SAVE_ALL_NMI cr3_reg=%edi
+
+ /* Copy the (short) IRET frame */
+ pushl 4*4(%esp) # flags
+ pushl 4*4(%esp) # cs
+ pushl 4*4(%esp) # ip
+
+ pushl %eax # orig_ax
+
+ SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
ENCODE_FRAME_POINTER
- FIXUP_ESPFIX_STACK # %eax == %esp
+
+ /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
+ xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
+
xorl %edx, %edx # zero error code
- call do_nmi
+ movl %esp, %eax # pt_regs pointer
+ jmp .Lnmi_from_sysenter_stack
+
+.Lnmi_from_espfix:
RESTORE_ALL_NMI cr3_reg=%edi
- lss 12+4(%esp), %esp # back to espfix stack
+ /*
+ * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
+ * fix up the gap and long frame:
+ *
+ * 3 - original frame (exception)
+ * 2 - ESPFIX block (above)
+ * 6 - gap (FIXUP_FRAME)
+ * 5 - long frame (FIXUP_FRAME)
+ * 1 - orig_ax
+ */
+ lss (1+5+6)*4(%esp), %esp # back to espfix stack
jmp .Lirq_return
#endif
END(nmi)
@@ -1487,7 +1655,7 @@
ENTRY(async_page_fault)
ASM_CLAC
pushl $do_async_page_fault
- jmp common_exception
+ jmp common_exception_read_cr2
END(async_page_fault)
#endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f95dcb2..b7c3ea4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -8,7 +8,7 @@
*
* entry.S contains the system-call and fault low-level handling routines.
*
- * Some of this is documented in Documentation/x86/entry_64.txt
+ * Some of this is documented in Documentation/x86/entry_64.rst
*
* A note on terminology:
* - iret frame: Architecture defined interrupt frame from SS to RIP
@@ -142,67 +142,6 @@
* with them due to bugs in both AMD and Intel CPUs.
*/
- .pushsection .entry_trampoline, "ax"
-
-/*
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address). So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
- *
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline. We can thus find cpu_entry_area with this macro:
- */
-
-#define CPU_ENTRY_AREA \
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
- UNWIND_HINT_EMPTY
- swapgs
-
- /* Stash the user RSP. */
- movq %rsp, RSP_SCRATCH
-
- /* Note: using %rsp as a scratch reg. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
- /* Load the top of the task stack into RSP */
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
- /* Start building the simulated IRET frame. */
- pushq $__USER_DS /* pt_regs->ss */
- pushq RSP_SCRATCH /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
-
- /*
- * x86 lacks a near absolute jump, and we can't jump to the real
- * entry text with a relative jump. We could push the target
- * address and then use retq, but this destroys the pipeline on
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
- * spill RDI and restore it in a second-stage trampoline.
- */
- pushq %rdi
- movq $entry_SYSCALL_64_stage2, %rdi
- JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
- .popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
- UNWIND_HINT_EMPTY
- popq %rdi
- jmp entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -212,21 +151,19 @@
*/
swapgs
- /*
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- * is not required to switch CR3.
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+ /* tss.sp2 is scratch space. */
+ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER_DS /* pt_regs->ss */
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
- pushq %rax /* pt_regs->orig_ax */
+ pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
@@ -329,6 +266,8 @@
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
+ STACKLEAK_ERASE_NOCLOBBER
+
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
@@ -359,7 +298,7 @@
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
#endif
#ifdef CONFIG_RETPOLINE
@@ -436,6 +375,18 @@
.endr
END(irq_entries_start)
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
+ jmp common_spurious
+ .align 8
+ vector=vector+1
+ .endr
+END(spurious_entries_start)
+
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
@@ -491,8 +442,8 @@
* it before we actually move ourselves to the IRQ stack.
*/
- movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
- movq PER_CPU_VAR(irq_stack_ptr), %rsp
+ movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
+ movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
#ifdef CONFIG_DEBUG_ENTRY
/*
@@ -568,7 +519,7 @@
testb $3, CS-ORIG_RAX+8(%rsp)
jz 1f
SWAPGS
-
+ FENCE_SWAPGS_USER_ENTRY
/*
* Switch to the thread stack. The IRET frame and orig_ax are
* on the stack, as well as the return address. RDI..R12 are
@@ -598,8 +549,10 @@
UNWIND_HINT_FUNC
movq (%rdi), %rdi
+ jmp 2f
1:
-
+ FENCE_SWAPGS_KERNEL_ENTRY
+2:
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -627,14 +580,25 @@
ret
END(interrupt_entry)
+_ASM_NOKPROBE(interrupt_entry)
/* Interrupt entry/exit. */
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
+/*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_spurious/interrupt.
+ */
+common_spurious:
+ addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
+ call interrupt_entry
+ UNWIND_HINT_REGS indirect=1
+ call smp_spurious_interrupt /* rdi points to pt_regs */
+ jmp ret_from_intr
+END(common_spurious)
+_ASM_NOKPROBE(common_spurious)
+
+/* common_interrupt is a hotpath. Align it */
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
@@ -688,6 +652,7 @@
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
+ STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
@@ -699,15 +664,14 @@
/* Returning to kernel space */
retint_kernel:
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/* Interrupts are off */
/* Check if we need preemption */
btl $9, EFLAGS(%rsp) /* were interrupts off? */
jnc 1f
-0: cmpl $0, PER_CPU_VAR(__preempt_count)
+ cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f
call preempt_schedule_irq
- jmp 0b
1:
#endif
/*
@@ -826,6 +790,7 @@
jmp native_irq_return_iret
#endif
END(common_interrupt)
+_ASM_NOKPROBE(common_interrupt)
/*
* APIC interrupts.
@@ -840,6 +805,7 @@
call \do_sym /* rdi points to pt_regs */
jmp ret_from_intr
END(\sym)
+_ASM_NOKPROBE(\sym)
.endm
/* Make sure APIC interrupt handlers end up in the irqentry section: */
@@ -898,17 +864,123 @@
/*
* Exception entry points.
*/
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
+.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
+
+ .if \paranoid
+ call paranoid_entry
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+ .else
+ call error_entry
+ .endif
+ UNWIND_HINT_REGS
+
+ .if \read_cr2
+ /*
+ * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
+ * intermediate storage as RDX can be clobbered in enter_from_user_mode().
+ * GET_CR2_INTO can clobber RAX.
+ */
+ GET_CR2_INTO(%r12);
+ .endif
+
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
+ TRACE_IRQS_OFF
+ .endif
+
+ .if \paranoid == 0
+ testb $3, CS(%rsp)
+ jz .Lfrom_kernel_no_context_tracking_\@
+ CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
+ .endif
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi, %esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $\ist_offset, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ .if \read_cr2
+ movq %r12, %rdx /* Move CR2 into 3rd argument */
+ .endif
+
+ call \do_sym
+
+ .if \shift_ist != -1
+ addq $\ist_offset, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
+ /* this procedure expect "no swapgs" flag in ebx */
+ jmp paranoid_exit
+ .else
+ jmp error_exit
+ .endif
+
+.endm
+
+/**
+ * idtentry - Generate an IDT entry stub
+ * @sym: Name of the generated entry point
+ * @do_sym: C function to be called
+ * @has_error_code: True if this IDT vector has an error code on the stack
+ * @paranoid: non-zero means that this vector may be invoked from
+ * kernel mode with user GSBASE and/or user CR3.
+ * 2 is special -- see below.
+ * @shift_ist: Set to an IST index if entries from kernel mode should
+ * decrement the IST stack so that nested entries get a
+ * fresh stack. (This is for #DB, which has a nasty habit
+ * of recursing.)
+ * @create_gap: create a 6-word stack gap when coming from kernel mode.
+ * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
+ *
+ * idtentry generates an IDT stub that sets up a usable kernel context,
+ * creates struct pt_regs, and calls @do_sym. The stub has the following
+ * special behaviors:
+ *
+ * On an entry from user mode, the stub switches from the trampoline or
+ * IST stack to the normal thread stack. On an exit to user mode, the
+ * normal exit-to-usermode path is invoked.
+ *
+ * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
+ * whereas we omit the preemption check if @paranoid != 0. This is purely
+ * because the implementation is simpler this way. The kernel only needs
+ * to check for asynchronous kernel preemption when IRQ handlers return.
+ *
+ * If @paranoid == 0, then the stub will handle IRET faults by pretending
+ * that the fault came from user mode. It will handle gs_change faults by
+ * pretending that the fault happened with kernel GSBASE. Since this handling
+ * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
+ * @paranoid == 0. This special handling will do the wrong thing for
+ * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ *
+ * @paranoid == 2 is special: the stub will never switch stacks. This is for
+ * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ */
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
/* Sanity check */
- .if \shift_ist != -1 && \paranoid == 0
+ .if \shift_ist != -1 && \paranoid != 1
.error "using shift_ist requires paranoid=1"
.endif
+ .if \create_gap && \paranoid
+ .error "using create_gap requires paranoid=0"
+ .endif
+
ASM_CLAC
.if \has_error_code == 0
@@ -920,47 +992,21 @@
jnz .Lfrom_usermode_switch_stack_\@
.endif
- .if \paranoid
- call paranoid_entry
- .else
- call error_entry
- .endif
- UNWIND_HINT_REGS
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
-
- .if \paranoid
- .if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
- .else
- TRACE_IRQS_OFF
- .endif
+ .if \create_gap == 1
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
.endif
- movq %rsp, %rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp), %rsi /* get error code */
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi, %esi /* no error code */
- .endif
-
- .if \shift_ist != -1
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
- .endif
-
- call \do_sym
-
- .if \shift_ist != -1
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
- .endif
-
- /* these procedures expect "no swapgs" flag in ebx */
- .if \paranoid
- jmp paranoid_exit
- .else
- jmp error_exit
- .endif
+ idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
.if \paranoid == 1
/*
@@ -969,21 +1015,10 @@
* run in real process context if user_mode(regs).
*/
.Lfrom_usermode_switch_stack_\@:
- call error_entry
-
- movq %rsp, %rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp), %rsi /* get error code */
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi, %esi /* no error code */
+ idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
.endif
- call \do_sym
-
- jmp error_exit
- .endif
+_ASM_NOKPROBE(\sym)
END(\sym)
.endm
@@ -992,7 +1027,7 @@
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1023,10 +1058,10 @@
ENDPROC(native_load_gs_index)
EXPORT_SYMBOL(native_load_gs_index)
- _ASM_EXTABLE(.Lgs_change, bad_gs)
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
.section .fixup, "ax"
/* running with kernelgs */
-bad_gs:
+.Lbad_gs:
SWAPGS /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
@@ -1050,7 +1085,7 @@
ret
ENDPROC(do_softirq_own_stack)
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
@@ -1080,7 +1115,7 @@
call xen_evtchn_do_upcall
LEAVE_IRQ_STACK
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
call xen_maybe_preempt_hcall
#endif
jmp error_exit
@@ -1130,11 +1165,13 @@
ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
+#endif /* CONFIG_XEN_PV */
+#ifdef CONFIG_XEN_PVHVM
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
xen_hvm_callback_vector xen_evtchn_do_upcall
+#endif
-#endif /* CONFIG_XEN */
#if IS_ENABLED(CONFIG_HYPERV)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1147,21 +1184,25 @@
hv_stimer0_callback_vector hv_stimer0_vector_handler
#endif /* CONFIG_HYPERV */
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0
+#if IS_ENABLED(CONFIG_ACRN_GUEST)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ acrn_hv_callback_vector acrn_hv_vector_handler
+#endif
+
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
+idtentry int3 do_int3 has_error_code=0 create_gap=1
idtentry stack_segment do_stack_segment has_error_code=1
-#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
idtentry xennmi do_nmi has_error_code=0
idtentry xendebug do_debug has_error_code=0
-idtentry xenint3 do_int3 has_error_code=0
#endif
idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1
+idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
#endif
#ifdef CONFIG_X86_MCE
@@ -1199,6 +1240,13 @@
*/
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+ /*
+ * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
+ * unconditional CR3 write, even in the PTI case. So do an lfence
+ * to prevent GS speculation, regardless of whether PTI is enabled.
+ */
+ FENCE_SWAPGS_KERNEL_ENTRY
+
ret
END(paranoid_entry)
@@ -1249,6 +1297,7 @@
* from user mode due to an IRET fault.
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
@@ -1260,18 +1309,11 @@
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
-
- /*
- * We need to tell lockdep that IRQs are off. We can't do this until
- * we fix gsbase, and we should do it before enter_from_user_mode
- * (which can take locks).
- */
- TRACE_IRQS_OFF
- CALL_enter_from_user_mode
ret
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
.Lerror_entry_done:
- TRACE_IRQS_OFF
ret
/*
@@ -1288,7 +1330,7 @@
cmpq %rax, RIP+8(%rsp)
je .Lbstep_iret
cmpq $.Lgs_change, RIP+8(%rsp)
- jne .Lerror_entry_done
+ jne .Lerror_entry_done_lfence
/*
* hack: .Lgs_change can fail with user gsbase. If this happens, fix up
@@ -1296,6 +1338,7 @@
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
@@ -1310,6 +1353,7 @@
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
@@ -1401,6 +1445,7 @@
swapgs
cld
+ FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1675,11 +1720,17 @@
iretq
END(nmi)
+#ifndef CONFIG_IA32_EMULATION
+/*
+ * This handles SYSCALL from 32-bit code. There is no way to program
+ * MSRs to fully disable 32-bit SYSCALL.
+ */
ENTRY(ignore_sysret)
UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+#endif
ENTRY(rewind_stack_do_exit)
UNWIND_HINT_FUNC
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 7d0df78..3991377 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -261,6 +261,11 @@
/* Opportunistic SYSRET */
sysret32_from_system_call:
+ /*
+ * We are not going to return to userspace from the trampoline
+ * stack. So let's erase the thread stack right now.
+ */
+ STACKLEAK_ERASE
TRACE_IRQS_ON /* User mode traces as IRQs on. */
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
@@ -356,7 +361,8 @@
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
+ /* In the Xen PV case we already run on the thread stack. */
+ ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -365,8 +371,9 @@
pushq 3*8(%rdi) /* regs->cs */
pushq 2*8(%rdi) /* regs->ip */
pushq 1*8(%rdi) /* regs->orig_ax */
-
pushq (%rdi) /* pt_regs->di */
+.Lint80_keep_stack:
+
pushq %rsi /* pt_regs->si */
xorl %esi, %esi /* nospec si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index d5252bc..b1bf317 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -10,10 +10,13 @@
/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
+#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual)
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
+#undef __SYSCALL_X32
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_X32(nr, sym, qual)
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
@@ -23,3 +26,25 @@
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
+
+#undef __SYSCALL_64
+#undef __SYSCALL_X32
+
+#ifdef CONFIG_X86_X32_ABI
+
+#define __SYSCALL_64(nr, sym, qual)
+#define __SYSCALL_X32(nr, sym, qual) [nr] = sym,
+
+asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_x32_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
+};
+
+#undef __SYSCALL_64
+#undef __SYSCALL_X32
+
+#endif
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b53..3fe0254 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -24,7 +24,7 @@
10 i386 unlink sys_unlink __ia32_sys_unlink
11 i386 execve sys_execve __ia32_compat_sys_execve
12 i386 chdir sys_chdir __ia32_sys_chdir
-13 i386 time sys_time __ia32_compat_sys_time
+13 i386 time sys_time32 __ia32_sys_time32
14 i386 mknod sys_mknod __ia32_sys_mknod
15 i386 chmod sys_chmod __ia32_sys_chmod
16 i386 lchown sys_lchown16 __ia32_sys_lchown16
@@ -36,12 +36,12 @@
22 i386 umount sys_oldumount __ia32_sys_oldumount
23 i386 setuid sys_setuid16 __ia32_sys_setuid16
24 i386 getuid sys_getuid16 __ia32_sys_getuid16
-25 i386 stime sys_stime __ia32_compat_sys_stime
+25 i386 stime sys_stime32 __ia32_sys_stime32
26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace
27 i386 alarm sys_alarm __ia32_sys_alarm
28 i386 oldfstat sys_fstat __ia32_sys_fstat
29 i386 pause sys_pause __ia32_sys_pause
-30 i386 utime sys_utime __ia32_compat_sys_utime
+30 i386 utime sys_utime32 __ia32_sys_utime32
31 i386 stty
32 i386 gtty
33 i386 access sys_access __ia32_sys_access
@@ -135,7 +135,7 @@
121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname
122 i386 uname sys_newuname __ia32_sys_newuname
123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt
-124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex
+124 i386 adjtimex sys_adjtimex_time32 __ia32_sys_adjtimex_time32
125 i386 mprotect sys_mprotect __ia32_sys_mprotect
126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask
127 i386 create_module
@@ -172,8 +172,8 @@
158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield
159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max
160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min
-161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval
-162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep
+161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 __ia32_sys_sched_rr_get_interval_time32
+162 i386 nanosleep sys_nanosleep_time32 __ia32_sys_nanosleep_time32
163 i386 mremap sys_mremap __ia32_sys_mremap
164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16
165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16
@@ -186,11 +186,11 @@
172 i386 prctl sys_prctl __ia32_sys_prctl
173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction
-175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask
+175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending
-177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait
+177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32
178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo
-179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend
+179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend
180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread
181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite
182 i386 chown sys_chown16 __ia32_sys_chown16
@@ -251,14 +251,14 @@
237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr
238 i386 tkill sys_tkill __ia32_sys_tkill
239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64
-240 i386 futex sys_futex __ia32_compat_sys_futex
+240 i386 futex sys_futex_time32 __ia32_sys_futex_time32
241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity
242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity
243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area
244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area
245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup
246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy
-247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents
+247 i386 io_getevents sys_io_getevents_time32 __ia32_sys_io_getevents_time32
248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit
249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel
250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64
@@ -271,18 +271,18 @@
257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages
258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address
259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create
-260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime
-261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime
+260 i386 timer_settime sys_timer_settime32 __ia32_sys_timer_settime32
+261 i386 timer_gettime sys_timer_gettime32 __ia32_sys_timer_gettime32
262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun
263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete
-264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime
-265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime
-266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres
-267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep
+264 i386 clock_settime sys_clock_settime32 __ia32_sys_clock_settime32
+265 i386 clock_gettime sys_clock_gettime32 __ia32_sys_clock_gettime32
+266 i386 clock_getres sys_clock_getres_time32 __ia32_sys_clock_getres_time32
+267 i386 clock_nanosleep sys_clock_nanosleep_time32 __ia32_sys_clock_nanosleep_time32
268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64
269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64
270 i386 tgkill sys_tgkill __ia32_sys_tgkill
-271 i386 utimes sys_utimes __ia32_compat_sys_utimes
+271 i386 utimes sys_utimes_time32 __ia32_sys_utimes_time32
272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind __ia32_sys_mbind
@@ -290,8 +290,8 @@
276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy
277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink
-279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend
-280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive
+279 i386 mq_timedsend sys_mq_timedsend_time32 __ia32_sys_mq_timedsend_time32
+280 i386 mq_timedreceive sys_mq_timedreceive_time32 __ia32_sys_mq_timedreceive_time32
281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify
282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr
283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load
@@ -310,7 +310,7 @@
296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat
297 i386 mknodat sys_mknodat __ia32_sys_mknodat
298 i386 fchownat sys_fchownat __ia32_sys_fchownat
-299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat
+299 i386 futimesat sys_futimesat_time32 __ia32_sys_futimesat_time32
300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat
301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat
302 i386 renameat sys_renameat __ia32_sys_renameat
@@ -319,8 +319,8 @@
305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat
306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat
307 i386 faccessat sys_faccessat __ia32_sys_faccessat
-308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6
-309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll
+308 i386 pselect6 sys_pselect6_time32 __ia32_compat_sys_pselect6_time32
+309 i386 ppoll sys_ppoll_time32 __ia32_compat_sys_ppoll_time32
310 i386 unshare sys_unshare __ia32_sys_unshare
311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list
312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list
@@ -331,13 +331,13 @@
317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages
318 i386 getcpu sys_getcpu __ia32_sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait
-320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat
+320 i386 utimensat sys_utimensat_time32 __ia32_sys_utimensat_time32
321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd
322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create
323 i386 eventfd sys_eventfd __ia32_sys_eventfd
324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate
-325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime
-326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime
+325 i386 timerfd_settime sys_timerfd_settime32 __ia32_sys_timerfd_settime32
+326 i386 timerfd_gettime sys_timerfd_gettime32 __ia32_sys_timerfd_gettime32
327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4
328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2
329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1
@@ -348,13 +348,13 @@
334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev
335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo
336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open
-337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg
+337 i386 recvmmsg sys_recvmmsg_time32 __ia32_compat_sys_recvmmsg_time32
338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init
339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark
340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64
341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at
342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at
-343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime
+343 i386 clock_adjtime sys_clock_adjtime32 __ia32_sys_clock_adjtime32
344 i386 syncfs sys_syncfs __ia32_sys_syncfs
345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg
346 i386 setns sys_setns __ia32_sys_setns
@@ -396,5 +396,47 @@
382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free
383 i386 statx sys_statx __ia32_sys_statx
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
-385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
+385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq
+393 i386 semget sys_semget __ia32_sys_semget
+394 i386 semctl sys_semctl __ia32_compat_sys_semctl
+395 i386 shmget sys_shmget __ia32_sys_shmget
+396 i386 shmctl sys_shmctl __ia32_compat_sys_shmctl
+397 i386 shmat sys_shmat __ia32_compat_sys_shmat
+398 i386 shmdt sys_shmdt __ia32_sys_shmdt
+399 i386 msgget sys_msgget __ia32_sys_msgget
+400 i386 msgsnd sys_msgsnd __ia32_compat_sys_msgsnd
+401 i386 msgrcv sys_msgrcv __ia32_compat_sys_msgrcv
+402 i386 msgctl sys_msgctl __ia32_compat_sys_msgctl
+403 i386 clock_gettime64 sys_clock_gettime __ia32_sys_clock_gettime
+404 i386 clock_settime64 sys_clock_settime __ia32_sys_clock_settime
+405 i386 clock_adjtime64 sys_clock_adjtime __ia32_sys_clock_adjtime
+406 i386 clock_getres_time64 sys_clock_getres __ia32_sys_clock_getres
+407 i386 clock_nanosleep_time64 sys_clock_nanosleep __ia32_sys_clock_nanosleep
+408 i386 timer_gettime64 sys_timer_gettime __ia32_sys_timer_gettime
+409 i386 timer_settime64 sys_timer_settime __ia32_sys_timer_settime
+410 i386 timerfd_gettime64 sys_timerfd_gettime __ia32_sys_timerfd_gettime
+411 i386 timerfd_settime64 sys_timerfd_settime __ia32_sys_timerfd_settime
+412 i386 utimensat_time64 sys_utimensat __ia32_sys_utimensat
+413 i386 pselect6_time64 sys_pselect6 __ia32_compat_sys_pselect6_time64
+414 i386 ppoll_time64 sys_ppoll __ia32_compat_sys_ppoll_time64
+416 i386 io_pgetevents_time64 sys_io_pgetevents __ia32_sys_io_pgetevents
+417 i386 recvmmsg_time64 sys_recvmmsg __ia32_compat_sys_recvmmsg_time64
+418 i386 mq_timedsend_time64 sys_mq_timedsend __ia32_sys_mq_timedsend
+419 i386 mq_timedreceive_time64 sys_mq_timedreceive __ia32_sys_mq_timedreceive
+420 i386 semtimedop_time64 sys_semtimedop __ia32_sys_semtimedop
+421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64
+422 i386 futex_time64 sys_futex __ia32_sys_futex
+423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval
+424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal
+425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
+426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
+427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register
+428 i386 open_tree sys_open_tree __ia32_sys_open_tree
+429 i386 move_mount sys_move_mount __ia32_sys_move_mount
+430 i386 fsopen sys_fsopen __ia32_sys_fsopen
+431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig
+432 i386 fsmount sys_fsmount __ia32_sys_fsmount
+433 i386 fspick sys_fspick __ia32_sys_fspick
+434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
+435 i386 clone3 sys_clone3 __ia32_sys_clone3
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709..c29976e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,20 @@
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq
+# don't use numbers 387 through 423, add new calls after the last
+# 'common' entry
+424 common pidfd_send_signal __x64_sys_pidfd_send_signal
+425 common io_uring_setup __x64_sys_io_uring_setup
+426 common io_uring_enter __x64_sys_io_uring_enter
+427 common io_uring_register __x64_sys_io_uring_register
+428 common open_tree __x64_sys_open_tree
+429 common move_mount __x64_sys_move_mount
+430 common fsopen __x64_sys_fsopen
+431 common fsconfig __x64_sys_fsconfig
+432 common fsmount __x64_sys_fsmount
+433 common fspick __x64_sys_fspick
+434 common pidfd_open __x64_sys_pidfd_open
+435 common clone3 __x64_sys_clone3/ptregs
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -361,7 +375,7 @@
520 x32 execve __x32_compat_sys_execve/ptregs
521 x32 ptrace __x32_compat_sys_ptrace
522 x32 rt_sigpending __x32_compat_sys_rt_sigpending
-523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait
+523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait_time64
524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo
525 x32 sigaltstack __x32_compat_sys_sigaltstack
526 x32 timer_create __x32_compat_sys_timer_create
@@ -375,7 +389,7 @@
534 x32 preadv __x32_compat_sys_preadv64
535 x32 pwritev __x32_compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo
-537 x32 recvmmsg __x32_compat_sys_recvmmsg
+537 x32 recvmmsg __x32_compat_sys_recvmmsg_time64
538 x32 sendmmsg __x32_compat_sys_sendmmsg
539 x32 process_vm_readv __x32_compat_sys_process_vm_readv
540 x32 process_vm_writev __x32_compat_sys_process_vm_writev
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 94fcd19..1af2be3 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -1,13 +1,13 @@
-#!/bin/sh
+#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
in="$1"
out="$2"
syscall_macro() {
- abi="$1"
- nr="$2"
- entry="$3"
+ local abi="$1"
+ local nr="$2"
+ local entry="$3"
# Entry can be either just a function name or "function/qualifier"
real_entry="${entry%%/*}"
@@ -21,14 +21,14 @@
}
emit() {
- abi="$1"
- nr="$2"
- entry="$3"
- compat="$4"
- umlentry=""
+ local abi="$1"
+ local nr="$2"
+ local entry="$3"
+ local compat="$4"
+ local umlentry=""
- if [ "$abi" = "64" -a -n "$compat" ]; then
- echo "a compat entry for a 64-bit syscall makes no sense" >&2
+ if [ "$abi" != "I386" -a -n "$compat" ]; then
+ echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2
exit 1
fi
@@ -62,14 +62,17 @@
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
- # COMMON is the same as 64, except that we don't expect X32
- # programs to use it. Our expectation has nothing to do with
- # any generated code, so treat them the same.
emit 64 "$nr" "$entry" "$compat"
+ if [ "$abi" = "COMMON" ]; then
+ # COMMON means that this syscall exists in the same form for
+ # 64-bit and X32.
+ echo "#ifdef CONFIG_X86_X32_ABI"
+ emit X32 "$nr" "$entry" "$compat"
+ echo "#endif"
+ fi
elif [ "$abi" = "X32" ]; then
- # X32 is equivalent to 64 on an X32-compatible kernel.
echo "#ifdef CONFIG_X86_X32_ABI"
- emit 64 "$nr" "$entry" "$compat"
+ emit X32 "$nr" "$entry" "$compat"
echo "#endif"
elif [ "$abi" = "I386" ]; then
emit "$abi" "$nr" "$entry" "$compat"
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index fee6bc7..2713490 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -1,8 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
* Copyright 2008 by Steven Rostedt, Red Hat, Inc
* (inspired by Andi Kleen's thunk_64.S)
- * Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#include <asm/asm.h>
@@ -34,7 +34,7 @@
THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
THUNK ___preempt_schedule, preempt_schedule
THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
EXPORT_SYMBOL(___preempt_schedule)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index be36bf4..ea5c416 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -1,9 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
* Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
- * Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
#include "calling.h"
@@ -12,9 +12,7 @@
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
- .globl \name
- .type \name, @function
-\name:
+ ENTRY(\name)
pushq %rbp
movq %rsp, %rbp
@@ -35,6 +33,7 @@
call \func
jmp .L_restore
+ ENDPROC(\name)
_ASM_NOKPROBE(\name)
.endm
@@ -47,7 +46,7 @@
THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
THUNK ___preempt_schedule, preempt_schedule
THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
EXPORT_SYMBOL(___preempt_schedule)
@@ -56,7 +55,7 @@
#if defined(CONFIG_TRACE_IRQFLAGS) \
|| defined(CONFIG_DEBUG_LOCK_ALLOC) \
- || defined(CONFIG_PREEMPT)
+ || defined(CONFIG_PREEMPTION)
.L_restore:
popq %r11
popq %r10
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c3d7ccd..0f21541 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,6 +3,12 @@
# Building vDSO images for x86.
#
+# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
+# the inclusion of generic Makefile.
+ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
+ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+include $(srctree)/lib/vdso/Makefile
+
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
@@ -47,10 +53,10 @@
CPPFLAGS_vdso.lds += -P -C
VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
- -z max-page-size=4096 -z common-page-size=4096
+ -z max-page-size=4096
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
- $(call if_changed,vdso)
+ $(call if_changed,vdso_and_check)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
hostprogs-y += vdso2c
@@ -83,6 +89,7 @@
#
CFLAGS_REMOVE_vdso-note.o = -pg
CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vvar.o = -pg
@@ -98,7 +105,7 @@
CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
- -z max-page-size=4096 -z common-page-size=4096
+ -z max-page-size=4096
# x32-rebranded versions
vobjx32s-y := $(vobjs-y:.o=-x32.o)
@@ -116,13 +123,13 @@
targets += vdsox32.lds $(vobjx32s-y)
$(obj)/%.so: OBJCOPYFLAGS := -S
-$(obj)/%.so: $(obj)/%.so.dbg
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
- $(call if_changed,vdso)
+ $(call if_changed,vdso_and_check)
-CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
+CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
targets += vdso32/vdso32.lds
@@ -159,7 +166,7 @@
$(obj)/vdso32/note.o \
$(obj)/vdso32/system_call.o \
$(obj)/vdso32/sigreturn.o
- $(call if_changed,vdso)
+ $(call if_changed,vdso_and_check)
#
# The DSO images are built using a special linker script.
@@ -170,11 +177,13 @@
-T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
-VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \
- $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \
- -Bsymbolic
+VDSO_LDFLAGS = -shared --hash-style=both --build-id \
+ $(call ld-option, --eh-frame-hdr) -Bsymbolic
GCOV_PROFILE := n
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
+
#
# Install the unstripped copies of vdso*.so. If our toolchain supports
# build-id, install .build-id links as well.
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index e48ca3a..d9ff616 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -1,330 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright 2006 Andi Kleen, SUSE Labs.
- * Subject to the GNU Public License, v.2
- *
* Fast user context implementation of clock_gettime, gettimeofday, and time.
*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Copyright 2019 ARM Limited
+ *
* 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
* sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
- *
- * The code should have no internal unresolved relocations.
- * Check with readelf after changing.
*/
-
-#include <uapi/linux/time.h>
-#include <asm/vgtod.h>
-#include <asm/vvar.h>
-#include <asm/unistd.h>
-#include <asm/msr.h>
-#include <asm/pvclock.h>
-#include <asm/mshyperv.h>
-#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
+#include <linux/types.h>
-#define gtod (&VVAR(vsyscall_gtod_data))
+#include "../../../../lib/vdso/gettimeofday.c"
-extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
-extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
+extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
extern time_t __vdso_time(time_t *t);
-#ifdef CONFIG_PARAVIRT_CLOCK
-extern u8 pvclock_page
- __attribute__((visibility("hidden")));
-#endif
-
-#ifdef CONFIG_HYPERV_TSCPAGE
-extern u8 hvclock_page
- __attribute__((visibility("hidden")));
-#endif
-
-#ifndef BUILD_VDSO32
-
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
- long ret;
- asm ("syscall" : "=a" (ret), "=m" (*ts) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
- "memory", "rcx", "r11");
- return ret;
+ return __cvdso_gettimeofday(tv, tz);
}
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) :
- "memory", "rcx", "r11");
- return ret;
-}
-
-
-#else
-
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
-{
- long ret;
-
- asm (
- "mov %%ebx, %%edx \n"
- "mov %[clock], %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret), "=m" (*ts)
- : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
- : "memory", "edx");
- return ret;
-}
-
-notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
-{
- long ret;
-
- asm (
- "mov %%ebx, %%edx \n"
- "mov %[tv], %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret), "=m" (*tv), "=m" (*tz)
- : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz)
- : "memory", "edx");
- return ret;
-}
-
-#endif
-
-#ifdef CONFIG_PARAVIRT_CLOCK
-static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
-{
- return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
-}
-
-static notrace u64 vread_pvclock(int *mode)
-{
- const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
- u64 ret;
- u64 last;
- u32 version;
-
- /*
- * Note: The kernel and hypervisor must guarantee that cpu ID
- * number maps 1:1 to per-CPU pvclock time info.
- *
- * Because the hypervisor is entirely unaware of guest userspace
- * preemption, it cannot guarantee that per-CPU pvclock time
- * info is updated if the underlying CPU changes or that that
- * version is increased whenever underlying CPU changes.
- *
- * On KVM, we are guaranteed that pvti updates for any vCPU are
- * atomic as seen by *all* vCPUs. This is an even stronger
- * guarantee than we get with a normal seqlock.
- *
- * On Xen, we don't appear to have that guarantee, but Xen still
- * supplies a valid seqlock using the version field.
- *
- * We only do pvclock vdso timing at all if
- * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
- * mean that all vCPUs have matching pvti and that the TSC is
- * synced, so we can just look at vCPU 0's pvti.
- */
-
- do {
- version = pvclock_read_begin(pvti);
-
- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
-
- ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
- } while (pvclock_read_retry(pvti, version));
-
- /* refer to vread_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
-}
-#endif
-#ifdef CONFIG_HYPERV_TSCPAGE
-static notrace u64 vread_hvclock(int *mode)
-{
- const struct ms_hyperv_tsc_page *tsc_pg =
- (const struct ms_hyperv_tsc_page *)&hvclock_page;
- u64 current_tick = hv_read_tsc_page(tsc_pg);
-
- if (current_tick != U64_MAX)
- return current_tick;
-
- *mode = VCLOCK_NONE;
- return 0;
-}
-#endif
-
-notrace static u64 vread_tsc(void)
-{
- u64 ret = (u64)rdtsc_ordered();
- u64 last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- /*
- * GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a function of time and the likely is
- * very likely) and there's a data dependence, so force GCC
- * to generate a branch instead. I don't barrier() because
- * we don't actually need a barrier, and if this function
- * ever gets inlined it will generate worse code.
- */
- asm volatile ("");
- return last;
-}
-
-notrace static inline u64 vgetsns(int *mode)
-{
- u64 v;
- cycles_t cycles;
-
- if (gtod->vclock_mode == VCLOCK_TSC)
- cycles = vread_tsc();
-#ifdef CONFIG_PARAVIRT_CLOCK
- else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
- cycles = vread_pvclock(mode);
-#endif
-#ifdef CONFIG_HYPERV_TSCPAGE
- else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
- cycles = vread_hvclock(mode);
-#endif
- else
- return 0;
- v = (cycles - gtod->cycle_last) & gtod->mask;
- return v * gtod->mult;
-}
-
-/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
-notrace static int __always_inline do_realtime(struct timespec *ts)
-{
- unsigned long seq;
- u64 ns;
- int mode;
-
- do {
- seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->wall_time_snsec;
- ns += vgetsns(&mode);
- ns >>= gtod->shift;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return mode;
-}
-
-notrace static int __always_inline do_monotonic(struct timespec *ts)
-{
- unsigned long seq;
- u64 ns;
- int mode;
-
- do {
- seq = gtod_read_begin(gtod);
- mode = gtod->vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
- ns += vgetsns(&mode);
- ns >>= gtod->shift;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-
- ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return mode;
-}
-
-notrace static void do_realtime_coarse(struct timespec *ts)
-{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->wall_time_coarse_sec;
- ts->tv_nsec = gtod->wall_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
-
-notrace static void do_monotonic_coarse(struct timespec *ts)
-{
- unsigned long seq;
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = gtod->monotonic_time_coarse_sec;
- ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-}
-
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
-{
- switch (clock) {
- case CLOCK_REALTIME:
- if (do_realtime(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_MONOTONIC:
- if (do_monotonic(ts) == VCLOCK_NONE)
- goto fallback;
- break;
- case CLOCK_REALTIME_COARSE:
- do_realtime_coarse(ts);
- break;
- case CLOCK_MONOTONIC_COARSE:
- do_monotonic_coarse(ts);
- break;
- default:
- goto fallback;
- }
-
- return 0;
-fallback:
- return vdso_fallback_gettime(clock, ts);
-}
-int clock_gettime(clockid_t, struct timespec *)
- __attribute__((weak, alias("__vdso_clock_gettime")));
-
-notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
-{
- if (likely(tv != NULL)) {
- if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
- return vdso_fallback_gtod(tv, tz);
- tv->tv_usec /= 1000;
- }
- if (unlikely(tz != NULL)) {
- tz->tz_minuteswest = gtod->tz_minuteswest;
- tz->tz_dsttime = gtod->tz_dsttime;
- }
-
- return 0;
-}
-int gettimeofday(struct timeval *, struct timezone *)
+int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
__attribute__((weak, alias("__vdso_gettimeofday")));
-/*
- * This will break when the xtime seconds get inaccurate, but that is
- * unlikely
- */
-notrace time_t __vdso_time(time_t *t)
+time_t __vdso_time(time_t *t)
{
- /* This is atomic on x86 so we don't need any locks. */
- time_t result = READ_ONCE(gtod->wall_time_sec);
-
- if (t)
- *t = result;
- return result;
+ return __cvdso_time(t);
}
-time_t time(time_t *t)
- __attribute__((weak, alias("__vdso_time")));
+
+time_t time(time_t *t) __attribute__((weak, alias("__vdso_time")));
+
+
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+/* both 64-bit and x32 use these */
+extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_getres(clockid_t clock,
+ struct __kernel_timespec *res)
+{
+ return __cvdso_clock_getres(clock, res);
+}
+int clock_getres(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+
+#else
+/* i386 only */
+extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
+{
+ return __cvdso_clock_gettime32(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime64(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime64")));
+
+int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res)
+{
+ return __cvdso_clock_getres_time32(clock, res);
+}
+
+int clock_getres(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+#endif
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index acfd5ba..93c6dc7 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -7,16 +7,6 @@
* This script controls its layout.
*/
-#if defined(BUILD_VDSO64)
-# define SHDR_SIZE 64
-#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
-# define SHDR_SIZE 40
-#else
-# error unknown VDSO target
-#endif
-
-#define NUM_FAKE_SHDRS 13
-
SECTIONS
{
/*
@@ -60,20 +50,8 @@
*(.bss*)
*(.dynbss*)
*(.gnu.linkonce.b.*)
-
- /*
- * Ideally this would live in a C file, but that won't
- * work cleanly for x32 until we start building the x32
- * C code using an x32 toolchain.
- */
- VDSO_FAKE_SECTION_TABLE_START = .;
- . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
- VDSO_FAKE_SECTION_TABLE_END = .;
} :text
- .fake_shstrtab : { *(.fake_shstrtab) } :text
-
-
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
@@ -87,11 +65,6 @@
.text : { *(.text*) } :text =0x90909090,
- /*
- * At the end so that eu-elflint stays happy when vdso2c strips
- * these. A better implementation would avoid allocating space
- * for these.
- */
.altinstructions : { *(.altinstructions) } :text
.altinstr_replacement : { *(.altinstr_replacement) } :text
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce..36b644e 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -25,6 +25,8 @@
__vdso_getcpu;
time;
__vdso_time;
+ clock_getres;
+ __vdso_clock_getres;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 4674f58..3a4d8d4 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,16 +73,12 @@
enum {
sym_vvar_start,
sym_vvar_page,
- sym_hpet_page,
sym_pvclock_page,
sym_hvclock_page,
- sym_VDSO_FAKE_SECTION_TABLE_START,
- sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
sym_vvar_page,
- sym_hpet_page,
sym_pvclock_page,
sym_hvclock_page,
};
@@ -95,15 +91,8 @@
struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
- [sym_hpet_page] = {"hpet_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
[sym_hvclock_page] = {"hvclock_page", true},
- [sym_VDSO_FAKE_SECTION_TABLE_START] = {
- "VDSO_FAKE_SECTION_TABLE_START", false
- },
- [sym_VDSO_FAKE_SECTION_TABLE_END] = {
- "VDSO_FAKE_SECTION_TABLE_END", false
- },
{"VDSO32_NOTE_MASK", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index fa847a6..a20b134 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -7,7 +7,7 @@
static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
void *stripped_addr, size_t stripped_len,
- FILE *outfile, const char *name)
+ FILE *outfile, const char *image_name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
@@ -93,11 +93,12 @@
int k;
ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
- const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
- GET_LE(&sym->st_name);
+ const char *sym_name = raw_addr +
+ GET_LE(&strtab_hdr->sh_offset) +
+ GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
- if (!strcmp(name, required_syms[k].name)) {
+ if (!strcmp(sym_name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
required_syms[k].name);
@@ -134,7 +135,7 @@
if (syms[sym_vvar_start] % 4096)
fail("vvar_begin must be a multiple of 4096\n");
- if (!name) {
+ if (!image_name) {
fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
@@ -157,7 +158,7 @@
}
fprintf(outfile, "\n};\n\n");
- fprintf(outfile, "const struct vdso_image %s = {\n", name);
+ fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
fprintf(outfile, "\t.data = raw_data,\n");
fprintf(outfile, "\t.size = %lu,\n", mapping_size);
if (alt_sec) {
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 42d4c89..240626e 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -65,9 +65,6 @@
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
-static const int zero;
-static const int one = 1;
-
static struct ctl_table abi_table2[] = {
{
.procname = "vsyscall32",
@@ -75,8 +72,8 @@
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (int *)&zero,
- .extra2 = (int *)&one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{}
};
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a..c772099 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -26,6 +26,8 @@
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
};
LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5..16a8050 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -21,6 +21,7 @@
__vdso_gettimeofday;
__vdso_getcpu;
__vdso_time;
+ __vdso_clock_getres;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f..b88a82b 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2006 Andi Kleen, SUSE Labs.
- * Subject to the GNU Public License, v.2
*
* Fast user context implementation of getcpu()
*/
@@ -13,14 +13,8 @@
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
- unsigned int p;
+ vdso_read_cpunode(cpu, node);
- p = __getcpu();
-
- if (cpu)
- *cpu = p & VGETCPU_CPU_MASK;
- if (node)
- *node = p >> 12;
return 0;
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556..f593774 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -1,6 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2007 Andi Kleen, SUSE Labs.
- * Subject to the GPL, v.2
*
* This contains most of the x86 vDSO kernel-side code.
*/
@@ -22,7 +22,7 @@
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
-#include <asm/mshyperv.h>
+#include <clocksource/hyperv_timer.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
@@ -39,7 +39,7 @@
struct linux_binprm;
-static int vdso_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
@@ -84,12 +84,11 @@
return 0;
}
-static int vvar_fault(const struct vm_special_mapping *sm,
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
long sym_offset;
- int ret = -EFAULT;
if (!image)
return VM_FAULT_SIGBUS;
@@ -108,29 +107,24 @@
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- ret = vm_insert_pfn(vma, vmf->address,
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+ return vmf_insert_pfn(vma, vmf->address,
+ __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
pvclock_get_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
- ret = vm_insert_pfn_prot(
- vma,
- vmf->address,
- __pa(pvti) >> PAGE_SHIFT,
- pgprot_decrypted(vma->vm_page_prot));
+ return vmf_insert_pfn_prot(vma, vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
- ret = vm_insert_pfn(vma, vmf->address,
- vmalloc_to_pfn(tsc_pg));
+ return vmf_insert_pfn(vma, vmf->address,
+ virt_to_phys(tsc_pg) >> PAGE_SHIFT);
}
- if (ret == 0 || ret == -EBUSY)
- return VM_FAULT_NOPAGE;
-
return VM_FAULT_SIGBUS;
}
@@ -267,7 +261,7 @@
* abusing from userspace install_speciall_mapping, which may
* not do accounting and rlimit right.
* We could search vma near context.vdso, but it's a slowpath,
- * so let's explicitely check all VMAs to be completely sure.
+ * so let's explicitly check all VMAs to be completely sure.
*/
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma_is_special_mapping(vma, &vdso_mapping) ||
@@ -332,40 +326,6 @@
return 0;
}
__setup("vdso=", vdso_setup);
-#endif
-
-#ifdef CONFIG_X86_64
-static void vgetcpu_cpu_init(void *arg)
-{
- int cpu = smp_processor_id();
- struct desc_struct d = { };
- unsigned long node = 0;
-#ifdef CONFIG_NUMA
- node = cpu_to_node(cpu);
-#endif
- if (static_cpu_has(X86_FEATURE_RDTSCP))
- write_rdtscp_aux((node << 12) | cpu);
-
- /*
- * Store cpu number in limit so that it can be loaded
- * quickly in user space in vgetcpu. (12 bits for the CPU
- * and 8 bits for the node)
- */
- d.limit0 = cpu | ((node & 0xf) << 12);
- d.limit1 = node >> 4;
- d.type = 5; /* RO data, expand down, accessed */
- d.dpl = 3; /* Visible to user code */
- d.s = 1; /* Not a system segment */
- d.p = 1; /* Present */
- d.d = 1; /* 32-bit */
-
- write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
-}
-
-static int vgetcpu_online(unsigned int cpu)
-{
- return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
-}
static int __init init_vdso(void)
{
@@ -375,9 +335,7 @@
init_vdso_image(&vdso_image_x32);
#endif
- /* notifier priority > KVM */
- return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
- "x86/vdso/vma:online", vgetcpu_online, NULL);
+ return 0;
}
subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
index a9f4856..93c1b3e 100644
--- a/arch/x86/entry/vsyscall/Makefile
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -1,7 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the x86 low level vsyscall code
#
-obj-y := vsyscall_gtod.o
-
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 82ed001..e7c596d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -42,9 +42,11 @@
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-static enum { EMULATE, NONE } vsyscall_mode =
+static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
+ XONLY;
#else
EMULATE;
#endif
@@ -54,6 +56,8 @@
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
+ else if (!strcmp("xonly", str))
+ vsyscall_mode = XONLY;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
@@ -99,28 +103,22 @@
* sig_on_uaccess_err, this could go away.
*/
- if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
- siginfo_t info;
+ if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = ¤t->thread;
- thread->error_code = 6; /* user fault, no page, write */
+ thread->error_code = X86_PF_USER | X86_PF_WRITE;
thread->cr2 = ptr;
thread->trap_nr = X86_TRAP_PF;
- clear_siginfo(&info);
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- info.si_code = SEGV_MAPERR;
- info.si_addr = (void __user *)ptr;
-
- force_sig_info(SIGSEGV, &info, current);
+ force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
return false;
} else {
return true;
}
}
-bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
@@ -129,6 +127,22 @@
long ret;
unsigned long orig_dx;
+ /* Write faults or kernel-privilege faults never get fixed up. */
+ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+ return false;
+
+ if (!(error_code & X86_PF_INSTR)) {
+ /* Failed vsyscall read */
+ if (vsyscall_mode == EMULATE)
+ return false;
+
+ /*
+ * User code tried and failed to read the vsyscall page.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+ return false;
+ }
+
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
@@ -275,7 +289,7 @@
return true;
sigsegv:
- force_sig(SIGSEGV, current);
+ force_sig(SIGSEGV);
return true;
}
@@ -291,7 +305,7 @@
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
-static struct vm_area_struct gate_vma = {
+static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
@@ -364,12 +378,20 @@
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- if (vsyscall_mode != NONE) {
+ /*
+ * For full emulation, the page needs to exist for real. In
+ * execute-only mode, there is no PTE at all backing the vsyscall
+ * page.
+ */
+ if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
+ if (vsyscall_mode == XONLY)
+ gate_vma.vm_flags = VM_EXEC;
+
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index c9596a9..2e203f3 100644
--- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -1,9 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* vsyscall_emu_64.S: Vsyscall emulation page
*
* Copyright (c) 2011 Andy Lutomirski
- *
- * Subject to the GNU General Public License, version 2
*/
#include <linux/linkage.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
deleted file mode 100644
index e1216dd..0000000
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Modified for x86 32 bit architecture by
- * Stefani Seibold <stefani@seibold.net>
- * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
- *
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
- *
- */
-
-#include <linux/timekeeper_internal.h>
-#include <asm/vgtod.h>
-#include <asm/vvar.h>
-
-int vclocks_used __read_mostly;
-
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
-
-void update_vsyscall_tz(void)
-{
- vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
- vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
- int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
- struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
- /* Mark the new vclock used. */
- BUILD_BUG_ON(VCLOCK_MAX >= 32);
- WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
-
- gtod_write_begin(vdata);
-
- /* copy vsyscall data */
- vdata->vclock_mode = vclock_mode;
- vdata->cycle_last = tk->tkr_mono.cycle_last;
- vdata->mask = tk->tkr_mono.mask;
- vdata->mult = tk->tkr_mono.mult;
- vdata->shift = tk->tkr_mono.shift;
-
- vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
- + ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->tkr_mono.shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
- vdata->monotonic_time_sec++;
- }
-
- vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
- tk->tkr_mono.shift);
-
- vdata->monotonic_time_coarse_sec =
- vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_coarse_nsec =
- vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
-
- while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
- vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
- vdata->monotonic_time_coarse_sec++;
- }
-
- gtod_write_end(vdata);
-}