KVM- Guest Enter & Guest Exit

Linux 4.14实现:

KVM Guest Exit 实现原理:

  1. 进入Guest OS之前保存Host的调用__guest_enter时的context以便Guest退出时返回该调用点继续支持
  2. Guest退出Trap到EL2时,则会借用__guest_exit返回enter时的调用点继续执行
    例如:Guest OS访问GICD Trap时,el1_sync -》
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
ENTRY(__kvm_hyp_vector)
    ventry  el2t_sync_invalid       // Synchronous EL2t
    ventry  el2t_irq_invalid        // IRQ EL2t
    ventry  el2t_fiq_invalid        // FIQ EL2t
    ventry  el2t_error_invalid      // Error EL2t

    ventry  el2h_sync_invalid       // Synchronous EL2h
    ventry  el2h_irq_invalid        // IRQ EL2h
    ventry  el2h_fiq_invalid        // FIQ EL2h
    ventry  el2_error           // Error EL2h

    ventry  el1_sync            // Synchronous 64-bit EL1
    ventry  el1_irq             // IRQ 64-bit EL1
    ventry  el1_fiq_invalid         // FIQ 64-bit EL1
    ventry  el1_error           // Error 64-bit EL1

    ventry  el1_sync            // Synchronous 32-bit EL1
    ventry  el1_irq             // IRQ 32-bit EL1
    ventry  el1_fiq_invalid         // FIQ 32-bit EL1
    ventry  el1_error           // Error 32-bit EL1
ENDPROC(__kvm_hyp_vector)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
el1_sync:               // Guest trapped into EL2
    stp x0, x1, [sp, #-16]!

    mrs x0, esr_el2
    lsr x0, x0, #ESR_ELx_EC_SHIFT
    cmp x0, #ESR_ELx_EC_HVC64
    ccmp    x0, #ESR_ELx_EC_HVC32, #4, ne
    b.ne    el1_trap

    mrs x1, vttbr_el2       // If vttbr is valid, the guest
    cbnz    x1, el1_hvc_guest   // called HVC

    /* Here, we're pretty sure the host called HVC. */
    ldp x0, x1, [sp], #16

    /* Check for a stub HVC call */
    cmp x0, #HVC_STUB_HCALL_NR
    b.hs    1f

    /*
     * Compute the idmap address of __kvm_handle_stub_hvc and
     * jump there. Since we use kimage_voffset, do not use the
     * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
     * (by loading it from the constant pool).
     *
     * Preserve x0-x4, which may contain stub parameters.
     */
    ldr x5, =__kvm_handle_stub_hvc
    ldr_l   x6, kimage_voffset

    /* x5 = __pa(x5) */
    sub x5, x5, x6
    br  x5

1:
    /*
     * Perform the EL2 call
     */
    kern_hyp_va x0
    do_el2_call

    eret

el1_hvc_guest:
    /*
     * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
     * The workaround has already been applied on the host,
     * so let's quickly get back to the guest. We don't bother
     * restoring x1, as it can be clobbered anyway.
     */
    ldr x1, [sp]                // Guest's x0
    eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1
    cbz w1, wa_epilogue

    /* ARM_SMCCC_ARCH_WORKAROUND_2 handling */
    eor w1, w1, #(ARM_SMCCC_ARCH_WORKAROUND_1 ^ \
              ARM_SMCCC_ARCH_WORKAROUND_2)
    cbnz    w1, el1_trap

#ifdef CONFIG_ARM64_SSBD
alternative_cb  arm64_enable_wa2_handling
    b   wa2_end
alternative_cb_end
    get_vcpu_ptr    x2, x0
    ldr x0, [x2, #VCPU_WORKAROUND_FLAGS]

    // Sanitize the argument and update the guest flags
    ldr x1, [sp, #8]            // Guest's x1
    clz w1, w1              // Murphy's device:
    lsr w1, w1, #5          // w1 = !!w1 without using
    eor w1, w1, #1          // the flags...
    bfi x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
    str x0, [x2, #VCPU_WORKAROUND_FLAGS]

    /* Check that we actually need to perform the call */
    hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
    cbz x0, wa2_end

    mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2
    smc #0

    /* Don't leak data from the SMC call */
    mov x3, xzr
wa2_end:
    mov x2, xzr
    mov x1, xzr
#endif

wa_epilogue:
    mov x0, xzr
    add sp, sp, #16
    eret

el1_trap:
    get_vcpu_ptr    x1, x0

    mrs     x0, esr_el2
    lsr     x0, x0, #ESR_ELx_EC_SHIFT
    /*
     * x0: ESR_EC
     * x1: vcpu pointer
     */

    /*
     * We trap the first access to the FP/SIMD to save the host context
     * and restore the guest context lazily.
     * If FP/SIMD is not implemented, handle the trap and inject an
     * undefined instruction exception to the guest.
     */
alternative_if_not ARM64_HAS_NO_FPSIMD
    cmp x0, #ESR_ELx_EC_FP_ASIMD
    b.eq    __fpsimd_guest_restore
alternative_else_nop_endif

    mov x0, #ARM_EXCEPTION_TRAP
    b   __guest_exit

el1_irq:
    stp     x0, x1, [sp, #-16]!
    get_vcpu_ptr    x1, x0
    mov x0, #ARM_EXCEPTION_IRQ
    b   __guest_exit

el1_error:
    stp     x0, x1, [sp, #-16]!
    get_vcpu_ptr    x1, x0
    mov x0, #ARM_EXCEPTION_EL1_SERROR
    b   __guest_exit

el2_error:
    /*
     * Only two possibilities:
     * 1) Either we come from the exit path, having just unmasked
     *    PSTATE.A: change the return code to an EL2 fault, and
     *    carry on, as we're already in a sane state to handle it.
     * 2) Or we come from anywhere else, and that's a bug: we panic.
     *
     * For (1), x0 contains the original return code and x1 doesn't
     * contain anything meaningful at that stage. We can reuse them
     * as temp registers.
     * For (2), who cares?
     */
    mrs x0, elr_el2
    adr x1, abort_guest_exit_start
    cmp x0, x1
    adr x1, abort_guest_exit_end
    ccmp    x0, x1, #4, ne
    b.ne    __hyp_panic
    mov x0, #(1 << ARM_EXIT_WITH_SERROR_BIT)
    eret
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/*
 * u64 __guest_enter(struct kvm_vcpu *vcpu,
 *           struct kvm_cpu_context *host_ctxt);
 */
ENTRY(__guest_enter)
    // x0: vcpu
    // x1: host context
    // x2-x17: clobbered by macros
    // x18: guest context

    // Store the host regs
    save_callee_saved_regs x1

    add x18, x0, #VCPU_CONTEXT

    // Restore guest regs x0-x17
    ldp x0, x1,   [x18, #CPU_XREG_OFFSET(0)]
    ldp x2, x3,   [x18, #CPU_XREG_OFFSET(2)]
    ldp x4, x5,   [x18, #CPU_XREG_OFFSET(4)]
    ldp x6, x7,   [x18, #CPU_XREG_OFFSET(6)]
    ldp x8, x9,   [x18, #CPU_XREG_OFFSET(8)]
    ldp x10, x11, [x18, #CPU_XREG_OFFSET(10)]
    ldp x12, x13, [x18, #CPU_XREG_OFFSET(12)]
    ldp x14, x15, [x18, #CPU_XREG_OFFSET(14)]
    ldp x16, x17, [x18, #CPU_XREG_OFFSET(16)]

    // Restore guest regs x19-x29, lr
    restore_callee_saved_regs x18

    // Restore guest reg x18
    ldr x18,      [x18, #CPU_XREG_OFFSET(18)]

    // Do not touch any register after this!
    eret
ENDPROC(__guest_enter)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
ENTRY(__guest_exit)
    // x0: return code
    // x1: vcpu
    // x2-x29,lr: vcpu regs
    // vcpu x0-x1 on the stack

    add x1, x1, #VCPU_CONTEXT

    ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)

    // Store the guest regs x2 and x3
    stp x2, x3,   [x1, #CPU_XREG_OFFSET(2)]

    // Retrieve the guest regs x0-x1 from the stack
    ldp x2, x3, [sp], #16   // x0, x1

    // Store the guest regs x0-x1 and x4-x18
    stp x2, x3,   [x1, #CPU_XREG_OFFSET(0)]
    stp x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
    stp x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
    stp x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
    stp x10, x11, [x1, #CPU_XREG_OFFSET(10)]
    stp x12, x13, [x1, #CPU_XREG_OFFSET(12)]
    stp x14, x15, [x1, #CPU_XREG_OFFSET(14)]
    stp x16, x17, [x1, #CPU_XREG_OFFSET(16)]
    str x18,      [x1, #CPU_XREG_OFFSET(18)]

    // Store the guest regs x19-x29, lr
    save_callee_saved_regs x1

    get_host_ctxt   x2, x3

    // Now restore the host regs
    restore_callee_saved_regs x2

    // If we have a pending asynchronous abort, now is the
    // time to find out. From your VAXorcist book, page 666:
    // "Threaten me not, oh Evil one!  For I speak with
    // the power of DEC, and I command thee to show thyself!"
    mrs x2, elr_el2
    mrs x3, esr_el2
    mrs x4, spsr_el2
    mov x5, x0

    dsb sy      // Synchronize against in-flight ld/st
    msr daifclr, #4 // Unmask aborts

    // This is our single instruction exception window. A pending
    // SError is guaranteed to occur at the earliest when we unmask
    // it, and at the latest just after the ISB.
    .global abort_guest_exit_start
abort_guest_exit_start:

    isb

    .global abort_guest_exit_end
abort_guest_exit_end:

    // If the exception took place, restore the EL1 exception
    // context so that we can report some information.
    // Merge the exception code with the SError pending bit.
    tbz x0, #ARM_EXIT_WITH_SERROR_BIT, 1f
    msr elr_el2, x2
    msr esr_el2, x3
    msr spsr_el2, x4
    orr x0, x0, x5
1:  ret //返回Host元enter Guest的调用点
ENDPROC(__guest_exit)

Linux 5.6实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
```c
/*
 * u64 __guest_enter(struct kvm_vcpu *vcpu,
 *           struct kvm_cpu_context *host_ctxt);
 */
SYM_FUNC_START(__guest_enter)
    // x0: vcpu
    // x1: host context
    // x2-x17: clobbered by macros
    // x29: guest context

    // Store the host regs  
    /* 在进入Guest OS之前,保存Host中调用点的信息,以便Guest OS退出时从该代码行继续执行   */
    save_callee_saved_regs x1

    // Now the host state is stored if we have a pending RAS SError it must
    // affect the host. If any asynchronous exception is pending we defer
    // the guest entry. The DSB isn't necessary before v8.2 as any SError
    // would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
    dsb nshst
    isb
alternative_else_nop_endif
    mrs x1, isr_el1 //将ISR_EL1, Interrupt Status Register读取到x1中
    cbz x1,  1f  //x1为0则跳转到1: no irq, no fiq, no SError pending
    mov x0, #ARM_EXCEPTION_IRQ  //x0中保存了ARM_EXCEPTION_IRQ
    ret //如果有pending的irq/fiq/SError,那么先不进入Guest OS?

1:
    add x29, x0, #VCPU_CONTEXT //x29 = x0 + #VCPU_CONTEXT (偏移)

    // Macro ptrauth_switch_to_guest format:
    //  ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3)
    // The below macro to restore guest keys is not implemented in C code
    // as it may cause Pointer Authentication key signing mismatch errors
    // when this feature is enabled for kernel code.
    ptrauth_switch_to_guest x29, x0, x1, x2

    // Restore guest regs x0-x17
    ldp x0, x1,   [x29, #CPU_XREG_OFFSET(0)]
    ldp x2, x3,   [x29, #CPU_XREG_OFFSET(2)]
    ldp x4, x5,   [x29, #CPU_XREG_OFFSET(4)]
    ldp x6, x7,   [x29, #CPU_XREG_OFFSET(6)]
    ldp x8, x9,   [x29, #CPU_XREG_OFFSET(8)]
    ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
    ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
    ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
    ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]

    // Restore guest regs x18-x29, lr
    restore_callee_saved_regs x29

    // Do not touch any register after this!
    //恢复了guest OS的上下文,eret回到Guest OS的EL1模式
    eret
    sb

SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
    // x0: return code
    // x1: vcpu
    // x2-x29,lr: vcpu regs
    // vcpu x0-x1 on the stack

    add x1, x1, #VCPU_CONTEXT

    ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)

    // Store the guest regs x2 and x3
    stp x2, x3,   [x1, #CPU_XREG_OFFSET(2)]

    // Retrieve the guest regs x0-x1 from the stack
    ldp x2, x3, [sp], #16   // x0, x1

    // Store the guest regs x0-x1 and x4-x17
    stp x2, x3,   [x1, #CPU_XREG_OFFSET(0)]
    stp x4, x5,   [x1, #CPU_XREG_OFFSET(4)]
    stp x6, x7,   [x1, #CPU_XREG_OFFSET(6)]
    stp x8, x9,   [x1, #CPU_XREG_OFFSET(8)]
    stp x10, x11, [x1, #CPU_XREG_OFFSET(10)]
    stp x12, x13, [x1, #CPU_XREG_OFFSET(12)]
    stp x14, x15, [x1, #CPU_XREG_OFFSET(14)]
    stp x16, x17, [x1, #CPU_XREG_OFFSET(16)]

    // Store the guest regs x18-x29, lr
    save_callee_saved_regs x1

    get_host_ctxt   x2, x3

    // Macro ptrauth_switch_to_guest format:
    //  ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
    // The below macro to save/restore keys is not implemented in C code
    // as it may cause Pointer Authentication key signing mismatch errors
    // when this feature is enabled for kernel code.
    ptrauth_switch_to_host x1, x2, x3, x4, x5

    // Now restore the host regs
    restore_callee_saved_regs x2

alternative_if ARM64_HAS_RAS_EXTN
    // If we have the RAS extensions we can consume a pending error
    // without an unmask-SError and isb. The ESB-instruction consumed any
    // pending guest error when we took the exception from the guest.
    mrs_s   x2, SYS_DISR_EL1
    str x2, [x1, #(VCPU_FAULT_DISR - VCPU_CONTEXT)]
    cbz x2, 1f
    msr_s   SYS_DISR_EL1, xzr
    orr x0, x0, #(1<<ARM_EXIT_WITH_SERROR_BIT)
1:  ret
alternative_else
    dsb sy      // Synchronize against in-flight ld/st
    isb         // Prevent an early read of side-effect free ISR
    mrs x2, isr_el1
    tbnz    x2, #8, 2f  // ISR_EL1.A
    ret
    nop
2:
alternative_endif
    // We know we have a pending asynchronous abort, now is the
    // time to flush it out. From your VAXorcist book, page 666:
    // "Threaten me not, oh Evil one!  For I speak with
    // the power of DEC, and I command thee to show thyself!"
    mrs x2, elr_el2
    mrs x3, esr_el2
    mrs x4, spsr_el2
    mov x5, x0

    msr daifclr, #4 // Unmask aborts

    // This is our single instruction exception window. A pending
    // SError is guaranteed to occur at the earliest when we unmask
    // it, and at the latest just after the ISB.
    .global abort_guest_exit_start
abort_guest_exit_start:

    isb

    .global abort_guest_exit_end
abort_guest_exit_end:

    msr daifset, #4 // Mask aborts

    // If the exception took place, restore the EL1 exception
    // context so that we can report some information.
    // Merge the exception code with the SError pending bit.
    tbz x0, #ARM_EXIT_WITH_SERROR_BIT, 1f
    msr elr_el2, x2
    msr esr_el2, x3
    msr spsr_el2, x4
    orr x0, x0, x5
1:  ret
SYM_FUNC_END(__guest_enter)
1