[PATCH v2] panic, kexec: Make __crash_kexec() NMI safe

Thu Jun 23 18:30:24 PDT 2022

On 06/20/22 at 12:15pm, Valentin Schneider wrote:
> Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI
> panic() doesn't work. The cause of that lies in the PREEMPT_RT definition
> of mutex_trylock():
> 
> 	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
> 		return 0;
> 
> This prevents an NMI panic() from executing the main body of
> __crash_kexec() which does the actual kexec into the kdump kernel.
> The warning and return are explained by:
> 
>   6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context")
>   [...]
>   The reasons for this are:
> 
>       1) There is a potential deadlock in the slowpath
> 
>       2) Another cpu which blocks on the rtmutex will boost the task
> 	 which allegedly locked the rtmutex, but that cannot work
> 	 because the hard/softirq context borrows the task context.
> 
> Furthermore, grabbing the lock isn't NMI safe, so do away with it and
> use an atomic variable to serialize reads vs writes of
> kexec_crash_image.
> 
> Tested by triggering NMI panics via:
> 
>   $ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi
>   $ echo 1 > /proc/sys/kernel/unknown_nmi_panic
>   $ echo 1 > /proc/sys/kernel/panic
> 
>   $ ipmitool power diag
> 
> Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context")
> Signed-off-by: Valentin Schneider <vschneid at redhat.com>
> ---
> v1 -> v2
> ++++++++
> 
> o Changed from Peterson-like synchronization to simpler atomic_cmpxchg
>   (Petr)
> o Slightly reworded changelog
> o Added Fixes: tag. Technically should be up to since kexec can happen
>   in an NMI, but that isn't such a clear target
> ---
>  include/linux/kexec.h |  1 +
>  kernel/kexec.c        | 16 ++++++++++++----
>  kernel/kexec_core.c   | 36 +++++++++++++++++++-----------------
>  kernel/kexec_file.c   | 11 +++++++++++
>  4 files changed, 43 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index ce6536f1d269..5849a15ae3dd 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -369,6 +369,7 @@ extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
>  
>  extern struct kimage *kexec_image;
>  extern struct kimage *kexec_crash_image;
> +extern atomic_t crash_kexec_lock;
>  extern int kexec_load_disabled;
>  
>  #ifndef kexec_flush_icache_page
> diff --git a/kernel/kexec.c b/kernel/kexec.c
> index b5e40f069768..73e0df2c608f 100644
> --- a/kernel/kexec.c
> +++ b/kernel/kexec.c
> @@ -94,14 +94,20 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
>  	/*
>  	 * Because we write directly to the reserved memory region when loading
>  	 * crash kernels we need a mutex here to prevent multiple crash kernels
> -	 * from attempting to load simultaneously, and to prevent a crash kernel
> -	 * from loading over the top of a in use crash kernel.
> -	 *
> -	 * KISS: always take the mutex.
> +	 * from attempting to load simultaneously.
>  	 */
>  	if (!mutex_trylock(&kexec_mutex))
>  		return -EBUSY;

So kexec_mutex is degenerated to only avoid simultaneous loading,
should we rename to reflect that?, e.g kexec_load_mutex.

>  
> +	/*
> +	 * Prevent loading a new crash kernel while one is in use.
> +	 * See associated comment in __crash_kexec().
> +	 */
> +	if (atomic_cmpxchg_acquire(&crash_kexec_lock, 0, 1)) {
> +		ret = -EBUSY;
> +		goto out_unlock_mutex;
> +	}
> +
>  	if (flags & KEXEC_ON_CRASH) {
>  		dest_image = &kexec_crash_image;
>  		if (kexec_crash_image)
> @@ -165,6 +171,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
>  
>  	kimage_free(image);
>  out_unlock:
> +	atomic_set_release(&crash_kexec_lock, 0);
> +out_unlock_mutex:
>  	mutex_unlock(&kexec_mutex);
>  	return ret;
>  }
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 4d34c78334ce..f957109a266c 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -933,6 +933,7 @@ int kimage_load_segment(struct kimage *image,
>  
>  struct kimage *kexec_image;
>  struct kimage *kexec_crash_image;
> +atomic_t crash_kexec_lock = ATOMIC_INIT(0);
>  int kexec_load_disabled;
>  #ifdef CONFIG_SYSCTL
>  static struct ctl_table kexec_core_sysctls[] = {
> @@ -964,25 +965,26 @@ late_initcall(kexec_core_sysctl_init);
>   */
>  void __noclone __crash_kexec(struct pt_regs *regs)
>  {
> -	/* Take the kexec_mutex here to prevent sys_kexec_load
> -	 * running on one cpu from replacing the crash kernel
> -	 * we are using after a panic on a different cpu.
> -	 *
> -	 * If the crash kernel was not located in a fixed area
> -	 * of memory the xchg(&kexec_crash_image) would be
> -	 * sufficient.  But since I reuse the memory...
> +	/*
> +	 * This should be taking kexec_mutex before doing anything with the
> +	 * kexec_crash_image, but this code can be run in NMI context which
> +	 * means we can't even trylock. This is circumvented by using an
> +	 * atomic variable that is *also* used by the codepaths that take
> +	 * the mutex to modify kexec_crash_image.
>  	 */
> -	if (mutex_trylock(&kexec_mutex)) {
> -		if (kexec_crash_image) {
> -			struct pt_regs fixed_regs;
> -
> -			crash_setup_regs(&fixed_regs, regs);
> -			crash_save_vmcoreinfo();
> -			machine_crash_shutdown(&fixed_regs);
> -			machine_kexec(kexec_crash_image);
> -		}
> -		mutex_unlock(&kexec_mutex);
> +	if (atomic_cmpxchg_acquire(&crash_kexec_lock, 0, 1))
> +		return;
> +
> +	if (kexec_crash_image) {
> +		struct pt_regs fixed_regs;
> +
> +		crash_setup_regs(&fixed_regs, regs);
> +		crash_save_vmcoreinfo();
> +		machine_crash_shutdown(&fixed_regs);
> +		machine_kexec(kexec_crash_image);
>  	}
> +
> +	atomic_set_release(&crash_kexec_lock, 0);
>  }
>  STACK_FRAME_NON_STANDARD(__crash_kexec);
>  
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index 145321a5e798..3faec031cfc9 100644
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
> @@ -337,6 +337,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
>  	if (!mutex_trylock(&kexec_mutex))
>  		return -EBUSY;
>  
> +	/*
> +	 * Prevent loading a new crash kernel while one is in use.
> +	 * See associated comment in __crash_kexec().
> +	 */
> +	if (atomic_cmpxchg_acquire(&crash_kexec_lock, 0, 1)) {
> +		ret = -EBUSY;
> +		goto out_mutex_unlock;
> +	}
> +
>  	dest_image = &kexec_image;
>  	if (flags & KEXEC_FILE_ON_CRASH) {
>  		dest_image = &kexec_crash_image;
> @@ -406,6 +415,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
>  	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
>  		arch_kexec_protect_crashkres();
>  
> +	atomic_set_release(&crash_kexec_lock, 0);
> +out_mutex_unlock:
>  	mutex_unlock(&kexec_mutex);
>  	kimage_free(image);
>  	return ret;
> -- 
> 2.31.1
>