[PATCH v2] arm64: implement support for static call trampolines

Wed Oct 28 14:41:14 EDT 2020

Implement arm64 support for the 'unoptimized' static call variety,
which routes all calls through a single trampoline that is patched
to perform a tail call to the selected function.

Since static call targets may be located in modules loaded out of
direct branching range, we need to be able to fall back to issuing
a ADRP/ADD pair to load the branch target into R16 and use a BR
instruction. As this involves patching more than a single B or NOP
instruction (for which the architecture makes special provisions
in terms of the synchronization needed), we may need to run the
full blown instruction patching logic that uses stop_machine(). It
also means that once we've patched in a ADRP/ADD pair once, we are
quite restricted in the patching we can code subsequently, and we
may end up using an indirect call after all (note that 

Cc: Peter Zijlstra (Intel) <peterz at infradead.org>
Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
---
v2:
This turned nasty really quickly when I realized that any sleeping task
could have been interrupted right in the middle of the ADRP/ADD pair
that we emit for static call targets that are out of immediate branching
range.

 arch/arm64/Kconfig                   |   1 +
 arch/arm64/include/asm/static_call.h |  32 +++++
 arch/arm64/kernel/Makefile           |   2 +-
 arch/arm64/kernel/static_call.c      | 129 ++++++++++++++++++++
 arch/arm64/kernel/vmlinux.lds.S      |   1 +
 5 files changed, 164 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f858c352f72a..9530e85f4f6a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -173,6 +173,7 @@ config ARM64
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_STATIC_CALL
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select MMU_GATHER_RCU_TABLE_FREE
diff --git a/arch/arm64/include/asm/static_call.h b/arch/arm64/include/asm/static_call.h
new file mode 100644
index 000000000000..7ddf939d57f5
--- /dev/null
+++ b/arch/arm64/include/asm/static_call.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STATIC_CALL_H
+#define _ASM_STATIC_CALL_H
+
+/*
+ * We have to account for the possibility that the static call site may
+ * be updated to refer to a target that is out of range for an ordinary
+ * 'B' branch instruction, and so we need to pre-allocate some space for
+ * a ADRP/ADD/BR sequence.
+ */
+#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insn)			    \
+	asm(".pushsection	.static_call.text, \"ax\"		\n" \
+	    ".align		5					\n" \
+	    ".globl		" STATIC_CALL_TRAMP_STR(name) "		\n" \
+	    STATIC_CALL_TRAMP_STR(name) ":				\n" \
+	    "hint 	34	/* BTI C */				\n" \
+	    insn "							\n" \
+	    "ret							\n" \
+	    "nop							\n" \
+	    "nop							\n" \
+	    "adrp	x16, " STATIC_CALL_KEY(name) "			\n" \
+	    "ldr	x16, [x16, :lo12:" STATIC_CALL_KEY(name) "]	\n" \
+	    "br		x16						\n" \
+	    ".popsection						\n")
+
+#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "b " #func)
+
+#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
+	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "nop")
+
+#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index bbaf0bc4ad60..f579800eb860 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -17,7 +17,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o
+			   syscall.o proton-pack.o static_call.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/static_call.c b/arch/arm64/kernel/static_call.c
new file mode 100644
index 000000000000..a97dfc4a1619
--- /dev/null
+++ b/arch/arm64/kernel/static_call.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/static_call.h>
+#include <linux/memory.h>
+#include <asm/debug-monitors.h>
+#include <asm/insn.h>
+
+/*
+ * The static call trampoline consists of one of the following sequences:
+ *
+ *      (A)           (B)           (C)           (D)           (E)
+ * 00: BTI  C        BTI  C        BTI  C        BTI  C        BTI  C
+ * 04: B    fn       NOP           NOP           NOP           NOP
+ * 08: RET           RET           ADRP X16, fn  ADRP X16, fn  ADRP X16, fn
+ * 0c: NOP           NOP           ADD  X16, fn  ADD  X16, fn  ADD  X16, fn
+ * 10:                             BR   X16      RET           NOP
+ * 14:                                                         ADRP X16, &fn
+ * 18:                                                         LDR  X16, [X16, &fn]
+ * 1c:                                                         BR   X16
+ *
+ * The architecture permits us to patch B instructions into NOPs or vice versa
+ * directly, but patching any other instruction sequence requires careful
+ * synchronization. Since branch targets may be out of range for ordinary
+ * immediate branch instructions, we may have to fall back to ADRP/ADD/BR
+ * sequences in some cases, which complicates things considerably; since any
+ * sleeping tasks may have been preempted right in the middle of any of these
+ * sequences, we have to carefully transform one into the other, and ensure
+ * that it is safe to resume execution at any point in the sequence for tasks
+ * that have already executed part of it.
+ *
+ * So the rules are:
+ * - we start out with (A) or (B)
+ * - a branch within immediate range can always be patched in at offset 0x4;
+ * - sequence (A) can be turned into (B) for NULL branch targets;
+ * - a branch outside of immediate range can be patched using (C), but only if
+ *   . the sequence being updated is (A) or (B), or
+ *   . the branch target address modulo 4k results in the same ADD opcode
+ *     (which could occur when patching the same far target a second time)
+ * - once we have patched in (C) we cannot go back to (A) or (B), so patching
+ *   in a NULL target now requires sequence (D);
+ * - if we cannot patch a far target using (C), we fall back to sequence (E),
+ *   which loads the function pointer from memory.
+ *
+ * If we abide by these rules, then the following must hold for tasks that were
+ * interrupted halfway through execution of the trampoline:
+ * - when resuming at offset 0x8, we can only encounter a RET if (B) or (D)
+ *   was patched in at any point, and therefore a NULL target is valid;
+ * - when resuming at offset 0xc, we are executing the ADD opcode that is only
+ *   reachable via the preceding ADRP, and which is patched in only a single
+ *   time, and is therefore guaranteed to be consistent with the ADRP target;
+ * - when resuming at offset 0x10, X16 must refer to a valid target, since it
+ *   is only reachable via a ADRP/ADD pair that is guaranteed to be consistent.
+ *
+ * Note that sequence (E) is only used when switching between multiple far
+ * targets, and that it is not a terminal degraded state.
+ */
+void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
+{
+	unsigned long pc = (unsigned long)tramp + 4;
+	unsigned long dst = (unsigned long)func;
+	void *addrs[] = {
+		(void *)pc, (void *)(pc + 4), (void *)(pc + 8), (void *)(pc + 12)
+	};
+	u32 slot4 = le32_to_cpup(addrs[2]);
+	u32 insn[4];
+
+	/* ensure the ADRP/LDR pair grabs the right value */
+	BUILD_BUG_ON(offsetof(struct static_call_key, func) > 0);
+
+	insn[0] = func ? aarch64_insn_gen_branch_imm(pc, dst,
+						     AARCH64_INSN_BRANCH_NOLINK)
+		       : AARCH64_INSN_HINT_NOP;
+
+	if (insn[0] != AARCH64_BREAK_FAULT) {
+		if (func || slot4 == AARCH64_INSN_HINT_NOP) {
+			/*
+			 * We can patch an immediate branch into the first slot
+			 * of any of the sequences above without any special
+			 * synchronization. We can also patch (A) into (B)
+			 * directly.
+			 */
+			aarch64_insn_patch_text_nosync(addrs[0], insn[0]);
+			return;
+		}
+
+		/*
+		 * We are converting (C), (D) or (E) into (D), and so we should
+		 * take care not to touch the ADRP/ADD opcodes, as we cannot be
+		 * sure that a sleeping task will not resume from there.
+		 */
+		addrs[1] = addrs[3];
+		insn[1] = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_30,
+						      AARCH64_INSN_BRANCH_RETURN);
+		aarch64_insn_patch_text(addrs, insn, 2);
+		return;
+	}
+
+	/* assume we are emitting sequence (C) */
+	insn[0] = AARCH64_INSN_HINT_NOP;
+	insn[1] = aarch64_insn_gen_adr(pc, dst, AARCH64_INSN_REG_16,
+				       AARCH64_INSN_ADR_TYPE_ADRP);
+	insn[2] = aarch64_insn_gen_add_sub_imm(AARCH64_INSN_REG_16,
+					       AARCH64_INSN_REG_16,
+					       dst % SZ_4K,
+					       AARCH64_INSN_VARIANT_64BIT,
+					       AARCH64_INSN_ADSB_ADD);
+	insn[3] = aarch64_insn_gen_branch_reg(AARCH64_INSN_REG_16,
+					      AARCH64_INSN_BRANCH_NOLINK);
+
+	if (WARN_ON(insn[1] == AARCH64_BREAK_FAULT))
+		return;
+
+	if (slot4 != AARCH64_INSN_HINT_NOP && slot4 != insn[2]) {
+		/*
+		 * We are modifying sequence (C), (D) or (E), but the ADD
+		 * opcode we generated is different. This means that we cannot
+		 * patch in sequence (C), because that would overwrite the ADD
+		 * instruction with one that is out of sync with the ADRP
+		 * instruction that sleeping tasks may just have executed. So
+		 * the only option is to switch to sequence (E), and use the
+		 * function pointer variable directly.
+		 */
+		addrs[1] = addrs[3];
+		insn[1] = AARCH64_INSN_HINT_NOP;
+		aarch64_insn_patch_text(addrs, insn, 2);
+		return;
+	}
+	aarch64_insn_patch_text(addrs, insn, ARRAY_SIZE(insn));
+}
+EXPORT_SYMBOL_GPL(arch_static_call_transform);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 6d78c041fdf6..f8049757142f 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -135,6 +135,7 @@ SECTIONS
 			IDMAP_TEXT
 			HIBERNATE_TEXT
 			TRAMP_TEXT
+			STATIC_CALL_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 		. = ALIGN(16);
-- 
2.17.1