[openwrt/openwrt] realtek: add rt-loader (runtime loader)

Sat Jun 28 07:14:59 PDT 2025

robimarko pushed a commit to openwrt/openwrt.git, branch main:
https://git.openwrt.org/ccbff8bbdd1aaca3ad76fc129b8d53effb161183

commit ccbff8bbdd1aaca3ad76fc129b8d53effb161183
Author: Markus Stockhausen <markus.stockhausen at gmx.de>
AuthorDate: Sun Jun 8 12:11:37 2025 -0400

    realtek: add rt-loader (runtime loader)
    
    The bootloader of many Realtek switches only supports gzipped kernel images.
    With limited flash space that might get critical in future versions. For better
    compression allow support for compressed images. For this a new loader was
    developed. Several ideas have been taken over from the existing lzma loader
    but this has been enhanced to make integration simpler. What is new:
    
    - Loader is position independent. No need to define load addresses
    - Loader identifies device memory on its own
    - Loader uses "official" upstream kernel lzma uncompress
      https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/decompress_unlzma.c
    - Loader uses "official" UNMODIFIED nanoprintg that is used by several
      bare metal projects. https://github.com/charlesnicholson/nanoprintf
    
    Compiled the loader ist just under 12KiB and during boot it will show:
    
    rt-loader
    Found RTL8380M (chip id 6275C) with 256MB
    Relocate 2924240 bytes from 0x80100000 to 0x8fce0000
    Extract kernel with 2900144 bytes from 0x8fce521c to 0x80100000...
    Extracted kernel size is 9814907 bytes
    Booting kernel from 0x80100000 ...
    
    [    0.000000] Linux version 6.12.33 ...
    [    0.000000] RTL838X model is 83806800
    ...
    
    Signed-off-by: Markus Stockhausen <markus.stockhausen at gmx.de>
    Link: https://github.com/openwrt/openwrt/pull/18397
    Signed-off-by: Robert Marko <robimarko at gmail.com>
---
 target/linux/realtek/image/rt-loader/Makefile      |   98 ++
 .../linux/realtek/image/rt-loader/include/board.h  |   14 +
 .../realtek/image/rt-loader/include/globals.h      |   17 +
 .../linux/realtek/image/rt-loader/include/memory.h |   30 +
 .../realtek/image/rt-loader/include/nanoprintf.h   | 1203 ++++++++++++++++++++
 .../linux/realtek/image/rt-loader/linker/linker.ld |   41 +
 target/linux/realtek/image/rt-loader/src/board.c   |  110 ++
 target/linux/realtek/image/rt-loader/src/main.c    |  123 ++
 target/linux/realtek/image/rt-loader/src/memory.c  |  122 ++
 target/linux/realtek/image/rt-loader/src/startup.S |  182 +++
 target/linux/realtek/image/rt-loader/src/unlzma.c  |  663 +++++++++++
 11 files changed, 2603 insertions(+)

diff --git a/target/linux/realtek/image/rt-loader/Makefile b/target/linux/realtek/image/rt-loader/Makefile
new file mode 100644
index 0000000000..6479db705c
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/Makefile
@@ -0,0 +1,98 @@
+# rt-loader make file
+# (c) 2025 Markus Stockhausen
+#
+# This is the make file for the rt-loader (aka runtime or realtek loader). It tries to
+# avoid copying files around where possible. Therefore it is controlled by the following
+# input parameters
+#
+# KERNEL_IMG_IN:	The filename of an LZMA compressed kernel image. This is required
+# KERNEL_IMG_OUT:	The filename of the kernel image with the rt-loader prepended.
+#			If not given it will be created as image.bin into the BUILD_DIR.
+# BUILD_DIR: 		The temporary build dir. If not given it will be set to "build".
+#
+# To add it into the OpenWrt toolchain just create two new build commands
+#
+# define Build/rt-loader
+#   $(MAKE) all clean -C rt-loader CROSS_COMPILE="$(TARGET_CROSS)" \
+#	    KERNEL_IMG_IN="$@" KERNEL_IMG_OUT="$@.new" BUILD_DIR="$@.build"
+#   mv "$@.new" "$@"
+# endef
+#
+# define Build/rt-compress
+#   $(STAGING_DIR_HOST)/bin/xz --format=lzma -9 --stdout "$@" > "$@.new"
+#   mv "$@.new" "$@"
+# endef
+#
+# Use them in a new kernel build recipe
+#
+# define Device/uimage-rt-loader
+#   KERNEL/rt-loader := kernel-bin | append-dtb | rt-compress | rt-loader
+#   KERNEL := $$(KERNEL/rt-loader) | uImage none
+#   KERNEL_INITRAMFS := $$(KERNEL/rt-loader) | uImage none
+# endef
+#
+# And finally add it to the target device. E.g.
+#
+# define Device/linksys_lgs310c
+#   $(Device/uimage-rt-loader)
+#   ...
+# endef
+
+CC		:= $(CROSS_COMPILE)gcc
+LD		:= $(CROSS_COMPILE)ld
+OBJCOPY		:= $(CROSS_COMPILE)objcopy
+OBJDUMP		:= $(CROSS_COMPILE)objdump
+
+CFLAGS		= -fpic -mabicalls -O2 -fno-builtin-printf -Iinclude
+
+ASFLAGS		= -fpic -msoft-float -Iinclude
+
+LDFLAGS		= -static -nostdlib -T linker/linker.ld --no-warn-mismatch
+
+O_FORMAT 	= $(shell $(OBJDUMP) -i | head -2 | grep elf32)
+
+SOURCES		= src/startup.S src/main.c src/board.c src/memory.c src/unlzma.c
+
+BUILD_DIR	?= build
+
+IMAGE_OBJ	:= $(BUILD_DIR)/image.o
+IMAGE_ELF     	:= $(BUILD_DIR)/image.elf
+
+KERNEL_IMG_OUT	?= $(BUILD_DIR)/image.bin
+
+OBJECTS_C	= $(filter %.c,$(SOURCES))
+OBJECTS_S	= $(filter %.S,$(SOURCES))
+
+OBJECTS		:= $(OBJECTS_S:.S=.o) $(OBJECTS_C:.c=.o)
+OBJECTS		:= $(patsubst %.o, $(BUILD_DIR)/%.o, $(OBJECTS)) $(IMAGE_OBJ)
+
+ifneq ($(MAKECMDGOALS),clean)
+ifndef KERNEL_IMG_IN
+$(error Compressed kernel image not given via KERNEL_IMG_IN)
+endif
+endif
+
+all: $(KERNEL_IMG_OUT)
+
+install:
+
+$(BUILD_DIR)/%.o : %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(BUILD_DIR)/%.o : %.S
+	@mkdir -p $(dir $@)
+	$(CC) $(ASFLAGS) -c -o $@ $<
+
+$(IMAGE_OBJ): $(KERNEL_IMG_IN)
+	$(OBJCOPY) -I binary -O $(O_FORMAT) --rename-section .data=.kernel $< $@
+
+$(IMAGE_ELF): $(OBJECTS)
+	$(LD) $(LDFLAGS) -o $@ $(OBJECTS)
+
+$(KERNEL_IMG_OUT): $(IMAGE_ELF)
+	$(OBJCOPY) -O binary $< $@
+
+clean:
+	rm -rf $(BUILD_DIR)/
+
diff --git a/target/linux/realtek/image/rt-loader/include/board.h b/target/linux/realtek/image/rt-loader/include/board.h
new file mode 100644
index 0000000000..b0d0945890
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/board.h
@@ -0,0 +1,14 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _BOARD_H_
+#define _BOARD_H_
+
+unsigned int board_get_memory(void);
+void board_get_system(char *buffer, int len);
+void board_panic(void);
+void board_putchar(int ch, void *ctx);;
+
+#endif  // _BOARD_H_
diff --git a/target/linux/realtek/image/rt-loader/include/globals.h b/target/linux/realtek/image/rt-loader/include/globals.h
new file mode 100644
index 0000000000..49052b8155
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/globals.h
@@ -0,0 +1,17 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _GLOBALS_H_
+#define _GLOBALS_H_
+
+#define KSEG0			0x80000000
+#define STACK_SIZE		0x10000
+#define HEAP_SIZE		0x40000
+#define MEMORY_ALIGNMENT	32
+
+#define printf(fmt, ...)	npf_pprintf(board_putchar, NULL, fmt, ##__VA_ARGS__)
+#define snprintf		npf_snprintf
+
+#endif  // _GLOBALS_H_
diff --git a/target/linux/realtek/image/rt-loader/include/memory.h b/target/linux/realtek/image/rt-loader/include/memory.h
new file mode 100644
index 0000000000..80d0f8a283
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/memory.h
@@ -0,0 +1,30 @@
+/*
+ * rt-loader header
+ * (c) 2025 Markus Stockhausen
+ */
+
+#ifndef _MEMORY_H_
+#define _MEMORY_H_
+
+#include <stddef.h>
+#include "globals.h"
+
+#define CACHE_HIT_INVALIDATE_I		0x10
+#define CACHE_HIT_WRITEBACK_INV_D	0x15
+
+#define ioread32(reg)			(*(volatile int *)(reg))
+#define iowrite32(val, reg)		(*(volatile int *)(reg) = val)
+
+void flush_cache(void *start_addr, unsigned long size);
+void free(void *ptr);
+void *malloc(size_t size);
+int memcmp(const void *s1, const void *s2, size_t count);
+void *memmove(void *dst, const void *src, size_t count);
+void *memcpy(void *dst, const void *src, size_t count);
+void *memset(void *dst, int value, size_t count);
+size_t strlen(const char *s);
+
+extern void *_heap_addr;
+extern void *_heap_addr_max;
+
+#endif  // _MEMORY_H_
diff --git a/target/linux/realtek/image/rt-loader/include/nanoprintf.h b/target/linux/realtek/image/rt-loader/include/nanoprintf.h
new file mode 100644
index 0000000000..a415ad9f0d
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/include/nanoprintf.h
@@ -0,0 +1,1203 @@
+/* nanoprintf v0.5.5: a tiny embeddable printf replacement written in C.
+   https://github.com/charlesnicholson/nanoprintf
+   charles.nicholson+nanoprintf at gmail.com
+   dual-licensed under 0bsd and unlicense, take your pick. see eof for details. */
+
+#ifndef NPF_H_INCLUDED
+#define NPF_H_INCLUDED
+
+#include <stdarg.h>
+#include <stddef.h>
+
+// Define this to fully sandbox nanoprintf inside of a translation unit.
+#ifdef NANOPRINTF_VISIBILITY_STATIC
+  #define NPF_VISIBILITY static
+#else
+  #define NPF_VISIBILITY extern
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+  #define NPF_PRINTF_ATTR(FORMAT_INDEX, VARGS_INDEX) \
+    __attribute__((format(printf, FORMAT_INDEX, VARGS_INDEX)))
+#else
+  #define NPF_PRINTF_ATTR(FORMAT_INDEX, VARGS_INDEX)
+#endif
+
+// Public API
+
+#ifdef __cplusplus
+#define NPF_RESTRICT
+extern "C" {
+#else
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#define NPF_RESTRICT restrict
+#else
+#define NPF_RESTRICT
+#endif
+#endif
+
+// The npf_ functions all return the number of bytes required to express the
+// fully-formatted string, not including the null terminator character.
+// The npf_ functions do not return negative values, since the lack of 'l' length
+// modifier support makes encoding errors impossible.
+
+NPF_VISIBILITY int npf_snprintf(char * NPF_RESTRICT buffer,
+                                size_t bufsz,
+                                const char * NPF_RESTRICT format,
+                                ...) NPF_PRINTF_ATTR(3, 4);
+
+NPF_VISIBILITY int npf_vsnprintf(char * NPF_RESTRICT buffer,
+                                 size_t bufsz,
+                                 char const * NPF_RESTRICT format,
+                                 va_list vlist)   NPF_PRINTF_ATTR(3, 0);
+
+typedef void (*npf_putc)(int c, void *ctx);
+NPF_VISIBILITY int npf_pprintf(npf_putc pc,
+                               void * NPF_RESTRICT pc_ctx,
+                               char const * NPF_RESTRICT format,
+                               ...) NPF_PRINTF_ATTR(3, 4);
+
+NPF_VISIBILITY int npf_vpprintf(npf_putc pc,
+                                void * NPF_RESTRICT pc_ctx,
+                                char const * NPF_RESTRICT format,
+                                va_list vlist) NPF_PRINTF_ATTR(3, 0);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NPF_H_INCLUDED
+
+/* The implementation of nanoprintf begins here, to be compiled only if
+   NANOPRINTF_IMPLEMENTATION is defined. In a multi-file library what follows would
+   be nanoprintf.c. */
+
+#ifdef NANOPRINTF_IMPLEMENTATION
+
+#ifndef NPF_IMPLEMENTATION_INCLUDED
+#define NPF_IMPLEMENTATION_INCLUDED
+
+#include <limits.h>
+#include <stdint.h>
+
+// The conversion buffer must fit at least UINT64_MAX in octal format with the leading '0'.
+#ifndef NANOPRINTF_CONVERSION_BUFFER_SIZE
+  #define NANOPRINTF_CONVERSION_BUFFER_SIZE    23
+#endif
+#if NANOPRINTF_CONVERSION_BUFFER_SIZE < 23
+  #error The size of the conversion buffer must be at least 23 bytes.
+#endif
+
+// Pick reasonable defaults if nothing's been configured.
+#if !defined(NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS) && \
+    !defined(NANOPRINTF_USE_ALT_FORM_FLAG)
+  #define NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS 1
+  #define NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS 0
+  #define NANOPRINTF_USE_ALT_FORM_FLAG 1
+#endif
+
+// If anything's been configured, everything must be configured.
+#ifndef NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+#ifndef NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS
+  #error NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS must be #defined to 0 or 1
+#endif
+
+// Ensure flags are compatible.
+#if (NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1) && \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 0)
+  #error Precision format specifiers must be enabled if float support is enabled.
+#endif
+
+// intmax_t / uintmax_t require stdint from c99 / c++11
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  #ifndef _MSC_VER
+    #ifdef __cplusplus
+      #if __cplusplus < 201103L
+        #error large format specifier support requires C++11 or later.
+      #endif
+    #else
+      #if __STDC_VERSION__ < 199409L
+        #error nanoprintf requires C99 or later.
+      #endif
+    #endif
+  #endif
+#endif
+
+// Figure out if we can disable warnings with pragmas.
+#ifdef __clang__
+  #define NPF_CLANG 1
+  #define NPF_GCC_PAST_4_6 0
+#else
+  #define NPF_CLANG 0
+  #if defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
+    #define NPF_GCC_PAST_4_6 1
+  #else
+    #define NPF_GCC_PAST_4_6 0
+  #endif
+#endif
+
+#if NPF_CLANG || NPF_GCC_PAST_4_6
+  #define NPF_HAVE_GCC_WARNING_PRAGMAS 1
+#else
+  #define NPF_HAVE_GCC_WARNING_PRAGMAS 0
+#endif
+
+#if NPF_HAVE_GCC_WARNING_PRAGMAS
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wunused-function"
+  #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+  #ifdef __cplusplus
+    #pragma GCC diagnostic ignored "-Wold-style-cast"
+  #endif
+  #pragma GCC diagnostic ignored "-Wpadded"
+  #pragma GCC diagnostic ignored "-Wfloat-equal"
+  #if NPF_CLANG
+    #pragma GCC diagnostic ignored "-Wc++98-compat-pedantic"
+    #pragma GCC diagnostic ignored "-Wcovered-switch-default"
+    #pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
+    #pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
+    #ifndef __APPLE__
+      #pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
+    #endif
+  #elif NPF_GCC_PAST_4_6
+    #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+  #endif
+#endif
+
+#ifdef _MSC_VER
+  #pragma warning(push)
+  #pragma warning(disable:4619) // there is no warning number 'number'
+  // C4619 has to be disabled first!
+  #pragma warning(disable:4127) // conditional expression is constant
+  #pragma warning(disable:4505) // unreferenced local function has been removed
+  #pragma warning(disable:4514) // unreferenced inline function has been removed
+  #pragma warning(disable:4701) // potentially uninitialized local variable used
+  #pragma warning(disable:4706) // assignment within conditional expression
+  #pragma warning(disable:4710) // function not inlined
+  #pragma warning(disable:4711) // function selected for inline expansion
+  #pragma warning(disable:4820) // padding added after struct member
+  #pragma warning(disable:5039) // potentially throwing function passed to extern C function
+  #pragma warning(disable:5045) // compiler will insert Spectre mitigation for memory load
+  #pragma warning(disable:5262) // implicit switch fall-through
+  #pragma warning(disable:26812) // enum type is unscoped
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__)
+  #define NPF_NOINLINE __attribute__((noinline))
+  #define NPF_FORCE_INLINE inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+  #define NPF_NOINLINE __declspec(noinline)
+  #define NPF_FORCE_INLINE inline __forceinline
+#else
+  #define NPF_NOINLINE
+  #define NPF_FORCE_INLINE
+#endif
+
+#if (NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1) || \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1)
+enum {
+  NPF_FMT_SPEC_OPT_NONE,
+  NPF_FMT_SPEC_OPT_LITERAL,
+  NPF_FMT_SPEC_OPT_STAR,
+};
+#endif
+
+enum {
+  NPF_FMT_SPEC_LEN_MOD_NONE,
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_LEN_MOD_SHORT,       // 'h'
+  NPF_FMT_SPEC_LEN_MOD_CHAR,        // 'hh'
+#endif
+  NPF_FMT_SPEC_LEN_MOD_LONG,        // 'l'
+  NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE, // 'L'
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_LEN_MOD_LARGE_LONG_LONG, // 'll'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_INTMAX,    // 'j'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_SIZET,     // 'z'
+  NPF_FMT_SPEC_LEN_MOD_LARGE_PTRDIFFT,  // 't'
+#endif
+};
+
+enum {
+  NPF_FMT_SPEC_CONV_NONE,
+  NPF_FMT_SPEC_CONV_PERCENT,      // '%'
+  NPF_FMT_SPEC_CONV_CHAR,         // 'c'
+  NPF_FMT_SPEC_CONV_STRING,       // 's'
+  NPF_FMT_SPEC_CONV_SIGNED_INT,   // 'i', 'd'
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_BINARY,       // 'b'
+#endif
+  NPF_FMT_SPEC_CONV_OCTAL,        // 'o'
+  NPF_FMT_SPEC_CONV_HEX_INT,      // 'x', 'X'
+  NPF_FMT_SPEC_CONV_UNSIGNED_INT, // 'u'
+  NPF_FMT_SPEC_CONV_POINTER,      // 'p'
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_WRITEBACK,    // 'n'
+#endif
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+  NPF_FMT_SPEC_CONV_FLOAT_DEC,      // 'f', 'F'
+  NPF_FMT_SPEC_CONV_FLOAT_SCI,      // 'e', 'E'
+  NPF_FMT_SPEC_CONV_FLOAT_SHORTEST, // 'g', 'G'
+  NPF_FMT_SPEC_CONV_FLOAT_HEX,      // 'a', 'A'
+#endif
+};
+
+typedef struct npf_format_spec {
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  int field_width;
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+  int prec;
+  uint8_t prec_opt;
+#endif
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  uint8_t field_width_opt;
+  char left_justified;   // '-'
+  char leading_zero_pad; // '0'
+#endif
+  char prepend;          // ' ' or '+'
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+  char alt_form;         // '#'
+#endif
+  char case_adjust;      // 'a' - 'A' , or 0 (must be non-negative to work)
+  uint8_t length_modifier;
+  uint8_t conv_spec;
+} npf_format_spec_t;
+
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 0
+  typedef long npf_int_t;
+  typedef unsigned long npf_uint_t;
+#else
+  typedef intmax_t npf_int_t;
+  typedef uintmax_t npf_uint_t;
+#endif
+
+typedef struct npf_bufputc_ctx {
+  char *dst;
+  size_t len;
+  size_t cur;
+} npf_bufputc_ctx_t;
+
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+  typedef char npf_size_is_ptrdiff[(sizeof(size_t) == sizeof(ptrdiff_t)) ? 1 : -1];
+  typedef ptrdiff_t npf_ssize_t;
+  typedef size_t npf_uptrdiff_t;
+#endif
+
+#ifdef _MSC_VER
+  #include <intrin.h>
+#endif
+
+#define NPF_MIN(x, y)    ((x) <= (y) ? (x) : (y))
+#define NPF_MAX(x, y)    ((x) >= (y) ? (x) : (y))
+
+static int npf_parse_format_spec(char const *format, npf_format_spec_t *out_spec) {
+  char const *cur = format;
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  out_spec->left_justified = 0;
+  out_spec->leading_zero_pad = 0;
+#endif
+  out_spec->case_adjust = 'a' - 'A'; // lowercase
+  out_spec->prepend = 0;
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+  out_spec->alt_form = 0;
+#endif
+
+  while (*++cur) { // cur points at the leading '%' character
+    switch (*cur) { // Optional flags
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+      case '-': out_spec->left_justified = '-'; out_spec->leading_zero_pad = 0; continue;
+      case '0': out_spec->leading_zero_pad = !out_spec->left_justified; continue;
+#endif
+      case '+': out_spec->prepend = '+'; continue;
+      case ' ': if (out_spec->prepend == 0) { out_spec->prepend = ' '; } continue;
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+      case '#': out_spec->alt_form = '#'; continue;
+#endif
+      default: break;
+    }
+    break;
+  }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+  out_spec->field_width = 0;
+  out_spec->field_width_opt = NPF_FMT_SPEC_OPT_NONE;
+  if (*cur == '*') {
+    out_spec->field_width_opt = NPF_FMT_SPEC_OPT_STAR;
+    ++cur;
+  } else {
+    while ((*cur >= '0') && (*cur <= '9')) {
+      out_spec->field_width_opt = NPF_FMT_SPEC_OPT_LITERAL;
+      out_spec->field_width = (out_spec->field_width * 10) + (*cur++ - '0');
+    }
+  }
+#endif
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+  out_spec->prec = 0;
+  out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+  if (*cur == '.') {
+    ++cur;
+    if (*cur == '*') {
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_STAR;
+      ++cur;
+    } else {
+      if (*cur == '-') {
+        ++cur;
+      } else {
+        out_spec->prec_opt = NPF_FMT_SPEC_OPT_LITERAL;
+      }
+      while ((*cur >= '0') && (*cur <= '9')) {
+        out_spec->prec = (out_spec->prec * 10) + (*cur++ - '0');
+      }
+    }
+  }
+#endif
+
+  uint_fast8_t tmp_conv = NPF_FMT_SPEC_CONV_NONE;
+  out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_NONE;
+  switch (*cur++) { // Length modifier
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+    case 'h':
+      out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_SHORT;
+      if (*cur == 'h') {
+        out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_CHAR;
+        ++cur;
+      }
+      break;
+#endif
+    case 'l':
+      out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LONG;
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+      if (*cur == 'l') {
+        out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_LONG_LONG;
+        ++cur;
+      }
+#endif
+      break;
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+    case 'L': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE; break;
+#endif
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+    case 'j': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_INTMAX; break;
+    case 'z': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_SIZET; break;
+    case 't': out_spec->length_modifier = NPF_FMT_SPEC_LEN_MOD_LARGE_PTRDIFFT; break;
+#endif
+    default: --cur; break;
+  }
+
+  switch (*cur++) { // Conversion specifier
+    case '%': out_spec->conv_spec = NPF_FMT_SPEC_CONV_PERCENT;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+      out_spec->prec = 0;
+#endif
+      break;
+
+    case 'c': out_spec->conv_spec = NPF_FMT_SPEC_CONV_CHAR;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+      out_spec->prec = 0;
+#endif
+      break;
+
+    case 's': out_spec->conv_spec = NPF_FMT_SPEC_CONV_STRING;
+      break;
+
+    case 'i':
+    case 'd': tmp_conv = NPF_FMT_SPEC_CONV_SIGNED_INT; goto finish;
+    case 'o': tmp_conv = NPF_FMT_SPEC_CONV_OCTAL; goto finish;
+    case 'u': tmp_conv = NPF_FMT_SPEC_CONV_UNSIGNED_INT; goto finish;
+    case 'X': out_spec->case_adjust = 0;
+    case 'x': tmp_conv = NPF_FMT_SPEC_CONV_HEX_INT; goto finish;
+    finish:
+      out_spec->conv_spec = (uint8_t)tmp_conv;
+#if (NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1) && \
+    (NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1)
+      if (out_spec->prec_opt != NPF_FMT_SPEC_OPT_NONE) { out_spec->leading_zero_pad = 0; }
+#endif
+      break;
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+    case 'F': out_spec->case_adjust = 0;
+    case 'f':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_DEC;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'E': out_spec->case_adjust = 0;
+    case 'e':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_SCI;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'G': out_spec->case_adjust = 0;
+    case 'g':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_SHORTEST;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+
+    case 'A': out_spec->case_adjust = 0;
+    case 'a':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_FLOAT_HEX;
+      if (out_spec->prec_opt == NPF_FMT_SPEC_OPT_NONE) { out_spec->prec = 6; }
+      break;
+#endif
+
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+    case 'n':
+      // todo: reject string if flags or width or precision exist
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_WRITEBACK;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+#endif
+      break;
+#endif
+
+    case 'p':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_POINTER;
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      out_spec->prec_opt = NPF_FMT_SPEC_OPT_NONE;
+#endif
+      break;
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+    case 'B':
+      out_spec->case_adjust = 0;
+    case 'b':
+      out_spec->conv_spec = NPF_FMT_SPEC_CONV_BINARY;
+      break;
+#endif
+
+    default: return 0;
+  }
+
+  return (int)(cur - format);
+}
+
+static NPF_NOINLINE int npf_utoa_rev(
+    npf_uint_t val, char *buf, uint_fast8_t base, char case_adj) {
+  uint_fast8_t n = 0;
+  do {
+    int_fast8_t const d = (int_fast8_t)(val % base);
+    *buf++ = (char)(((d < 10) ? '0' : ('A' - 10 + case_adj)) + d);
+    ++n;
+    val /= base;
+  } while (val);
+  return (int)n;
+}
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+
+#include <float.h>
+
+#if (DBL_MANT_DIG <= 11) && (DBL_MAX_EXP <= 16)
+  typedef uint_fast16_t npf_double_bin_t;
+  typedef int_fast8_t npf_ftoa_exp_t;
+#elif (DBL_MANT_DIG <= 24) && (DBL_MAX_EXP <= 128)
+  typedef uint_fast32_t npf_double_bin_t;
+  typedef int_fast8_t npf_ftoa_exp_t;
+#elif (DBL_MANT_DIG <= 53) && (DBL_MAX_EXP <= 1024)
+  typedef uint_fast64_t npf_double_bin_t;
+  typedef int_fast16_t npf_ftoa_exp_t;
+#else
+  #error Unsupported width of the double type.
+#endif
+
+// The floating point conversion code works with an unsigned integer type of any size.
+#ifndef NANOPRINTF_CONVERSION_FLOAT_TYPE
+  #define NANOPRINTF_CONVERSION_FLOAT_TYPE unsigned int
+#endif
+typedef NANOPRINTF_CONVERSION_FLOAT_TYPE npf_ftoa_man_t;
+
+#if (NANOPRINTF_CONVERSION_BUFFER_SIZE <= UINT_FAST8_MAX) && (UINT_FAST8_MAX <= INT_MAX)
+  typedef uint_fast8_t npf_ftoa_dec_t;
+#else
+  typedef int npf_ftoa_dec_t;
+#endif
+
+enum {
+  NPF_DOUBLE_EXP_MASK = DBL_MAX_EXP * 2 - 1,
+  NPF_DOUBLE_EXP_BIAS = DBL_MAX_EXP - 1,
+  NPF_DOUBLE_MAN_BITS = DBL_MANT_DIG - 1,
+  NPF_DOUBLE_BIN_BITS = sizeof(npf_double_bin_t) * CHAR_BIT,
+  NPF_DOUBLE_SIGN_POS = sizeof(double) * CHAR_BIT - 1,
+  NPF_FTOA_MAN_BITS   = sizeof(npf_ftoa_man_t) * CHAR_BIT,
+  NPF_FTOA_SHIFT_BITS =
+    ((NPF_FTOA_MAN_BITS < DBL_MANT_DIG) ? NPF_FTOA_MAN_BITS : DBL_MANT_DIG) - 1
+};
+
+/* Generally, floating-point conversion implementations use
+   grisu2 (https://bit.ly/2JgMggX) and ryu (https://bit.ly/2RLXSg0) algorithms,
+   which are mathematically exact and fast, but require large lookup tables.
+
+   This implementation was inspired by Wojciech Muła's (zdjęcia at garnek.pl)
+   algorithm (http://0x80.pl/notesen/2015-12-29-float-to-string.html) and
+   extended further by adding dynamic scaling and configurable integer width by
+   Oskars Rubenis (https://github.com/Okarss). */
+
+static NPF_FORCE_INLINE npf_double_bin_t npf_double_to_int_rep(double f) {
+  // Union-cast is UB pre-C11 and in all C++; the compiler optimizes the code below.
+  npf_double_bin_t bin;
+  char const *src = (char const *)&f;
+  char *dst = (char *)&bin;
+  for (uint_fast8_t i = 0; i < sizeof(f); ++i) { dst[i] = src[i]; }
+  return bin;
+}
+
+static int npf_ftoa_rev(char *buf, npf_format_spec_t const *spec, double f) {
+  char const *ret = NULL;
+  npf_double_bin_t bin = npf_double_to_int_rep(f);
+
+  // Unsigned -> signed int casting is IB and can raise a signal but generally doesn't.
+  npf_ftoa_exp_t exp =
+    (npf_ftoa_exp_t)((npf_ftoa_exp_t)(bin >> NPF_DOUBLE_MAN_BITS) & NPF_DOUBLE_EXP_MASK);
+
+  bin &= ((npf_double_bin_t)0x1 << NPF_DOUBLE_MAN_BITS) - 1;
+  if (exp == (npf_ftoa_exp_t)NPF_DOUBLE_EXP_MASK) { // special value
+    ret = (bin) ? "NAN" : "FNI";
+    goto exit;
+  }
+  if (spec->prec > (NANOPRINTF_CONVERSION_BUFFER_SIZE - 2)) { goto exit; }
+  if (exp) { // normal number
+    bin |= (npf_double_bin_t)0x1 << NPF_DOUBLE_MAN_BITS;
+  } else { // subnormal number
+    ++exp;
+  }
+  exp = (npf_ftoa_exp_t)(exp - NPF_DOUBLE_EXP_BIAS);
+
+  uint_fast8_t carry; carry = 0;
+  npf_ftoa_dec_t end, dec; dec = (npf_ftoa_dec_t)spec->prec;
+  if (dec
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+      || spec->alt_form
+#endif
+  ) {
+    buf[dec++] = '.';
+  }
+
+  { // Integer part
+    npf_ftoa_man_t man_i;
+
+    if (exp >= 0) {
+      int_fast8_t shift_i =
+        (int_fast8_t)((exp > NPF_FTOA_SHIFT_BITS) ? (int)NPF_FTOA_SHIFT_BITS : exp);
+      npf_ftoa_exp_t exp_i = (npf_ftoa_exp_t)(exp - shift_i);
+      shift_i = (int_fast8_t)(NPF_DOUBLE_MAN_BITS - shift_i);
+      man_i = (npf_ftoa_man_t)(bin >> shift_i);
+
+      if (exp_i) {
+        if (shift_i) {
+          carry = (bin >> (shift_i - 1)) & 0x1;
+        }
+        exp = NPF_DOUBLE_MAN_BITS; // invalidate the fraction part
+      }
+
+      // Scale the exponent from base-2 to base-10.
+      for (; exp_i; --exp_i) {
+        if (!(man_i & ((npf_ftoa_man_t)0x1 << (NPF_FTOA_MAN_BITS - 1)))) {
+          man_i = (npf_ftoa_man_t)(man_i << 1);
+          man_i = (npf_ftoa_man_t)(man_i | carry); carry = 0;
+        } else {
+          if (dec >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+          buf[dec++] = '0';
+          carry = (((uint_fast8_t)(man_i % 5) + carry) > 2);
+          man_i /= 5;
+        }
+      }
+    } else {
+      man_i = 0;
+    }
+    end = dec;
+
+    do { // Print the integer
+      if (end >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+      buf[end++] = (char)('0' + (char)(man_i % 10));
+      man_i /= 10;
+    } while (man_i);
+  }
+
+  { // Fraction part
+    npf_ftoa_man_t man_f;
+    npf_ftoa_dec_t dec_f = (npf_ftoa_dec_t)spec->prec;
+
+    if (exp < NPF_DOUBLE_MAN_BITS) {
+      int_fast8_t shift_f = (int_fast8_t)((exp < 0) ? -1 : exp);
+      npf_ftoa_exp_t exp_f = (npf_ftoa_exp_t)(exp - shift_f);
+      npf_double_bin_t bin_f =
+        bin << ((NPF_DOUBLE_BIN_BITS - NPF_DOUBLE_MAN_BITS) + shift_f);
+
+      // This if-else statement can be completely optimized at compile time.
+      if (NPF_DOUBLE_BIN_BITS > NPF_FTOA_MAN_BITS) {
+        man_f = (npf_ftoa_man_t)(bin_f >> ((unsigned)(NPF_DOUBLE_BIN_BITS -
+                                                      NPF_FTOA_MAN_BITS) %
+                                           NPF_DOUBLE_BIN_BITS));
+        carry = (uint_fast8_t)((bin_f >> ((unsigned)(NPF_DOUBLE_BIN_BITS -
+                                                     NPF_FTOA_MAN_BITS - 1) %
+                                          NPF_DOUBLE_BIN_BITS)) & 0x1);
+      } else {
+        man_f = (npf_ftoa_man_t)((npf_ftoa_man_t)bin_f
+                                 << ((unsigned)(NPF_FTOA_MAN_BITS -
+                                                NPF_DOUBLE_BIN_BITS) % NPF_FTOA_MAN_BITS));
+        carry = 0;
+      }
+
+      // Scale the exponent from base-2 to base-10 and prepare the first digit.
+      for (uint_fast8_t digit = 0; dec_f && (exp_f < 4); ++exp_f) {
+        if ((man_f > ((npf_ftoa_man_t)-4 / 5)) || digit) {
+          carry = (uint_fast8_t)(man_f & 0x1);
+          man_f = (npf_ftoa_man_t)(man_f >> 1);
+        } else {
+          man_f = (npf_ftoa_man_t)(man_f * 5);
+          if (carry) { man_f = (npf_ftoa_man_t)(man_f + 3); carry = 0; }
+          if (exp_f < 0) {
+            buf[--dec_f] = '0';
+          } else {
+            ++digit;
+          }
+        }
+      }
+      man_f = (npf_ftoa_man_t)(man_f + carry);
+      carry = (exp_f >= 0);
+      dec = 0;
+    } else {
+      man_f = 0;
+    }
+
+    if (dec_f) {
+      // Print the fraction
+      for (;;) {
+        buf[--dec_f] = (char)('0' + (char)(man_f >> (NPF_FTOA_MAN_BITS - 4)));
+        man_f = (npf_ftoa_man_t)(man_f & ~((npf_ftoa_man_t)0xF << (NPF_FTOA_MAN_BITS - 4)));
+        if (!dec_f) { break; }
+        man_f = (npf_ftoa_man_t)(man_f * 10);
+      }
+      man_f = (npf_ftoa_man_t)(man_f << 4);
+    }
+    if (exp < NPF_DOUBLE_MAN_BITS) {
+      carry &= (uint_fast8_t)(man_f >> (NPF_FTOA_MAN_BITS - 1));
+    }
+  }
+
+  // Round the number
+  for (; carry; ++dec) {
+    if (dec >= NANOPRINTF_CONVERSION_BUFFER_SIZE) { goto exit; }
+    if (dec >= end) { buf[end++] = '0'; }
+    if (buf[dec] == '.') { continue; }
+    carry = (buf[dec] == '9');
+    buf[dec] = (char)(carry ? '0' : (buf[dec] + 1));
+  }
+
+  return (int)end;
+exit:
+  if (!ret) { ret = "RRE"; }
+  uint_fast8_t i;
+  for (i = 0; ret[i]; ++i) { buf[i] = (char)(ret[i] + spec->case_adjust); }
+  return -(int)i;
+}
+
+#endif // NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+static int npf_bin_len(npf_uint_t u) {
+  // Return the length of the binary string format of 'u', preferring intrinsics.
+  if (!u) { return 1; }
+
+#ifdef _MSC_VER // Win64, use _BSR64 for everything. If x86, use _BSR when non-large.
+  #ifdef _M_X64
+    #define NPF_HAVE_BUILTIN_CLZ
+    #define NPF_CLZ _BitScanReverse64
+  #elif NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 0
+    #define NPF_HAVE_BUILTIN_CLZ
+    #define NPF_CLZ _BitScanReverse
+  #endif
+  #ifdef NPF_HAVE_BUILTIN_CLZ
+    unsigned long idx;
+    NPF_CLZ(&idx, u);
+    return (int)(idx + 1);
+  #endif
+#elif NPF_CLANG || NPF_GCC_PAST_4_6
+  #define NPF_HAVE_BUILTIN_CLZ
+  #if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+    #define NPF_CLZ(X) ((sizeof(long long) * CHAR_BIT) - (size_t)__builtin_clzll(X))
+  #else
+    #define NPF_CLZ(X) ((sizeof(long) * CHAR_BIT) - (size_t)__builtin_clzl(X))
+  #endif
+  return (int)NPF_CLZ(u);
+#endif
+
+#ifndef NPF_HAVE_BUILTIN_CLZ
+  int n;
+  for (n = 0; u; ++n, u >>= 1); // slow but small software fallback
+  return n;
+#else
+  #undef NPF_HAVE_BUILTIN_CLZ
+  #undef NPF_CLZ
+#endif
+}
+#endif
+
+static void npf_bufputc(int c, void *ctx) {
+  npf_bufputc_ctx_t *bpc = (npf_bufputc_ctx_t *)ctx;
+  if (bpc->cur < bpc->len) { bpc->dst[bpc->cur++] = (char)c; }
+}
+
+static void npf_bufputc_nop(int c, void *ctx) { (void)c; (void)ctx; }
+
+typedef struct npf_cnt_putc_ctx {
+  npf_putc pc;
+  void *ctx;
+  int n;
+} npf_cnt_putc_ctx_t;
+
+static void npf_putc_cnt(int c, void *ctx) {
+  npf_cnt_putc_ctx_t *pc_cnt = (npf_cnt_putc_ctx_t *)ctx;
+  ++pc_cnt->n;
+  pc_cnt->pc(c, pc_cnt->ctx); // sibling-call optimization
+}
+
+#define NPF_PUTC(VAL) do { npf_putc_cnt((int)(VAL), &pc_cnt); } while (0)
+
+#define NPF_EXTRACT(MOD, CAST_TO, EXTRACT_AS) \
+  case NPF_FMT_SPEC_LEN_MOD_##MOD: val = (CAST_TO)va_arg(args, EXTRACT_AS); break
+
+#define NPF_WRITEBACK(MOD, TYPE) \
+  case NPF_FMT_SPEC_LEN_MOD_##MOD: *(va_arg(args, TYPE *)) = (TYPE)pc_cnt.n; break
+
+int npf_vpprintf(npf_putc pc, void *pc_ctx, char const *format, va_list args) {
+  npf_format_spec_t fs;
+  char const *cur = format;
+  npf_cnt_putc_ctx_t pc_cnt;
+  pc_cnt.pc = pc;
+  pc_cnt.ctx = pc_ctx;
+  pc_cnt.n = 0;
+
+  while (*cur) {
+    int const fs_len = (*cur != '%') ? 0 : npf_parse_format_spec(cur, &fs);
+    if (!fs_len) { NPF_PUTC(*cur++); continue; }
+    cur += fs_len;
+
+    // Extract star-args immediately
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    if (fs.field_width_opt == NPF_FMT_SPEC_OPT_STAR) {
+      fs.field_width = va_arg(args, int);
+      if (fs.field_width < 0) {
+        fs.field_width = -fs.field_width;
+        fs.left_justified = 1;
+      }
+    }
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    if (fs.prec_opt == NPF_FMT_SPEC_OPT_STAR) {
+      fs.prec = va_arg(args, int);
+      if (fs.prec < 0) { fs.prec_opt = NPF_FMT_SPEC_OPT_NONE; }
+    }
+#endif
+
+    union { char cbuf_mem[NANOPRINTF_CONVERSION_BUFFER_SIZE]; npf_uint_t binval; } u;
+    char *cbuf = u.cbuf_mem, sign_c = 0;
+    int cbuf_len = 0;
+    char need_0x = 0;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    int field_pad = 0;
+    char pad_c = 0;
+#endif
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    int prec_pad = 0;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    uint_fast8_t zero = 0;
+#endif
+#endif
+
+    // Extract and convert the argument to string, point cbuf at the text.
+    switch (fs.conv_spec) {
+      case NPF_FMT_SPEC_CONV_PERCENT:
+        *cbuf = '%';
+        cbuf_len = 1;
+        break;
+
+      case NPF_FMT_SPEC_CONV_CHAR:
+        *cbuf = (char)va_arg(args, int);
+        cbuf_len = (*cbuf) ? 1 : 0;
+        break;
+
+      case NPF_FMT_SPEC_CONV_STRING: {
+        cbuf = va_arg(args, char *);
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+        for (char const *s = cbuf;
+             ((fs.prec_opt == NPF_FMT_SPEC_OPT_NONE) || (cbuf_len < fs.prec)) && cbuf && *s;
+             ++s, ++cbuf_len);
+#else
+        for (char const *s = cbuf; cbuf && *s; ++s, ++cbuf_len); // strlen
+#endif
+      } break;
+
+      case NPF_FMT_SPEC_CONV_SIGNED_INT: {
+        npf_int_t val = 0;
+        switch (fs.length_modifier) {
+          NPF_EXTRACT(NONE, int, int);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+          NPF_EXTRACT(SHORT, short, int);
+          NPF_EXTRACT(CHAR, signed char, int);
+#endif
+          NPF_EXTRACT(LONG, long, long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+          NPF_EXTRACT(LARGE_LONG_LONG, long long, long long);
+          NPF_EXTRACT(LARGE_INTMAX, intmax_t, intmax_t);
+          NPF_EXTRACT(LARGE_SIZET, npf_ssize_t, npf_ssize_t);
+          NPF_EXTRACT(LARGE_PTRDIFFT, ptrdiff_t, ptrdiff_t);
+#endif
+          default: break;
+        }
+
+        sign_c = (val < 0) ? '-' : fs.prepend;
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = !val;
+#endif
+        // special case, if prec and value are 0, skip
+        if (!val && (fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec) {
+          cbuf_len = 0;
+        } else
+#endif
+        {
+          npf_uint_t uval = (npf_uint_t)val;
+          if (val < 0) { uval = 0 - uval; }
+          cbuf_len = npf_utoa_rev(uval, cbuf, 10, fs.case_adjust);
+        }
+      } break;
+
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_BINARY:
+#endif
+      case NPF_FMT_SPEC_CONV_OCTAL:
+      case NPF_FMT_SPEC_CONV_HEX_INT:
+      case NPF_FMT_SPEC_CONV_UNSIGNED_INT:
+      case NPF_FMT_SPEC_CONV_POINTER: {
+        npf_uint_t val = 0;
+
+        if (fs.conv_spec == NPF_FMT_SPEC_CONV_POINTER) {
+          val = (npf_uint_t)(uintptr_t)va_arg(args, void *);
+        } else {
+          switch (fs.length_modifier) {
+            NPF_EXTRACT(NONE, unsigned, unsigned);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+            NPF_EXTRACT(SHORT, unsigned short, unsigned);
+            NPF_EXTRACT(CHAR, unsigned char, unsigned);
+#endif
+            NPF_EXTRACT(LONG, unsigned long, unsigned long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+            NPF_EXTRACT(LARGE_LONG_LONG, unsigned long long, unsigned long long);
+            NPF_EXTRACT(LARGE_INTMAX, uintmax_t, uintmax_t);
+            NPF_EXTRACT(LARGE_SIZET, size_t, size_t);
+            NPF_EXTRACT(LARGE_PTRDIFFT, npf_uptrdiff_t, npf_uptrdiff_t);
+#endif
+            default: break;
+          }
+        }
+
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = !val;
+#endif
+        if (!val && (fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec) {
+          // Zero value and explicitly-requested zero precision means "print nothing".
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+          if ((fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL) && fs.alt_form) {
+            fs.prec = 1; // octal special case, print a single '0'
+          }
+#endif
+        } else
+#endif
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+        if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) {
+          cbuf_len = npf_bin_len(val); u.binval = val;
+        } else
+#endif
+        {
+          uint_fast8_t const base = (fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL) ?
+            8u : ((fs.conv_spec == NPF_FMT_SPEC_CONV_UNSIGNED_INT) ? 10u : 16u);
+          cbuf_len = npf_utoa_rev(val, cbuf, base, fs.case_adjust);
+        }
+
+#if NANOPRINTF_USE_ALT_FORM_FLAG == 1
+        if (val && fs.alt_form && (fs.conv_spec == NPF_FMT_SPEC_CONV_OCTAL)) {
+          cbuf[cbuf_len++] = '0'; // OK to add leading octal '0' immediately.
+        }
+
+        if (val && fs.alt_form) { // 0x or 0b but can't write it yet.
+          if ((fs.conv_spec == NPF_FMT_SPEC_CONV_HEX_INT) ||
+              (fs.conv_spec == NPF_FMT_SPEC_CONV_POINTER)) { need_0x = 'X'; }
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+          else if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) { need_0x = 'B'; }
+#endif
+          if (need_0x) { need_0x = (char)(need_0x + fs.case_adjust); }
+        }
+#endif
+      } break;
+
+#if NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_WRITEBACK:
+        switch (fs.length_modifier) {
+          NPF_WRITEBACK(NONE, int);
+#if NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS == 1
+          NPF_WRITEBACK(SHORT, short);
+          NPF_WRITEBACK(CHAR, signed char);
+#endif
+          NPF_WRITEBACK(LONG, long);
+#if NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS == 1
+          NPF_WRITEBACK(LARGE_LONG_LONG, long long);
+          NPF_WRITEBACK(LARGE_INTMAX, intmax_t);
+          NPF_WRITEBACK(LARGE_SIZET, npf_ssize_t);
+          NPF_WRITEBACK(LARGE_PTRDIFFT, ptrdiff_t);
+#endif
+          default: break;
+        } break;
+#endif
+
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+      case NPF_FMT_SPEC_CONV_FLOAT_DEC:
+      case NPF_FMT_SPEC_CONV_FLOAT_SCI:
+      case NPF_FMT_SPEC_CONV_FLOAT_SHORTEST:
+      case NPF_FMT_SPEC_CONV_FLOAT_HEX: {
+        double val;
+        if (fs.length_modifier == NPF_FMT_SPEC_LEN_MOD_LONG_DOUBLE) {
+          val = (double)va_arg(args, long double);
+        } else {
+          val = va_arg(args, double);
+        }
+
+        sign_c = (npf_double_to_int_rep(val) >> NPF_DOUBLE_SIGN_POS) ? '-' : fs.prepend;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+        zero = (val == 0.);
+#endif
+        cbuf_len = npf_ftoa_rev(cbuf, &fs, val);
+        if (cbuf_len < 0) { // negative means text (not number), so ignore the '0' flag
+           cbuf_len = -cbuf_len;
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+           fs.leading_zero_pad = 0;
+#endif
+        }
+      } break;
+#endif
+      default: break;
+    }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    // Compute the field width pad character
+    if (fs.field_width_opt != NPF_FMT_SPEC_OPT_NONE) {
+      if (fs.leading_zero_pad) {
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+        if ((fs.prec_opt != NPF_FMT_SPEC_OPT_NONE) && !fs.prec && zero) {
+          pad_c = ' ';
+        } else
+#endif
+        { pad_c = '0'; }
+      } else { pad_c = ' '; }
+    }
+#endif
+
+    // Compute the number of bytes to truncate or '0'-pad.
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    if (fs.conv_spec != NPF_FMT_SPEC_CONV_STRING) {
+#if NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS == 1
+      // float precision is after the decimal point
+      if ((fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_DEC) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_SCI) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_SHORTEST) &&
+          (fs.conv_spec != NPF_FMT_SPEC_CONV_FLOAT_HEX))
+#endif
+      { prec_pad = NPF_MAX(0, fs.prec - cbuf_len); }
+    }
+#endif
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    // Given the full converted length, how many pad bytes?
+    field_pad = fs.field_width - cbuf_len - !!sign_c;
+    if (need_0x) { field_pad -= 2; }
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+    field_pad -= prec_pad;
+#endif
+    field_pad = NPF_MAX(0, field_pad);
+
+    // Apply right-justified field width if requested
+    if (!fs.left_justified && pad_c) { // If leading zeros pad, sign goes first.
+      if (pad_c == '0') {
+        if (sign_c) { NPF_PUTC(sign_c); sign_c = 0; }
+        // Pad byte is '0', write '0x' before '0' pad chars.
+        if (need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); }
+      }
+      while (field_pad-- > 0) { NPF_PUTC(pad_c); }
+      // Pad byte is ' ', write '0x' after ' ' pad chars but before number.
+      if ((pad_c != '0') && need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); }
+    } else
+#endif
+    { if (need_0x) { NPF_PUTC('0'); NPF_PUTC(need_0x); } } // no pad, '0x' requested.
+
+    // Write the converted payload
+    if (fs.conv_spec == NPF_FMT_SPEC_CONV_STRING) {
+      for (int i = 0; cbuf && (i < cbuf_len); ++i) { NPF_PUTC(cbuf[i]); }
+    } else {
+      if (sign_c) { NPF_PUTC(sign_c); }
+#if NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS == 1
+      while (prec_pad-- > 0) { NPF_PUTC('0'); } // int precision leads.
+#endif
+#if NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS == 1
+      if (fs.conv_spec == NPF_FMT_SPEC_CONV_BINARY) {
+        while (cbuf_len) { NPF_PUTC('0' + ((u.binval >> --cbuf_len) & 1)); }
+      } else
+#endif
+      { while (cbuf_len-- > 0) { NPF_PUTC(cbuf[cbuf_len]); } } // payload is reversed
+    }
+
+#if NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS == 1
+    if (fs.left_justified && pad_c) { // Apply left-justified field width
+      while (field_pad-- > 0) { NPF_PUTC(pad_c); }
+    }
+#endif
+  }
+
+  return pc_cnt.n;
+}
+
+#undef NPF_PUTC
+#undef NPF_EXTRACT
+#undef NPF_WRITEBACK
+
+int npf_pprintf(npf_putc pc,
+                void * NPF_RESTRICT pc_ctx,
+                char const * NPF_RESTRICT format,
+                ...) {
+  va_list val;
+  va_start(val, format);
+  int const rv = npf_vpprintf(pc, pc_ctx, format, val);
+  va_end(val);
+  return rv;
+}
+
+int npf_snprintf(char * NPF_RESTRICT buffer,
+                 size_t bufsz,
+                 const char * NPF_RESTRICT format,
+                 ...) {
+  va_list val;
+  va_start(val, format);
+  int const rv = npf_vsnprintf(buffer, bufsz, format, val);
+  va_end(val);
+  return rv;
+}
+
+int npf_vsnprintf(char * NPF_RESTRICT buffer,
+                  size_t bufsz,
+                  char const * NPF_RESTRICT format,
+                  va_list vlist) {
+  npf_bufputc_ctx_t bufputc_ctx;
+  bufputc_ctx.dst = buffer;
+  bufputc_ctx.len = bufsz;
+  bufputc_ctx.cur = 0;
+
+  npf_putc const pc = buffer ? npf_bufputc : npf_bufputc_nop;
+  int const n = npf_vpprintf(pc, &bufputc_ctx, format, vlist);
+
+  if (buffer && bufsz) {
+#ifdef NANOPRINTF_SNPRINTF_SAFE_EMPTY_STRING_ON_OVERFLOW
+    buffer[(n < 0 || (unsigned)n >= bufsz) ? 0 : n] = '\0';
+#else
+    buffer[n < 0 ? 0 : NPF_MIN((unsigned)n, bufsz - 1)] = '\0';
+#endif
+  }
+
+  return n;
+}
+
+#if NPF_HAVE_GCC_WARNING_PRAGMAS
+  #pragma GCC diagnostic pop
+#endif
+
+#ifdef _MSC_VER
+  #pragma warning(pop)
+#endif
+
+#endif // NPF_IMPLEMENTATION_INCLUDED
+#endif // NANOPRINTF_IMPLEMENTATION
+
+/*
+  nanoprintf is dual-licensed under both the "Unlicense" and the
+  "Zero-Clause BSD" (0BSD) licenses. The intent of this dual-licensing
+  structure is to make nanoprintf as consumable as possible in as many
+  environments / countries / companies as possible without any
+  encumberances.
+
+  The text of the two licenses follows below:
+
+  ============================== UNLICENSE ==============================
+
+  This is free and unencumbered software released into the public domain.
+
+  Anyone is free to copy, modify, publish, use, compile, sell, or
+  distribute this software, either in source code form or as a compiled
+  binary, for any purpose, commercial or non-commercial, and by any
+  means.
+
+  In jurisdictions that recognize copyright laws, the author or authors
+  of this software dedicate any and all copyright interest in the
+  software to the public domain. We make this dedication for the benefit
+  of the public at large and to the detriment of our heirs and
+  successors. We intend this dedication to be an overt act of
+  relinquishment in perpetuity of all present and future rights to this
+  software under copyright law.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+  For more information, please refer to <http://unlicense.org>
+
+  ================================ 0BSD =================================
+
+  Copyright (C) 2019- by Charles Nicholson <charles.nicholson+nanoprintf at gmail.com>
+
+  Permission to use, copy, modify, and/or distribute this software for
+  any purpose with or without fee is hereby granted.
+
+  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
diff --git a/target/linux/realtek/image/rt-loader/linker/linker.ld b/target/linux/realtek/image/rt-loader/linker/linker.ld
new file mode 100644
index 0000000000..dd1fb5aaa0
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/linker/linker.ld
@@ -0,0 +1,41 @@
+ENTRY(_start)
+
+SECTIONS {
+	.text : {
+		*(.text)
+	}
+
+	.data : ALIGN(32) {
+		*(.sdata*)
+		*(.data*)
+	}
+/*
+ * In MIPS position independent code (PIC), the global offset table (GOT) is a data structure
+ * used to facilitate access to global variables and functions when the code's final memory
+ * location is not known at compile time. The GOT contains absolute addresses of global symbols,
+ * but is itself located using a relative reference. This allows the code to be relocated at
+ * runtime without modification.
+ */
+	.got : ALIGN(32) {
+		__got_start = .;
+		*(.got*)
+		__got_end = .;
+	}
+/*
+ * Storage for the compressed kernel image that was integrated into the loader during link time.
+ * No code just binary data.
+ */
+	.kernel : ALIGN(1) {
+		__kernel_data_start = .;
+		KEEP(*(.kernel))
+		__kernel_data_end = .;
+	}
+
+	.bss (NOLOAD) : ALIGN(4) {
+		__bss_start = .;
+		*(.bss)
+		*(.sbss)
+		*(COMMON)
+		__bss_end = .;
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/board.c b/target/linux/realtek/image/rt-loader/src/board.c
new file mode 100644
index 0000000000..d6d5865673
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/board.c
@@ -0,0 +1,110 @@
+/*
+ * rt-loader board functions
+ * (c) 2025 Markus Stockhausen
+ */
+
+#include "globals.h"
+#include "memory.h"
+#include "nanoprintf.h"
+
+#define DRAM_CONFIG_REG			0xb8001004
+#define UART_BUFFER_REG			0xb8002000
+#define UART_LINE_STATUS_REG		0xb8002014
+#define UART_TX_READY			(1 << 29)
+
+#define RTL838X_ENABLE_RW_MASK		0x3
+#define RTL838X_INT_RW_CTRL_REG		0xbb000058
+#define RTL838X_MODEL_NAME_INFO_REG	0xbb0000d4
+#define RTL839X_MODEL_NAME_INFO_REG	0xbb000ff0
+#define RTL83XX_CHIP_INFO_EN		0xa0000000
+#define RTL93XX_MODEL_NAME_INFO_REG	0xbb000004
+#define RTL93XX_CHIP_INFO_EN		0xa0000
+
+/*
+ * board_putchar() is the central function to write to serial console of the device. Some printf
+ * libraries (e.g. https://github.com/mpaland/printf) need a fixed function name like _putchar.
+ * To keep the original library as is, link the two functions with gcc compiler option
+ * -D_putchar=board_putchar
+ */
+
+void board_putchar(int ch, void *ctx)
+{
+	while (!(ioread32(UART_LINE_STATUS_REG) & UART_TX_READY));
+	iowrite32(((int)ch) << 24, UART_BUFFER_REG);
+
+	if (ch == '\n')
+		board_putchar('\r', ctx);
+}
+
+/*
+ * board_get_memory() does what it is named after. On Realtek switches the DRAM config register
+ * has information about bank count, bus width, ... From that the memory size can be derived.
+ */
+
+unsigned int board_get_memory(void)
+{
+	unsigned int dcr = ioread32(DRAM_CONFIG_REG);
+	char ROWCNTv[] = {11, 12, 13, 14, 15, 16};
+	char COLCNTv[] = {8, 9, 10, 11, 12};
+	char BNKCNTv[] = {1, 2, 3};
+	char BUSWIDv[] = {0, 1, 2};
+
+	return 1 << (BNKCNTv[(dcr >> 28) & 0x3] + BUSWIDv[(dcr >> 24) & 0x3] +
+		     ROWCNTv[(dcr >> 20) & 0xf] + COLCNTv[(dcr >> 16) & 0xf]);
+}
+
+/*
+ * board_get_system() generates a readable system name that will be printed during startup.
+ * Formatting can be whatever is helpful.
+ */
+
+void board_get_system(char *buffer, int len)
+{
+	unsigned int chip_id, model_id, model_version, chip_version;
+	unsigned int reg, val, act;
+
+	act = RTL93XX_CHIP_INFO_EN;
+	reg = RTL93XX_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+
+	if ((val & 0xffec0000) == 0x93000000)
+		goto found;
+
+	act = RTL83XX_CHIP_INFO_EN;
+	reg = RTL839X_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+	if ((val & 0xfff80000) == 0x83900000)
+		goto found;
+
+	iowrite32(0x3, RTL838X_INT_RW_CTRL_REG);
+	reg = RTL838X_MODEL_NAME_INFO_REG;
+	val = ioread32(reg);
+found:
+	model_id = val >> 16;
+	model_version = (val >> 11) & 0x1f;
+
+	iowrite32(act, reg + 4);
+	val = ioread32(reg + 4);
+	chip_id = val & 0xffff;
+
+	if (model_id < 0x9300)
+		chip_version = val >> 16 & 0x1f;
+	else
+		chip_version = val >> 28 & 0x0f;
+
+	snprintf(buffer, len, "RTL%04X%c (chip id %04x%c)",
+		 model_id, model_version ? model_version + 64 : 0,
+		 chip_id, chip_version ? chip_version + 64 : 0);
+}
+
+/*
+ * board_panic() is called in critical cases. Whatever is needed can be done here. Maybe
+ * an automatic reboot can be issued some day. For now just halt processing.
+ */
+
+void board_panic(void)
+{
+	printf("halt system\n");
+	while (1) {
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/main.c b/target/linux/realtek/image/rt-loader/src/main.c
new file mode 100644
index 0000000000..747881ea06
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/main.c
@@ -0,0 +1,123 @@
+/*
+ * rt-loader main program
+ * (c) 2025 Markus Stockhausen
+ *
+ * This code was inspired by the OpenWrt lzma loader. Thanks to
+ *
+ * Copyright (C) 2004 Manuel Novoa III (mjn3 at codepoet.org)
+ * Copyright (C) 2005 Mineharu Takahara <mtakahar at yahoo.com>
+ * Copyright (C) 2005 by Oleg I. Vdovikin <oleg at cs.msu.su>
+ * Copyright (C) 2011 Gabor Juhos <juhosg at openwrt.org>
+ */
+
+#include "board.h"
+#include "globals.h"
+#include "memory.h"
+
+#define NANOPRINTF_USE_FIELD_WIDTH_FORMAT_SPECIFIERS	1
+#define NANOPRINTF_USE_LARGE_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_SMALL_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_BINARY_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_USE_WRITEBACK_FORMAT_SPECIFIERS	0
+#define NANOPRINTF_USE_PRECISION_FORMAT_SPECIFIERS	0
+#define NANOPRINTF_USE_FLOAT_FORMAT_SPECIFIERS		0
+#define NANOPRINTF_IMPLEMENTATION
+#include "nanoprintf.h"
+
+extern void *_kernel_load_addr;
+extern void *_kernel_data_addr;
+extern int _kernel_data_size;
+extern void *_my_load_addr;
+extern int _my_load_size;
+
+extern int unlzma(unsigned char *buf, long in_len,
+	   long (*fill)(void*, unsigned long),
+	   long (*flush)(void*, unsigned long),
+	   unsigned char *output,
+	   long *outlen,
+	   long *posp,
+	   void(*error)(char *x));
+
+typedef void (*entry_func_t)(unsigned long reg_a0, unsigned long reg_a1,
+			     unsigned long reg_a2, unsigned long reg_a3);
+
+void *relocate(void *src, int len)
+{
+	void *addr;
+	unsigned int offs;
+
+	/*
+	 * Relocate to highest possible memory address. This is usually the RAM size minus some
+	 * space for the heap and the stack pointer. As we do not have any highmem features
+	 * limit this to 256MB.
+	 */
+
+	offs = (board_get_memory() - STACK_SIZE - HEAP_SIZE - len - 1024) & 0xfff0000;
+	addr = (void *)KSEG0 + offs;
+
+	printf("Relocate %d bytes from 0x%08x to 0x%08x\n", len, src, addr);
+
+	memcpy(addr, src, len);
+	flush_cache(addr, len);
+
+	return addr;
+}
+
+void welcome(void)
+{
+	char system[80];
+
+	board_get_system(system, sizeof(system));
+
+	printf("rt-loader\n");
+	printf("Running on %s with %dMB\n", system, board_get_memory() >> 20);
+}
+
+void decompress_error(char *x)
+{
+	printf("%s\n", x);
+}
+
+void *decompress(void *out, void *in, int len)
+{
+	long outlen;
+
+	printf("Extract kernel with %d bytes from 0x%08x to 0x%08x ...\n", len, in, out);
+
+	if (unlzma(in, len, 0, 0, out, &outlen, 0, decompress_error))
+		board_panic();
+
+	printf("Extracted kernel size is %d bytes\n", outlen);
+	flush_cache(out, outlen);
+
+	return out;
+}
+
+void main(unsigned long reg_a0, unsigned long reg_a1,
+	  unsigned long reg_a2, unsigned long reg_a3)
+{
+	entry_func_t fn;
+
+	if (_kernel_load_addr == _my_load_addr) {
+		/*
+		 * During first run relocate the whole package to the end of memory. Use
+		 * _my_load_size as relocation length. That includes the bss section, aka
+		 * uninitialized globals. So it is possible to initialize globals during
+		 * first run and have them at hand after relocation.
+		 */
+
+		welcome();
+		fn = relocate(_my_load_addr, _my_load_size);
+		fn(reg_a0, reg_a1, reg_a2, reg_a3);
+	} else {
+		/*
+		 * During second run extract the attached kernel image to the memory address
+		 * that the loader was loaded to in the first run.
+		 */
+
+		fn = decompress(_kernel_load_addr, _kernel_data_addr, _kernel_data_size);
+
+		printf("Booting kernel from 0x%08x ...\n\n", fn);
+		fn(reg_a0, reg_a1, reg_a2, reg_a3);
+	}
+}
diff --git a/target/linux/realtek/image/rt-loader/src/memory.c b/target/linux/realtek/image/rt-loader/src/memory.c
new file mode 100644
index 0000000000..6ff5a44897
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/memory.c
@@ -0,0 +1,122 @@
+/*
+ * rt-loader memory functions
+ * (c) 2025 Markus Stockhausen
+ *
+ * This is a small function collection to get some rudimentary memory management working when
+ * running bare metal. None of these functions is optimized but works well for current needs.
+ */
+
+#include "board.h"
+#include "globals.h"
+#include "memory.h"
+#include "nanoprintf.h"
+
+#define CACHE_OP(op, addr)			\
+	__asm__ __volatile__(			\
+	"	.set	push		\n"	\
+	"	.set	noreorder	\n"	\
+	"	.set	mips3\n\t	\n"	\
+	"	cache	%0, %1		\n"	\
+	"	.set	pop		\n"	\
+	:					\
+	: "i" (op), "R" (*(unsigned char *)(addr)))
+
+void flush_cache(void *start_addr, unsigned long size)
+{
+	/*
+	 * MIPS cores may have different cache lines. Most common are 16 and 32 bytes. Avoid
+	 * detection routines or multiple implementations and take the lowest known value that
+	 * will fit fine for cores with longer cache lines
+	 */
+
+	unsigned long lsize = 16;
+	unsigned long addr = (unsigned long)start_addr & ~(lsize - 1);
+	unsigned long aend = ((unsigned long)start_addr + size - 1) & ~(lsize - 1);
+
+	while (1) {
+		CACHE_OP(CACHE_HIT_INVALIDATE_I, addr);
+		CACHE_OP(CACHE_HIT_WRITEBACK_INV_D, addr);
+		if (addr == aend)
+			break;
+		addr += lsize;
+	}
+}
+
+void free(void *ptr)
+{
+	/* this is only one shot allocation */
+}
+
+int memcmp(const void *s1, const void *s2, size_t count)
+{
+	volatile char *p1 = (volatile char *)s1;
+	volatile char *p2 = (volatile char *)s2;
+
+	while (count--) {
+		if (*p1 != *p2)
+			return (int)(*p1) - (int)(*p2);
+
+		p1++;
+		p2++;
+	}
+
+	return 0;
+}
+
+void *memmove(void *dst, const void *src, size_t count)
+{
+	volatile char *d = (volatile char *)dst;
+	volatile char *s = (volatile char *)src;
+
+	if (d < s) {
+		while (count--)
+			*d++ = *s++;
+	} else if (d > s) {
+		d += count;
+		s += count;
+		while (count--)
+			*--d = *--s;
+	}
+
+	return dst;
+}
+
+void *memcpy(void *dst, const void *src, size_t count)
+{
+	memmove(dst, src, count);
+}
+
+void *memset(void *dst, int c, size_t count)
+{
+	volatile char *d = (volatile char *)dst;
+
+	while (count--)
+		*d++ = c;
+
+	return (void *)d;
+}
+
+void *malloc(size_t size)
+{
+	void *start;
+
+	start = (void *)(((unsigned int)_heap_addr + MEMORY_ALIGNMENT - 1) & ~(MEMORY_ALIGNMENT - 1));
+	if ((start + size) > _heap_addr_max) {
+		printf("malloc(%d) failed. Only %dkB of %dkB heap left.\n",
+		       size, (_heap_addr_max - start) >> 10, HEAP_SIZE >> 10);
+		board_panic();
+	}
+
+	_heap_addr += size;
+
+	return start;
+}
+
+size_t strlen(const char *s)
+{
+	const char *p = s;
+
+	while (*p) ++p;
+
+	return (size_t)(p - s);
+}
diff --git a/target/linux/realtek/image/rt-loader/src/startup.S b/target/linux/realtek/image/rt-loader/src/startup.S
new file mode 100644
index 0000000000..898f7e1a16
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/startup.S
@@ -0,0 +1,182 @@
+# rt-loader assembler startup code
+# (c) 2025 Markus Stockhausen
+
+#include "globals.h"
+
+# This start code allows to run a position independent code (PIC) on bare metal. In that case
+# all addresses are looked up via the global offset table (GOT). But that must be filled during
+# this initialization sequence. Without a proper GOT using standard "la" instruction in the code
+# will not work. Provide a macro that avoids the dependency.
+
+.macro _LA reg, symbol
+	lui \reg, %hi(\symbol)
+	addi \reg, \reg, %lo(\symbol)
+	add \reg, $t9
+.endm
+
+	.section .text
+	.globl _start
+	.ent _start
+_start:
+	.set noreorder
+
+# Determine current program load address and store it into t9.
+
+	bal	_where_am_i
+	nop
+_where_am_i:
+	move	$t9, $ra
+	subu	$t9, $t9, 0x8
+
+
+# Check if this our first run (_kernel_load_addr = 0?)
+
+	_LA	$t6, _kernel_load_addr
+	lw	$t7, 0($t6)
+	bne	$zero, $t7, _init_done
+	nop
+
+# During first run store the current load address as the target kernel load address.
+
+	sw	$t9, 0($t6)
+
+# Same for the global variables in the BSS section. Clear them only during the first run. This
+# way the "global program state" can be copied over to the relocation address.
+
+	_LA	$t3, __bss_start
+	_LA	$t4, __bss_end
+_bss_zero:
+	beq	$t3, $t4, _init_done
+	nop
+	sw	$zero, 0($t3)
+	addiu	$t3, $t3, 4
+	b	_bss_zero
+	nop
+
+_init_done:
+
+# Code is running bare metal and no one initializes the global offset table. After the build
+# process the table is relative to address 0x0. Starting from anywhere else breaks the program.
+# A manual update is required during startup. Usually this is quite easy by simply adding the
+# current load address to all entries.
+# But this code relocates itself to another memory address and starts itself over. At the new
+# address it will find a global offset table that fits to the previous execution. To solve this
+# store a copy of the last load address in got_delta variable and only add the difference after
+# a relocation. Sequence is as follows
+#
+# - U-Boot loads the code to 0x80100000
+# - U-Boot runs the code at 0x80100000
+# - code identifies its dynamic start_address = 0x80100000
+# - code reads (initial) _got_delta = 0x00000000
+# - code adds 0x80100000 to all GOT entries
+# - code stores _got_delta with 0x80100000
+# - code copies itself over to a new location 0x85000000
+# - code starts itself from 0x85000000
+# - code identifies its dynamic start_address = 0x85000000
+# - code reads (pre-filled) _got_delta = 0x80100000
+# - code adds 0x4f00000 (= 0x85000000 - 0x80100000) to all GOT entries
+# - ...
+#
+
+	_LA	$t6, _got_delta
+	lw	$t5, 0($t6)
+	subu    $t7, $t9, $t5
+	sw	$t9, 0($t6)
+	_LA	$t3, __got_start
+	_LA	$t4, __got_end
+_got_patch:
+	beq	$t3, $t4, _got_done
+	nop
+	lw	$t5, 0($t3)
+	addu	$t5, $t5, $t7
+	sw	$t5, 0($t3)
+	addiu	$t3, $t3, 4
+	b	_got_patch
+	nop
+_got_done:
+
+# Linker attached kernel to end of package. Store addresses in global variables
+
+	_LA	$t8, _my_load_addr
+	sw	$t9, 0($t8)
+
+	_LA	$t5, __kernel_data_start
+	_LA	$t4, _kernel_data_addr
+	sw	$t5, 0($t4)
+
+	_LA	$t3, __kernel_data_end
+	subu	$t3, $t3, $t5
+	_LA	$t4, _kernel_data_size
+	sw	$t3, 0($t4)
+
+# Determine own code size by looking where BSS ends.
+
+	_LA	$t3, __bss_end
+	subu	$t6, $t3, $t9
+	_LA	$t4, _my_load_size
+	sw	$t6, 0($t4)
+
+# Setup heap. It will start directly behind BSS
+
+	addiu 	$t3, MEMORY_ALIGNMENT
+	li	$t4, ~(MEMORY_ALIGNMENT - 1)
+	and	$t3, $t4
+
+	_LA	$t5, _heap_addr
+	sw	$t3, 0($t5)
+
+	li	$t4, HEAP_SIZE
+	add	$t3, $t4
+
+	_LA	$t5, _heap_addr_max
+	sw	$t3, 0($t5)
+
+# Setup stack that is located on top of heap.
+
+	li	$t4, STACK_SIZE
+	add	$sp, $t3, $t4
+
+# Adapt t9 so it points to main(). This is needed so main() can find the GOT via t9/gp
+
+	_LA     $t8, main
+	move	$t9, $t8
+
+# Call main() with parameters a0, a3, __kernel_start, __kernel_end
+	bal	main
+	nop
+
+	.end _start
+
+	.section .data
+	.align 4
+# delta for global offset table initialization
+_got_delta:
+	.word 0
+# current heap address for malloc() / free()
+	.globl _heap_addr
+_heap_addr:
+	.word 0
+# maximum heap address
+	.globl _heap_addr_max
+_heap_addr_max:
+	.word 0
+# current program load address
+	.globl _my_load_addr
+_my_load_addr:
+	.word 0
+# total size of code including attached kernel and bss (uninitialized global variables)
+	.globl _my_load_size
+_my_load_size:
+	.word 0
+# target load address of kernel = this programs address during initial run
+	.globl _kernel_load_addr
+_kernel_load_addr:
+	.word 0
+# absolute start address of attached kernel
+	.globl _kernel_data_addr
+_kernel_data_addr:
+	.word 0
+# size of attached kernel
+	.globl _kernel_data_size
+_kernel_data_size:
+	.word 0
diff --git a/target/linux/realtek/image/rt-loader/src/unlzma.c b/target/linux/realtek/image/rt-loader/src/unlzma.c
new file mode 100644
index 0000000000..a7ddc004ea
--- /dev/null
+++ b/target/linux/realtek/image/rt-loader/src/unlzma.c
@@ -0,0 +1,663 @@
+/* Lzma decompressor for Linux kernel. Shamelessly snarfed
+ *from busybox 1.1.1
+ *
+ *Linux kernel adaptation
+ *Copyright (C) 2006  Alain < alain at knaff.lu >
+ *
+ *Based on small lzma deflate implementation/Small range coder
+ *implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel at gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ *Copyright (C) 1999-2005  Igor Pavlov
+ *
+ *Copyrights of the parts, see headers below.
+ *
+ *
+ *This program is free software; you can redistribute it and/or
+ *modify it under the terms of the GNU Lesser General Public
+ *License as published by the Free Software Foundation; either
+ *version 2.1 of the License, or (at your option) any later version.
+ *
+ *This program is distributed in the hope that it will be useful,
+ *but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *Lesser General Public License for more details.
+ *
+ *You should have received a copy of the GNU Lesser General Public
+ *License along with this library; if not, write to the Free Software
+ *Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef MIN
+#define	MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+static long long read_int(unsigned char *ptr, int size)
+{
+	int i;
+	long long ret = 0;
+
+	for (i = 0; i < size; i++)
+		ret = (ret << 8) | ptr[size-i-1];
+	return ret;
+}
+
+#define ENDIAN_CONVERT(x) \
+  x = (typeof(x))read_int((unsigned char *)&x, sizeof(x))
+
+
+/* Small range coder implementation for lzma.
+ *Copyright (C) 2006  Aurelien Jacobs < aurel at gnuage.org >
+ *
+ *Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ *Copyright (c) 1999-2005  Igor Pavlov
+ */
+
+#include "memory.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define LZMA_IOBUF_SIZE	0x10000
+
+struct rc {
+	long (*fill)(void*, unsigned long);
+	uint8_t *ptr;
+	uint8_t *buffer;
+	uint8_t *buffer_end;
+	long buffer_size;
+	uint32_t code;
+	uint32_t range;
+	uint32_t bound;
+	void (*error)(char *);
+};
+
+
+#define RC_TOP_BITS 24
+#define RC_MOVE_BITS 5
+#define RC_MODEL_TOTAL_BITS 11
+
+
+static long nofill(void *buffer, unsigned long len)
+{
+	return -1;
+}
+
+/* Called twice: once at startup and once in rc_normalize() */
+static void rc_read(struct rc *rc)
+{
+	rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE);
+	if (rc->buffer_size <= 0)
+		rc->error("unexpected EOF");
+	rc->ptr = rc->buffer;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+}
+
+/* Called once */
+static inline void rc_init(struct rc *rc,
+			   long (*fill)(void*, unsigned long),
+			   char *buffer, long buffer_size)
+{
+	if (fill)
+		rc->fill = fill;
+	else
+		rc->fill = nofill;
+	rc->buffer = (uint8_t *)buffer;
+	rc->buffer_size = buffer_size;
+	rc->buffer_end = rc->buffer + rc->buffer_size;
+	rc->ptr = rc->buffer;
+
+	rc->code = 0;
+	rc->range = 0xFFFFFFFF;
+}
+
+static inline void rc_init_code(struct rc *rc)
+{
+	int i;
+
+	for (i = 0; i < 5; i++) {
+		if (rc->ptr >= rc->buffer_end)
+			rc_read(rc);
+		rc->code = (rc->code << 8) | *rc->ptr++;
+	}
+}
+
+
+/* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */
+static void rc_do_normalize(struct rc *rc)
+{
+	if (rc->ptr >= rc->buffer_end)
+		rc_read(rc);
+	rc->range <<= 8;
+	rc->code = (rc->code << 8) | *rc->ptr++;
+}
+static inline void rc_normalize(struct rc *rc)
+{
+	if (rc->range < (1 << RC_TOP_BITS))
+		rc_do_normalize(rc);
+}
+
+/* Called 9 times */
+/* Why rc_is_bit_0_helper exists?
+ *Because we want to always expose (rc->code < rc->bound) to optimizer
+ */
+static inline uint32_t rc_is_bit_0_helper(struct rc *rc, uint16_t *p)
+{
+	rc_normalize(rc);
+	rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS);
+	return rc->bound;
+}
+static inline int rc_is_bit_0(struct rc *rc, uint16_t *p)
+{
+	uint32_t t = rc_is_bit_0_helper(rc, p);
+	return rc->code < t;
+}
+
+/* Called ~10 times, but very small, thus inlined */
+static inline void rc_update_bit_0(struct rc *rc, uint16_t *p)
+{
+	rc->range = rc->bound;
+	*p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS;
+}
+static inline void rc_update_bit_1(struct rc *rc, uint16_t *p)
+{
+	rc->range -= rc->bound;
+	rc->code -= rc->bound;
+	*p -= *p >> RC_MOVE_BITS;
+}
+
+/* Called 4 times in unlzma loop */
+static int rc_get_bit(struct rc *rc, uint16_t *p, int *symbol)
+{
+	if (rc_is_bit_0(rc, p)) {
+		rc_update_bit_0(rc, p);
+		*symbol *= 2;
+		return 0;
+	} else {
+		rc_update_bit_1(rc, p);
+		*symbol = *symbol * 2 + 1;
+		return 1;
+	}
+}
+
+/* Called once */
+static inline int rc_direct_bit(struct rc *rc)
+{
+	rc_normalize(rc);
+	rc->range >>= 1;
+	if (rc->code >= rc->range) {
+		rc->code -= rc->range;
+		return 1;
+	}
+	return 0;
+}
+
+/* Called twice */
+static inline void rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol)
+{
+	int i = num_levels;
+
+	*symbol = 1;
+	while (i--)
+		rc_get_bit(rc, p + *symbol, symbol);
+	*symbol -= 1 << num_levels;
+}
+
+
+/*
+ * Small lzma deflate implementation.
+ * Copyright (C) 2006  Aurelien Jacobs < aurel at gnuage.org >
+ *
+ * Based on LzmaDecode.c from the LZMA SDK 4.22 (https://www.7-zip.org/)
+ * Copyright (C) 1999-2005  Igor Pavlov
+ */
+
+
+struct lzma_header {
+	uint8_t pos;
+	uint32_t dict_size;
+	uint64_t dst_size;
+} __attribute__ ((packed)) ;
+
+
+#define LZMA_BASE_SIZE 1846
+#define LZMA_LIT_SIZE 768
+
+#define LZMA_NUM_POS_BITS_MAX 4
+
+#define LZMA_LEN_NUM_LOW_BITS 3
+#define LZMA_LEN_NUM_MID_BITS 3
+#define LZMA_LEN_NUM_HIGH_BITS 8
+
+#define LZMA_LEN_CHOICE 0
+#define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1)
+#define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1)
+#define LZMA_LEN_MID (LZMA_LEN_LOW \
+		      + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS)))
+#define LZMA_LEN_HIGH (LZMA_LEN_MID \
+		       +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS)))
+#define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS))
+
+#define LZMA_NUM_STATES 12
+#define LZMA_NUM_LIT_STATES 7
+
+#define LZMA_START_POS_MODEL_INDEX 4
+#define LZMA_END_POS_MODEL_INDEX 14
+#define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1))
+
+#define LZMA_NUM_POS_SLOT_BITS 6
+#define LZMA_NUM_LEN_TO_POS_STATES 4
+
+#define LZMA_NUM_ALIGN_BITS 4
+
+#define LZMA_MATCH_MIN_LEN 2
+
+#define LZMA_IS_MATCH 0
+#define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES)
+#define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES)
+#define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \
+		       + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX))
+#define LZMA_SPEC_POS (LZMA_POS_SLOT \
+		       +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS))
+#define LZMA_ALIGN (LZMA_SPEC_POS \
+		    + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX)
+#define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS))
+#define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS)
+#define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS)
+
+
+struct writer {
+	uint8_t *buffer;
+	uint8_t previous_byte;
+	size_t buffer_pos;
+	int bufsize;
+	size_t global_pos;
+	long (*flush)(void*, unsigned long);
+	struct lzma_header *header;
+};
+
+struct cstate {
+	int state;
+	uint32_t rep0, rep1, rep2, rep3;
+};
+
+static inline size_t get_pos(struct writer *wr)
+{
+	return
+		wr->global_pos + wr->buffer_pos;
+}
+
+static inline uint8_t peek_old_byte(struct writer *wr, uint32_t offs)
+{
+	if (!wr->flush) {
+		int32_t pos;
+		while (offs > wr->header->dict_size)
+			offs -= wr->header->dict_size;
+		pos = wr->buffer_pos - offs;
+		return wr->buffer[pos];
+	} else {
+		uint32_t pos = wr->buffer_pos - offs;
+		while (pos >= wr->header->dict_size)
+			pos += wr->header->dict_size;
+		return wr->buffer[pos];
+	}
+
+}
+
+static inline int write_byte(struct writer *wr, uint8_t byte)
+{
+	wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte;
+	if (wr->flush && wr->buffer_pos == wr->header->dict_size) {
+		wr->buffer_pos = 0;
+		wr->global_pos += wr->header->dict_size;
+		if (wr->flush((char *)wr->buffer, wr->header->dict_size)
+				!= wr->header->dict_size)
+			return -1;
+	}
+	return 0;
+}
+
+
+static inline int copy_byte(struct writer *wr, uint32_t offs)
+{
+	return write_byte(wr, peek_old_byte(wr, offs));
+}
+
+static inline int copy_bytes(struct writer *wr,
+					 uint32_t rep0, int len)
+{
+	do {
+		if (copy_byte(wr, rep0))
+			return -1;
+		len--;
+	} while (len != 0 && wr->buffer_pos < wr->header->dst_size);
+
+	return len;
+}
+
+static inline int process_bit0(struct writer *wr, struct rc *rc,
+				     struct cstate *cst, uint16_t *p,
+				     int pos_state, uint16_t *prob,
+				     int lc, uint32_t literal_pos_mask) {
+	int mi = 1;
+	rc_update_bit_0(rc, prob);
+	prob = (p + LZMA_LITERAL +
+		(LZMA_LIT_SIZE
+		 * (((get_pos(wr) & literal_pos_mask) << lc)
+		    + (wr->previous_byte >> (8 - lc))))
+		);
+
+	if (cst->state >= LZMA_NUM_LIT_STATES) {
+		int match_byte = peek_old_byte(wr, cst->rep0);
+		do {
+			int bit;
+			uint16_t *prob_lit;
+
+			match_byte <<= 1;
+			bit = match_byte & 0x100;
+			prob_lit = prob + 0x100 + bit + mi;
+			if (rc_get_bit(rc, prob_lit, &mi)) {
+				if (!bit)
+					break;
+			} else {
+				if (bit)
+					break;
+			}
+		} while (mi < 0x100);
+	}
+	while (mi < 0x100) {
+		uint16_t *prob_lit = prob + mi;
+		rc_get_bit(rc, prob_lit, &mi);
+	}
+	if (cst->state < 4)
+		cst->state = 0;
+	else if (cst->state < 10)
+		cst->state -= 3;
+	else
+		cst->state -= 6;
+
+	return write_byte(wr, mi);
+}
+
+static inline int process_bit1(struct writer *wr, struct rc *rc,
+			       struct cstate *cst, uint16_t *p,
+			       int pos_state, uint16_t *prob) {
+	int offset;
+	uint16_t *prob_len;
+	int num_bits;
+	int len;
+
+	rc_update_bit_1(rc, prob);
+	prob = p + LZMA_IS_REP + cst->state;
+	if (rc_is_bit_0(rc, prob)) {
+		rc_update_bit_0(rc, prob);
+		cst->rep3 = cst->rep2;
+		cst->rep2 = cst->rep1;
+		cst->rep1 = cst->rep0;
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3;
+		prob = p + LZMA_LEN_CODER;
+	} else {
+		rc_update_bit_1(rc, prob);
+		prob = p + LZMA_IS_REP_G0 + cst->state;
+		if (rc_is_bit_0(rc, prob)) {
+			rc_update_bit_0(rc, prob);
+			prob = (p + LZMA_IS_REP_0_LONG
+				+ (cst->state <<
+				   LZMA_NUM_POS_BITS_MAX) +
+				pos_state);
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+
+				cst->state = cst->state < LZMA_NUM_LIT_STATES ?
+					9 : 11;
+				return copy_byte(wr, cst->rep0);
+			} else {
+				rc_update_bit_1(rc, prob);
+			}
+		} else {
+			uint32_t distance;
+
+			rc_update_bit_1(rc, prob);
+			prob = p + LZMA_IS_REP_G1 + cst->state;
+			if (rc_is_bit_0(rc, prob)) {
+				rc_update_bit_0(rc, prob);
+				distance = cst->rep1;
+			} else {
+				rc_update_bit_1(rc, prob);
+				prob = p + LZMA_IS_REP_G2 + cst->state;
+				if (rc_is_bit_0(rc, prob)) {
+					rc_update_bit_0(rc, prob);
+					distance = cst->rep2;
+				} else {
+					rc_update_bit_1(rc, prob);
+					distance = cst->rep3;
+					cst->rep3 = cst->rep2;
+				}
+				cst->rep2 = cst->rep1;
+			}
+			cst->rep1 = cst->rep0;
+			cst->rep0 = distance;
+		}
+		cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11;
+		prob = p + LZMA_REP_LEN_CODER;
+	}
+
+	prob_len = prob + LZMA_LEN_CHOICE;
+	if (rc_is_bit_0(rc, prob_len)) {
+		rc_update_bit_0(rc, prob_len);
+		prob_len = (prob + LZMA_LEN_LOW
+			    + (pos_state <<
+			       LZMA_LEN_NUM_LOW_BITS));
+		offset = 0;
+		num_bits = LZMA_LEN_NUM_LOW_BITS;
+	} else {
+		rc_update_bit_1(rc, prob_len);
+		prob_len = prob + LZMA_LEN_CHOICE_2;
+		if (rc_is_bit_0(rc, prob_len)) {
+			rc_update_bit_0(rc, prob_len);
+			prob_len = (prob + LZMA_LEN_MID
+				    + (pos_state <<
+				       LZMA_LEN_NUM_MID_BITS));
+			offset = 1 << LZMA_LEN_NUM_LOW_BITS;
+			num_bits = LZMA_LEN_NUM_MID_BITS;
+		} else {
+			rc_update_bit_1(rc, prob_len);
+			prob_len = prob + LZMA_LEN_HIGH;
+			offset = ((1 << LZMA_LEN_NUM_LOW_BITS)
+				  + (1 << LZMA_LEN_NUM_MID_BITS));
+			num_bits = LZMA_LEN_NUM_HIGH_BITS;
+		}
+	}
+
+	rc_bit_tree_decode(rc, prob_len, num_bits, &len);
+	len += offset;
+
+	if (cst->state < 4) {
+		int pos_slot;
+
+		cst->state += LZMA_NUM_LIT_STATES;
+		prob =
+			p + LZMA_POS_SLOT +
+			((len <
+			  LZMA_NUM_LEN_TO_POS_STATES ? len :
+			  LZMA_NUM_LEN_TO_POS_STATES - 1)
+			 << LZMA_NUM_POS_SLOT_BITS);
+		rc_bit_tree_decode(rc, prob,
+				   LZMA_NUM_POS_SLOT_BITS,
+				   &pos_slot);
+		if (pos_slot >= LZMA_START_POS_MODEL_INDEX) {
+			int i, mi;
+			num_bits = (pos_slot >> 1) - 1;
+			cst->rep0 = 2 | (pos_slot & 1);
+			if (pos_slot < LZMA_END_POS_MODEL_INDEX) {
+				cst->rep0 <<= num_bits;
+				prob = p + LZMA_SPEC_POS +
+					cst->rep0 - pos_slot - 1;
+			} else {
+				num_bits -= LZMA_NUM_ALIGN_BITS;
+				while (num_bits--)
+					cst->rep0 = (cst->rep0 << 1) |
+						rc_direct_bit(rc);
+				prob = p + LZMA_ALIGN;
+				cst->rep0 <<= LZMA_NUM_ALIGN_BITS;
+				num_bits = LZMA_NUM_ALIGN_BITS;
+			}
+			i = 1;
+			mi = 1;
+			while (num_bits--) {
+				if (rc_get_bit(rc, prob + mi, &mi))
+					cst->rep0 |= i;
+				i <<= 1;
+			}
+		} else
+			cst->rep0 = pos_slot;
+		if (++(cst->rep0) == 0)
+			return 0;
+		if (cst->rep0 > wr->header->dict_size
+				|| cst->rep0 > get_pos(wr))
+			return -1;
+	}
+
+	len += LZMA_MATCH_MIN_LEN;
+
+	return copy_bytes(wr, cst->rep0, len);
+}
+
+
+
+int unlzma(unsigned char *buf, long in_len,
+	   long (*fill)(void*, unsigned long),
+	   long (*flush)(void*, unsigned long),
+	   unsigned char *output,
+	   long *outlen,
+	   long *posp,
+	   void(*error)(char *x))
+{
+	struct lzma_header header;
+	int lc, pb, lp;
+	uint32_t pos_state_mask;
+	uint32_t literal_pos_mask;
+	uint16_t *p;
+	int num_probs;
+	struct rc rc;
+	int i, mi;
+	struct writer wr;
+	struct cstate cst;
+	unsigned char *inbuf;
+	int ret = -1;
+
+	rc.error = error;
+
+	if (buf)
+		inbuf = buf;
+	else
+		inbuf = malloc(LZMA_IOBUF_SIZE);
+	if (!inbuf) {
+		error("Could not allocate input buffer");
+		goto exit_0;
+	}
+
+	cst.state = 0;
+	cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1;
+
+	wr.header = &header;
+	wr.flush = flush;
+	wr.global_pos = 0;
+	wr.previous_byte = 0;
+	wr.buffer_pos = 0;
+
+	rc_init(&rc, fill, inbuf, in_len);
+
+	for (i = 0; i < sizeof(header); i++) {
+		if (rc.ptr >= rc.buffer_end)
+			rc_read(&rc);
+		((unsigned char *)&header)[i] = *rc.ptr++;
+	}
+
+	if (header.pos >= (9 * 5 * 5)) {
+		error("bad header");
+		goto exit_1;
+	}
+
+	mi = 0;
+	lc = header.pos;
+	while (lc >= 9) {
+		mi++;
+		lc -= 9;
+	}
+	pb = 0;
+	lp = mi;
+	while (lp >= 5) {
+		pb++;
+		lp -= 5;
+	}
+	pos_state_mask = (1 << pb) - 1;
+	literal_pos_mask = (1 << lp) - 1;
+
+	ENDIAN_CONVERT(header.dict_size);
+	ENDIAN_CONVERT(header.dst_size);
+
+	if (header.dict_size == 0)
+		header.dict_size = 1;
+
+	if (output)
+		wr.buffer = output;
+	else {
+		wr.bufsize = MIN(header.dst_size, header.dict_size);
+		wr.buffer = malloc(wr.bufsize);
+	}
+	if (wr.buffer == NULL)
+		goto exit_1;
+
+	num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
+	p = (uint16_t *) malloc(num_probs * sizeof(*p));
+	if (p == NULL)
+		goto exit_2;
+	num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
+	for (i = 0; i < num_probs; i++)
+		p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1;
+
+	rc_init_code(&rc);
+
+	while (get_pos(&wr) < header.dst_size) {
+		int pos_state =	get_pos(&wr) & pos_state_mask;
+		uint16_t *prob = p + LZMA_IS_MATCH +
+			(cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state;
+		if (rc_is_bit_0(&rc, prob)) {
+			if (process_bit0(&wr, &rc, &cst, p, pos_state, prob,
+					lc, literal_pos_mask)) {
+				error("LZMA data is corrupt");
+				goto exit_3;
+			}
+		} else {
+			if (process_bit1(&wr, &rc, &cst, p, pos_state, prob)) {
+				error("LZMA data is corrupt");
+				goto exit_3;
+			}
+			if (cst.rep0 == 0)
+				break;
+		}
+		if (rc.buffer_size <= 0)
+			goto exit_3;
+	}
+
+	*outlen = get_pos(&wr);
+
+	if (posp)
+		*posp = rc.ptr-rc.buffer;
+	if (!wr.flush || wr.flush(wr.buffer, wr.buffer_pos) == wr.buffer_pos)
+		ret = 0;
+exit_3:
+	free(p);
+exit_2:
+	if (!output)
+		free(wr.buffer);
+exit_1:
+	if (!buf)
+		free(inbuf);
+exit_0:
+	return ret;
+}