[RFC] IXP4xx little-endian data-coherent support

Krzysztof Halasa khc at pm.waw.pl
Thu Mar 25 20:35:52 EDT 2010


Hi,

I'm finally trying to eliminate the performance hit caused by IXP4xx NPE
network engines working in big-endian mode only (which means on LE
system network buffers have to be byte-swapped by the CPU).

I have already booted Linux in LE-DC mode and it generally works, and
the remaining changes needed seem trivial, though a few questions
remains.

The LE data-coherent mode on IXP4xx is achieved by setting a certain bit
(9) in first level page table descriptors. This means we can control
data-coherent vs value-coherent with 1 MB of virtual address space
granularity (LE DC is just hardware byte-swapping).

Options:

1. use DC mode on whole virtual address space.
Pro: simplicity
Con: using value-coherent mode for certain (most) peripherals
(registers) is faster, since there is no byte-unswapping to be done.

2. use DC mode for most devices (including RAM and PCI address space
(but not PCI controller registers)), and value-coherent mode for the
peripherals.
Pro: faster
Con: we have to provide different mappings ("memory types"?) for
different devices. It could use something like ioremap_byteswapped()
- does it make sense?

Another possibility for #2: map all those peripherals at boot (we're
already doing it except for a small QMgr region), set their page table
entry bit statically in MMU code (= value-coherent), then everything
else is DC. Much simpler, though it means the ROM area (EXP bus) has to
be mapped DC as well - perhaps an advantage (few drivers including MTD
and IDE drivers have to be modified).
That's what I personally prefer at this point.


Another question is entering the DC mode. It's only possible with MMU,
so the "boot" code has to be value-coherent. Then, just before enabling
the MMU, all active memory resources (such as the page table, the boot
loader tags, kernel image and possibly the external initramfs) have to
byte-swapped. Possibly, the kernel and/or external initramfs can be
pre-swapped (only parts which are running with MMU - not sure if it's
practical with the kernel). I'm currently just byte-swapping the entire
RAM except for a small area in which the swapping code (and MMU-on)
resides. Guess I could use a "trampoline" in QMgr SRAM area (with a
"section" mapping for simplicity).

Comments?

I'm attaching a working patch (core only, no drivers), it isn't pretty
but I think it shows the idea. It needs another trivial patch which adds
Kbuild option IXP4XX_SUPPORT_425A0 (LE data-coherent mode requires
IXP425 stepping B0 or later CPU).
-- 
Krzysztof Halasa

--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -649,6 +649,16 @@ config CPU_ENDIAN_BE32
 	help
 	  Support for the BE-32 (big-endian) mode on pre-ARMv6 processors.
 
+config CPU_LITTLE_ENDIAN_DATA_COHERENT
+	bool "Data-coherent CPU mode"
+	depends on !CPU_BIG_ENDIAN && ARCH_IXP4XX && !IXP4XX_SUPPORT_425A0
+	help
+	  Use data-coherent mode to access peripherals. This will improve
+	  performance of certain Ethernet and WAN drivers, at the cost of
+	  added complexity. Not very well tested.
+
+	  If unsure, say "N".
+
 config CPU_HIGH_VECTOR
 	depends on !MMU && CPU_CP15 && !CPU_ARM740T
 	bool "Select the High exception vector"

Perhaps this MT_DEVICE_VALUE_COHERENT isn't best name, MT_DEVICE_BYTESWAPPED?
--- a/arch/arm/include/asm/mach/map.h
+++ b/arch/arm/include/asm/mach/map.h
@@ -27,6 +27,7 @@ struct map_desc {
 #define MT_MEMORY		9
 #define MT_ROM			10
 #define MT_MEMORY_NONCACHED	11
+#define MT_DEVICE_VALUE_COHERENT 12
 
 #ifdef CONFIG_MMU
 extern void iotable_init(struct map_desc *, int);



The following also removes PMD_BIT4, it seems XScale wants it cleared:
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -20,8 +20,15 @@
 
 #ifdef CONFIG_MMU
 
+
+#if defined(CONFIG_ARCH_IXP4XX) && defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
+#define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_DOMAIN(DOMAIN_USER) | (1 << 9))
+#define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_DOMAIN(DOMAIN_KERNEL) | (1 << 9))
+#else
 #define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
 #define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_KERNEL))
+#endif
+
 
 /*
  * Since we have only two-level page tables, these are trivial



Since our RAM is big-endian, we have to swap PCI accesses:
--- a/arch/arm/mach-ixp4xx/common-pci.c
+++ b/arch/arm/mach-ixp4xx/common-pci.c
@@ -415,6 +415,6 @@ void __init ixp4xx_pci_preinit(void)
 	 * little-endian PCI and the big-endian AHB bus
 	 */
-#ifdef __ARMEB__
+#if defined(__ARMEB__) || defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
 	*PCI_CSR = PCI_CSR_IC | PCI_CSR_ABE | PCI_CSR_PDS | PCI_CSR_ADS;
 #else
 	*PCI_CSR = PCI_CSR_IC | PCI_CSR_ABE;
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -53,24 +53,29 @@ static struct map_desc ixp4xx_io_desc[] __initdata = {
 		.virtual	= IXP4XX_PERIPHERAL_BASE_VIRT,
 		.pfn		= __phys_to_pfn(IXP4XX_PERIPHERAL_BASE_PHYS),
 		.length		= IXP4XX_PERIPHERAL_REGION_SIZE,
-		.type		= MT_DEVICE
+		.type		= MT_DEVICE_VALUE_COHERENT
 	}, {	/* Expansion Bus Config Registers */
 		.virtual	= IXP4XX_EXP_CFG_BASE_VIRT,
 		.pfn		= __phys_to_pfn(IXP4XX_EXP_CFG_BASE_PHYS),
 		.length		= IXP4XX_EXP_CFG_REGION_SIZE,
-		.type		= MT_DEVICE
+		.type		= MT_DEVICE_VALUE_COHERENT
 	}, {	/* PCI Registers */
 		.virtual	= IXP4XX_PCI_CFG_BASE_VIRT,
 		.pfn		= __phys_to_pfn(IXP4XX_PCI_CFG_BASE_PHYS),
 		.length		= IXP4XX_PCI_CFG_REGION_SIZE,
-		.type		= MT_DEVICE
+		.type		= MT_DEVICE_VALUE_COHERENT
+	}, {	/* Queue Manager */
+		.virtual	= IXP4XX_QMGR_BASE_VIRT,
+		.pfn		= __phys_to_pfn(IXP4XX_QMGR_BASE_PHYS),
+		.length		= IXP4XX_QMGR_REGION_SIZE,
+		.type		= MT_DEVICE_VALUE_COHERENT
 	},
 #ifdef CONFIG_DEBUG_LL
 	{	/* Debug UART mapping */
 		.virtual	= IXP4XX_DEBUG_UART_BASE_VIRT,
 		.pfn		= __phys_to_pfn(IXP4XX_DEBUG_UART_BASE_PHYS),
 		.length		= IXP4XX_DEBUG_UART_REGION_SIZE,
-		.type		= MT_DEVICE
+		.type		= MT_DEVICE_VALUE_COHERENT
 	}
 #endif
 };
--- a/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
+++ b/arch/arm/mach-ixp4xx/include/mach/ixp4xx-regs.h
@@ -30,19 +30,24 @@
  *
  * 0x50000000	0x10000000	ioremap'd	EXP BUS
  *
- * 0x6000000	0x00004000	ioremap'd	QMgr
+ * 0xFFA00000 -> 0xFFBFFFFF is value-preserving in little-endian mode
  *
- * 0xC0000000	0x00001000	0xffbff000	PCI CFG
+ * 0x60000000	0x00004000	0xffbe7000	QMgr
+ *
+ * 0xC8000000	0x00013000	0xffbeb000	On-Chip Peripherals
  *
  * 0xC4000000	0x00001000	0xffbfe000	EXP CFG
  *
- * 0xC8000000	0x00013000	0xffbeb000	On-Chip Peripherals
+ * 0xC0000000	0x00001000	0xffbff000	PCI CFG
+ *
+ * (this should end on 0xFEFFFFFF, only VMALLOC_END -> 0xFEFFFFFF is for platform usage)
  */
 
 /*
  * Queue Manager
  */
 #define IXP4XX_QMGR_BASE_PHYS		(0x60000000)
+#define IXP4XX_QMGR_BASE_VIRT		(0xFFBE7000)
 #define IXP4XX_QMGR_REGION_SIZE		(0x00004000)
 
 /*
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -257,6 +257,13 @@ static struct mem_type mem_types[] = {
 		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
 		.domain    = DOMAIN_KERNEL,
 	},
+	[MT_DEVICE_VALUE_COHERENT] = {
+		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
+				  L_PTE_SHARED,
+		.prot_l1	= PMD_TYPE_TABLE,
+		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_S,
+		.domain		= DOMAIN_IO,
+	},
 };
 
 const struct mem_type *get_mem_type(unsigned int type)
@@ -315,6 +322,12 @@ static void __init build_mem_type_table(void)
 		for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
 			mem_types[i].prot_sect &= ~PMD_BIT4;
 			mem_types[i].prot_l1 &= ~PMD_BIT4;
+#if defined(CONFIG_ARCH_IXP4XX) && defined(CONFIG_CPU_LITTLE_ENDIAN_DATA_COHERENT)
+			if (i != MT_DEVICE_VALUE_COHERENT) {
+				mem_types[i].prot_l1 |= 1 << 9;
+				mem_types[i].prot_sect |= 1 << 9;
+			}
+#endif
 		}
 	} else if (cpu_arch < CPU_ARCH_ARMv6) {
 		for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index 93df472..796eb87 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -23,11 +23,13 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <asm/assembler.h>
+#include <asm/domain.h>
 #include <asm/hwcap.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
+#include <asm/system.h>
 #include "proc-macros.S"
 
 /*
@@ -480,6 +482,64 @@ __xscale_setup:
 	mcr	p15, 0, ip, c7, c7, 0		@ invalidate I, D caches & BTB
 	mcr	p15, 0, ip, c7, c10, 4		@ Drain Write (& Fill) Buffer
 	mcr	p15, 0, ip, c8, c7, 0		@ invalidate I, D TLBs
+#ifndef __ARMEB__
+	mrc	p15, 0, r0, c1, c0, 1
+	orr	r0, r0, #2			@ set the page table P bit
+	mcr	p15, 0, r0, c1, c0, 1
+
+#define TRAMPOLINE 0x200
+
+	/*
+	 * Create identity mapping for on-chip Queue Manager SRAM to cater for
+	 * the MMU enable.  This identity mapping will be removed by
+	 * paging_init().  We use our current program counter to determine
+	 * corresponding section base address.
+	 */
+#if 0
+	mov	r0, #0x60000000
+	orr	r0, #0x00000C00
+	orr	r0, #0x0000000E
+#endif
+	mov	r0, #0x00000C00
+	orr	r0, #0x0000000E
+	@ add	r3, r4, #0x600 << 2		@ r4 = page table address
+	str	r0, [r4]			@ identity mapping @ 0x60002100
+
+	adr	r6, BSYM(__xscale_setup_moved)
+	@mov	r7, #0x60000000
+	@orr	r7, #0x00002100
+	mov	r7, #TRAMPOLINE
+	mov	r3, #0x100
+1:	ldr	r0, [r6], #4
+	str	r0, [r7], #4
+	subs	r3, r3, #1
+	bne	1b
+	@mov	r7, #0x60000000
+	@orr	r7, #0x00002100
+	mov	r7, #TRAMPOLINE
+	mov	pc, r7
+
+__xscale_setup_moved:
+	mov	r6, #0				@ base address to swap
+2:	ldr	r0, [r6]
+	eor	r7, r0, r0, ror #16
+	bic	r7, r7, #0x00ff0000
+	mov	r0, r0, ror #8
+	eor	r0, r0, r7, lsr #8
+	str	r0, [r6], #4
+	cmp	r6, #0x200			@ end address
+	bne	2b
+
+	mov	r6, #0x400			@ base address to swap
+3:	ldr	r0, [r6]
+	eor	r7, r0, r0, ror #16
+	bic	r7, r7, #0x00ff0000
+	mov	r0, r0, ror #8
+	eor	r0, r0, r7, lsr #8
+	str	r0, [r6], #4
+	cmp	r6, #64 * 1024 * 1024		@ end address
+	bne	3b
+#endif
 	mov	r0, #1 << 6			@ cp6 for IOP3xx and Bulverde
 	orr	r0, r0, #1 << 13		@ Its undefined whether this
 	mcr	p15, 0, r0, c15, c1, 0		@ affects USR or SVC modes
@@ -489,7 +549,57 @@ __xscale_setup:
 	mrc	p15, 0, r0, c1, c0, 0		@ get control register
 	bic	r0, r0, r5
 	orr	r0, r0, r6
+
+#ifdef __ARMEB__
 	mov	pc, lr
+#else
+/*
+ * Setup common bits before finally enabling the MMU.  Essentially
+ * this is just loading the page table pointer and domain access
+ * registers.
+ */
+#ifdef CONFIG_ALIGNMENT_TRAP
+	orr	r0, r0, #CR_A
+#else
+	bic	r0, r0, #CR_A
+#endif
+#ifdef CONFIG_CPU_DCACHE_DISABLE
+	bic	r0, r0, #CR_C
+#endif
+#ifdef CONFIG_CPU_BPREDICT_DISABLE
+	bic	r0, r0, #CR_Z
+#endif
+#ifdef CONFIG_CPU_ICACHE_DISABLE
+	bic	r0, r0, #CR_I
+#endif
+	mov	r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
+		      domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
+		      domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
+		      domain_val(DOMAIN_IO, DOMAIN_CLIENT))
+	mcr	p15, 0, r5, c3, c0, 0		@ load domain access register
+	mcr	p15, 0, r4, c2, c0, 0		@ load page table pointer
+	b	__xscale_turn_mmu_on
+
+/*
+ * Enable the MMU.  This completely changes the structure of the visible
+ * memory space.  You will not be able to trace execution through this.
+ * If you have an enquiry about this, *please* check the linux-arm-kernel
+ * mailing list archives BEFORE sending another post to the list.
+ *
+ *  r0  = cp#15 control register
+ *  r13 = *virtual* address to jump to upon completion
+ *
+ * other registers depend on the function called upon completion
+ */
+	.align	5
+__xscale_turn_mmu_on:
+	mov	r0, r0
+	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
+	mrc	p15, 0, r3, c0, c0, 0		@ read id reg
+	mov	r3, r3
+	mov	r3, r13
+	mov	pc, r3
+#endif
 	.size	__xscale_setup, . - __xscale_setup
 
 	/*
@@ -823,7 +933,7 @@ __ixp42x_proc_info:
 		PMD_SECT_BUFFERABLE | \
 		PMD_SECT_CACHEABLE | \
 		PMD_SECT_AP_WRITE | \
-		PMD_SECT_AP_READ
+		PMD_SECT_AP_READ | (1 << 9)
 	.long   PMD_TYPE_SECT | \
 		PMD_SECT_AP_WRITE | \
 		PMD_SECT_AP_READ



More information about the linux-arm-kernel mailing list