performance counter support for ARM architecture
nelakurthi koteswararao
koteswararao18 at gmail.com
Wed Sep 30 03:26:18 EDT 2009
Dear all,
I will change the naming conventions and symbolic names once perfcounter for
ARM is supported.
I want to do intermediate releases for review in mean time
1. I am able to support page faults in ARM with the attached patch along
with application.( this is for linux-2.6.29 kernel)
-bash-3.2# ./perf stat ./array
Performance counter stats for './array':
2005.297192 task-clock-msecs # 0.998 CPUs
7 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
76 page-faults # 0.000 M/sec
<not counted> cycles
<not counted> instructions
<not counted> cache-references
<not counted> cache-misses
2.009101297 seconds time elapsed
Please look at it and give your review comments.
Regards,
Koteswararao.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20090930/7854d777/attachment.htm>
-------------- next part --------------
Supported performance counter for ARM architecture.
ChangeLog:
2009/09/24
Location: Linux-2.6.29.y-BRANCH_SS
refs #6657
First changelog verison.
---
arch/arm/Kconfig | 3 3 + 0 - 0 !
arch/arm/include/asm/atomic.h | 1 1 + 0 - 0 !
arch/arm/include/asm/perf_counter.h | 8 8 + 0 - 0 !
arch/arm/include/asm/unistd.h | 3 2 + 1 - 0 !
arch/arm/kernel/calls.S | 1 1 + 0 - 0 !
arch/arm/mm/fault.c | 10 9 + 1 - 0 !
include/asm-generic/atomic64.h | 42 42 + 0 - 0 !
lib/Kconfig | 6 6 + 0 - 0 !
lib/Makefile | 2 2 + 0 - 0 !
lib/atomic64.c | 175 175 + 0 - 0 !
tools/perf/perf.h | 6 6 + 0 - 0 !
11 files changed, 255 insertions(+), 2 deletions(-)
Index: b/arch/arm/include/asm/unistd.h
===================================================================
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -387,8 +387,9 @@
#define __NR_dup3 (__NR_SYSCALL_BASE+358)
#define __NR_pipe2 (__NR_SYSCALL_BASE+359)
#define __NR_inotify_init1 (__NR_SYSCALL_BASE+360)
+#define __NR_perf_counter_open (__NR_SYSCALL_BASE+361)
-#define __NR_syscall_max 361
+#define __NR_syscall_max 362
/*
* The following SWIs are ARM private.
Index: b/arch/arm/kernel/calls.S
===================================================================
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -370,6 +370,7 @@
CALL(sys_dup3)
CALL(sys_pipe2)
/* 360 */ CALL(sys_inotify_init1)
+ CALL(sys_perf_counter_open)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
Index: b/tools/perf/perf.h
===================================================================
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -25,6 +25,12 @@
#define cpu_relax() asm volatile("" ::: "memory");
#endif
+#ifdef __arm__
+#include "../../arch/arm/include/asm/unistd.h"
+#define rmb() asm volatile("" ::: "memory")
+#define cpu_relax() asm volatile("" ::: "memory");
+#endif
+
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
Index: b/arch/arm/Kconfig
===================================================================
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -20,6 +20,7 @@ config ARM
select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
select HAVE_FUNCTION_GRAPH_TRACER if (!XIP_KERNEL)
select HAVE_GENERIC_DMA_COHERENT
+ select GENERIC_ATOMIC64
help
The ARM series is a line of low-power-consumption RISC chip designs
licensed by ARM Ltd and targeted at embedded applications and
@@ -253,6 +254,7 @@ config ARCH_NE1
# select PCI
select GENERIC_TIME
select GENERIC_CLOCKEVENTS
+ select HAVE_PERF_COUNTERS
help
This enables support for NEC-EL NaviEngine1-based boards.
@@ -463,6 +465,7 @@ config ARCH_MXC
select ARCH_MTD_XIP
select GENERIC_GPIO
select ARCH_REQUIRE_GPIOLIB
+ select HAVE_PERF_COUNTERS
help
Support for Freescale MXC/iMX-based family of processors
Index: b/arch/arm/include/asm/perf_counter.h
===================================================================
--- /dev/null
+++ b/arch/arm/include/asm/perf_counter.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_ARM_PERF_COUNTER_H
+#define _ASM_ARM_PERF_COUNTER_H
+
+#define PERF_COUNTER_INDEX_OFFSET 1
+/* ARM only supports software counters through this interface. */
+static inline void set_perf_counter_pending(void) { do { } while(0);
+}
+#endif /* _ASM_ARM_PERF_COUNTER_H */
Index: b/arch/arm/include/asm/atomic.h
===================================================================
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -225,6 +225,7 @@ static inline int atomic_add_unless(atom
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
+#include <asm-generic/atomic64.h>
#include <asm-generic/atomic.h>
#endif
#endif
Index: b/include/asm-generic/atomic64.h
===================================================================
--- /dev/null
+++ b/include/asm-generic/atomic64.h
@@ -0,0 +1,42 @@
+/*
+ * Generic implementation of 64-bit atomics using spinlocks,
+ * useful on processors that don't have 64-bit atomic instructions.
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus at au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_GENERIC_ATOMIC64_H
+#define _ASM_GENERIC_ATOMIC64_H
+
+typedef struct {
+ long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(i) { (i) }
+
+extern long long atomic64_read(const atomic64_t *v);
+extern void atomic64_set(atomic64_t *v, long long i);
+extern void atomic64_add(long long a, atomic64_t *v);
+extern long long atomic64_add_return(long long a, atomic64_t *v);
+extern void atomic64_sub(long long a, atomic64_t *v);
+extern long long atomic64_sub_return(long long a, atomic64_t *v);
+extern long long atomic64_dec_if_positive(atomic64_t *v);
+extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
+extern long long atomic64_xchg(atomic64_t *v, long long new);
+extern int atomic64_add_unless(atomic64_t *v, long long a, long long u);
+
+#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
+#define atomic64_inc(v) atomic64_add(1LL, (v))
+#define atomic64_inc_return(v) atomic64_add_return(1LL, (v))
+#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
+#define atomic64_sub_and_test(a, v) (atomic64_sub_return((a), (v)) == 0)
+#define atomic64_dec(v) atomic64_sub(1LL, (v))
+#define atomic64_dec_return(v) atomic64_sub_return(1LL, (v))
+#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1LL, 0LL)
+
+#endif /* _ASM_GENERIC_ATOMIC64_H */
Index: b/lib/atomic64.c
===================================================================
--- /dev/null
+++ b/lib/atomic64.c
@@ -0,0 +1,175 @@
+/*
+ * Generic implementation of 64-bit atomics using spinlocks,
+ * useful on processors that don't have 64-bit atomic instructions.
+ *
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus at au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+/*
+ * We use a hashed array of spinlocks to provide exclusive access
+ * to each atomic64_t variable. Since this is expected to used on
+ * systems with small numbers of CPUs (<= 4 or so), we use a
+ * relatively small array of 16 spinlocks to avoid wasting too much
+ * memory on the spinlock array.
+ */
+#define NR_LOCKS 16
+
+/*
+ * Ensure each lock is in a separate cacheline.
+ */
+static union {
+ spinlock_t lock;
+ char pad[L1_CACHE_BYTES];
+} atomic64_lock[NR_LOCKS] __cacheline_aligned_in_smp;
+
+static inline spinlock_t *lock_addr(const atomic64_t *v)
+{
+ unsigned long addr = (unsigned long) v;
+
+ addr >>= L1_CACHE_SHIFT;
+ addr ^= (addr >> 8) ^ (addr >> 16);
+ return &atomic64_lock[addr & (NR_LOCKS - 1)].lock;
+}
+
+long long atomic64_read(const atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter = i;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+void atomic64_add(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long atomic64_add_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void atomic64_sub(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long atomic64_sub_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_dec_if_positive(atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter - 1;
+ if (val >= 0)
+ v->counter = val;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ if (val == o)
+ v->counter = n;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_xchg(atomic64_t *v, long long new)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ v->counter = new;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ int ret = 1;
+
+ spin_lock_irqsave(lock, flags);
+ if (v->counter != u) {
+ v->counter += a;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(lock, flags);
+ return ret;
+}
+
+static int init_atomic64_lock(void)
+{
+ int i;
+
+ for (i = 0; i < NR_LOCKS; ++i)
+ spin_lock_init(&atomic64_lock[i].lock);
+ return 0;
+}
+
+pure_initcall(init_atomic64_lock);
Index: b/lib/Makefile
===================================================================
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -88,6 +88,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += sys
obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
+obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
Index: b/lib/Kconfig
===================================================================
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -177,4 +177,10 @@ config DISABLE_OBSOLETE_CPUMASK_FUNCTION
bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
depends on EXPERIMENTAL && BROKEN
+#
+# Generic 64-bit atomic support is selected if needed
+#
+config GENERIC_ATOMIC64
+ bool
+
endmenu
Index: b/arch/arm/mm/fault.c
===================================================================
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -16,6 +16,7 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/page-flags.h>
+#include <linux/perf_counter.h>
#include <asm/system.h>
#include <asm/pgtable.h>
@@ -145,7 +146,6 @@ __do_user_fault(struct task_struct *tsk,
show_regs(regs);
}
#endif
-
tsk->thread.address = addr;
tsk->thread.error_code = fsr;
tsk->thread.trap_no = 14;
@@ -254,6 +254,7 @@ do_page_fault(unsigned long addr, unsign
tsk = current;
mm = tsk->mm;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, addr);
/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
@@ -281,6 +282,13 @@ do_page_fault(unsigned long addr, unsign
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
return 0;
+ if(tsk->maj_flt)
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, addr);
+ if(tsk->min_flt)
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, addr);
+
/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
-------------- next part --------------
int fast_multiply(x, y)
{
return x * y;
}
int slow_multiply(x, y)
{
int i, j, z;
for (i = 0, z = 0; i < x; i++)
z = z + y;
return z;
}
int main()
{
int i,j;
int x,y;
for (i = 0; i < 200; i ++) {
for (j = 0; j < 3000 ; j++) {
x = fast_multiply(i, j);
y = slow_multiply(i, j);
}
}
return 0;
}
More information about the linux-arm-kernel
mailing list