[PATCH 1/5] PMFS: Add Persistent Memory File System
Vishal Verma
vishal.l.verma at linux.intel.com
Fri Apr 26 18:20:42 EDT 2013
Initial version of PMFS, the Persistent Memory File System.
This commit rebases to Linux 3.9-rc7
Signed-off-by: Sanjay Kumar <sanjay.k.kumar at intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy at intel.com>
Signed-off-by: Dulloor <subramanya.r.dulloor at intel.com>
Signed-off-by: Edmund Nadolski <edmund.nadolski at intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler at linux.intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma at linux.intel.com>
---
Documentation/filesystems/pmfs.txt | 140 +++
arch/Kconfig | 3 +
arch/x86/Kconfig | 2 +
arch/x86/include/asm/io.h | 6 +
arch/x86/include/asm/pgtable_types.h | 1 +
arch/x86/kernel/setup.c | 17 +-
arch/x86/kernel/sys_x86_64.c | 150 ++++
arch/x86/mm/ioremap.c | 90 ++-
arch/x86/mm/pat.c | 2 +
arch/x86/mm/pgtable.c | 2 +
fs/Kconfig | 3 +-
fs/Makefile | 1 +
fs/pmfs/Kconfig | 37 +
fs/pmfs/Makefile | 11 +
fs/pmfs/balloc.c | 256 ++++++
fs/pmfs/bbuild.c | 509 +++++++++++
fs/pmfs/dir.c | 310 +++++++
fs/pmfs/file.c | 333 +++++++
fs/pmfs/inode.c | 1568 ++++++++++++++++++++++++++++++++++
fs/pmfs/ioctl.c | 150 ++++
fs/pmfs/journal.c | 866 +++++++++++++++++++
fs/pmfs/journal.h | 101 +++
fs/pmfs/namei.c | 797 +++++++++++++++++
fs/pmfs/persist.c | 238 +++++
fs/pmfs/pmfs.h | 576 +++++++++++++
fs/pmfs/pmfs_test.c | 50 ++
fs/pmfs/super.c | 1217 ++++++++++++++++++++++++++
fs/pmfs/symlink.c | 71 ++
fs/pmfs/wprotect.c | 91 ++
fs/pmfs/wprotect.h | 166 ++++
fs/pmfs/xip.c | 672 +++++++++++++++
fs/pmfs/xip.h | 28 +
include/asm-generic/pgtable.h | 8 +
include/linux/io.h | 10 +
include/linux/mm.h | 15 +
include/linux/pmfs_def.h | 206 +++++
include/linux/pmfs_sb.h | 83 ++
include/linux/vmalloc.h | 2 +-
include/uapi/linux/magic.h | 1 +
lib/ioremap.c | 126 +++-
mm/madvise.c | 6 +
mm/memcontrol.c | 4 +-
mm/memory.c | 194 +++++-
mm/mlock.c | 5 +
mm/mmap.c | 3 +
mm/mprotect.c | 4 +
mm/msync.c | 10 +-
mm/vmalloc.c | 10 +-
48 files changed, 9109 insertions(+), 42 deletions(-)
create mode 100644 Documentation/filesystems/pmfs.txt
create mode 100644 fs/pmfs/Kconfig
create mode 100644 fs/pmfs/Makefile
create mode 100644 fs/pmfs/balloc.c
create mode 100644 fs/pmfs/bbuild.c
create mode 100644 fs/pmfs/dir.c
create mode 100644 fs/pmfs/file.c
create mode 100644 fs/pmfs/inode.c
create mode 100644 fs/pmfs/ioctl.c
create mode 100644 fs/pmfs/journal.c
create mode 100644 fs/pmfs/journal.h
create mode 100644 fs/pmfs/namei.c
create mode 100644 fs/pmfs/persist.c
create mode 100644 fs/pmfs/pmfs.h
create mode 100644 fs/pmfs/pmfs_test.c
create mode 100644 fs/pmfs/super.c
create mode 100644 fs/pmfs/symlink.c
create mode 100644 fs/pmfs/wprotect.c
create mode 100644 fs/pmfs/wprotect.h
create mode 100644 fs/pmfs/xip.c
create mode 100644 fs/pmfs/xip.h
create mode 100644 include/linux/pmfs_def.h
create mode 100644 include/linux/pmfs_sb.h
diff --git a/Documentation/filesystems/pmfs.txt b/Documentation/filesystems/pmfs.txt
new file mode 100644
index 0000000..e9f2bb4
--- /dev/null
+++ b/Documentation/filesystems/pmfs.txt
@@ -0,0 +1,140 @@
+
+PMFS Introduction
+=================
+
+PMFS is a file system for persistent memory. The file system is optimized to be
+lightweight and efficient in providing access to persistent memory that is
+directly accessible via CPU load/store instructions. It manages the persistent
+memory directly and avoids the block driver layer and page cache layer and thus
+provides synchronous reads and writes to persistent area. It supports all the
+existing POSIX style file system APIs so that the applications need not be
+modified to use this file system. In addition, PMFS provides support for huge
+pages to minimize TLB entry usage and speed up virtual address lookup. PMFS's
+mmap interface can map a file's data directly into the process's address space
+without any intermediate buffering. This file system has been validated using
+DRAM to emulate persistent memory. Hence, PMFS also provides an option to load
+the file system from a disk-based file into memory during mount and save the
+file system from memory into the disk-based file during unmount. PMFS also
+guarantees consistent and durable updates to the file system meta-data against
+arbitrary system and power failures. PMFS uses journaling (undo log) to provide
+consistent updates to meta-data.
+
+
+Configuring PMFS
+================
+
+PMFS uses a physically contiguous area of DRAM (which is not used by the
+kernel) as the file system space. To make sure that the kernel doesn't use a
+certain contiguous physical memory area you can boot the kernel with 'memmap'
+kernel command line option. For more information on this, please see
+Documentation/kernel-parameters.txt.
+
+For example, adding 'memmap=2G$4G' to the kernel boot parameters will reserve
+2G of memory, starting at 4G. (You may have to escape the $ so it isn't
+interpreted by GRUB 2, if you use that as your boot loader.)
+
+After the OS has booted, you can initialize PMFS during mount command by
+passing 'init=' mount option.
+
+For example,
+
+#mount -t pmfs -o physaddr=0x100000000,init=2G none /mnt/pmfs
+
+The above command will create a PMFS file system in the 2GB region starting at
+0x100000000 (4GB) and mount it at /mnt/pmfs. There are many other mount time
+options supported by pmfs. Some of the main options include:
+
+wprotect: This option protects pmfs from stray writes (e.g., because of kernel
+bugs). It makes sure that the file system is mapped read-only into the kernel
+and makes it writable only for a brief period when writing to it. (EXPERIMENTAL
+- Use with Caution).
+
+jsize: This option specifies the journal size. Default is 4MB.
+
+hugemmap: This option enables support for using huge pages in memory-mapped
+files.
+
+backing: This option specifies a disk based file which should be used as a
+persistent backing store for pmfs during mount and unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,init=2G,backing="/data/pmfs.img" none /mnt/pmfs
+
+The above example initializes a 2GB PMFS and during unmount it saves the file
+system into a file /data/pmfs.img
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img" none /mnt/pmfs
+
+The above example loads the PMFS from /data/pmfs.img during mount and saves
+the file system to /data/pmfs.img during unmount.
+
+backing_opt: This option specifies how the backing file should be used. It can
+have 2 values;
+
+1: This value means that PMFS will not be loaded from the backing file during
+mount. It is either created using 'init=' option, or the pre-existing file
+system in the memory is used.
+
+2: This value means that the PMFS will not be stored to the backing file during
+unmount.
+
+If backing_opt is not specified, PMFS will load the file system from backing
+file (if init= option is not specified) during mount and store the file system
+to the backing file during unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img",backing_opt=2 none /mnt/pmfs
+
+The above example loads the PMFS from /data/pmfs.img during mount but does not
+save the file system to /data/pmfs.img during unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img",backing_opt=1 none /mnt/pmfs
+
+The above example assumes that there is a PMFS already present at the specified
+physical address (create during an earlier mount). It uses that same PMFS
+instead of loading it from /data/pmfs.img. It, however, saves the file system
+to /data/pmfs.img during unmount.
+
+For full list of options, please refer to the source code.
+
+
+Using Huge Pages with PMFS
+==========================
+
+PMFS supports the use of huge-pages through the fallocate(), and ftruncate()
+system calls. These functions set the file size and also provide PMFS with a
+hint about what data-block size to use (fallocate() also pre-allocates the
+data-blocks). For example, if we set the file size below 2MB, 4KB blocksize is
+used. If we set the file size between >= 2MB but < 1GB, 2MB block size is used,
+and if we set the file size >= 1GB, 1GB block-size is used. fallocate() or
+ftruncate() should be called on a empty file (size 0) for the block-size hint
+to be applied properly. So, a good way to use Huge Pages in PMFS is to open a
+new file through the open() system call, and call fallocate() or ftruncate() to
+set the file size and block-size hint. Remember, that it is only a hint, so if
+PMFS can't find enough free blocks of a particular size, it will try to use
+smaller block-size. If the block-size hint is not set, default 4KB block-size
+will be used for file's data-blocks.
+
+
+Current Limitations
+===================
+
+a) PMFS uses a memory region not used by the kernel. Hence the memory needs to
+be reserved by using the memmap= option or using BIOS ACPI tables.
+
+b) Because of multiple blocksize support, PMFS supports multiple max file
+sizes. For example, if the file's block size is 4KB, the file can grow upto
+512 GB in size, if blocksize is 2MB, file can grow upto 256 TB, and if the
+blocksize is 1GB, the file can grow upto 128 PB.
+
+c) PMFS does not currently support extended attributes.
+
+d) PMFS currently only works with x86_64 kernels.
+
+e) We ran out of bits in vmaâs vm_flags field, so we reused a flag that is
+guaranteed not to be used on x86_64.
+
+
+Contact Information
+=====================
+
+Please send bug reports/comments/feedback to the PMFS development
+list: linux-pmfs at intel.com
diff --git a/arch/Kconfig b/arch/Kconfig
index 1455579..82a5965 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -174,6 +174,9 @@ config USER_RETURN_NOTIFIER
config HAVE_IOREMAP_PROT
bool
+config HAVE_SET_MEMORY_RO
+ bool
+
config HAVE_KPROBES
bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 70c0f3d..b94d591 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,8 @@ config X86
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
+ select HAVE_IRQ_WORK
+ select HAVE_SET_MEMORY_RO
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select HAVE_MEMBLOCK
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eef..1cfda69 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -173,9 +173,15 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem * ioremap_cache_ro(resource_size_t offset,
+ unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
unsigned long prot_val);
+extern void __iomem *
+ioremap_hpage_cache_ro(resource_size_t phys_addr, unsigned long size);
+extern void __iomem *
+ioremap_hpage_cache(resource_size_t phys_addr, unsigned long size);
/*
* The default ioremap() behavior is non-cached:
*/
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 567b5d0..7b84690 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -157,6 +157,7 @@
#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
+#define PAGE_KERNEL_IO_LARGE __pgprot(__PAGE_KERNEL_IO | _PAGE_PSE)
/* xwr */
#define __P000 PAGE_NONE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 90d8cc9..3be22d8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -281,7 +281,22 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
}
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ {
+ printk(KERN_INFO "direct_gbpages(%d). cpu_has_gbpages(%d)."
+ "GB pages not supported.\n", direct_gbpages, cpu_has_gbpages);
+ direct_gbpages = 0;
+ }
+}
+#else
+static inline void init_gbpages(void)
+{
+}
static void __init cleanup_highmap(void)
{
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5a..d607f1a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>
+#include <linux/export.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
@@ -190,3 +191,152 @@ bottomup:
*/
return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}
+
+
+static unsigned long arch_get_unmapped_area_bottomup_sz(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long align_size,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+
+ if (len > mm->cached_hole_size) {
+ start_addr = mm->free_area_cache;
+ } else {
+ start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ }
+
+full_search:
+ addr = ALIGN(start_addr, align_size);
+
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ /* At this point: (!vma || addr < vma->vm_end). */
+ if (TASK_SIZE - len < addr) {
+ /*
+ * Start a new search - just in case we missed
+ * some holes.
+ */
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ start_addr = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = 0;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ if (addr + mm->cached_hole_size < vma->vm_start)
+ mm->cached_hole_size = vma->vm_start - addr;
+ addr = ALIGN(vma->vm_end, align_size);
+ }
+}
+
+static unsigned long arch_get_unmapped_area_topdown_sz(struct file *file,
+ unsigned long addr0, unsigned long len, unsigned long align_size,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev_vma;
+ unsigned long base = mm->mmap_base, addr = addr0;
+ unsigned long largest_hole = mm->cached_hole_size;
+ unsigned long align_mask = ~(align_size - 1);
+ int first_time = 1;
+
+ /* don't allow allocations above current base */
+ if (mm->free_area_cache > base)
+ mm->free_area_cache = base;
+
+ if (len <= largest_hole) {
+ largest_hole = 0;
+ mm->free_area_cache = base;
+ }
+try_again:
+ /* make sure it can fit in the remaining address space */
+ if (mm->free_area_cache < len)
+ goto fail;
+
+ /* either no address requested or can't fit in requested address hole */
+ addr = (mm->free_area_cache - len) & align_mask;
+ do {
+ /*
+ * Lookup failure means no vma is above this address,
+ * i.e. return with success:
+ */
+ vma = find_vma(mm, addr);
+ if (!vma)
+ return addr;
+
+ /*
+ * new region fits between prev_vma->vm_end and
+ * vma->vm_start, use it:
+ */
+ prev_vma = vma->vm_prev;
+ if (addr + len <= vma->vm_start &&
+ (!prev_vma || (addr >= prev_vma->vm_end))) {
+ /* remember the address as a hint for next time */
+ mm->cached_hole_size = largest_hole;
+ return (mm->free_area_cache = addr);
+ } else {
+ /* pull free_area_cache down to the first hole */
+ if (mm->free_area_cache == vma->vm_end) {
+ mm->free_area_cache = vma->vm_start;
+ mm->cached_hole_size = largest_hole;
+ }
+ }
+
+ /* remember the largest hole we saw so far */
+ if (addr + largest_hole < vma->vm_start)
+ largest_hole = vma->vm_start - addr;
+
+ /* try just below the current vma->vm_start */
+ addr = (vma->vm_start - len) & align_mask;
+ } while (len <= vma->vm_start);
+
+fail:
+ /*
+ * if hint left us with no space for the requested
+ * mapping then try again:
+ */
+ if (first_time) {
+ mm->free_area_cache = base;
+ largest_hole = 0;
+ first_time = 0;
+ goto try_again;
+ }
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
+ addr = arch_get_unmapped_area_bottomup_sz(file, addr0, len, align_size,
+ pgoff, flags);
+
+ /*
+ * Restore the topdown base:
+ */
+ mm->free_area_cache = base;
+ mm->cached_hole_size = ~0UL;
+
+ return addr;
+}
+
+unsigned long arch_get_unmapped_area_sz(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long align_size,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ return arch_get_unmapped_area_bottomup_sz(file, addr, len, align_size,
+ pgoff, flags);
+ return arch_get_unmapped_area_topdown_sz(file, addr, len, align_size,
+ pgoff, flags);
+}
+EXPORT_SYMBOL(arch_get_unmapped_area_sz);
+
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78fe3f1..a212b3f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -21,6 +21,7 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/pat.h>
+#include <asm/cpufeature.h>
#include "physaddr.h"
@@ -50,17 +51,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
return err;
}
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-static void __iomem *__ioremap_caller(resource_size_t phys_addr,
- unsigned long size, unsigned long prot_val, void *caller)
+static void __iomem *___ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, unsigned long prot_val, void *caller,
+ unsigned int hpages, unsigned int readonly)
{
unsigned long offset, vaddr;
resource_size_t pfn, last_pfn, last_addr;
@@ -94,12 +87,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
* Don't allow anybody to remap normal RAM that we're using..
*/
last_pfn = last_addr >> PAGE_SHIFT;
- for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
- int is_ram = page_is_ram(pfn);
-
- if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
- return NULL;
- WARN_ON_ONCE(is_ram);
+ if ((phys_addr >> PAGE_SHIFT) < max_pfn)
+ {
+ for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
+ int is_ram = page_is_ram(pfn);
+
+ if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+ return NULL;
+ WARN_ON_ONCE(is_ram);
+ }
}
/*
@@ -145,6 +141,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
break;
}
+ /* Map pages RO */
+ if (readonly)
+ prot = __pgprot((unsigned long)prot.pgprot & ~_PAGE_RW);
+
/*
* Ok, go for it..
*/
@@ -157,8 +157,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
if (kernel_map_sync_memtype(phys_addr, size, prot_val))
goto err_free_area;
- if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
- goto err_free_area;
+ if (hpages)
+ {
+ if (ioremap_hpage_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
+ }
+ else
+ {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
+ }
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
@@ -178,6 +186,21 @@ err_free_memtype:
return NULL;
}
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, unsigned long prot_val, void *caller)
+{
+ return ___ioremap_caller(phys_addr, size, prot_val, caller, 0, 0);
+}
+
/**
* ioremap_nocache - map bus memory into CPU space
* @phys_addr: bus address of the memory
@@ -235,13 +258,40 @@ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
}
EXPORT_SYMBOL(ioremap_wc);
+void __iomem *
+ioremap_hpage_cache(resource_size_t phys_addr, unsigned long size)
+{
+ /* Map using hugepages */
+ return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0), 1, 0);
+}
+EXPORT_SYMBOL(ioremap_hpage_cache);
+
+void __iomem *
+ioremap_hpage_cache_ro(resource_size_t phys_addr, unsigned long size)
+{
+ /* Map using hugepages */
+ return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0), 1, 1);
+}
+EXPORT_SYMBOL(ioremap_hpage_cache_ro);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
- __builtin_return_address(0));
+ /* Map using 4k pages */
+ return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0), 0, 0);
}
EXPORT_SYMBOL(ioremap_cache);
+void __iomem *ioremap_cache_ro(resource_size_t phys_addr, unsigned long size)
+{
+ /* Map using 4k pages */
+ return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0), 0, 1);
+}
+
+EXPORT_SYMBOL(ioremap_cache_ro);
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
unsigned long prot_val)
{
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 6574388..333a795 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -183,6 +183,8 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
struct pagerange_state state = {start_pfn, 0, 0};
+ if (start_pfn >= max_pfn)
+ return 0;
/*
* For legacy reasons, physical address range in the legacy ISA
* region is tracked as non-RAM. This will allow users of
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17fda6a..58e4e52 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,5 +1,6 @@
#include <linux/mm.h>
#include <linux/gfp.h>
+#include <linux/export.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
@@ -328,6 +329,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+EXPORT_SYMBOL(ptep_set_access_flags);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
diff --git a/fs/Kconfig b/fs/Kconfig
index 780725a..7901c54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -17,7 +17,7 @@ source "fs/ext4/Kconfig"
config FS_XIP
# execute in place
bool
- depends on EXT2_FS_XIP
+ depends on EXT2_FS_XIP || PMFS_XIP
default y
source "fs/jbd/Kconfig"
@@ -209,6 +209,7 @@ source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
+source "fs/pmfs/Kconfig"
source "fs/exofs/Kconfig"
source "fs/f2fs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 9d53192..4a3aad5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -127,3 +127,4 @@ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
+obj-$(CONFIG_PMFS) += pmfs/
diff --git a/fs/pmfs/Kconfig b/fs/pmfs/Kconfig
new file mode 100644
index 0000000..7173b04
--- /dev/null
+++ b/fs/pmfs/Kconfig
@@ -0,0 +1,37 @@
+config PMFS
+ tristate "Persistent and Protected PM file system support"
+ depends on HAS_IOMEM
+ select CRC16
+ help
+ If your system has a block of fast (comparable in access speed to
+ system memory) and non-volatile byte-addressable memory and you wish to
+ mount a light-weight, full-featured, and space-efficient filesystem over
+ it, say Y here, and read <file:Documentation/filesystems/pmfs.txt>.
+
+ To compile this as a module, choose M here: the module will be
+ called pmfs.
+
+config PMFS_XIP
+ bool "Execute-in-place in PMFS"
+ depends on PMFS && BLOCK
+ help
+ Say Y here to enable XIP feature of PMFS.
+
+config PMFS_WRITE_PROTECT
+ bool "PMFS write protection"
+ depends on PMFS && MMU && HAVE_SET_MEMORY_RO
+ default y
+ help
+ Say Y here to enable the write protect feature of PMFS.
+
+config PMFS_TEST
+ boolean
+ depends on PMFS
+
+config PMFS_TEST_MODULE
+ tristate "PMFS Test"
+ depends on PMFS && PMFS_WRITE_PROTECT && m
+ select PMFS_TEST
+ help
+ Say Y here to build a simple module to test the protection of
+ PMFS. The module will be called pmfs_test.
diff --git a/fs/pmfs/Makefile b/fs/pmfs/Makefile
new file mode 100644
index 0000000..806c19d
--- /dev/null
+++ b/fs/pmfs/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux pmfs-filesystem routines.
+#
+
+obj-$(CONFIG_PMFS) += pmfs.o
+obj-$(CONFIG_PMFS_TEST_MODULE) += pmfs_test.o
+
+pmfs-y := bbuild.o balloc.o dir.o file.o inode.o namei.o super.o symlink.o ioctl.o persist.o journal.o
+
+pmfs-$(CONFIG_PMFS_WRITE_PROTECT) += wprotect.o
+pmfs-$(CONFIG_PMFS_XIP) += xip.o
diff --git a/fs/pmfs/balloc.c b/fs/pmfs/balloc.c
new file mode 100644
index 0000000..3acc81d
--- /dev/null
+++ b/fs/pmfs/balloc.c
@@ -0,0 +1,256 @@
+/*
+ * PMFS emulated persistence. This file contains code to
+ * handle data blocks of various sizes efficiently.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "pmfs.h"
+
+void pmfs_init_blockmap(struct super_block *sb, unsigned long init_used_size)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ unsigned long num_used_block;
+ struct pmfs_blocknode *blknode;
+
+ num_used_block = (init_used_size + sb->s_blocksize - 1) >>
+ sb->s_blocksize_bits;
+
+ blknode = pmfs_alloc_blocknode(sb);
+ if (blknode == NULL)
+ PMFS_ASSERT(0);
+ blknode->block_low = sbi->block_start;
+ blknode->block_high = sbi->block_start + num_used_block - 1;
+ sbi->num_free_blocks -= num_used_block;
+ list_add(&blknode->link, &sbi->block_inuse_head);
+}
+
+void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
+ unsigned short btype)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct list_head *head = &(sbi->block_inuse_head);
+ unsigned long new_block_low;
+ unsigned long new_block_high;
+ unsigned long num_blocks = 0;
+ struct pmfs_blocknode *i;
+ struct pmfs_blocknode *free_blocknode= NULL;
+ struct pmfs_blocknode *curr_node;
+
+ num_blocks = pmfs_get_numblocks(btype);
+ new_block_low = blocknr;
+ new_block_high = blocknr + num_blocks - 1;
+
+ mutex_lock(&sbi->s_lock);
+
+ /* Traverese each blocknode entry */
+ list_for_each_entry(i, head, link) {
+
+ if (new_block_low > i->block_high) {
+ /* skip to next blocknode */
+ continue;
+ }
+
+ if ((new_block_low == i->block_low) &&
+ (new_block_high == i->block_high)) {
+ /* fits entire datablock */
+ list_del(&i->link);
+ free_blocknode = i;
+ sbi->num_blocknode_allocated--;
+ sbi->num_free_blocks += num_blocks;
+ break;
+ }
+ if ((new_block_low == i->block_low) &&
+ (new_block_high < i->block_high)) {
+ /* Aligns left */
+ i->block_low = new_block_high + 1;
+ sbi->num_free_blocks += num_blocks;
+ break;
+ }
+ if ((new_block_low > i->block_low) &&
+ (new_block_high == i->block_high)) {
+ /* Aligns right */
+ i->block_high = new_block_low - 1;
+ sbi->num_free_blocks += num_blocks;
+ break;
+ }
+ if ((new_block_low > i->block_low) &&
+ (new_block_high < i->block_high)) {
+ /* Aligns somewhere in the middle */
+ curr_node = pmfs_alloc_blocknode(sb);
+ PMFS_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ /* returning without freeing the block*/
+ break;
+ }
+ curr_node->block_low = new_block_high + 1;
+ curr_node->block_high = i->block_high;
+ i->block_high = new_block_low - 1;
+ list_add(&curr_node->link, &i->link);
+ sbi->num_free_blocks += num_blocks;
+ break;
+ }
+ }
+
+ mutex_unlock(&sbi->s_lock);
+
+ if (free_blocknode)
+ __pmfs_free_blocknode(free_blocknode);
+
+ return;
+}
+
+
+int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
+ unsigned short btype, int zero)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct list_head *head = &(sbi->block_inuse_head);
+ struct pmfs_blocknode *i, *next_i;
+ struct pmfs_blocknode *free_blocknode= NULL;
+ void *bp;
+ unsigned long num_blocks = 0;
+ struct pmfs_blocknode *curr_node;
+ int errval = 0;
+ bool found = 0;
+ unsigned long next_block_low;
+ unsigned long new_block_low;
+ unsigned long new_block_high;
+
+ num_blocks = pmfs_get_numblocks(btype);
+
+ mutex_lock(&sbi->s_lock);
+
+ /* Traverese each blocknode entry */
+ list_for_each_entry(i, head, link) {
+ if (i->link.next == head) {
+ next_i = NULL;
+ next_block_low = sbi->block_end;
+ } else {
+ next_i = list_entry(i->link.next, typeof(*i), link);
+ next_block_low = next_i->block_low;
+ }
+
+ new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
+ new_block_high = new_block_low + num_blocks - 1;
+
+ if (new_block_high >= next_block_low) {
+ /* Does not fit - skip to next blocknode */
+ continue;
+ }
+
+ if ((new_block_low == (i->block_high + 1)) &&
+ (new_block_high == (next_block_low - 1)))
+ {
+ /* Fill the gap completly */
+ if (next_i) {
+ i->block_high = next_i->block_high;
+ list_del(&next_i->link);
+ free_blocknode = next_i;
+ sbi->num_blocknode_allocated--;
+ } else {
+ i->block_high = new_block_high;
+ }
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low == (i->block_high + 1)) &&
+ (new_block_high < (next_block_low - 1))) {
+ /* Aligns to left */
+ i->block_high = new_block_high;
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low > (i->block_high + 1)) &&
+ (new_block_high == (next_block_low - 1))) {
+ /* Aligns to right */
+ if (next_i) {
+ /* right node exist */
+ next_i->block_low = new_block_low;
+ } else {
+ /* right node does NOT exist */
+ curr_node = pmfs_alloc_blocknode(sb);
+ PMFS_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ errval = -ENOSPC;
+ break;
+ }
+ curr_node->block_low = new_block_low;
+ curr_node->block_high = new_block_high;
+ list_add(&curr_node->link, &i->link);
+ }
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low > (i->block_high + 1)) &&
+ (new_block_high < (next_block_low - 1))) {
+ /* Aligns somewhere in the middle */
+ curr_node = pmfs_alloc_blocknode(sb);
+ PMFS_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ errval = -ENOSPC;
+ break;
+ }
+ curr_node->block_low = new_block_low;
+ curr_node->block_high = new_block_high;
+ list_add(&curr_node->link, &i->link);
+ found = 1;
+ break;
+ }
+ }
+
+ if (found == 1) {
+ sbi->num_free_blocks -= num_blocks;
+ }
+
+ mutex_unlock(&sbi->s_lock);
+
+ if (free_blocknode)
+ __pmfs_free_blocknode(free_blocknode);
+
+ if (found == 0) {
+ return -ENOSPC;
+ }
+
+ if (zero) {
+ size_t size;
+ bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
+ pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
+ if (btype == PMFS_BLOCK_TYPE_4K)
+ size = 0x1 << 12;
+ else if (btype == PMFS_BLOCK_TYPE_2M)
+ size = 0x1 << 21;
+ else
+ size = 0x1 << 30;
+ memset_nt(bp, 0, size);
+ pmfs_memlock_block(sb, bp);
+ }
+ *blocknr = new_block_low;
+
+ return errval;
+}
+
+unsigned long pmfs_count_free_blocks(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ return sbi->num_free_blocks;
+}
diff --git a/fs/pmfs/bbuild.c b/fs/pmfs/bbuild.c
new file mode 100644
index 0000000..e279b64
--- /dev/null
+++ b/fs/pmfs/bbuild.c
@@ -0,0 +1,509 @@
+/*
+ * PMFS emulated persistence. This file contains code to
+ * handle data blocks of various sizes efficiently.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "pmfs.h"
+
+static void pmfs_clear_datablock_inode(struct super_block *sb)
+{
+ struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+ pmfs_transaction_t *trans;
+
+ /* 2 log entry for inode */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans))
+ return;
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ pmfs_memunlock_inode(sb, pi);
+ memset(pi, 0, MAX_DATA_PER_LENTRY);
+ pmfs_memlock_inode(sb, pi);
+
+ /* commit the transaction */
+ pmfs_commit_transaction(sb, trans);
+}
+
+static void pmfs_init_blockmap_from_inode(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+ struct pmfs_blocknode_lowhigh *p = NULL;
+ struct pmfs_blocknode *blknode;
+ unsigned long index;
+ unsigned long blocknr;
+ unsigned long i;
+ unsigned long num_blocknode;
+ u64 bp;
+
+ num_blocknode = sbi->num_blocknode_allocated;
+ sbi->num_blocknode_allocated = 0;
+ for (i=0; i<num_blocknode; i++) {
+ index = i & 0xFF;
+ if (i == (i & 0xFFFFFFFFFFFFFF00)) {
+ /* Find and get new data block */
+ blocknr = i >> 8; //256 Entries in a block
+ bp = __pmfs_find_data_block(sb, pi, blocknr);
+ p = pmfs_get_block(sb, bp);
+ }
+ PMFS_ASSERT(p);
+ blknode = pmfs_alloc_blocknode(sb);
+ if (blknode == NULL)
+ PMFS_ASSERT(0);
+ blknode->block_low = le64_to_cpu(p[index].block_low);
+ blknode->block_high = le64_to_cpu(p[index].block_high);
+ list_add_tail(&blknode->link, &sbi->block_inuse_head);
+ }
+
+ return;
+}
+
+static bool pmfs_can_skip_full_scan(struct super_block *sb)
+{
+ struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+ struct pmfs_super_block *super = pmfs_get_super(sb);
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ u64 root, isize;
+ unsigned int height, btype;
+
+ if (!pi->root)
+ {
+ return false;
+ }
+
+ sbi->num_blocknode_allocated =
+ le64_to_cpu(super->s_num_blocknode_allocated);
+ sbi->num_free_blocks = le64_to_cpu(super->s_num_free_blocks);
+ sbi->s_inodes_count = le32_to_cpu(super->s_inodes_count);
+ sbi->s_free_inodes_count = le32_to_cpu(super->s_free_inodes_count);
+ sbi->s_inodes_used_count = le32_to_cpu(super->s_inodes_used_count);
+ sbi->s_free_inode_hint = le32_to_cpu(super->s_free_inode_hint);
+
+ pmfs_init_blockmap_from_inode(sb);
+
+ root = pi->root;
+ height = pi->height;
+ btype = pi->i_blk_type;
+ isize = le64_to_cpu(pi->i_size);
+
+ /* Clearing the datablock inode */
+ pmfs_clear_datablock_inode(sb);
+
+ pmfs_free_inode_subtree(sb, root, height, btype, isize);
+
+ return true;
+}
+
+
+static int pmfs_allocate_datablock_block_inode(pmfs_transaction_t *trans,
+ struct super_block *sb, struct pmfs_inode *pi, unsigned long num_blocks)
+{
+ int errval;
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_mode = 0;
+ pi->i_links_count = cpu_to_le16(1);
+ pi->i_blk_type = PMFS_BLOCK_TYPE_4K;
+ pi->i_flags = 0;
+ pi->height = 0;
+ pi->i_dtime = 0;
+ pi->i_size = cpu_to_le64(num_blocks << sb->s_blocksize_bits);
+ pmfs_memlock_inode(sb, pi);
+
+ errval = __pmfs_alloc_blocks(trans, sb, pi, 0, num_blocks, false);
+
+ return errval;
+}
+
+void pmfs_save_blocknode_mappings(struct super_block *sb)
+{
+ unsigned long num_blocks, blocknr;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+ struct pmfs_blocknode_lowhigh *p;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct list_head *head = &(sbi->block_inuse_head);
+ struct pmfs_blocknode *i;
+ struct pmfs_super_block *super;
+ pmfs_transaction_t *trans;
+ u64 bp;
+ int j, k;
+ int errval;
+
+ num_blocks = ((sbi->num_blocknode_allocated * sizeof(struct
+ pmfs_blocknode_lowhigh) - 1) >> sb->s_blocksize_bits) + 1;
+
+ /* 2 log entry for inode, 2 lentry for super-block */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + MAX_SB_LENTRIES);
+ if (IS_ERR(trans))
+ return;
+
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ errval = pmfs_allocate_datablock_block_inode(trans, sb, pi, num_blocks);
+
+ if (errval != 0) {
+ pmfs_dbg("Error saving the blocknode mappings: %d\n", errval);
+ pmfs_abort_transaction(sb, trans);
+ return;
+ }
+
+ j = 0;
+ k = 0;
+ p = NULL;
+ list_for_each_entry(i, head, link) {
+ blocknr = k >> 8;
+ if (j == 0) {
+ /* Find, get and unlock new data block */
+ bp = __pmfs_find_data_block(sb, pi, blocknr);
+ p = pmfs_get_block(sb, bp);
+ pmfs_memunlock_block(sb, p);
+ }
+ p[j].block_low = cpu_to_le64(i->block_low);
+ p[j].block_high = cpu_to_le64(i->block_high);
+ j++;
+
+ if (j == 256) {
+ j = 0;
+ /* Lock the data block */
+ pmfs_memlock_block(sb, p);
+ pmfs_flush_buffer(p, 4096, false);
+ }
+
+ k++;
+ }
+
+ /* Lock the block */
+ if (j) {
+ pmfs_flush_buffer(p, j << 4, false);
+ pmfs_memlock_block(sb, p);
+ }
+
+ /*
+ * save the total allocated blocknode mappings
+ * in super block
+ */
+ super = pmfs_get_super(sb);
+ pmfs_add_logentry(sb, trans, &super->s_wtime,
+ PMFS_FAST_MOUNT_FIELD_SIZE, LE_DATA);
+
+ pmfs_memunlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
+
+ super->s_wtime = cpu_to_le32(get_seconds());
+ super->s_num_blocknode_allocated =
+ cpu_to_le64(sbi->num_blocknode_allocated);
+ super->s_num_free_blocks = cpu_to_le64(sbi->num_free_blocks);
+ super->s_inodes_count = cpu_to_le32(sbi->s_inodes_count);
+ super->s_free_inodes_count = cpu_to_le32(sbi->s_free_inodes_count);
+ super->s_inodes_used_count = cpu_to_le32(sbi->s_inodes_used_count);
+ super->s_free_inode_hint = cpu_to_le32(sbi->s_free_inode_hint);
+
+ pmfs_memlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
+ /* commit the transaction */
+ pmfs_commit_transaction(sb, trans);
+}
+
+static void pmfs_inode_crawl_recursive(struct super_block *sb,
+ unsigned long block, u32 height,
+ u32 btype)
+{
+ u64 *node;
+ unsigned int i;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ if (height == 0) {
+ /* This is the data block */
+ if (btype == cpu_to_le16(PMFS_BLOCK_TYPE_4K)) {
+ set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+ } else if (btype == cpu_to_le16(PMFS_BLOCK_TYPE_2M)) {
+ set_bit(block >> PAGE_SHIFT_2M, sbi->bitmap_2M);
+ } else {
+ set_bit(block >> PAGE_SHIFT_1G, sbi->bitmap_1G);
+
+ }
+ return;
+ }
+
+ node = pmfs_get_block(sb, block);
+ set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+ for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+ if (node[i] == 0)
+ continue;
+ pmfs_inode_crawl_recursive(sb,
+ le64_to_cpu(node[i]), height - 1, btype);
+ }
+}
+
+static inline void pmfs_inode_crawl(struct super_block *sb,
+ struct pmfs_inode *pi)
+{
+ if (pi->root == 0)
+ return;
+ pmfs_inode_crawl_recursive(sb, le64_to_cpu(pi->root), pi->height,
+ pi->i_blk_type);
+}
+
+static void pmfs_inode_table_crawl_recursive(struct super_block *sb,
+ unsigned long block, u32 height, u32 btype)
+{
+ u64 *node;
+ unsigned int i;
+ struct pmfs_inode *pi;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ node = pmfs_get_block(sb, block);
+
+ if (height == 0) {
+ unsigned int inodes_per_block = INODES_PER_BLOCK(btype);
+ if (likely(btype == PMFS_BLOCK_TYPE_2M))
+ set_bit(block >> PAGE_SHIFT_2M, sbi->bitmap_2M);
+ else
+ set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+
+ sbi->s_inodes_count += inodes_per_block;
+ for (i = 0; i < inodes_per_block; i++) {
+ pi = (struct pmfs_inode *)((void *)node +
+ PMFS_INODE_SIZE * i);
+ if (le16_to_cpu(pi->i_links_count) == 0 &&
+ (le16_to_cpu(pi->i_mode) == 0 ||
+ le32_to_cpu(pi->i_dtime))) {
+ /* Empty inode */
+ continue;
+ }
+ sbi->s_inodes_used_count++;
+ pmfs_inode_crawl(sb, pi);
+ }
+ return;
+ }
+
+ set_bit(block >> PAGE_SHIFT , sbi->bitmap_4k);
+ for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+ if (node[i] == 0)
+ continue;
+ pmfs_inode_table_crawl_recursive(sb,
+ le64_to_cpu(node[i]), height - 1, btype);
+ }
+}
+
+static int pmfs_alloc_insert_blocknode_map(struct super_block *sb,
+ unsigned long low, unsigned long high)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct list_head *head = &(sbi->block_inuse_head);
+ struct pmfs_blocknode *i, *next_i;
+ struct pmfs_blocknode *free_blocknode= NULL;
+ unsigned long num_blocks = 0;
+ struct pmfs_blocknode *curr_node;
+ int errval = 0;
+ bool found = 0;
+ unsigned long next_block_low;
+ unsigned long new_block_low;
+ unsigned long new_block_high;
+
+ //num_blocks = pmfs_get_numblocks(btype);
+
+ new_block_low = low;
+ new_block_high = high;
+ num_blocks = high - low + 1;
+
+ /* Traverese each blocknode entry */
+ list_for_each_entry(i, head, link) {
+ if (i->link.next == head) {
+ next_i = NULL;
+ next_block_low = sbi->block_end;
+ } else {
+ next_i = list_entry(i->link.next, typeof(*i), link);
+ next_block_low = next_i->block_low;
+ }
+
+
+ if (new_block_high >= next_block_low) {
+ /* Does not fit - skip to next blocknode */
+ continue;
+ }
+
+ if ((new_block_low == (i->block_high + 1)) &&
+ (new_block_high == (next_block_low - 1)))
+ {
+ /* Fill the gap completly */
+ if (next_i) {
+ i->block_high = next_i->block_high;
+ list_del(&next_i->link);
+ free_blocknode = next_i;
+ } else {
+ i->block_high = new_block_high;
+ }
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low == (i->block_high + 1)) &&
+ (new_block_high < (next_block_low - 1))) {
+ /* Aligns to left */
+ i->block_high = new_block_high;
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low > (i->block_high + 1)) &&
+ (new_block_high == (next_block_low - 1))) {
+ /* Aligns to right */
+ if (next_i) {
+ /* right node exist */
+ next_i->block_low = new_block_low;
+ } else {
+ /* right node does NOT exist */
+ curr_node = pmfs_alloc_blocknode(sb);
+ PMFS_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ errval = -ENOSPC;
+ break;
+ }
+ curr_node->block_low = new_block_low;
+ curr_node->block_high = new_block_high;
+ list_add(&curr_node->link, &i->link);
+ }
+ found = 1;
+ break;
+ }
+
+ if ((new_block_low > (i->block_high + 1)) &&
+ (new_block_high < (next_block_low - 1))) {
+ /* Aligns somewhere in the middle */
+ curr_node = pmfs_alloc_blocknode(sb);
+ PMFS_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ errval = -ENOSPC;
+ break;
+ }
+ curr_node->block_low = new_block_low;
+ curr_node->block_high = new_block_high;
+ list_add(&curr_node->link, &i->link);
+ found = 1;
+ break;
+ }
+ }
+
+ if (found == 1) {
+ sbi->num_free_blocks -= num_blocks;
+ }
+
+ if (free_blocknode)
+ pmfs_free_blocknode(sb, free_blocknode);
+
+ if (found == 0) {
+ return -ENOSPC;
+ }
+
+
+ return errval;
+}
+
+static int __pmfs_build_blocknode_map(struct super_block *sb,
+ unsigned long *bitmap, unsigned long bsize, unsigned long scale)
+{
+ unsigned long next = 1;
+ unsigned long low = 0;
+
+ while (1) {
+ next = find_next_bit(bitmap, bsize, next);
+ if (next == bsize)
+ break;
+ low = next;
+ next = find_next_zero_bit(bitmap, bsize, next);
+ if (pmfs_alloc_insert_blocknode_map(sb, low << scale , (next << scale) - 1)) {
+ printk("PMFS: Error could not insert 0x%lx-0x%lx\n", low << scale, ((next << scale) - 1));
+ }
+ if (next == bsize)
+ break;
+ }
+ return 0;
+}
+
+static int pmfs_build_blocknode_map(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ __pmfs_build_blocknode_map(sb, sbi->bitmap_4k, sbi->bitmap_4k_size * 8, PAGE_SHIFT - 12);
+ __pmfs_build_blocknode_map(sb, sbi->bitmap_2M, sbi->bitmap_2M_size * 8, PAGE_SHIFT_2M - 12);
+ __pmfs_build_blocknode_map(sb, sbi->bitmap_1G, sbi->bitmap_1G_size * 8, PAGE_SHIFT_1G - 12);
+
+ return 0;
+}
+
+int pmfs_setup_blocknode_map(struct super_block *sb)
+{
+ struct pmfs_super_block *super = pmfs_get_super(sb);
+ struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ unsigned long initsize = le64_to_cpu(super->s_size);
+ bool value = false;
+
+ mutex_init(&sbi->inode_table_mutex);
+ sbi->block_start = (unsigned long)0;
+ sbi->block_end = ((unsigned long)(initsize) >> PAGE_SHIFT);
+
+ value = pmfs_can_skip_full_scan(sb);
+ if (value) {
+ pmfs_dbg_verbose("PMFS: Skipping full scan of inodes...\n");
+ return 0;
+ }
+
+ sbi->bitmap_4k_size = (initsize >> (PAGE_SHIFT + 0x3)) + 1;
+ sbi->bitmap_2M_size = (initsize >> (PAGE_SHIFT_2M + 0x3)) + 1;
+ sbi->bitmap_1G_size = (initsize >> (PAGE_SHIFT_1G + 0x3)) + 1;
+
+ /* Alloc memory to hold the block alloc bitmap */
+ sbi->bitmap_4k = kzalloc(sbi->bitmap_4k_size, GFP_KERNEL);
+ sbi->bitmap_2M = kzalloc(sbi->bitmap_2M_size, GFP_KERNEL);
+ sbi->bitmap_1G = kzalloc(sbi->bitmap_1G_size, GFP_KERNEL);
+ if (!sbi->bitmap_4k || !sbi->bitmap_2M || !sbi->bitmap_1G) {
+ goto skip;
+ }
+
+ /* Clearing the datablock inode */
+ pmfs_clear_datablock_inode(sb);
+
+ pmfs_inode_table_crawl_recursive(sb, le64_to_cpu(pi->root), pi->height,
+ pi->i_blk_type);
+
+ /* Reserving tow inodes - Inode 0 and Inode for datablock */
+ sbi->s_free_inodes_count = sbi->s_inodes_count -
+ (sbi->s_inodes_used_count + 2);
+
+ /* set the block 0 as this is used */
+ sbi->s_free_inode_hint = PMFS_FREE_INODE_HINT_START;
+
+ /* initialize the num_free_blocks to */
+ sbi->num_free_blocks = ((unsigned long)(initsize) >> PAGE_SHIFT);
+ pmfs_init_blockmap(sb, le64_to_cpu(journal->base) + sbi->jsize);
+
+ pmfs_build_blocknode_map(sb);
+
+skip:
+
+ kfree(sbi->bitmap_4k);
+ kfree(sbi->bitmap_2M);
+ kfree(sbi->bitmap_1G);
+
+ return 0;
+}
diff --git a/fs/pmfs/dir.c b/fs/pmfs/dir.c
new file mode 100644
index 0000000..b3ddb3c
--- /dev/null
+++ b/fs/pmfs/dir.c
@@ -0,0 +1,310 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for directories.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "pmfs.h"
+
+/*
+ * Parent is locked.
+ */
+
+#define DT2IF(dt) (((dt) << 12) & S_IFMT)
+#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
+
+static int pmfs_add_dirent_to_buf(pmfs_transaction_t *trans,
+ struct dentry *dentry, struct inode *inode,
+ struct pmfs_direntry *de, u8 *blk_base, struct pmfs_inode *pidir)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned short reclen;
+ int nlen, rlen;
+ char *top;
+
+ reclen = PMFS_DIR_REC_LEN(namelen);
+ if (!de) {
+ de = (struct pmfs_direntry *)blk_base;
+ top = blk_base + dir->i_sb->s_blocksize - reclen;
+ while ((char *)de <= top) {
+#if 0
+ if (!pmfs_check_dir_entry("pmfs_add_dirent_to_buf",
+ dir, de, blk_base, offset))
+ return -EIO;
+ if (pmfs_match(namelen, name, de))
+ return -EEXIST;
+#endif
+ rlen = le16_to_cpu(de->de_len);
+ if (de->ino) {
+ nlen = PMFS_DIR_REC_LEN(de->name_len);
+ if ((rlen - nlen) >= reclen)
+ break;
+ } else if (rlen >= reclen)
+ break;
+ de = (struct pmfs_direntry *)((char *)de + rlen);
+ }
+ if ((char *)de > top)
+ return -ENOSPC;
+ }
+ rlen = le16_to_cpu(de->de_len);
+
+ if (de->ino) {
+ struct pmfs_direntry *de1;
+ pmfs_add_logentry(dir->i_sb, trans, &de->de_len,
+ sizeof(de->de_len), LE_DATA);
+ nlen = PMFS_DIR_REC_LEN(de->name_len);
+ de1 = (struct pmfs_direntry *)((char *)de + nlen);
+ pmfs_memunlock_block(dir->i_sb, blk_base);
+ de1->de_len = cpu_to_le16(rlen - nlen);
+ de->de_len = cpu_to_le16(nlen);
+ pmfs_memlock_block(dir->i_sb, blk_base);
+ de = de1;
+ } else {
+ pmfs_add_logentry(dir->i_sb, trans, &de->ino,
+ sizeof(de->ino), LE_DATA);
+ }
+ pmfs_memunlock_block(dir->i_sb, blk_base);
+ /*de->file_type = 0;*/
+ if (inode) {
+ de->ino = cpu_to_le64(inode->i_ino);
+ /*de->file_type = IF2DT(inode->i_mode); */
+ } else {
+ de->ino = 0;
+ }
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+ pmfs_memlock_block(dir->i_sb, blk_base);
+ pmfs_flush_buffer(de, reclen, false);
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ */
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ /*dir->i_version++; */
+
+ pmfs_memunlock_inode(dir->i_sb, pidir);
+ pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+ pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
+ pmfs_memlock_inode(dir->i_sb, pidir);
+ return 0;
+}
+
+/* adds a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int pmfs_add_entry(pmfs_transaction_t *trans, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ int retval = -EINVAL;
+ unsigned long block, blocks;
+ struct pmfs_direntry *de;
+ char *blk_base;
+ struct pmfs_inode *pidir;
+
+ if (!dentry->d_name.len)
+ return -EINVAL;
+
+ pidir = pmfs_get_inode(sb, dir->i_ino);
+ pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+ for (block = 0; block < blocks; block++) {
+ blk_base =
+ pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+ if (!blk_base) {
+ retval = -EIO;
+ goto out;
+ }
+ retval = pmfs_add_dirent_to_buf(trans, dentry, inode,
+ NULL, blk_base, pidir);
+ if (retval != -ENOSPC)
+ goto out;
+ }
+ retval = pmfs_alloc_blocks(trans, dir, blocks, 1, false);
+ if (retval)
+ goto out;
+
+ dir->i_size += dir->i_sb->s_blocksize;
+ pmfs_update_isize(dir, pidir);
+
+ blk_base = pmfs_get_block(sb, pmfs_find_data_block(dir, blocks));
+ if (!blk_base) {
+ retval = -ENOSPC;
+ goto out;
+ }
+ /* No need to log the changes to this de because its a new block */
+ de = (struct pmfs_direntry *)blk_base;
+ pmfs_memunlock_block(sb, blk_base);
+ de->ino = 0;
+ de->de_len = cpu_to_le16(sb->s_blocksize);
+ pmfs_memlock_block(sb, blk_base);
+ /* Since this is a new block, no need to log changes to this block */
+ retval = pmfs_add_dirent_to_buf(NULL, dentry, inode, de, blk_base,
+ pidir);
+out:
+ return retval;
+}
+
+/* removes a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int pmfs_remove_entry(pmfs_transaction_t *trans, struct dentry *de,
+ struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct inode *dir = de->d_parent->d_inode;
+ struct pmfs_inode *pidir;
+ struct qstr *entry = &de->d_name;
+ struct pmfs_direntry *res_entry, *prev_entry;
+ int retval = -EINVAL;
+ unsigned long blocks, block;
+ char *blk_base = NULL;
+
+ if (!de->d_name.len)
+ return -EINVAL;
+
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+
+ for (block = 0; block < blocks; block++) {
+ blk_base =
+ pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+ if (!blk_base)
+ goto out;
+ if (pmfs_search_dirblock(blk_base, dir, entry,
+ block << sb->s_blocksize_bits,
+ &res_entry, &prev_entry) == 1)
+ break;
+ }
+
+ if (block == blocks)
+ goto out;
+ if (prev_entry) {
+ pmfs_add_logentry(sb, trans, &prev_entry->de_len,
+ sizeof(prev_entry->de_len), LE_DATA);
+ pmfs_memunlock_block(sb, blk_base);
+ prev_entry->de_len =
+ cpu_to_le16(le16_to_cpu(prev_entry->de_len) +
+ le16_to_cpu(res_entry->de_len));
+ pmfs_memlock_block(sb, blk_base);
+ } else {
+ pmfs_add_logentry(sb, trans, &res_entry->ino,
+ sizeof(res_entry->ino), LE_DATA);
+ pmfs_memunlock_block(sb, blk_base);
+ res_entry->ino = 0;
+ pmfs_memlock_block(sb, blk_base);
+ }
+ /*dir->i_version++; */
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+
+ pidir = pmfs_get_inode(sb, dir->i_ino);
+ pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ pmfs_memunlock_inode(sb, pidir);
+ pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+ pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
+ pmfs_memlock_inode(sb, pidir);
+ retval = 0;
+out:
+ return retval;
+}
+
+static int pmfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi;
+ char *blk_base;
+ int ret = 0, stored;
+ int error = 0;
+ unsigned long offset;
+ struct pmfs_direntry *de;
+ ino_t ino;
+
+ stored = 0;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+ unsigned long blk = filp->f_pos >> sb->s_blocksize_bits;
+
+ blk_base =
+ pmfs_get_block(sb, pmfs_find_data_block(inode, blk));
+ if (!blk_base) {
+ pmfs_dbg("directory %lu contains a hole at offset %lld\n",
+ inode->i_ino, filp->f_pos);
+ filp->f_pos += sb->s_blocksize - offset;
+ continue;
+ }
+#if 0
+ if (filp->f_version != inode->i_version) {
+ for (i = 0; i < sb->s_blocksize && i < offset; ) {
+ de = (struct pmfs_direntry *)(blk_base + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (le16_to_cpu(de->de_len) <
+ PMFS_DIR_REC_LEN(1))
+ break;
+ i += le16_to_cpu(de->de_len);
+ }
+ offset = i;
+ filp->f_pos =
+ (filp->f_pos & ~(sb->s_blocksize - 1)) | offset;
+ filp->f_version = inode->i_version;
+ }
+#endif
+ while (!error && filp->f_pos < inode->i_size
+ && offset < sb->s_blocksize) {
+ de = (struct pmfs_direntry *)(blk_base + offset);
+ if (!pmfs_check_dir_entry("pmfs_readdir", inode, de,
+ blk_base, offset)) {
+ /* On error, skip the f_pos to the next block. */
+ filp->f_pos = (filp->f_pos | (sb->s_blocksize - 1)) + 1;
+ ret = stored;
+ goto out;
+ }
+ offset += le16_to_cpu(de->de_len);
+ if (de->ino) {
+ ino = le64_to_cpu(de->ino);
+ pi = pmfs_get_inode(sb, ino);
+ error = filldir(dirent, de->name, de->name_len,
+ filp->f_pos, ino,
+ IF2DT(le16_to_cpu(pi->i_mode)));
+ if (error)
+ break;
+ stored++;
+ }
+ filp->f_pos += le16_to_cpu(de->de_len);
+ }
+ offset = 0;
+ }
+out:
+ return ret;
+}
+
+const struct file_operations pmfs_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = pmfs_readdir,
+ .fsync = noop_fsync,
+ .unlocked_ioctl = pmfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = pmfs_compat_ioctl,
+#endif
+};
diff --git a/fs/pmfs/file.c b/fs/pmfs/file.c
new file mode 100644
index 0000000..e6c3812
--- /dev/null
+++ b/fs/pmfs/file.c
@@ -0,0 +1,333 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for files.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/falloc.h>
+#include <asm/mman.h>
+#include "pmfs.h"
+#include "xip.h"
+
+static inline int pmfs_can_set_blocksize_hint(struct pmfs_inode *pi,
+ loff_t new_size)
+{
+ /* Currently, we don't deallocate data blocks till the file is deleted.
+ * So no changing blocksize hints once allocation is done. */
+ if (le64_to_cpu(pi->root))
+ return 0;
+ return 1;
+}
+
+int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
+ loff_t new_size)
+{
+ unsigned short block_type;
+
+ if (!pmfs_can_set_blocksize_hint(pi, new_size))
+ return 0;
+
+ if (new_size >= 0x40000000) { /* 1G */
+ block_type = PMFS_BLOCK_TYPE_1G;
+ goto hint_set;
+ }
+
+ if (new_size >= 0x200000) { /* 2M */
+ block_type = PMFS_BLOCK_TYPE_2M;
+ goto hint_set;
+ }
+
+ /* defaulting to 4K */
+ block_type = PMFS_BLOCK_TYPE_4K;
+
+hint_set:
+ pmfs_dbg_verbose(
+ "Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n",
+ new_size, pi->i_size, le64_to_cpu(pi->root));
+ pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type);
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_blk_type = block_type;
+ pmfs_memlock_inode(sb, pi);
+ return 0;
+}
+
+static long pmfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ long ret = 0;
+ unsigned long blocknr, blockoff;
+ int num_blocks, blocksize_mask;
+ struct pmfs_inode *pi;
+ pmfs_transaction_t *trans;
+ loff_t new_size;
+
+ /* We only support the FALLOC_FL_KEEP_SIZE mode */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ if (S_ISDIR(inode->i_mode))
+ return -ENODEV;
+
+ mutex_lock(&inode->i_mutex);
+
+ new_size = len + offset;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ if (!pi) {
+ ret = -EACCES;
+ goto out;
+ }
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES +
+ MAX_METABLOCK_LENTRIES);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ /* Set the block size hint */
+ pmfs_set_blocksize_hint(sb, pi, new_size);
+
+ blocksize_mask = sb->s_blocksize - 1;
+ blocknr = offset >> sb->s_blocksize_bits;
+ blockoff = offset & blocksize_mask;
+ num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
+ ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true);
+
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+
+ pmfs_memunlock_inode(sb, pi);
+ if (ret || (mode & FALLOC_FL_KEEP_SIZE)) {
+ pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+ inode->i_size = new_size;
+ pi->i_size = cpu_to_le64(inode->i_size);
+ }
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pmfs_memlock_inode(sb, pi);
+
+ pmfs_commit_transaction(sb, trans);
+
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+loff_t pmfs_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ int retval;
+
+ if (origin != SEEK_DATA && origin != SEEK_HOLE)
+ return generic_file_llseek(file, offset, origin);
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_DATA:
+ retval = pmfs_find_region(inode, &offset, 0);
+ if (retval) {
+ mutex_unlock(&inode->i_mutex);
+ return retval;
+ }
+ break;
+ case SEEK_HOLE:
+ retval = pmfs_find_region(inode, &offset, 1);
+ if (retval) {
+ mutex_unlock(&inode->i_mutex);
+ return retval;
+ }
+ break;
+ }
+
+ if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
+ offset > inode->i_sb->s_maxbytes) {
+ mutex_unlock(&inode->i_mutex);
+ return -EINVAL;
+ }
+
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
+/* This function is called by both msync() and fsync().
+ * TODO: Check if we can avoid calling pmfs_flush_buffer() for fsync. We use
+ * movnti to write data to files, so we may want to avoid doing unnecessary
+ * pmfs_flush_buffer() on fsync() */
+int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ /* Sync from start to end[inclusive] */
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t isize;
+ int error;
+
+ end += 1; /* end is inclusive. We like our indices normal please ! */
+
+ isize = i_size_read(inode);
+
+ if ((unsigned long)end > (unsigned long)isize)
+ end = isize;
+ if (!isize || (start >= end))
+ {
+ pmfs_dbg_verbose("[%s:%d] : (ERR) isize(%llx), start(%llx),"
+ " end(%llx)\n", __func__, __LINE__, isize, start, end);
+ return -ENODATA;
+ }
+
+ /* Align start and end to cacheline boundaries */
+ start = start & CACHELINE_MASK;
+ end = CACHELINE_ALIGN(end);
+ do {
+ void *xip_mem;
+ pgoff_t pgoff;
+ loff_t offset;
+ unsigned long xip_pfn, nr_flush_bytes;
+
+ pgoff = start >> PAGE_CACHE_SHIFT;
+ offset = start & ~PAGE_CACHE_MASK;
+
+ nr_flush_bytes = PAGE_CACHE_SIZE - offset;
+ if (nr_flush_bytes > (end - start))
+ nr_flush_bytes = end - start;
+
+ error = mapping->a_ops->get_xip_mem(mapping, pgoff, 0,
+ &xip_mem, &xip_pfn);
+
+ if (unlikely(error)) {
+ /* sparse files could have such holes */
+ pmfs_dbg_verbose("[%s:%d] : start(%llx), end(%llx),"
+ " pgoff(%lx)\n", __func__, __LINE__, start, end, pgoff);
+ } else {
+ /* flush the range */
+ pmfs_flush_buffer(xip_mem+offset, nr_flush_bytes, 0);
+ }
+
+ start += nr_flush_bytes;
+ } while (start < end);
+
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ return 0;
+}
+
+/* This callback is called when a file is closed */
+static int pmfs_flush(struct file *file, fl_owner_t id)
+{
+ int ret = 0;
+ /* if the file was opened for writing, make it persistent.
+ * TODO: Should we be more smart to check if the file was modified? */
+ if (file->f_flags & FMODE_WRITE) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+
+ return ret;
+}
+
+extern unsigned long arch_get_unmapped_area_sz(struct file *file,
+ unsigned long addr0, unsigned long len, unsigned long align_size,
+ unsigned long pgoff, unsigned long flags);
+
+unsigned long
+pmfs_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ unsigned long align_size;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct inode *inode = file->f_mapping->host;
+ struct pmfs_inode *pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (pi->i_blk_type == PMFS_BLOCK_TYPE_1G)
+ align_size = PUD_SIZE;
+ else if (pi->i_blk_type == PMFS_BLOCK_TYPE_2M)
+ align_size = PMD_SIZE;
+ else
+ align_size = PAGE_SIZE;
+
+ if (flags & MAP_FIXED) {
+ /* FIXME: We could use 4K mappings as fallback. */
+ if (len & (align_size - 1))
+ return -EINVAL;
+ if (addr & (align_size - 1))
+ return -EINVAL;
+ return addr;
+ }
+
+ if (addr) {
+ addr = ALIGN(addr, align_size);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+ return arch_get_unmapped_area_sz(file, addr, len, align_size, pgoff,
+ flags);
+#if 0
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ return pmfs_get_unmapped_area_bottomup(file, addr, len,
+ align_size, pgoff,
+ flags);
+ else
+ return pmfs_get_unmapped_area_topdown(file, addr, len,
+ align_size, pgoff,
+ flags);
+#endif
+}
+
+const struct file_operations pmfs_xip_file_operations = {
+ .llseek = pmfs_llseek,
+ .read = pmfs_xip_file_read,
+ .write = pmfs_xip_file_write,
+ .mmap = pmfs_xip_file_mmap,
+ .open = generic_file_open,
+ .fsync = pmfs_fsync,
+ .flush = pmfs_flush,
+ .get_unmapped_area = pmfs_get_unmapped_area,
+ .unlocked_ioctl = pmfs_ioctl,
+ .fallocate = pmfs_fallocate,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = pmfs_compat_ioctl,
+#endif
+};
+
+const struct inode_operations pmfs_file_inode_operations = {
+ .setattr = pmfs_notify_change,
+ .getattr = pmfs_getattr,
+ .get_acl = NULL,
+};
diff --git a/fs/pmfs/inode.c b/fs/pmfs/inode.c
new file mode 100644
index 0000000..08bdc87
--- /dev/null
+++ b/fs/pmfs/inode.c
@@ -0,0 +1,1568 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/highuid.h>
+#include <linux/module.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/types.h>
+#include "pmfs.h"
+#include "xip.h"
+
+struct backing_dev_info pmfs_backing_dev_info __read_mostly = {
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[PMFS_BLOCK_TYPE_MAX] = {0x1000, 0x200000, 0x40000000};
+
+/*
+ * allocate a data block for inode and return it's absolute blocknr.
+ * Zeroes out the block if zero set. Increments inode->i_blocks.
+ */
+static int pmfs_new_data_block(struct super_block *sb, struct pmfs_inode *pi,
+ unsigned long *blocknr, int zero)
+{
+ unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+
+ int errval = pmfs_new_block(sb, blocknr, pi->i_blk_type, zero);
+
+ if (!errval) {
+ pmfs_memunlock_inode(sb, pi);
+ le64_add_cpu(&pi->i_blocks,
+ (1 << (data_bits - sb->s_blocksize_bits)));
+ pmfs_memlock_inode(sb, pi);
+ }
+
+ return errval;
+}
+
+/*
+ * find the offset to the block represented by the given inode's file
+ * relative block number.
+ */
+u64 pmfs_find_data_block(struct inode *inode, unsigned long file_blocknr)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ u32 blk_shift;
+ unsigned long blk_offset, blocknr = file_blocknr;
+ unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+ unsigned int meta_bits = META_BLK_SHIFT;
+ u64 bp;
+
+ /* convert the 4K blocks into the actual blocks the inode is using */
+ blk_shift = data_bits - sb->s_blocksize_bits;
+ blk_offset = file_blocknr & ((1 << blk_shift) - 1);
+ blocknr = file_blocknr >> blk_shift;
+
+ if (blocknr >= (1UL << (pi->height * meta_bits)))
+ return 0;
+
+ bp = __pmfs_find_data_block(sb, pi, blocknr);
+ pmfs_dbg1("find_data_block %lx, %x %llx blk_p %p blk_shift %x"
+ " blk_offset %lx\n", file_blocknr, pi->height, bp,
+ pmfs_get_block(sb, bp), blk_shift, blk_offset);
+
+ if (bp == 0)
+ return 0;
+ return bp + (blk_offset << sb->s_blocksize_bits);
+}
+
+/* recursive_find_region: recursively search the btree to find hole or data
+ * in the specified range
+ * Input:
+ * block: points to the root of the b-tree
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int recursive_find_region(struct super_block *sb, unsigned long block,
+ u32 height, unsigned long first_blocknr, unsigned long last_blocknr,
+ int *data_found, int *hole_found, int hole)
+{
+ unsigned int meta_bits = META_BLK_SHIFT;
+ u64 *node;
+ unsigned long first_blk, last_blk, node_bits, blocks = 0;
+ unsigned int first_index, last_index, i;
+
+ node_bits = (height - 1) * meta_bits;
+
+ first_index = first_blocknr >> node_bits;
+ last_index = last_blocknr >> node_bits;
+
+ node = pmfs_get_block(sb, le64_to_cpu(block));
+
+ for (i = first_index; i <= last_index; i++) {
+ if (height == 1 || node[i] == 0) {
+ if (node[i]) {
+ *data_found = 1;
+ if (!hole)
+ goto done;
+ } else {
+ *hole_found = 1;
+ }
+
+ if (!*hole_found || !hole)
+ blocks += (1UL << node_bits);
+ } else {
+ first_blk = (i == first_index) ? (first_blocknr &
+ ((1 << node_bits) - 1)) : 0;
+
+ last_blk = (i == last_index) ? (last_blocknr &
+ ((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+ blocks += recursive_find_region(sb, node[i], height - 1,
+ first_blk, last_blk, data_found, hole_found,
+ hole);
+ if (!hole && *data_found)
+ goto done;
+ /* cond_resched(); */
+ }
+ }
+done:
+ return blocks;
+}
+
+/*
+ * find the file offset for SEEK_DATA/SEEK_HOLE
+ */
+unsigned long pmfs_find_region(struct inode *inode, loff_t *offset, int hole)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+ unsigned long first_blocknr, last_blocknr;
+ unsigned long blocks = 0, offset_in_block;
+ int data_found = 0, hole_found = 0;
+
+ if (*offset >= inode->i_size)
+ return -ENXIO;
+
+ if (!inode->i_blocks || !pi->root) {
+ if (hole)
+ return inode->i_size;
+ else
+ return -ENXIO;
+ }
+
+ offset_in_block = *offset & ((1UL << data_bits) - 1);
+
+ if (pi->height == 0) {
+ data_found = 1;
+ goto out;
+ }
+
+ first_blocknr = *offset >> data_bits;
+ last_blocknr = inode->i_size >> data_bits;
+
+ pmfs_dbg_verbose("find_region offset %llx, first_blocknr %lx,"
+ " last_blocknr %lx hole %d\n",
+ *offset, first_blocknr, last_blocknr, hole);
+
+ blocks = recursive_find_region(inode->i_sb, pi->root, pi->height,
+ first_blocknr, last_blocknr, &data_found, &hole_found, hole);
+
+out:
+ /* Searching data but only hole found till the end */
+ if (!hole && !data_found && hole_found)
+ return -ENXIO;
+
+ if (data_found && !hole_found) {
+ /* Searching data but we are alredy into them */
+ if (hole)
+ /* Searching hole but only data found, go to the end */
+ *offset = inode->i_size;
+ return 0;
+ }
+
+ /* Searching for hole, hole found and starting inside an hole */
+ if (hole && hole_found && !blocks) {
+ /* we found data after it */
+ if (!data_found)
+ /* last hole */
+ *offset = inode->i_size;
+ return 0;
+ }
+
+ if (offset_in_block) {
+ blocks--;
+ *offset += (blocks << data_bits) +
+ ((1 << data_bits) - offset_in_block);
+ } else {
+ *offset += blocks << data_bits;
+ }
+
+ return 0;
+}
+
+/* examine the meta-data block node upto the end_idx for any non-null
+ * pointers. if found return false, else return true.
+ * requied to determine if a meta-data block contains no pointers and hence
+ * can be freed.
+ */
+static inline bool is_empty_meta_block(u64 *node, unsigned int start_idx,
+ unsigned int end_idx)
+{
+ int i, last_idx = (1 << META_BLK_SHIFT) - 1;
+ for (i = 0; i < start_idx; i++)
+ if (unlikely(node[i]))
+ return false;
+ for (i = end_idx + 1; i <= last_idx; i++)
+ if (unlikely(node[i]))
+ return false;
+ return true;
+}
+
+/* recursive_truncate_blocks: recursively deallocate a range of blocks from
+ * first_blocknr to last_blocknr in the inode's btree.
+ * Input:
+ * block: points to the root of the b-tree where the blocks need to be allocated
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * end: last byte offset of the range
+ */
+static int recursive_truncate_blocks(struct super_block *sb, u64 block,
+ u32 height, u32 btype, unsigned long first_blocknr,
+ unsigned long last_blocknr, bool *meta_empty)
+{
+ unsigned long blocknr, first_blk, last_blk;
+ unsigned int node_bits, first_index, last_index, i;
+ u64 *node;
+ unsigned int freed = 0, bzero;
+ int start, end;
+ bool mpty, all_range_freed = true;;
+
+ node = pmfs_get_block(sb, le64_to_cpu(block));
+
+ node_bits = (height - 1) * META_BLK_SHIFT;
+
+ start = first_index = first_blocknr >> node_bits;
+ end = last_index = last_blocknr >> node_bits;
+
+ if (height == 1) {
+ for (i = first_index; i <= last_index; i++) {
+ if (unlikely(!node[i]))
+ continue;
+ /* Freeing the data block */
+ blocknr = pmfs_get_blocknr(sb, le64_to_cpu(node[i]),
+ btype);
+ pmfs_free_block(sb, blocknr, btype);
+ freed++;
+ }
+ } else {
+ for (i = first_index; i <= last_index; i++) {
+ if (unlikely(!node[i]))
+ continue;
+ first_blk = (i == first_index) ? (first_blocknr &
+ ((1 << node_bits) - 1)) : 0;
+
+ last_blk = (i == last_index) ? (last_blocknr &
+ ((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+ freed += recursive_truncate_blocks(sb, node[i],
+ height - 1, btype, first_blk, last_blk, &mpty);
+ /* cond_resched(); */
+ if (mpty) {
+ /* Freeing the meta-data block */
+ blocknr = pmfs_get_blocknr(sb, le64_to_cpu(
+ node[i]), PMFS_BLOCK_TYPE_4K);
+ pmfs_free_block(sb, blocknr,PMFS_BLOCK_TYPE_4K);
+ } else {
+ if (i == first_index)
+ start++;
+ else if (i == last_index)
+ end--;
+ all_range_freed = false;
+ }
+ }
+ }
+ if (all_range_freed &&
+ is_empty_meta_block(node, first_index, last_index)) {
+ *meta_empty = true;
+ } else {
+ /* Zero-out the freed range if the meta-block in not empty */
+ if (start <= end) {
+ bzero = (end - start + 1) * sizeof(u64);
+ pmfs_memunlock_block(sb, node);
+ memset(&node[start], 0, bzero);
+ pmfs_memlock_block(sb, node);
+ pmfs_flush_buffer(&node[start], bzero, false);
+ }
+ *meta_empty = false;
+ }
+ return freed;
+}
+
+unsigned int pmfs_free_inode_subtree(struct super_block *sb,
+ u64 root, u32 height, u32 btype, loff_t end)
+{
+ unsigned long first_blocknr, last_blocknr;
+ unsigned int freed;
+ unsigned int data_bits = blk_type_to_shift[btype];
+ bool mpty;
+
+ if (!root)
+ return 0;
+
+ if (height == 0) {
+ first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+ btype);
+ pmfs_free_block(sb, first_blocknr, btype);
+ freed = 1;
+ } else {
+ first_blocknr = 0;
+ last_blocknr = (end - 1) >> data_bits;
+
+ freed = recursive_truncate_blocks(sb, root, height, btype,
+ first_blocknr, last_blocknr, &mpty);
+ BUG_ON(!mpty);
+ first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+ PMFS_BLOCK_TYPE_4K);
+ pmfs_free_block(sb, first_blocknr,PMFS_BLOCK_TYPE_4K);
+ }
+ return freed;
+}
+
+static void pmfs_decrease_btree_height(struct super_block *sb,
+ struct pmfs_inode *pi, unsigned long newsize, u64 newroot)
+{
+ unsigned int height = pi->height, new_height = 0;
+ unsigned long blocknr, last_blocknr;
+ u64 *root;
+ char b[8];
+
+ if (pi->i_blocks == 0 || newsize == 0) {
+ /* root must be NULL */
+ BUG_ON(newroot != 0);
+ goto update_root_and_height;
+ }
+
+ last_blocknr = ((newsize + pmfs_inode_blk_size(pi) - 1) >>
+ pmfs_inode_blk_shift(pi)) - 1;
+ while (last_blocknr > 0) {
+ last_blocknr = last_blocknr >> META_BLK_SHIFT;
+ new_height++;
+ }
+ if (height == new_height)
+ return;
+ pmfs_dbg_verbose("reducing tree height %x->%x\n", height, new_height);
+ while (height > new_height) {
+ /* freeing the meta block */
+ root = pmfs_get_block(sb, le64_to_cpu(newroot));
+ blocknr = pmfs_get_blocknr(sb, le64_to_cpu(newroot),
+ PMFS_BLOCK_TYPE_4K);
+ newroot = root[0];
+ pmfs_free_block(sb, blocknr, PMFS_BLOCK_TYPE_4K);
+ height--;
+ }
+update_root_and_height:
+ /* pi->height and pi->root need to be atomically updated. use
+ * cmpxchg16 here. The following is dependent on a specific layout of
+ * inode fields */
+ *(u64 *)b = *(u64 *)pi;
+ /* pi->height is at offset 2 from pi */
+ b[2] = (u8)new_height;
+ /* TODO: the following function assumes cmpxchg16b instruction writes
+ * 16 bytes atomically. Confirm if it is really true. */
+ cmpxchg_double_local((u64 *)pi, &pi->root, *(u64 *)pi, pi->root,
+ *(u64 *)b, newroot);
+ return;
+}
+
+static unsigned long pmfs_inode_count_iblocks_recursive(struct super_block *sb,
+ unsigned long block, u32 height)
+{
+ u64 *node;
+ unsigned int i;
+ unsigned long i_blocks = 0;
+
+ if (height == 0)
+ return 1;
+ node = pmfs_get_block(sb, block);
+ for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+ if (node[i] == 0)
+ continue;
+ i_blocks += pmfs_inode_count_iblocks_recursive(sb,
+ le64_to_cpu(node[i]), height - 1);
+ }
+ return i_blocks;
+}
+
+static inline unsigned long pmfs_inode_count_iblocks (struct super_block *sb,
+ struct pmfs_inode *pi, u64 root)
+{
+ unsigned long iblocks;
+ if (root == 0)
+ return 0;
+ iblocks = pmfs_inode_count_iblocks_recursive(sb, le64_to_cpu(root),
+ pi->height);
+ return (iblocks << (pmfs_inode_blk_shift(pi) - sb->s_blocksize_bits));
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void __pmfs_truncate_blocks(struct inode *inode, loff_t start,
+ loff_t end)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ unsigned long first_blocknr, last_blocknr;
+ u64 root;
+ unsigned int freed = 0;
+ unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+ unsigned int meta_bits = META_BLK_SHIFT;
+ bool mpty;
+
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+
+ if (!pi->root)
+ goto end_truncate_blocks;
+
+ pmfs_dbg_verbose("truncate: pi %p iblocks %llx %llx %llx %x %llx\n", pi,
+ pi->i_blocks, start, end, pi->height, pi->i_size);
+
+ first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+ if (pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) {
+ last_blocknr = (1UL << (pi->height * meta_bits)) - 1;
+ } else {
+ if (end == 0)
+ goto end_truncate_blocks;
+ last_blocknr = (end - 1) >> data_bits;
+ }
+
+ if (first_blocknr > last_blocknr)
+ goto end_truncate_blocks;
+ root = pi->root;
+
+ if (pi->height == 0) {
+ first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+ pi->i_blk_type);
+ pmfs_free_block(sb, first_blocknr, pi->i_blk_type);
+ root = 0;
+ freed = 1;
+ } else {
+ freed = recursive_truncate_blocks(sb, root, pi->height,
+ pi->i_blk_type, first_blocknr, last_blocknr, &mpty);
+ if (mpty) {
+ first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+ PMFS_BLOCK_TYPE_4K);
+ pmfs_free_block(sb, first_blocknr, PMFS_BLOCK_TYPE_4K);
+ root = 0;
+ }
+ }
+ /* if we are called during mount, a power/system failure had happened.
+ * Dont trust inode->i_blocks; recalculate it by rescanning the inode */
+ if (pmfs_is_mounting(sb))
+ inode->i_blocks = pmfs_inode_count_iblocks(sb, pi, root);
+ else
+ inode->i_blocks -= (freed * (1 << (data_bits -
+ sb->s_blocksize_bits)));
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_blocks = cpu_to_le64(inode->i_blocks);
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pmfs_decrease_btree_height(sb, pi, start, root);
+ /* Check for the flag EOFBLOCKS is still valid after the set size */
+ check_eof_blocks(sb, pi, inode->i_size);
+ pmfs_memlock_inode(sb, pi);
+ /* now flush the inode's first cacheline which was modified */
+ pmfs_flush_buffer(pi, 1, false);
+ return;
+end_truncate_blocks:
+ /* we still need to update ctime and mtime */
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pmfs_memlock_inode(sb, pi);
+ pmfs_flush_buffer(pi, 1, false);
+ return;
+}
+
+
+static int pmfs_increase_btree_height(struct super_block *sb,
+ struct pmfs_inode *pi, u32 new_height)
+{
+ u32 height = pi->height;
+ u64 *root, prev_root = pi->root;
+ unsigned long blocknr;
+ int errval = 0;
+
+ pmfs_dbg_verbose("increasing tree height %x:%x\n", height, new_height);
+ while (height < new_height) {
+ /* allocate the meta block */
+ errval = pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
+ if (errval) {
+ pmfs_err(sb, "failed to increase btree height\n");
+ break;
+ }
+ blocknr = pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K);
+ root = pmfs_get_block(sb, blocknr);
+ pmfs_memunlock_block(sb, root);
+ root[0] = prev_root;
+ pmfs_memlock_block(sb, root);
+ pmfs_flush_buffer(root, sizeof(*root), false);
+ prev_root = cpu_to_le64(blocknr);
+ height++;
+ }
+ pmfs_memunlock_inode(sb, pi);
+ pi->root = prev_root;
+ pi->height = height;
+ pmfs_memlock_inode(sb, pi);
+ return errval;
+}
+
+/* recursive_alloc_blocks: recursively allocate a range of blocks from
+ * first_blocknr to last_blocknr in the inode's btree.
+ * Input:
+ * block: points to the root of the b-tree where the blocks need to be allocated
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * zero: whether to zero-out the allocated block(s)
+ */
+static int recursive_alloc_blocks(pmfs_transaction_t *trans,
+ struct super_block *sb, struct pmfs_inode *pi, u64 block, u32 height,
+ unsigned long first_blocknr, unsigned long last_blocknr, bool new_node,
+ bool zero)
+{
+ int i, errval;
+ unsigned int meta_bits = META_BLK_SHIFT, node_bits;
+ u64 *node;
+ bool journal_saved = 0;
+ unsigned long blocknr, first_blk, last_blk;
+ unsigned int first_index, last_index;
+ unsigned int flush_bytes;
+
+ node = pmfs_get_block(sb, le64_to_cpu(block));
+
+ node_bits = (height - 1) * meta_bits;
+
+ first_index = first_blocknr >> node_bits;
+ last_index = last_blocknr >> node_bits;
+
+ for (i = first_index; i <= last_index; i++) {
+ if (height == 1) {
+ if (node[i] == 0) {
+ errval = pmfs_new_data_block(sb, pi, &blocknr,
+ zero);
+ if (errval) {
+ pmfs_dbg_verbose("alloc data blk failed"
+ " %d\n", errval);
+ /* For later recovery in truncate... */
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_flags |= cpu_to_le32(
+ PMFS_EOFBLOCKS_FL);
+ pmfs_memlock_inode(sb, pi);
+ return errval;
+ }
+ /* save the meta-data into the journal before
+ * modifying */
+ if (new_node == 0 && journal_saved == 0) {
+ int le_size = (last_index - i + 1) << 3;
+ pmfs_add_logentry(sb, trans, &node[i],
+ le_size, LE_DATA);
+ journal_saved = 1;
+ }
+ pmfs_memunlock_block(sb, node);
+ node[i] = cpu_to_le64(pmfs_get_block_off(sb,
+ blocknr, pi->i_blk_type));
+ pmfs_memlock_block(sb, node);
+ }
+ } else {
+ if (node[i] == 0) {
+ /* allocate the meta block */
+ errval = pmfs_new_block(sb, &blocknr,
+ PMFS_BLOCK_TYPE_4K, 1);
+ if (errval) {
+ pmfs_dbg_verbose("alloc meta blk"
+ " failed\n");
+ goto fail;
+ }
+ /* save the meta-data into the journal before
+ * modifying */
+ if (new_node == 0 && journal_saved == 0) {
+ int le_size = (last_index - i + 1) << 3;
+ pmfs_add_logentry(sb, trans, &node[i],
+ le_size, LE_DATA);
+ journal_saved = 1;
+ }
+ pmfs_memunlock_block(sb, node);
+ node[i] = cpu_to_le64(pmfs_get_block_off(sb,
+ blocknr, PMFS_BLOCK_TYPE_4K));
+ pmfs_memlock_block(sb, node);
+ new_node = 1;
+ }
+
+ first_blk = (i == first_index) ? (first_blocknr &
+ ((1 << node_bits) - 1)) : 0;
+
+ last_blk = (i == last_index) ? (last_blocknr &
+ ((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+ errval = recursive_alloc_blocks(trans, sb, pi, node[i],
+ height - 1, first_blk, last_blk, new_node, zero);
+ if (errval < 0)
+ goto fail;
+ }
+ }
+ if (new_node || trans == NULL) {
+ /* if the changes were not logged, flush the cachelines we may
+ * have modified */
+ flush_bytes = (last_index - first_index + 1) * sizeof(node[0]);
+ pmfs_flush_buffer(&node[first_index], flush_bytes, false);
+ }
+ errval = 0;
+fail:
+ return errval;
+}
+
+int __pmfs_alloc_blocks(pmfs_transaction_t *trans, struct super_block *sb,
+ struct pmfs_inode *pi, unsigned long file_blocknr, unsigned int num,
+ bool zero)
+{
+ int errval;
+ unsigned long max_blocks;
+ unsigned int height;
+ unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+ unsigned int blk_shift, meta_bits = META_BLK_SHIFT;
+ unsigned long blocknr, first_blocknr, last_blocknr, total_blocks;
+ /* convert the 4K blocks into the actual blocks the inode is using */
+ blk_shift = data_bits - sb->s_blocksize_bits;
+
+ first_blocknr = file_blocknr >> blk_shift;
+ last_blocknr = (file_blocknr + num - 1) >> blk_shift;
+
+ pmfs_dbg_verbose("alloc_blocks height %d file_blocknr %lx num %x, "
+ "first blocknr 0x%lx, last_blocknr 0x%lx\n",
+ pi->height, file_blocknr, num, first_blocknr, last_blocknr);
+
+ height = pi->height;
+
+ blk_shift = height * meta_bits;
+
+ max_blocks = 0x1UL << blk_shift;
+
+ if (last_blocknr > max_blocks - 1) {
+ /* B-tree height increases as a result of this allocation */
+ total_blocks = last_blocknr >> blk_shift;
+ while (total_blocks > 0) {
+ total_blocks = total_blocks >> meta_bits;
+ height++;
+ }
+ if (height > 3) {
+ pmfs_dbg("[%s:%d] Max file size. Cant grow the file\n",
+ __func__, __LINE__);
+ errval = -ENOSPC;
+ goto fail;
+ }
+ }
+
+ if (!pi->root) {
+ if (height == 0) {
+ u64 root;
+ errval = pmfs_new_data_block(sb, pi, &blocknr, zero);
+ if (errval) {
+ pmfs_dbg_verbose("[%s:%d] failed: alloc data"
+ " block\n", __func__, __LINE__);
+ goto fail;
+ }
+ root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
+ pi->i_blk_type));
+ /* TODO: use RTM for in-place atomic update */
+ pmfs_memunlock_inode(sb, pi);
+ pi->root = root;
+ pi->height = height;
+ pmfs_memlock_inode(sb, pi);
+ } else {
+ errval = pmfs_increase_btree_height(sb, pi, height);
+ if (errval) {
+ pmfs_dbg_verbose("[%s:%d] failed: inc btree"
+ " height\n", __func__, __LINE__);
+ goto fail;
+ }
+ errval = recursive_alloc_blocks(trans, sb, pi, pi->root,
+ pi->height, first_blocknr, last_blocknr, 1, zero);
+ if (errval < 0)
+ goto fail;
+ }
+ } else {
+ /* Go forward only if the height of the tree is non-zero. */
+ if (height == 0)
+ return 0;
+
+ if (height > pi->height) {
+ errval = pmfs_increase_btree_height(sb, pi, height);
+ if (errval) {
+ pmfs_dbg_verbose("Err: inc height %x:%x tot %lx"
+ "\n", pi->height, height, total_blocks);
+ goto fail;
+ }
+ }
+ errval = recursive_alloc_blocks(trans, sb, pi, pi->root, height,
+ first_blocknr, last_blocknr, 0, zero);
+ if (errval < 0)
+ goto fail;
+ }
+ return 0;
+fail:
+ return errval;
+}
+
+/*
+ * Allocate num data blocks for inode, starting at given file-relative
+ * block number.
+ */
+inline int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode,
+ unsigned long file_blocknr, unsigned int num, bool zero)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ int errval;
+
+ errval = __pmfs_alloc_blocks(trans, sb, pi, file_blocknr, num, zero);
+ inode->i_blocks = le64_to_cpu(pi->i_blocks);
+
+ return errval;
+}
+
+/* Initialize the inode table. The pmfs_inode struct corresponding to the
+ * inode table has already been zero'd out */
+int pmfs_init_inode_table(struct super_block *sb)
+{
+ struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ unsigned long num_blocks = 0, init_inode_table_size;
+ int errval;
+
+ if (sbi->num_inodes == 0) {
+ /* initial inode table size was not specified. */
+ if (sbi->initsize >= PMFS_LARGE_INODE_TABLE_THREASHOLD)
+ init_inode_table_size = PMFS_LARGE_INODE_TABLE_SIZE;
+ else
+ init_inode_table_size = PMFS_DEF_BLOCK_SIZE_4K;
+ } else {
+ init_inode_table_size = sbi->num_inodes << PMFS_INODE_BITS;
+ }
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_mode = 0;
+ pi->i_uid = 0;
+ pi->i_gid = 0;
+ pi->i_links_count = cpu_to_le16(1);
+ pi->i_flags = 0;
+ pi->height = 0;
+ pi->i_dtime = 0;
+ if (init_inode_table_size >= PMFS_LARGE_INODE_TABLE_SIZE)
+ pi->i_blk_type = PMFS_BLOCK_TYPE_2M;
+ else
+ pi->i_blk_type = PMFS_BLOCK_TYPE_4K;
+
+ num_blocks = (init_inode_table_size + pmfs_inode_blk_size(pi) - 1) >>
+ pmfs_inode_blk_shift(pi);
+
+ pi->i_size = cpu_to_le64(num_blocks << pmfs_inode_blk_shift(pi));
+ /* pmfs_sync_inode(pi); */
+ pmfs_memlock_inode(sb, pi);
+
+ sbi->s_inodes_count = num_blocks <<
+ (pmfs_inode_blk_shift(pi) - PMFS_INODE_BITS);
+ /* calculate num_blocks in terms of 4k blocksize */
+ num_blocks = num_blocks << (pmfs_inode_blk_shift(pi) -
+ sb->s_blocksize_bits);
+ errval = __pmfs_alloc_blocks(NULL, sb, pi, 0, num_blocks, true);
+
+ if (errval != 0) {
+ pmfs_err(sb, "Err: initializing the Inode Table: %d\n", errval);
+ return errval;
+ }
+
+ /* inode 0 is considered invalid and hence never used */
+ sbi->s_free_inodes_count =
+ (sbi->s_inodes_count - PMFS_FREE_INODE_HINT_START);
+ sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+
+ return 0;
+}
+
+static int pmfs_read_inode(struct inode *inode, struct pmfs_inode *pi)
+{
+ int ret = -EIO;
+
+#if 0
+ if (pmfs_calc_checksum((u8 *)pi, PMFS_INODE_SIZE)) {
+ pmfs_err(inode->i_sb, "checksum error in inode %lx\n",
+ (u64)inode->i_ino);
+ goto bad_inode;
+ }
+#endif
+
+ inode->i_mode = le16_to_cpu(pi->i_mode);
+ inode->i_uid = le32_to_cpu(pi->i_uid);
+ inode->i_gid = le32_to_cpu(pi->i_gid);
+ set_nlink(inode, le16_to_cpu(pi->i_links_count));
+ inode->i_size = le64_to_cpu(pi->i_size);
+ inode->i_atime.tv_sec = le32_to_cpu(pi->i_atime);
+ inode->i_ctime.tv_sec = le32_to_cpu(pi->i_ctime);
+ inode->i_mtime.tv_sec = le32_to_cpu(pi->i_mtime);
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_generation = le32_to_cpu(pi->i_generation);
+ pmfs_set_inode_flags(inode, pi);
+
+ /* check if the inode is active. */
+ if (inode->i_nlink == 0 &&
+ (inode->i_mode == 0 || le32_to_cpu(pi->i_dtime))) {
+ /* this inode is deleted */
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+
+ inode->i_blocks = le64_to_cpu(pi->i_blocks);
+ inode->i_mapping->a_ops = &pmfs_aops_xip;
+ inode->i_mapping->backing_dev_info = &pmfs_backing_dev_info;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &pmfs_file_inode_operations;
+ inode->i_fop = &pmfs_xip_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &pmfs_dir_inode_operations;
+ inode->i_fop = &pmfs_dir_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &pmfs_symlink_inode_operations;
+ break;
+ default:
+ inode->i_size = 0;
+ inode->i_op = &pmfs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(pi->dev.rdev));
+ break;
+ }
+
+ return 0;
+
+bad_inode:
+ make_bad_inode(inode);
+ return ret;
+}
+
+static void pmfs_update_inode(struct inode *inode, struct pmfs_inode *pi)
+{
+ pmfs_memunlock_inode(inode->i_sb, pi);
+ pi->i_mode = cpu_to_le16(inode->i_mode);
+ pi->i_uid = cpu_to_le32(inode->i_uid);
+ pi->i_gid = cpu_to_le32(inode->i_gid);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pi->i_size = cpu_to_le64(inode->i_size);
+ pi->i_blocks = cpu_to_le64(inode->i_blocks);
+ pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pi->i_generation = cpu_to_le32(inode->i_generation);
+ pmfs_get_inode_flags(inode, pi);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+
+ pmfs_memlock_inode(inode->i_sb, pi);
+
+ return;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ */
+static int pmfs_free_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_inode *pi;
+ unsigned long inode_nr;
+ pmfs_transaction_t *trans;
+ int err = 0;
+
+ mutex_lock(&PMFS_SB(sb)->inode_table_mutex);
+
+ pmfs_dbg_verbose("free_inode: %lx free_nodes %x tot nodes %x hint %x\n",
+ inode->i_ino, sbi->s_free_inodes_count, sbi->s_inodes_count,
+ sbi->s_free_inode_hint);
+ inode_nr = inode->i_ino >> PMFS_INODE_BITS;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+
+ /* This transaction can be avoided if using RTM */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ /* TODO: use RTM to write the below cacheline atomically */
+ pmfs_memunlock_inode(sb, pi);
+ pi->root = 0;
+ /* pi->i_links_count = 0;
+ pi->i_xattr = 0; */
+ pi->i_size = 0;
+ pi->i_dtime = cpu_to_le32(get_seconds());
+ pmfs_memlock_inode(sb, pi);
+
+ pmfs_commit_transaction(sb, trans);
+
+ /* increment s_free_inodes_count */
+ if (inode_nr < (sbi->s_free_inode_hint))
+ sbi->s_free_inode_hint = (inode_nr);
+
+ sbi->s_free_inodes_count += 1;
+
+ if ((sbi->s_free_inodes_count) ==
+ (sbi->s_inodes_count) - PMFS_FREE_INODE_HINT_START) {
+ /* filesystem is empty */
+ pmfs_dbg_verbose("fs is empty!\n");
+ sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+ }
+
+ pmfs_dbg_verbose("free_inode: free_nodes %x total_nodes %x hint %x\n",
+ sbi->s_free_inodes_count, sbi->s_inodes_count,
+ sbi->s_free_inode_hint);
+out:
+ mutex_unlock(&PMFS_SB(sb)->inode_table_mutex);
+ return err;
+}
+
+struct inode *pmfs_iget(struct super_block *sb, unsigned long ino)
+{
+ struct inode *inode;
+ struct pmfs_inode *pi;
+ int err;
+
+ inode = iget_locked(sb, ino);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ pi = pmfs_get_inode(sb, ino);
+ if (!pi) {
+ err = -EACCES;
+ goto fail;
+ }
+ err = pmfs_read_inode(inode, pi);
+ if (unlikely(err))
+ goto fail;
+ inode->i_ino = ino;
+
+ unlock_new_inode(inode);
+ return inode;
+fail:
+ iget_failed(inode);
+ return ERR_PTR(err);
+}
+
+void pmfs_evict_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ u64 root;
+ unsigned int height, btype;
+ int err = 0;
+
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ goto out;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ goto out;
+
+ root = pi->root;
+ height = pi->height;
+ btype = pi->i_blk_type;
+
+ /* first free the inode */
+ err = pmfs_free_inode(inode);
+ if (err)
+ goto out;
+ /* then free the blocks from the inode's b-tree */
+ pmfs_free_inode_subtree(sb, root, height, btype,
+ inode->i_size);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_size = 0;
+ }
+out:
+ /* now it is safe to remove the inode from the truncate list */
+ pmfs_truncate_del(inode);
+ /* TODO: Since we don't use page-cache, do we really need the following
+ * call? */
+ truncate_inode_pages(&inode->i_data, 0);
+
+ clear_inode(inode);
+}
+
+static int pmfs_increase_inode_table_size(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+ pmfs_transaction_t *trans;
+ int errval;
+
+ /* 1 log entry for inode-table inode, 1 lentry for inode-table b-tree */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ errval = __pmfs_alloc_blocks(trans, sb, pi,
+ pi->i_size >> sb->s_blocksize_bits, 1, true);
+
+ if (errval == 0) {
+ u64 i_size = le64_to_cpu(pi->i_size);
+
+ sbi->s_free_inode_hint = i_size >> PMFS_INODE_BITS;
+ i_size += pmfs_inode_blk_size(pi);
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_size = cpu_to_le64(i_size);
+ pmfs_memlock_inode(sb, pi);
+
+ sbi->s_free_inodes_count += INODES_PER_BLOCK(pi->i_blk_type);
+ sbi->s_inodes_count = i_size >> PMFS_INODE_BITS;
+ } else
+ pmfs_dbg_verbose("no space left to inc inode table!\n");
+ /* commit the transaction */
+ pmfs_commit_transaction(sb, trans);
+ return errval;
+}
+
+struct inode *pmfs_new_inode(pmfs_transaction_t *trans, struct inode *dir,
+ umode_t mode, const struct qstr *qstr)
+{
+ struct super_block *sb;
+ struct pmfs_sb_info *sbi;
+ struct inode *inode;
+ struct pmfs_inode *pi = NULL, *inode_table;
+ struct pmfs_inode *diri = NULL;
+ int i, errval;
+ u32 num_inodes, inodes_per_block;
+ ino_t ino = 0;
+
+ sb = dir->i_sb;
+ sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode_init_owner(inode, dir, mode);
+ inode->i_blocks = inode->i_size = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_generation = atomic_add_return(1, &sbi->next_generation);
+
+ inode_table = pmfs_get_inode_table(sb);
+
+ pmfs_dbg_verbose("inode: %p free_inodes %x total_inodes %x hint %x\n",
+ inode, sbi->s_free_inodes_count, sbi->s_inodes_count,
+ sbi->s_free_inode_hint);
+
+ diri = pmfs_get_inode(sb, dir->i_ino);
+ if (!diri)
+ return ERR_PTR(-EACCES);
+
+ mutex_lock(&sbi->inode_table_mutex);
+
+ /* find the oldest unused pmfs inode */
+ i = (sbi->s_free_inode_hint);
+ inodes_per_block = INODES_PER_BLOCK(inode_table->i_blk_type);
+retry:
+ num_inodes = (sbi->s_inodes_count);
+ while (i < num_inodes) {
+ u32 end_ino;
+ end_ino = i + (inodes_per_block - (i & (inodes_per_block - 1)));
+ ino = i << PMFS_INODE_BITS;
+ pi = pmfs_get_inode(sb, ino);
+ for (; i < end_ino; i++) {
+ /* check if the inode is active. */
+ if (le16_to_cpu(pi->i_links_count) == 0 &&
+ (le16_to_cpu(pi->i_mode) == 0 ||
+ le32_to_cpu(pi->i_dtime)))
+ /* this inode is free */
+ break;
+ pi = (struct pmfs_inode *)((void *)pi +
+ PMFS_INODE_SIZE);
+ }
+ /* found a free inode */
+ if (i < end_ino)
+ break;
+ }
+ if (unlikely(i >= num_inodes)) {
+ errval = pmfs_increase_inode_table_size(sb);
+ if (errval == 0)
+ goto retry;
+ mutex_unlock(&PMFS_SB(sb)->inode_table_mutex);
+ pmfs_dbg("PMFS: could not find a free inode\n");
+ goto fail1;
+ }
+
+ ino = i << PMFS_INODE_BITS;
+ pmfs_dbg_verbose("allocating inode %lx\n", ino);
+
+ /* chosen inode is in ino */
+ inode->i_ino = ino;
+ pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_blk_type = PMFS_DEFAULT_BLOCK_TYPE;
+ pi->i_flags = pmfs_mask_flags(mode, diri->i_flags);
+ pi->height = 0;
+ pi->i_dtime = 0;
+ pmfs_memlock_inode(sb, pi);
+
+ sbi->s_free_inodes_count -= 1;
+
+ if (i < (sbi->s_inodes_count) - 1)
+ sbi->s_free_inode_hint = (i + 1);
+ else
+ sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+
+ mutex_unlock(&sbi->inode_table_mutex);
+
+ pmfs_update_inode(inode, pi);
+
+ pmfs_set_inode_flags(inode, pi);
+
+ if (insert_inode_locked(inode) < 0) {
+ pmfs_err(sb, "pmfs_new_inode failed ino %lx\n", inode->i_ino);
+ errval = -EINVAL;
+ goto fail1;
+ }
+
+ return inode;
+fail1:
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(errval);
+}
+
+inline void pmfs_update_nlink(struct inode *inode, struct pmfs_inode *pi)
+{
+ pmfs_memunlock_inode(inode->i_sb, pi);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+inline void pmfs_update_isize(struct inode *inode, struct pmfs_inode *pi)
+{
+ pmfs_memunlock_inode(inode->i_sb, pi);
+ pi->i_size = cpu_to_le64(inode->i_size);
+ pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+inline void pmfs_update_time(struct inode *inode, struct pmfs_inode *pi)
+{
+ pmfs_memunlock_inode(inode->i_sb, pi);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+/* This function checks if VFS's inode and PMFS's inode are not in sync */
+static bool pmfs_is_inode_dirty(struct inode *inode, struct pmfs_inode *pi)
+{
+ if (inode->i_ctime.tv_sec != le32_to_cpu(pi->i_ctime) ||
+ inode->i_mtime.tv_sec != le32_to_cpu(pi->i_mtime) ||
+ inode->i_size != le64_to_cpu(pi->i_size) ||
+ inode->i_mode != le16_to_cpu(pi->i_mode) ||
+ inode->i_uid != le32_to_cpu(pi->i_uid) ||
+ inode->i_gid != le32_to_cpu(pi->i_gid) ||
+ inode->i_nlink != le16_to_cpu(pi->i_links_count) ||
+ inode->i_blocks != le64_to_cpu(pi->i_blocks) ||
+ inode->i_atime.tv_sec != le32_to_cpu(pi->i_atime))
+ return true;
+ return false;
+}
+
+int pmfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ /* write_inode should never be called because we always keep our inodes
+ * clean. So let us know if write_inode ever gets called. */
+ BUG();
+ return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because PMFS always keeps its inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void pmfs_dirty_inode(struct inode *inode, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+ /* only i_atime should have changed if at all.
+ * we can do in-place atomic update */
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ pmfs_memlock_inode(sb, pi);
+ pmfs_flush_buffer(&pi->i_atime, sizeof(pi->i_atime), true);
+
+ if (pmfs_is_inode_dirty(inode, pi))
+ BUG();
+}
+
+/*
+ * Called to zeros out a single block. It's used in the "resize"
+ * to avoid to keep data in case the file grow up again.
+ */
+/* Make sure to zero out just a single 4K page in case of 2M or 1G blocks */
+static void pmfs_block_truncate_page(struct inode *inode, loff_t newsize)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long offset = newsize & (sb->s_blocksize - 1);
+ unsigned long blocknr, length;
+ u64 blockoff;
+ char *bp;
+
+ /* Block boundary or extending ? */
+ if (!offset || newsize > inode->i_size)
+ return;
+
+ length = sb->s_blocksize - offset;
+ blocknr = newsize >> sb->s_blocksize_bits;
+
+ blockoff = pmfs_find_data_block(inode, blocknr);
+
+ /* Hole ? */
+ if (!blockoff)
+ return;
+
+ bp = pmfs_get_block(sb, blockoff);
+ if (!bp)
+ return;
+ pmfs_memunlock_block(sb, bp);
+ memset(bp + offset, 0, length);
+ pmfs_memlock_block(sb, bp);
+ pmfs_flush_buffer(bp + offset, length, false);
+ return;
+}
+
+void pmfs_truncate_del(struct inode *inode)
+{
+ struct list_head *prev;
+ struct pmfs_inode_vfs *si = PMFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+ struct pmfs_inode_truncate_item *li;
+ unsigned long ino_next;
+
+ mutex_lock(&sbi->s_truncate_lock);
+ if (list_empty(&si->i_truncated))
+ goto out;
+ /* Make sure all truncate operation is persistent before removing the
+ * inode from the truncate list */
+ PERSISTENT_MARK();
+
+ li = pmfs_get_truncate_item(sb, inode->i_ino);
+
+ ino_next = le64_to_cpu(li->i_next_truncate);
+ prev = si->i_truncated.prev;
+
+ list_del_init(&si->i_truncated);
+ PERSISTENT_BARRIER();
+
+ /* Atomically delete the inode from the truncate list */
+ if (prev == &sbi->s_truncate) {
+ pmfs_memunlock_range(sb, head, sizeof(*head));
+ head->i_next_truncate = cpu_to_le64(ino_next);
+ pmfs_memlock_range(sb, head, sizeof(*head));
+ pmfs_flush_buffer(&head->i_next_truncate,
+ sizeof(head->i_next_truncate), false);
+ } else {
+ struct inode *i_prv = &list_entry(prev,
+ struct pmfs_inode_vfs, i_truncated)->vfs_inode;
+ struct pmfs_inode_truncate_item *li_prv =
+ pmfs_get_truncate_item(sb, i_prv->i_ino);
+ pmfs_memunlock_range(sb, li_prv, sizeof(*li_prv));
+ li_prv->i_next_truncate = ino_next;
+ pmfs_memlock_range(sb, li_prv, sizeof(*li_prv));
+ pmfs_flush_buffer(&li_prv->i_next_truncate,
+ sizeof(li_prv->i_next_truncate), false);
+ }
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+out:
+ mutex_unlock(&sbi->s_truncate_lock);
+}
+
+/* PMFS maintains a so-called truncate list, which is a linked list of inodes
+ * which require further processing in case of a power failure. Currently, PMFS
+ * uses the truncate list for two purposes.
+ * 1) When removing a file, if the i_links_count becomes zero (i.e., the file
+ * is not referenced by any directory entry), the inode needs to be freed.
+ * However, if the file is currently in use (e.g., opened) it can't be freed
+ * until all references are closed. Hence PMFS adds the inode to the truncate
+ * list during directory entry removal, and removes it from the truncate list
+ * when VFS calls evict_inode. If a power failure happens before evict_inode,
+ * the inode is freed during the next mount when we recover the truncate list
+ * 2) When truncating a file (reducing the file size and freeing the blocks),
+ * we dont want to return the freed blocks to the free list until the whole
+ * truncate operation is complete. So we add the inode to the truncate list with
+ * the specified truncate_size. Now we can return freed blocks to the free list
+ * even before the transaction is complete. Because if a power failure happens
+ * before freeing of all the blocks is complete, PMFS will free the remaining
+ * blocks during the next mount when we recover the truncate list */
+void pmfs_truncate_add(struct inode *inode, u64 truncate_size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+ struct pmfs_inode_truncate_item *li;
+
+ mutex_lock(&PMFS_SB(sb)->s_truncate_lock);
+ if (!list_empty(&PMFS_I(inode)->i_truncated))
+ goto out_unlock;
+
+ li = pmfs_get_truncate_item(sb, inode->i_ino);
+
+ pmfs_memunlock_range(sb, li, sizeof(*li));
+ li->i_next_truncate = head->i_next_truncate;
+ li->i_truncatesize = cpu_to_le64(truncate_size);
+ pmfs_memlock_range(sb, li, sizeof(*li));
+ pmfs_flush_buffer(li, sizeof(*li), false);
+ /* make sure above is persistent before changing the head pointer */
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ /* Atomically insert this inode at the head of the truncate list. */
+ pmfs_memunlock_range(sb, head, sizeof(*head));
+ head->i_next_truncate = cpu_to_le64(inode->i_ino);
+ pmfs_memlock_range(sb, head, sizeof(*head));
+ pmfs_flush_buffer(&head->i_next_truncate,
+ sizeof(head->i_next_truncate), false);
+ /* No need to make the head persistent here if we are called from
+ * within a transaction, because the transaction will provide a
+ * subsequent persistent barrier */
+ if (pmfs_current_transaction() == NULL) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ list_add(&PMFS_I(inode)->i_truncated, &PMFS_SB(sb)->s_truncate);
+
+out_unlock:
+ mutex_unlock(&PMFS_SB(sb)->s_truncate_lock);
+}
+
+void pmfs_setsize(struct inode *inode, loff_t newsize)
+{
+ loff_t oldsize = inode->i_size;
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode))) {
+ pmfs_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+ return;
+ }
+
+ if (newsize != oldsize) {
+ pmfs_block_truncate_page(inode, newsize);
+ i_size_write(inode, newsize);
+ }
+ /* FIXME: we should make sure that there is nobody reading the inode
+ * before truncating it. Also we need to munmap the truncated range
+ * from application address space, if mmapped. */
+ /* synchronize_rcu(); */
+ __pmfs_truncate_blocks(inode, newsize, oldsize);
+ /* No need to make the b-tree persistent here if we are called from
+ * within a transaction, because the transaction will provide a
+ * subsequent persistent barrier */
+ if (pmfs_current_transaction() == NULL) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ return;
+}
+
+int pmfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ /* stat->blocks should be the number of 512B blocks */
+ stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+ return 0;
+}
+
+/* update a single inode field atomically without using a transaction */
+static int pmfs_update_single_field(struct super_block *sb, struct inode *inode,
+ struct pmfs_inode *pi, unsigned int ia_valid)
+{
+ pmfs_memunlock_inode(sb, pi);
+ switch (ia_valid) {
+ case ATTR_MODE:
+ pi->i_mode = cpu_to_le16(inode->i_mode);
+ break;
+ case ATTR_UID:
+ pi->i_uid = cpu_to_le32(inode->i_uid);
+ break;
+ case ATTR_GID:
+ pi->i_gid = cpu_to_le32(inode->i_gid);
+ break;
+ case ATTR_SIZE:
+ pi->i_size = cpu_to_le64(inode->i_size);
+ break;
+ case ATTR_ATIME:
+ pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ break;
+ case ATTR_CTIME:
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ break;
+ case ATTR_MTIME:
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ break;
+ }
+ pmfs_memlock_inode(sb, pi);
+ pmfs_flush_buffer(pi, sizeof(*pi), true);
+ return 0;
+}
+
+int pmfs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+ pmfs_transaction_t *trans;
+ int ret;
+ unsigned int ia_valid = attr->ia_valid, attr_mask;
+
+ if (!pi)
+ return -EACCES;
+
+ ret = inode_change_ok(inode, attr);
+ if (ret)
+ return ret;
+
+ if ((ia_valid & ATTR_SIZE) && (attr->ia_size != inode->i_size ||
+ pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL))) {
+
+ pmfs_truncate_add(inode, attr->ia_size);
+ /* set allocation hint */
+ pmfs_set_blocksize_hint(sb, pi, attr->ia_size);
+
+ /* now we can freely truncate the inode */
+ pmfs_setsize(inode, attr->ia_size);
+ pmfs_update_isize(inode, pi);
+ pmfs_flush_buffer(pi, CACHELINE_SIZE, false);
+ /* we have also updated the i_ctime and i_mtime, so no
+ * need to update them again */
+ ia_valid = ia_valid & ~(ATTR_CTIME | ATTR_MTIME);
+ /* now it is safe to remove the inode from the truncate list */
+ pmfs_truncate_del(inode);
+ }
+ setattr_copy(inode, attr);
+
+ /* we have already handled ATTR_SIZE above so no need to check for it */
+ attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME | ATTR_MTIME |
+ ATTR_CTIME;
+
+ ia_valid = ia_valid & attr_mask;
+
+ if (ia_valid == 0)
+ return ret;
+ /* check if we need to update only a single field. we could avoid using
+ * a transaction */
+ if ((ia_valid & (ia_valid - 1)) == 0) {
+ pmfs_update_single_field(sb, inode, pi, ia_valid);
+ return ret;
+ }
+
+ BUG_ON(pmfs_current_transaction());
+ /* multiple fields are modified. Use a transaction for atomicity */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+
+ pmfs_update_inode(inode, pi);
+
+ pmfs_commit_transaction(sb, trans);
+
+ return ret;
+}
+
+void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi)
+{
+ unsigned int flags = le32_to_cpu(pi->i_flags);
+
+ inode->i_flags &=
+ ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+ if (flags & FS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+ if (!pi->i_xattr)
+ inode_has_no_xattr(inode);
+}
+
+void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi)
+{
+ unsigned int flags = inode->i_flags;
+ unsigned int pmfs_flags = le32_to_cpu(pi->i_flags);
+
+ pmfs_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+ FS_NOATIME_FL | FS_DIRSYNC_FL);
+ if (flags & S_SYNC)
+ pmfs_flags |= FS_SYNC_FL;
+ if (flags & S_APPEND)
+ pmfs_flags |= FS_APPEND_FL;
+ if (flags & S_IMMUTABLE)
+ pmfs_flags |= FS_IMMUTABLE_FL;
+ if (flags & S_NOATIME)
+ pmfs_flags |= FS_NOATIME_FL;
+ if (flags & S_DIRSYNC)
+ pmfs_flags |= FS_DIRSYNC_FL;
+
+ pi->i_flags = cpu_to_le32(pmfs_flags);
+}
+
+const struct address_space_operations pmfs_aops_xip = {
+ .get_xip_mem = pmfs_get_xip_mem,
+ /*.xip_mem_protect = pmfs_xip_mem_protect,*/
+};
diff --git a/fs/pmfs/ioctl.c b/fs/pmfs/ioctl.c
new file mode 100644
index 0000000..c9623ed
--- /dev/null
+++ b/fs/pmfs/ioctl.c
@@ -0,0 +1,150 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Ioctl operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include "pmfs.h"
+
+long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct pmfs_inode *pi;
+ struct super_block *sb = inode->i_sb;
+ unsigned int flags;
+ int ret;
+ pmfs_transaction_t *trans;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ if (!pi)
+ return -EACCES;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = le32_to_cpu(pi->i_flags) & PMFS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *)arg);
+ case FS_IOC_SETFLAGS: {
+ unsigned int oldflags;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EPERM;
+ goto flags_out;
+ }
+
+ if (get_user(flags, (int __user *)arg)) {
+ ret = -EFAULT;
+ goto flags_out;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ oldflags = le32_to_cpu(pi->i_flags);
+
+ if ((flags ^ oldflags) &
+ (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
+ goto flags_out;
+ }
+ }
+
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~FS_DIRSYNC_FL;
+
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ inode->i_ctime = CURRENT_TIME_SEC;
+ /*TODO: This transaction can be avoided if we had RTM */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_flags = cpu_to_le32(flags);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pmfs_set_inode_flags(inode, pi);
+ pmfs_memlock_inode(sb, pi);
+ pmfs_commit_transaction(sb, trans);
+out:
+ mutex_unlock(&inode->i_mutex);
+flags_out:
+ mnt_drop_write_file(filp);
+ return ret;
+ }
+ case FS_IOC_GETVERSION:
+ return put_user(inode->i_generation, (int __user *)arg);
+ case FS_IOC_SETVERSION: {
+ __u32 generation;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+ if (get_user(generation, (int __user *)arg)) {
+ ret = -EFAULT;
+ goto setversion_out;
+ }
+ mutex_lock(&inode->i_mutex);
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pi->i_generation = cpu_to_le32(inode->i_generation);
+ pmfs_memlock_inode(sb, pi);
+ pmfs_commit_transaction(sb, trans);
+ mutex_unlock(&inode->i_mutex);
+setversion_out:
+ mnt_drop_write_file(filp);
+ return ret;
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long pmfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ case FS_IOC32_GETVERSION:
+ cmd = FS_IOC_GETVERSION;
+ break;
+ case FS_IOC32_SETVERSION:
+ cmd = FS_IOC_SETVERSION;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return pmfs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/pmfs/journal.c b/fs/pmfs/journal.c
new file mode 100644
index 0000000..bb710a2
--- /dev/null
+++ b/fs/pmfs/journal.c
@@ -0,0 +1,866 @@
+/*
+ * PMFS journaling facility. This file contains code to log changes to pmfs
+ * meta-data to facilitate consistent meta-data updates against arbitrary
+ * power and system failures.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include "pmfs.h"
+#include "journal.h"
+
+static void dump_transaction(struct pmfs_sb_info *sbi,
+ pmfs_transaction_t *trans)
+{
+ int i;
+ pmfs_logentry_t *le = trans->start_addr;
+
+ for (i = 0; i < trans->num_entries; i++) {
+ pmfs_dbg_trans("ao %llx tid %x gid %x type %x sz %x\n",
+ le->addr_offset, le->transaction_id, le->gen_id,
+ le->type, le->size);
+ le++;
+ }
+}
+
+static inline uint32_t next_log_entry(uint32_t jsize, uint32_t le_off)
+{
+ le_off = le_off + LOGENTRY_SIZE;
+ if (le_off >= jsize)
+ le_off = 0;
+ return le_off;
+}
+
+static inline uint32_t prev_log_entry(uint32_t jsize, uint32_t le_off)
+{
+ if (le_off == 0)
+ le_off = jsize;
+ le_off = le_off - LOGENTRY_SIZE;
+ return le_off;
+}
+
+static inline uint16_t next_gen_id(uint16_t gen_id)
+{
+ gen_id++;
+ /* check for wraparound */
+ if (gen_id == 0)
+ gen_id++;
+ return gen_id;
+}
+
+static inline uint16_t prev_gen_id(uint16_t gen_id)
+{
+ gen_id--;
+ /* check for wraparound */
+ if (gen_id == 0)
+ gen_id--;
+ return gen_id;
+}
+
+/* Undo a valid log entry */
+static inline void pmfs_undo_logentry(struct super_block *sb,
+ pmfs_logentry_t *le)
+{
+ char *data;
+
+ if (le->size > 0) {
+ data = pmfs_get_block(sb, le64_to_cpu(le->addr_offset));
+ /* Undo changes by flushing the log entry to pmfs */
+ pmfs_memunlock_range(sb, data, le->size);
+ memcpy(data, le->data, le->size);
+ pmfs_memlock_range(sb, data, le->size);
+ pmfs_flush_buffer(data, le->size, false);
+ }
+}
+
+/* can be called during journal recovery or transaction abort */
+/* We need to Undo in the reverse order */
+static void pmfs_undo_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans)
+{
+ pmfs_logentry_t *le;
+ int i;
+ uint16_t gen_id = trans->gen_id;
+
+ le = trans->start_addr + trans->num_used;
+ le--;
+ for (i = trans->num_used - 1; i >= 0; i--, le--) {
+ if (gen_id == le16_to_cpu(le->gen_id))
+ pmfs_undo_logentry(sb, le);
+ }
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_flush_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_logentry_t *le = trans->start_addr;
+ int i;
+ char *data;
+
+ for (i = 0; i < trans->num_used; i++, le++) {
+ if (le->size) {
+ data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
+ if (sbi->redo_log) {
+ pmfs_memunlock_range(sb, data, le->size);
+ memcpy(data, le->data, le->size);
+ pmfs_memlock_range(sb, data, le->size);
+ } else
+ pmfs_flush_buffer(data, le->size, false);
+ }
+ }
+}
+
+static inline void invalidate_gen_id(pmfs_logentry_t *le)
+{
+ le->gen_id = 0;
+ pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_invalidate_logentries(struct super_block *sb,
+ pmfs_transaction_t *trans)
+{
+ pmfs_logentry_t *le = trans->start_addr;
+ int i;
+
+ pmfs_memunlock_range(sb, trans->start_addr,
+ trans->num_entries * LOGENTRY_SIZE);
+ for (i = 0; i < trans->num_entries; i++) {
+ invalidate_gen_id(le);
+ if (le->type == LE_START) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ le++;
+ }
+ pmfs_memlock_range(sb, trans->start_addr,
+ trans->num_entries * LOGENTRY_SIZE);
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_redo_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans, bool recover)
+{
+ pmfs_logentry_t *le = trans->start_addr;
+ int i;
+ uint16_t gen_id = trans->gen_id;
+ char *data;
+
+ for (i = 0; i < trans->num_entries; i++) {
+ if (gen_id == le16_to_cpu(le->gen_id) && le->size > 0) {
+ data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
+ /* flush data if we are called during recovery */
+ if (recover) {
+ pmfs_memunlock_range(sb, data, le->size);
+ memcpy(data, le->data, le->size);
+ pmfs_memlock_range(sb, data, le->size);
+ }
+ pmfs_flush_buffer(data, le->size, false);
+ }
+ le++;
+ }
+}
+
+/* recover the transaction ending at a valid log entry *le */
+/* called for Undo log and traverses the journal backward */
+static uint32_t pmfs_recover_transaction(struct super_block *sb, uint32_t head,
+ uint32_t tail, pmfs_logentry_t *le)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_transaction_t trans;
+ bool cmt_or_abrt_found = false, start_found = false;
+ uint16_t gen_id = le16_to_cpu(le->gen_id);
+
+ memset(&trans, 0, sizeof(trans));
+ trans.transaction_id = le32_to_cpu(le->transaction_id);
+ trans.gen_id = gen_id;
+
+ do {
+ trans.num_entries++;
+ trans.num_used++;
+
+ if (gen_id == le16_to_cpu(le->gen_id)) {
+ /* Handle committed/aborted transactions */
+ if (le->type & LE_COMMIT || le->type & LE_ABORT)
+ cmt_or_abrt_found = true;
+ if (le->type & LE_START) {
+ trans.start_addr = le;
+ start_found = true;
+ break;
+ }
+ }
+ if (tail == 0 || tail == head)
+ break;
+ /* prev log entry */
+ le--;
+ /* Handle uncommitted transactions */
+ if ((gen_id == le16_to_cpu(le->gen_id))
+ && (le->type & LE_COMMIT || le->type & LE_ABORT)) {
+ BUG_ON(trans.transaction_id ==
+ le32_to_cpu(le->transaction_id));
+ le++;
+ break;
+ }
+ tail = prev_log_entry(sbi->jsize, tail);
+ } while (1);
+
+ if (start_found && !cmt_or_abrt_found)
+ pmfs_undo_transaction(sb, &trans);
+
+ if (gen_id == MAX_GEN_ID) {
+ if (!start_found)
+ trans.start_addr = le;
+ /* make sure the changes made by pmfs_undo_transaction() are
+ * persistent before invalidating the log entries */
+ if (start_found && !cmt_or_abrt_found) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ pmfs_invalidate_logentries(sb, &trans);
+ }
+ return tail;
+}
+
+/* process the transaction starting at a valid log entry *le */
+/* called by the log cleaner and journal recovery */
+static uint32_t pmfs_process_transaction(struct super_block *sb, uint32_t head,
+ uint32_t tail, pmfs_logentry_t *le, bool recover)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_transaction_t trans;
+ uint16_t gen_id;
+ uint32_t new_head = head;
+
+ gen_id = le16_to_cpu(le->gen_id);
+ if (!(le->type & LE_START)) {
+ pmfs_dbg("start of trans %x but LE_START not set. gen_id %d\n",
+ le32_to_cpu(le->transaction_id), gen_id);
+ return next_log_entry(sbi->jsize, new_head);
+ }
+ memset(&trans, 0, sizeof(trans));
+ trans.transaction_id = le32_to_cpu(le->transaction_id);
+ trans.start_addr = le;
+ trans.gen_id = gen_id;
+ do {
+ trans.num_entries++;
+ trans.num_used++;
+ new_head = next_log_entry(sbi->jsize, new_head);
+
+ /* Handle committed/aborted transactions */
+ if ((gen_id == le16_to_cpu(le->gen_id)) && (le->type & LE_COMMIT
+ || le->type & LE_ABORT)) {
+ head = new_head;
+ if ((le->type & LE_COMMIT) && sbi->redo_log)
+ pmfs_redo_transaction(sb, &trans, recover);
+
+ if (gen_id == MAX_GEN_ID) {
+ if ((le->type & LE_COMMIT) && sbi->redo_log) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ pmfs_invalidate_logentries(sb, &trans);
+ }
+ break;
+ }
+ /* next log entry */
+ le++;
+ /* Handle uncommitted transactions */
+ if ((new_head == tail) || ((gen_id == le16_to_cpu(le->gen_id))
+ && (le->type & LE_START))) {
+ /* found a new valid transaction w/o finding a commit */
+ if (recover) {
+ /* if this function is called by recovery, move
+ * ahead even if we didn't find a commit record
+ * for this transaction */
+ head = new_head;
+ if (gen_id == MAX_GEN_ID)
+ pmfs_invalidate_logentries(sb, &trans);
+ }
+ pmfs_dbg_trans("no cmt tid %d sa %p nle %d tail %x"
+ " gen %d\n",
+ trans.transaction_id,trans.start_addr,trans.num_entries,
+ trans.num_used, trans.gen_id);
+ /* dump_transaction(sbi, &trans); */
+ break;
+ }
+ } while (new_head != tail);
+
+ return head;
+}
+
+static void pmfs_clean_journal(struct super_block *sb, bool unmount)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ uint32_t head = le32_to_cpu(journal->head);
+ uint32_t new_head, tail;
+ uint16_t gen_id;
+ volatile u64 *ptr_tail_genid = (volatile u64 *)&journal->tail;
+ u64 tail_genid;
+ pmfs_logentry_t *le;
+
+ /* atomically read both tail and gen_id of journal. Normally use of
+ * volatile is prohibited in kernel code but since we use volatile
+ * to write to journal's tail and gen_id atomically, we thought we
+ * should use volatile to read them simultaneously and avoid locking
+ * them. */
+ tail_genid = *ptr_tail_genid;
+ tail = le32_to_cpu(tail_genid & 0xFFFFFFFF);
+ gen_id = le16_to_cpu((tail_genid >> 32) & 0xFFFF);
+
+ /* journal wraparound happened. so head points to prev generation id */
+ if (tail < head)
+ gen_id = prev_gen_id(gen_id);
+ pmfs_dbg_trans("starting journal cleaning %x %x\n", head, tail);
+ while (head != tail) {
+ le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
+ if (gen_id == le16_to_cpu(le->gen_id)) {
+ /* found a valid log entry, process the transaction */
+ new_head = pmfs_process_transaction(sb, head, tail,
+ le, false);
+ /* no progress was made. return */
+ if (new_head == head)
+ break;
+ head = new_head;
+ } else {
+ if (gen_id == MAX_GEN_ID) {
+ pmfs_memunlock_range(sb, le, sizeof(*le));
+ invalidate_gen_id(le);
+ pmfs_memlock_range(sb, le, sizeof(*le));
+ }
+ head = next_log_entry(sbi->jsize, head);
+ }
+ /* handle journal wraparound */
+ if (head == 0)
+ gen_id = next_gen_id(gen_id);
+ }
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ pmfs_memunlock_range(sb, journal, sizeof(*journal));
+ journal->head = cpu_to_le32(head);
+ pmfs_memlock_range(sb, journal, sizeof(*journal));
+ pmfs_flush_buffer(&journal->head, sizeof(journal->head), true);
+ if (unmount) {
+ PERSISTENT_MARK();
+ if (journal->head != journal->tail)
+ pmfs_dbg("PMFS: umount but journal not empty %x:%x\n",
+ le32_to_cpu(journal->head), le32_to_cpu(journal->tail));
+ PERSISTENT_BARRIER();
+ }
+ pmfs_dbg_trans("leaving journal cleaning %x %x\n", head, tail);
+}
+
+static void log_cleaner_try_sleeping(struct pmfs_sb_info *sbi)
+{
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&sbi->log_cleaner_wait, &wait, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&sbi->log_cleaner_wait, &wait);
+}
+
+static int pmfs_log_cleaner(void *arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ pmfs_dbg_trans("Running log cleaner thread\n");
+ for ( ; ; ) {
+ log_cleaner_try_sleeping(sbi);
+
+ if (kthread_should_stop())
+ break;
+
+ pmfs_clean_journal(sb, false);
+ }
+ pmfs_clean_journal(sb, true);
+ pmfs_dbg_trans("Exiting log cleaner thread\n");
+ return 0;
+}
+
+static int pmfs_journal_cleaner_run(struct super_block *sb)
+{
+ int ret = 0;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ init_waitqueue_head(&sbi->log_cleaner_wait);
+
+ sbi->log_cleaner_thread = kthread_run(pmfs_log_cleaner, sb,
+ "pmfs_log_cleaner_0x%llx", sbi->phys_addr);
+ if (IS_ERR(sbi->log_cleaner_thread)) {
+ /* failure at boot is fatal */
+ pmfs_err(sb, "Failed to start pmfs log cleaner thread\n");
+ ret = -1;
+ }
+ return ret;
+}
+
+int pmfs_journal_soft_init(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+
+ sbi->next_transaction_id = 0;
+ sbi->journal_base_addr = pmfs_get_block(sb,le64_to_cpu(journal->base));
+ sbi->jsize = le32_to_cpu(journal->size);
+ mutex_init(&sbi->journal_mutex);
+ sbi->redo_log = !!le16_to_cpu(journal->redo_logging);
+
+ return pmfs_journal_cleaner_run(sb);
+}
+
+int pmfs_journal_hard_init(struct super_block *sb, uint64_t base,
+ uint32_t size)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+
+ pmfs_memunlock_range(sb, journal, sizeof(*journal));
+ journal->base = cpu_to_le64(base);
+ journal->size = cpu_to_le32(size);
+ journal->gen_id = cpu_to_le16(1);
+ journal->head = journal->tail = 0;
+ /* lets do Undo logging for now */
+ journal->redo_logging = 0;
+ pmfs_memlock_range(sb, journal, sizeof(*journal));
+
+ sbi->journal_base_addr = pmfs_get_block(sb, base);
+ pmfs_memunlock_range(sb, sbi->journal_base_addr, size);
+ memset_nt(sbi->journal_base_addr, 0, size);
+ pmfs_memlock_range(sb, sbi->journal_base_addr, size);
+
+ return pmfs_journal_soft_init(sb);
+}
+
+static void wakeup_log_cleaner(struct pmfs_sb_info *sbi)
+{
+ if (!waitqueue_active(&sbi->log_cleaner_wait))
+ return;
+ pmfs_dbg_trans("waking up the cleaner thread\n");
+ wake_up_interruptible(&sbi->log_cleaner_wait);
+}
+
+int pmfs_journal_uninit(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ if (sbi->log_cleaner_thread)
+ kthread_stop(sbi->log_cleaner_thread);
+ return 0;
+}
+
+inline pmfs_transaction_t *pmfs_current_transaction(void)
+{
+ return (pmfs_transaction_t *)current->journal_info;
+}
+
+static int pmfs_free_logentries(int max_log_entries)
+{
+ pmfs_dbg("pmfs_free_logentries: Not Implemented\n");
+ return -ENOMEM;
+}
+
+pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
+ int max_log_entries)
+{
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_transaction_t *trans;
+ uint32_t head, tail, req_size, avail_size;
+ uint64_t base;
+#if 0
+ trans = pmfs_current_transaction();
+
+ if (trans) {
+ BUG_ON(trans->t_journal != journal);
+ return trans;
+ }
+#endif
+ /* If it is an undo log, need one more log-entry for commit record */
+ if (!sbi->redo_log)
+ max_log_entries++;
+
+ trans = pmfs_alloc_transaction();
+ if (!trans)
+ return ERR_PTR(-ENOMEM);
+ memset(trans, 0, sizeof(*trans));
+
+ trans->num_used = 0;
+ trans->num_entries = max_log_entries;
+ trans->t_journal = journal;
+ req_size = max_log_entries << LESIZE_SHIFT;
+
+ mutex_lock(&sbi->journal_mutex);
+
+ tail = le32_to_cpu(journal->tail);
+ head = le32_to_cpu(journal->head);
+ trans->transaction_id = sbi->next_transaction_id++;
+again:
+ trans->gen_id = le16_to_cpu(journal->gen_id);
+ avail_size = (tail >= head) ?
+ (sbi->jsize - (tail - head)) : (head - tail);
+ avail_size = avail_size - LOGENTRY_SIZE;
+
+ if (avail_size < req_size) {
+ uint32_t freed_size;
+ /* run the log cleaner function to free some log entries */
+ freed_size = pmfs_free_logentries(max_log_entries);
+ if ((avail_size + freed_size) < req_size)
+ goto journal_full;
+ }
+ base = le64_to_cpu(journal->base) + tail;
+ tail = tail + req_size;
+ /* journal wraparound because of this transaction allocation.
+ * start the transaction from the beginning of the journal so
+ * that we dont have any wraparound within a transaction */
+ pmfs_memunlock_range(sb, journal, sizeof(*journal));
+ if (tail >= sbi->jsize) {
+ volatile u64 *ptr;
+ tail = 0;
+ /* write the gen_id and tail atomically. Use of volatile is
+ * normally prohibited in kernel code, but it is required here
+ * because we want to write atomically against power failures
+ * and locking can't provide that. */
+ ptr = (volatile u64 *)&journal->tail;
+ /* writing 8-bytes atomically setting tail to 0 */
+ set_64bit(ptr, (u64)cpu_to_le16(next_gen_id(le16_to_cpu(
+ journal->gen_id))) << 32);
+ pmfs_memlock_range(sb, journal, sizeof(*journal));
+ pmfs_dbg_trans("journal wrapped. tail %x gid %d cur tid %d\n",
+ le32_to_cpu(journal->tail),le16_to_cpu(journal->gen_id),
+ sbi->next_transaction_id - 1);
+ goto again;
+ } else {
+ journal->tail = cpu_to_le32(tail);
+ pmfs_memlock_range(sb, journal, sizeof(*journal));
+ }
+ mutex_unlock(&sbi->journal_mutex);
+
+ avail_size = avail_size - req_size;
+ /* wake up the log cleaner if required */
+ if ((sbi->jsize - avail_size) > (sbi->jsize >> 3))
+ wakeup_log_cleaner(sbi);
+ pmfs_flush_buffer(&journal->tail, sizeof(u64), false);
+
+ pmfs_dbg_trans("new transaction tid %d nle %d avl sz %x sa %llx\n",
+ trans->transaction_id, max_log_entries, avail_size, base);
+ trans->start_addr = pmfs_get_block(sb, base);
+
+ trans->parent = (pmfs_transaction_t *)current->journal_info;
+ current->journal_info = trans;
+ return trans;
+journal_full:
+ mutex_unlock(&sbi->journal_mutex);
+ pmfs_err(sb, "Journal full. base %llx sz %x head:tail %x:%x ncl %x\n",
+ le64_to_cpu(journal->base), le32_to_cpu(journal->size),
+ le32_to_cpu(journal->head), le32_to_cpu(journal->tail),
+ max_log_entries);
+ pmfs_free_transaction(trans);
+ return ERR_PTR(-EAGAIN);
+}
+
+static inline void pmfs_commit_logentry(struct super_block *sb,
+ pmfs_transaction_t *trans, pmfs_logentry_t *le)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ if (sbi->redo_log) {
+ /* Redo Log */
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ /* Atomically write the commit type */
+ le->type |= LE_COMMIT;
+ barrier();
+ /* Atomically make the log entry valid */
+ le->gen_id = cpu_to_le16(trans->gen_id);
+ pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ /* Update the FS in place */
+ pmfs_flush_transaction(sb, trans);
+ } else {
+ /* Undo Log */
+ /* Update the FS in place: currently already done. so
+ * only need to clflush */
+ pmfs_flush_transaction(sb, trans);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ /* Atomically write the commit type */
+ le->type |= LE_COMMIT;
+ barrier();
+ /* Atomically make the log entry valid */
+ le->gen_id = cpu_to_le16(trans->gen_id);
+ pmfs_flush_buffer(le, LOGENTRY_SIZE, true);
+ }
+ return;
+}
+
+int pmfs_add_logentry(struct super_block *sb,
+ pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_logentry_t *le;
+ int num_les = 0, i;
+ uint64_t le_start = size ? pmfs_get_addr_off(sbi, addr) : 0;
+ uint8_t le_size;
+
+ if (trans == NULL)
+ return -EINVAL;
+ le = trans->start_addr + trans->num_used;
+
+ if (size == 0) {
+ /* At least one log entry required for commit/abort log entry */
+ if ((type & LE_COMMIT) || (type & LE_ABORT))
+ num_les = 1;
+ } else
+ num_les = (size + sizeof(le->data) - 1)/sizeof(le->data);
+
+ pmfs_dbg_trans("add le id %d size %x, num_les %d tail %x le %p\n",
+ trans->transaction_id, size, trans->num_entries,
+ trans->num_used, le);
+
+ if ((trans->num_used + num_les) > trans->num_entries) {
+ pmfs_err(sb, "Log Entry full. tid %x ne %x tail %x size %x\n",
+ trans->transaction_id, trans->num_entries,
+ trans->num_used, size);
+ dump_transaction(sbi, trans);
+ dump_stack();
+ return -ENOMEM;
+ }
+
+ pmfs_memunlock_range(sb, le, sizeof(*le) * num_les);
+ for (i = 0; i < num_les; i++) {
+ le->addr_offset = cpu_to_le64(le_start);
+ le->transaction_id = cpu_to_le32(trans->transaction_id);
+ le_size = (i == (num_les - 1)) ? size : sizeof(le->data);
+ le->size = le_size;
+ size -= le_size;
+ if (le_size)
+ memcpy(le->data, addr, le_size);
+ le->type = type;
+
+ if (i == 0 && trans->num_used == 0)
+ le->type |= LE_START;
+ trans->num_used++;
+
+ /* handle special log entry */
+ if (i == (num_les - 1) && (type & LE_COMMIT)) {
+ pmfs_commit_logentry(sb, trans, le);
+ pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
+ return 0;
+ }
+ /* put a compile time barrier so that compiler doesnt reorder
+ * the writes to the log entry */
+ barrier();
+
+ /* Atomically make the log entry valid */
+ le->gen_id = cpu_to_le16(trans->gen_id);
+ pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+
+ addr += le_size;
+ le_start += le_size;
+ le++;
+ }
+ pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
+ if (!sbi->redo_log) {
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ return 0;
+}
+
+int pmfs_commit_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans)
+{
+ if (trans == NULL)
+ return 0;
+ /* Add the commit log-entry */
+ pmfs_add_logentry(sb, trans, NULL, 0, LE_COMMIT);
+
+ pmfs_dbg_trans("completing transaction for id %d\n",
+ trans->transaction_id);
+
+ current->journal_info = trans->parent;
+ pmfs_free_transaction(trans);
+ return 0;
+}
+
+int pmfs_abort_transaction(struct super_block *sb, pmfs_transaction_t *trans)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ if (trans == NULL)
+ return 0;
+ pmfs_dbg_trans("abort trans for tid %x sa %p numle %d tail %x gen %d\n",
+ trans->transaction_id, trans->start_addr, trans->num_entries,
+ trans->num_used, trans->gen_id);
+ dump_transaction(sbi, trans);
+ /*dump_stack();*/
+
+ if (!sbi->redo_log) {
+ /* Undo Log */
+ pmfs_undo_transaction(sb, trans);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+ /* add a abort log entry */
+ pmfs_add_logentry(sb, trans, NULL, 0, LE_ABORT);
+ current->journal_info = trans->parent;
+ pmfs_free_transaction(trans);
+ return 0;
+}
+
+static void invalidate_remaining_journal(struct super_block *sb,
+ void *journal_vaddr, uint32_t jtail, uint32_t jsize)
+{
+ pmfs_logentry_t *le = (pmfs_logentry_t *)(journal_vaddr + jtail);
+ void *start = le;
+
+ pmfs_memunlock_range(sb, start, jsize - jtail);
+ while (jtail < jsize) {
+ invalidate_gen_id(le);
+ le++;
+ jtail += LOGENTRY_SIZE;
+ }
+ pmfs_memlock_range(sb, start, jsize - jtail);
+}
+
+/* we need to increase the gen_id to invalidate all the journal log
+ * entries. This is because after the recovery, we may still have some
+ * valid log entries beyond the tail (before power failure, they became
+ * persistent before the journal tail could become persistent.
+ * should gen_id and head be updated atomically? not necessarily? we
+ * can update gen_id before journal head because gen_id and head are in
+ * the same cacheline */
+static void pmfs_forward_journal(struct super_block *sb, struct pmfs_sb_info
+ *sbi, pmfs_journal_t *journal)
+{
+ uint16_t gen_id = le16_to_cpu(journal->gen_id);
+ /* handle gen_id wrap around */
+ if (gen_id == MAX_GEN_ID) {
+ invalidate_remaining_journal(sb, sbi->journal_base_addr,
+ le32_to_cpu(journal->tail), sbi->jsize);
+ }
+ PERSISTENT_MARK();
+ gen_id = next_gen_id(gen_id);
+ /* make all changes persistent before advancing gen_id and head */
+ PERSISTENT_BARRIER();
+ pmfs_memunlock_range(sb, journal, sizeof(*journal));
+ journal->gen_id = cpu_to_le16(gen_id);
+ barrier();
+ journal->head = journal->tail;
+ pmfs_memlock_range(sb, journal, sizeof(*journal));
+ pmfs_flush_buffer(journal, sizeof(*journal), false);
+}
+
+static int pmfs_recover_undo_journal(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ uint32_t tail = le32_to_cpu(journal->tail);
+ uint32_t head = le32_to_cpu(journal->head);
+ uint16_t gen_id = le16_to_cpu(journal->gen_id);
+ pmfs_logentry_t *le;
+
+ while (head != tail) {
+ /* handle journal wraparound */
+ if (tail == 0)
+ gen_id = prev_gen_id(gen_id);
+ tail = prev_log_entry(sbi->jsize, tail);
+
+ le = (pmfs_logentry_t *)(sbi->journal_base_addr + tail);
+ if (gen_id == le16_to_cpu(le->gen_id)) {
+ tail = pmfs_recover_transaction(sb, head, tail, le);
+ } else {
+ if (gen_id == MAX_GEN_ID) {
+ pmfs_memunlock_range(sb, le, sizeof(*le));
+ invalidate_gen_id(le);
+ pmfs_memlock_range(sb, le, sizeof(*le));
+ }
+ }
+ }
+ pmfs_forward_journal(sb, sbi, journal);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ return 0;
+}
+
+static int pmfs_recover_redo_journal(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ uint32_t tail = le32_to_cpu(journal->tail);
+ uint32_t head = le32_to_cpu(journal->head);
+ uint16_t gen_id = le16_to_cpu(journal->gen_id);
+ pmfs_logentry_t *le;
+
+ /* journal wrapped around. so head points to previous generation id */
+ if (tail < head)
+ gen_id = prev_gen_id(gen_id);
+
+ while (head != tail) {
+ le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
+ if (gen_id == le16_to_cpu(le->gen_id)) {
+ head = pmfs_process_transaction(sb, head, tail,
+ le, true);
+ } else {
+ if (gen_id == MAX_GEN_ID) {
+ pmfs_memunlock_range(sb, le, sizeof(*le));
+ invalidate_gen_id(le);
+ pmfs_memlock_range(sb, le, sizeof(*le));
+ }
+ head = next_log_entry(sbi->jsize, head);
+ }
+ /* handle journal wraparound */
+ if (head == 0)
+ gen_id = next_gen_id(gen_id);
+ }
+ pmfs_forward_journal(sb, sbi, journal);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ return 0;
+}
+
+int pmfs_recover_journal(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ pmfs_journal_t *journal = pmfs_get_journal(sb);
+ uint32_t tail = le32_to_cpu(journal->tail);
+ uint32_t head = le32_to_cpu(journal->head);
+ uint16_t gen_id = le16_to_cpu(journal->gen_id);
+
+ /* is the journal empty? true if unmounted properly. */
+ if (head == tail)
+ return 0;
+ pmfs_dbg("PMFS: journal recovery. head:tail %x:%x gen_id %d\n",
+ head, tail, gen_id);
+ if (sbi->redo_log)
+ pmfs_recover_redo_journal(sb);
+ else
+ pmfs_recover_undo_journal(sb);
+ return 0;
+}
+
diff --git a/fs/pmfs/journal.h b/fs/pmfs/journal.h
new file mode 100644
index 0000000..6781029
--- /dev/null
+++ b/fs/pmfs/journal.h
@@ -0,0 +1,101 @@
+/*
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PMFS_JOURNAL_H__
+#define __PMFS_JOURNAL_H__
+#include <linux/slab.h>
+
+/* default pmfs journal size 4MB */
+#define PMFS_DEFAULT_JOURNAL_SIZE (4 << 20)
+/* minimum pmfs journal size 64KB */
+#define PMFS_MINIMUM_JOURNAL_SIZE (1 << 16)
+
+#define CACHELINE_SIZE (64)
+#define CLINE_SHIFT (6)
+#define CACHELINE_MASK (~(CACHELINE_SIZE - 1))
+#define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
+
+#define LOGENTRY_SIZE CACHELINE_SIZE
+#define LESIZE_SHIFT CLINE_SHIFT
+
+#define MAX_INODE_LENTRIES (2)
+#define MAX_SB_LENTRIES (2)
+/* 1 le for dir entry and 1 le for potentially allocating a new dir block */
+#define MAX_DIRENTRY_LENTRIES (2)
+/* 2 le for adding or removing the inode from truncate list. used to log
+ * potential changes to inode table's i_next_truncate and i_sum */
+#define MAX_TRUNCATE_LENTRIES (2)
+#define MAX_DATA_PER_LENTRY 48
+/* blocksize * max_btree_height */
+#define MAX_METABLOCK_LENTRIES \
+ ((PMFS_DEF_BLOCK_SIZE_4K * 3)/MAX_DATA_PER_LENTRY)
+
+#define MAX_PTRS_PER_LENTRY (MAX_DATA_PER_LENTRY / sizeof(u64))
+
+#define TRANS_RUNNING 1
+#define TRANS_COMMITTED 2
+#define TRANS_ABORTED 3
+
+#define LE_DATA 0
+#define LE_START 1
+#define LE_COMMIT 2
+#define LE_ABORT 4
+
+#define MAX_GEN_ID ((uint16_t)-1)
+
+/* persistent data structure to describe a single log-entry */
+/* every log entry is max CACHELINE_SIZE bytes in size */
+typedef struct {
+ __le64 addr_offset;
+ __le32 transaction_id;
+ __le16 gen_id;
+ u8 type; /* normal, commit, or abort */
+ u8 size;
+ char data[48];
+} pmfs_logentry_t;
+
+/* volatile data structure to describe a transaction */
+typedef struct pmfs_transaction {
+ u32 transaction_id;
+ u16 num_entries;
+ u16 num_used;
+ u16 gen_id;
+ u16 status;
+ pmfs_journal_t *t_journal;
+ pmfs_logentry_t *start_addr;
+ struct pmfs_transaction *parent;
+} pmfs_transaction_t;
+
+extern inline pmfs_transaction_t *pmfs_alloc_transaction(void);
+extern inline void pmfs_free_transaction(pmfs_transaction_t *trans);
+
+extern int pmfs_journal_soft_init(struct super_block *sb);
+extern int pmfs_journal_hard_init(struct super_block *sb,
+ uint64_t base, uint32_t size);
+extern int pmfs_journal_uninit(struct super_block *sb);
+extern pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
+ int nclines);
+extern inline pmfs_transaction_t *pmfs_current_transaction(void);
+extern int pmfs_add_logentry(struct super_block *sb,
+ pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type);
+extern int pmfs_commit_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans);
+extern int pmfs_abort_transaction(struct super_block *sb,
+ pmfs_transaction_t *trans);
+extern int pmfs_recover_journal(struct super_block *sb);
+
+#endif /* __PMFS_JOURNAL_H__ */
diff --git a/fs/pmfs/namei.c b/fs/pmfs/namei.c
new file mode 100644
index 0000000..490d09d
--- /dev/null
+++ b/fs/pmfs/namei.c
@@ -0,0 +1,797 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode operations for directories.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "pmfs.h"
+#include "xip.h"
+
+/*
+ * Couple of helper functions - make the code slightly cleaner.
+ */
+static inline void pmfs_inc_count(struct inode *inode, struct pmfs_inode *pi)
+{
+ inc_nlink(inode);
+ pmfs_update_nlink(inode, pi);
+}
+
+static inline void pmfs_dec_count(struct inode *inode, struct pmfs_inode *pi)
+{
+ if (inode->i_nlink) {
+ drop_nlink(inode);
+ pmfs_update_nlink(inode, pi);
+ }
+}
+
+static inline int pmfs_add_nondir(pmfs_transaction_t *trans,
+ struct inode *dir, struct dentry *dentry, struct inode *inode)
+{
+ struct pmfs_inode *pi;
+ int err = pmfs_add_entry(trans, dentry, inode);
+
+ if (!err) {
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+ return 0;
+ }
+ pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+ pmfs_dec_count(inode, pi);
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
+}
+
+static inline struct pmfs_direntry *pmfs_next_entry(struct pmfs_direntry *p)
+{
+ return (struct pmfs_direntry *)((char *)p + le16_to_cpu(p->de_len));
+}
+
+/*
+ * Methods themselves.
+ */
+int pmfs_check_dir_entry(const char *function, struct inode *dir,
+ struct pmfs_direntry *de, u8 *base,
+ unsigned long offset)
+{
+ const char *error_msg = NULL;
+ const int rlen = le16_to_cpu(de->de_len);
+
+ if (unlikely(rlen < PMFS_DIR_REC_LEN(1)))
+ error_msg = "de_len is smaller than minimal";
+ else if (unlikely(rlen % 4 != 0))
+ error_msg = "de_len % 4 != 0";
+ else if (unlikely(rlen < PMFS_DIR_REC_LEN(de->name_len)))
+ error_msg = "de_len is too small for name_len";
+ else if (unlikely((((u8 *)de - base) + rlen > dir->i_sb->s_blocksize)))
+ error_msg = "directory entry across blocks";
+
+ if (unlikely(error_msg != NULL)) {
+ pmfs_dbg("bad entry in directory #%lu: %s - "
+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg, offset,
+ (unsigned long)le64_to_cpu(de->ino), rlen,
+ de->name_len);
+ }
+
+ return error_msg == NULL ? 1 : 0;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
+ unsigned long offset,
+ struct pmfs_direntry **res_dir,
+ struct pmfs_direntry **prev_dir)
+{
+ struct pmfs_direntry *de;
+ struct pmfs_direntry *pde = NULL;
+ char *dlimit;
+ int de_len;
+ const char *name = child->name;
+ int namelen = child->len;
+
+ de = (struct pmfs_direntry *)blk_base;
+ dlimit = blk_base + dir->i_sb->s_blocksize;
+ while ((char *)de < dlimit) {
+ /* this code is executed quadratically often */
+ /* do minimal checking `by hand' */
+
+ if ((char *)de + namelen <= dlimit &&
+ pmfs_match(namelen, name, de)) {
+ /* found a match - just to be sure, do a full check */
+ if (!pmfs_check_dir_entry("pmfs_inode_by_name",
+ dir, de, blk_base, offset))
+ return -1;
+ *res_dir = de;
+ if (prev_dir)
+ *prev_dir = pde;
+ return 1;
+ }
+ /* prevent looping on a bad block */
+ de_len = le16_to_cpu(de->de_len);
+ if (de_len <= 0)
+ return -1;
+ offset += de_len;
+ pde = de;
+ de = (struct pmfs_direntry *)((char *)de + de_len);
+ }
+ return 0;
+}
+
+static ino_t pmfs_inode_by_name(struct inode *dir, struct qstr *entry,
+ struct pmfs_direntry **res_entry)
+{
+ struct pmfs_inode *pi;
+ ino_t i_no = 0;
+ int namelen, nblocks, i;
+ u8 *blk_base;
+ const u8 *name = entry->name;
+ struct super_block *sb = dir->i_sb;
+ unsigned long block, start;
+ struct pmfs_inode_vfs *si = PMFS_I(dir);
+
+ pi = pmfs_get_inode(sb, dir->i_ino);
+
+ namelen = entry->len;
+ if (namelen > PMFS_NAME_LEN)
+ return 0;
+ if ((namelen <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == 0)) {
+ /*
+ * "." or ".." will only be in the first block
+ */
+ block = start = 0;
+ nblocks = 1;
+ goto restart;
+ }
+ nblocks = dir->i_size >> dir->i_sb->s_blocksize_bits;
+ start = si->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+restart:
+ do {
+ blk_base =
+ pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+ if (!blk_base)
+ goto done;
+ i = pmfs_search_dirblock(blk_base, dir, entry,
+ block << sb->s_blocksize_bits,
+ res_entry, NULL);
+ if (i == 1) {
+ si->i_dir_start_lookup = block;
+ i_no = le64_to_cpu((*res_entry)->ino);
+ goto done;
+ } else {
+ if (i < 0)
+ goto done;
+ }
+ if (++block >= nblocks)
+ block = 0;
+ } while (block != start);
+ /*
+ * If the directory has grown while we were searching, then
+ * search the last part of the directory before giving up.
+ */
+ block = nblocks;
+ nblocks = dir->i_size >> sb->s_blocksize_bits;
+ if (block < nblocks) {
+ start = 0;
+ goto restart;
+ }
+done:
+ return i_no;
+}
+
+static struct dentry *pmfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode = NULL;
+ struct pmfs_direntry *de;
+ ino_t ino;
+
+ if (dentry->d_name.len > PMFS_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ ino = pmfs_inode_by_name(dir, &dentry->d_name, &de);
+ if (ino) {
+ inode = pmfs_iget(dir->i_sb, ino);
+ if (inode == ERR_PTR(-ESTALE)) {
+ pmfs_err(dir->i_sb, __func__,
+ "deleted inode referenced: %lu",
+ (unsigned long)ino);
+ return ERR_PTR(-EIO);
+ }
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int pmfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ struct nameidata *nd)
+{
+ struct inode *inode = NULL;
+ int err = PTR_ERR(inode);
+ struct super_block *sb = dir->i_sb;
+ pmfs_transaction_t *trans;
+
+ /* two log entries for new inode, 1 lentry for dir inode, 1 for dir
+ * inode's b-tree, 2 lentries for logging dir entry
+ */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
+ if (IS_ERR(inode))
+ goto out_err;
+ inode->i_op = &pmfs_file_inode_operations;
+ inode->i_mapping->a_ops = &pmfs_aops_xip;
+ inode->i_fop = &pmfs_xip_file_operations;
+ err = pmfs_add_nondir(trans, dir, dentry, inode);
+ if (err)
+ goto out_err;
+ pmfs_commit_transaction(sb, trans);
+out:
+ return err;
+out_err:
+ pmfs_abort_transaction(sb, trans);
+ return err;
+}
+
+static int pmfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t rdev)
+{
+ struct inode *inode = NULL;
+ int err = PTR_ERR(inode);
+ pmfs_transaction_t *trans;
+ struct super_block *sb = dir->i_sb;
+ struct pmfs_inode *pi;
+
+ /* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
+ * inode's b-tree, 2 lentries for logging dir entry
+ */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
+ if (!IS_ERR(inode))
+ goto out_err;
+ init_special_inode(inode, mode, rdev);
+ inode->i_op = &pmfs_special_inode_operations;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+ err = pmfs_add_nondir(trans, dir, dentry, inode);
+ if (err)
+ goto out_err;
+ pmfs_commit_transaction(sb, trans);
+out:
+ return err;
+out_err:
+ pmfs_abort_transaction(sb, trans);
+ return err;
+}
+
+static int pmfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct super_block *sb = dir->i_sb;
+ int err = -ENAMETOOLONG;
+ unsigned len = strlen(symname);
+ struct inode *inode;
+ pmfs_transaction_t *trans;
+ struct pmfs_inode *pi;
+
+ if (len + 1 > sb->s_blocksize)
+ goto out;
+
+ /* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
+ * inode's b-tree, 2 lentries for logging dir entry
+ */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ inode = pmfs_new_inode(trans, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode)) {
+ pmfs_abort_transaction(sb, trans);
+ goto out;
+ }
+
+ inode->i_op = &pmfs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &pmfs_aops_xip;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ err = pmfs_block_symlink(inode, symname, len);
+ if (err)
+ goto out_fail;
+
+ inode->i_size = len;
+ pmfs_update_isize(inode, pi);
+
+ err = pmfs_add_nondir(trans, dir, dentry, inode);
+ if (err) {
+ /* free up the allocated block to the symlink inode */
+ pmfs_setsize(inode, 0);
+ pmfs_abort_transaction(sb, trans);
+ goto out;
+ }
+
+ pmfs_commit_transaction(sb, trans);
+out:
+ return err;
+
+out_fail:
+ pmfs_dec_count(inode, pi);
+ unlock_new_inode(inode);
+ iput(inode);
+ pmfs_abort_transaction(sb, trans);
+ goto out;
+}
+
+static int pmfs_link(struct dentry *dest_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = dest_dentry->d_inode;
+ int err = -ENOMEM;
+ pmfs_transaction_t *trans;
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+ if (inode->i_nlink >= PMFS_LINK_MAX)
+ return -EMLINK;
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+ /* only need to log the first 48 bytes since we only modify ctime and
+ * i_links_count in this system call */
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ ihold(inode);
+
+ err = pmfs_add_entry(trans, dentry, inode);
+ if (!err) {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inc_nlink(inode);
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pmfs_memlock_inode(sb, pi);
+
+ d_instantiate(dentry, inode);
+ pmfs_commit_transaction(sb, trans);
+ } else {
+ iput(inode);
+ pmfs_abort_transaction(sb, trans);
+ }
+out:
+ return err;
+}
+
+static int pmfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ int retval = -ENOMEM;
+ pmfs_transaction_t *trans;
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ retval = PTR_ERR(trans);
+ goto out;
+ }
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ retval = pmfs_remove_entry(trans, dentry, inode);
+ if (retval)
+ goto end_unlink;
+
+ if (inode->i_nlink == 1)
+ pmfs_truncate_add(inode, inode->i_size);
+ inode->i_ctime = dir->i_ctime;
+
+ pmfs_memunlock_inode(sb, pi);
+ if (inode->i_nlink) {
+ drop_nlink(inode);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ }
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pmfs_memlock_inode(sb, pi);
+
+ pmfs_commit_transaction(sb, trans);
+ return 0;
+end_unlink:
+ pmfs_abort_transaction(sb, trans);
+out:
+ return retval;
+}
+
+static int pmfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+ struct pmfs_inode *pi, *pidir;
+ struct pmfs_direntry *de = NULL;
+ struct super_block *sb = dir->i_sb;
+ pmfs_transaction_t *trans;
+ int err = -EMLINK;
+ char *blk_base;
+
+ if (dir->i_nlink >= PMFS_LINK_MAX)
+ goto out;
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ inode = pmfs_new_inode(trans, dir, S_IFDIR | mode, &dentry->d_name);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode)) {
+ pmfs_abort_transaction(sb, trans);
+ goto out;
+ }
+
+ inode->i_op = &pmfs_dir_inode_operations;
+ inode->i_fop = &pmfs_dir_operations;
+ inode->i_mapping->a_ops = &pmfs_aops_xip;
+
+ /* since this is a new inode so we dont need to include this
+ * pmfs_alloc_blocks in the transaction
+ */
+ err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
+ if (err)
+ goto out_clear_inode;
+ inode->i_size = sb->s_blocksize;
+
+ blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
+ de = (struct pmfs_direntry *)blk_base;
+ pmfs_memunlock_range(sb, blk_base, sb->s_blocksize);
+ de->ino = cpu_to_le64(inode->i_ino);
+ de->name_len = 1;
+ de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
+ strcpy(de->name, ".");
+ /*de->file_type = S_IFDIR; */
+ de = pmfs_next_entry(de);
+ de->ino = cpu_to_le64(dir->i_ino);
+ de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
+ de->name_len = 2;
+ strcpy(de->name, "..");
+ /*de->file_type = S_IFDIR; */
+ pmfs_memlock_range(sb, blk_base, sb->s_blocksize);
+
+ /* No need to journal the dir entries but we need to persist them */
+ pmfs_flush_buffer(blk_base, PMFS_DIR_REC_LEN(1) +
+ PMFS_DIR_REC_LEN(2), true);
+
+ set_nlink(inode, 2);
+
+ err = pmfs_add_entry(trans, dentry, inode);
+ if (err) {
+ pmfs_dbg_verbose("failed to add dir entry\n");
+ goto out_clear_inode;
+ }
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pi->i_size = cpu_to_le64(inode->i_size);
+ pmfs_memlock_inode(sb, pi);
+
+ pidir = pmfs_get_inode(sb, dir->i_ino);
+ pmfs_inc_count(dir, pidir);
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ pmfs_commit_transaction(sb, trans);
+
+out:
+ return err;
+
+out_clear_inode:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ pmfs_abort_transaction(sb, trans);
+ goto out;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int pmfs_empty_dir(struct inode *inode)
+{
+ unsigned long offset;
+ struct pmfs_direntry *de, *de1;
+ struct super_block *sb;
+ char *blk_base;
+ int err = 0;
+
+ sb = inode->i_sb;
+ if (inode->i_size < PMFS_DIR_REC_LEN(1) + PMFS_DIR_REC_LEN(2)) {
+ pmfs_dbg("bad directory (dir #%lu)-no data block",
+ inode->i_ino);
+ return 1;
+ }
+
+ blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
+ if (!blk_base) {
+ pmfs_dbg("bad directory (dir #%lu)-no data block",
+ inode->i_ino);
+ return 1;
+ }
+
+ de = (struct pmfs_direntry *)blk_base;
+ de1 = pmfs_next_entry(de);
+
+ if (le64_to_cpu(de->ino) != inode->i_ino || !le64_to_cpu(de1->ino) ||
+ strcmp(".", de->name) || strcmp("..", de1->name)) {
+ pmfs_dbg("bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ return 1;
+ }
+ offset = le16_to_cpu(de->de_len) + le16_to_cpu(de1->de_len);
+ de = pmfs_next_entry(de1);
+ while (offset < inode->i_size) {
+ if (!blk_base || (void *)de >= (void *)(blk_base +
+ sb->s_blocksize)) {
+ err = 0;
+ blk_base = pmfs_get_block(sb, pmfs_find_data_block(
+ inode, offset >> sb->s_blocksize_bits));
+ if (!blk_base) {
+ pmfs_dbg("Error: reading dir #%lu offset %lu\n",
+ inode->i_ino, offset);
+ offset += sb->s_blocksize;
+ continue;
+ }
+ de = (struct pmfs_direntry *)blk_base;
+ }
+ if (!pmfs_check_dir_entry("empty_dir", inode, de, blk_base,
+ offset)) {
+ de = (struct pmfs_direntry *)(blk_base +
+ sb->s_blocksize);
+ offset = (offset | (sb->s_blocksize - 1)) + 1;
+ continue;
+ }
+ if (le64_to_cpu(de->ino))
+ return 0;
+ offset += le16_to_cpu(de->de_len);
+ de = pmfs_next_entry(de);
+ }
+ return 1;
+}
+
+static int pmfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct pmfs_direntry *de;
+ pmfs_transaction_t *trans;
+ struct super_block *sb = inode->i_sb;
+ struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino), *pidir;
+ int err = -ENOTEMPTY;
+
+ if (!inode)
+ return -ENOENT;
+
+ if (pmfs_inode_by_name(dir, &dentry->d_name, &de) == 0)
+ return -ENOENT;
+
+ if (!pmfs_empty_dir(inode))
+ return err;
+
+ if (inode->i_nlink != 2)
+ pmfs_dbg("empty directory has nlink!=2 (%d)", inode->i_nlink);
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+ MAX_DIRENTRY_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ return err;
+ }
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ err = pmfs_remove_entry(trans, dentry, inode);
+ if (err)
+ goto end_rmdir;
+
+ /*inode->i_version++; */
+ clear_nlink(inode);
+ inode->i_ctime = dir->i_ctime;
+
+ pmfs_memunlock_inode(sb, pi);
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pi->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ pmfs_memlock_inode(sb, pi);
+
+ /* add the inode to truncate list in case a crash happens before the
+ * subsequent evict_inode is called. It will be deleted from the
+ * truncate list during evict_inode.
+ */
+ pmfs_truncate_add(inode, inode->i_size);
+
+ pidir = pmfs_get_inode(sb, dir->i_ino);
+ pmfs_dec_count(dir, pidir);
+
+ pmfs_commit_transaction(sb, trans);
+ return err;
+end_rmdir:
+ pmfs_abort_transaction(sb, trans);
+ return err;
+}
+
+static int pmfs_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct pmfs_direntry *new_de = NULL, *old_de = NULL;
+ pmfs_transaction_t *trans;
+ struct super_block *sb = old_inode->i_sb;
+ struct pmfs_inode *pi, *new_pidir, *old_pidir;
+ int err = -ENOENT;
+
+ pmfs_inode_by_name(new_dir, &new_dentry->d_name, &new_de);
+ pmfs_inode_by_name(old_dir, &old_dentry->d_name, &old_de);
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 4 +
+ MAX_DIRENTRY_LENTRIES * 2);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (new_inode) {
+ err = -ENOTEMPTY;
+ if (S_ISDIR(old_inode->i_mode) && !pmfs_empty_dir(new_inode))
+ goto out;
+ } else {
+ if (S_ISDIR(old_inode->i_mode)) {
+ err = -EMLINK;
+ if (new_dir->i_nlink >= PMFS_LINK_MAX)
+ goto out;
+ }
+ }
+
+ new_pidir = pmfs_get_inode(sb, new_dir->i_ino);
+
+ pi = pmfs_get_inode(sb, old_inode->i_ino);
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ if (!new_de) {
+ /* link it into the new directory. */
+ err = pmfs_add_entry(trans, new_dentry, old_inode);
+ if (err)
+ goto out;
+ } else {
+ pmfs_add_logentry(sb, trans, &new_de->ino, sizeof(new_de->ino),
+ LE_DATA);
+
+ pmfs_memunlock_range(sb, new_de, sb->s_blocksize);
+ new_de->ino = cpu_to_le64(old_inode->i_ino);
+ /*new_de->file_type = old_de->file_type; */
+ pmfs_memlock_range(sb, new_de, sb->s_blocksize);
+
+ pmfs_add_logentry(sb, trans, new_pidir, MAX_DATA_PER_LENTRY,
+ LE_DATA);
+ /*new_dir->i_version++; */
+ new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
+ pmfs_update_time(new_dir, new_pidir);
+ }
+
+ /* and unlink the inode from the old directory ... */
+ err = pmfs_remove_entry(trans, old_dentry, old_inode);
+ if (err)
+ goto out;
+
+ if (new_inode) {
+ pi = pmfs_get_inode(sb, new_inode->i_ino);
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+ new_inode->i_ctime = CURRENT_TIME;
+
+ pmfs_memunlock_inode(sb, pi);
+ if (S_ISDIR(old_inode->i_mode)) {
+ if (new_inode->i_nlink)
+ drop_nlink(new_inode);
+ }
+ pi->i_ctime = cpu_to_le32(new_inode->i_ctime.tv_sec);
+ if (new_inode->i_nlink)
+ drop_nlink(new_inode);
+ pi->i_links_count = cpu_to_le16(new_inode->i_nlink);
+ pmfs_memlock_inode(sb, pi);
+
+ if (!new_inode->i_nlink)
+ pmfs_truncate_add(new_inode, new_inode->i_size);
+ } else {
+ if (S_ISDIR(old_inode->i_mode)) {
+ pmfs_inc_count(new_dir, new_pidir);
+ old_pidir = pmfs_get_inode(sb, old_dir->i_ino);
+ pmfs_dec_count(old_dir, old_pidir);
+ }
+ }
+
+ pmfs_commit_transaction(sb, trans);
+ return 0;
+out:
+ pmfs_abort_transaction(sb, trans);
+ return err;
+}
+
+struct dentry *pmfs_get_parent(struct dentry *child)
+{
+ struct inode *inode;
+ struct qstr dotdot = { .name = "..", .len = 2 };
+ struct pmfs_direntry *de = NULL;
+ ino_t ino;
+
+ pmfs_inode_by_name(child->d_inode, &dotdot, &de);
+ if (!de)
+ return ERR_PTR(-ENOENT);
+ ino = le64_to_cpu(de->ino);
+
+ if (ino)
+ inode = pmfs_iget(child->d_inode->i_sb, ino);
+ else
+ return ERR_PTR(-ENOENT);
+
+ return d_obtain_alias(inode);
+}
+
+const struct inode_operations pmfs_dir_inode_operations = {
+ .create = pmfs_create,
+ .lookup = pmfs_lookup,
+ .link = pmfs_link,
+ .unlink = pmfs_unlink,
+ .symlink = pmfs_symlink,
+ .mkdir = pmfs_mkdir,
+ .rmdir = pmfs_rmdir,
+ .mknod = pmfs_mknod,
+ .rename = pmfs_rename,
+ .setattr = pmfs_notify_change,
+ .get_acl = NULL,
+};
+
+const struct inode_operations pmfs_special_inode_operations = {
+ .setattr = pmfs_notify_change,
+ .get_acl = NULL,
+};
diff --git a/fs/pmfs/persist.c b/fs/pmfs/persist.c
new file mode 100644
index 0000000..a039028
--- /dev/null
+++ b/fs/pmfs/persist.c
@@ -0,0 +1,238 @@
+/*
+ * PMFS emulated persistence. this file contains code to load pmfs from a
+ * file into memory and store pmfs to a file from memory.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/backing-dev.h>
+#include "pmfs.h"
+
+static ssize_t pmfs_write_backing_store(struct file *flp, char *src,
+ ssize_t bytes, loff_t *woff)
+{
+ mm_segment_t old_fs;
+ ssize_t len = 0;
+
+ if (bytes > 0) {
+ old_fs = get_fs();
+ set_fs(get_ds());
+ len = vfs_write(flp, src, bytes, woff);
+ set_fs(old_fs);
+ if (len <= 0)
+ pmfs_dbg_verbose("Could not write file or corrupted pmfs\n");
+ }
+ return len;
+}
+
+static ssize_t pmfs_read_backing_store(struct file *flp, char *dest,
+ ssize_t bytes, loff_t *roff)
+{
+ mm_segment_t old_fs;
+ ssize_t len = 0;
+
+ if (bytes > 0) {
+ old_fs = get_fs();
+ set_fs(get_ds());
+ len = vfs_read(flp, dest, bytes, roff);
+ set_fs(old_fs);
+ if (len <= 0)
+ pmfs_dbg_verbose("Could not read file or corrupted pmfs\n");
+ }
+ return len;
+}
+
+/* Stores PMFS memory image into a storage file. Uses the allocation blocknode
+ * linked list to determine which memory ranges to save */
+static int pmfs_storefs(struct file *flp, struct super_block *sb)
+{
+ loff_t woff = 0;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ u64 num_blocknodes = sbi->num_blocknode_allocated, size;
+ struct list_head *head = &(sbi->block_inuse_head);
+ struct pmfs_blocknode *i;
+ struct pmfs_blocknode_lowhigh p;
+ char *ptr;
+
+ pmfs_info("storing pmfs to %s with 0x%llx blknodes\n",
+ sbi->pmfs_backing_file, num_blocknodes);
+ /* first save the number of blocknodes */
+ if (pmfs_write_backing_store(flp, (char *)&num_blocknodes, sizeof(u64),
+ &woff) != sizeof(u64))
+ goto out;
+ /* Then save the blocks containing blocknodes. */
+ list_for_each_entry(i, head, link) {
+ p.block_low = cpu_to_le64(i->block_low);
+ p.block_high = cpu_to_le64(i->block_high);
+ if (pmfs_write_backing_store(flp, (char *)&p, sizeof(p), &woff)
+ != sizeof(p))
+ goto out;
+ }
+ /* align the write offset on 4K boundary */
+ woff = (woff + PAGE_SIZE - 1) & ~(0xFFFUL);
+ /* Now save all the memory ranges allocated in the PMFS. These ranges
+ * are specified by the block_low and block_high fields of every
+ * struct pmfs_blocknode_lowhigh */
+ list_for_each_entry(i, head, link) {
+ if (i->block_low == 0)
+ ptr = (char *)pmfs_get_super(sb);
+ else
+ ptr = pmfs_get_block(sb, i->block_low << PAGE_SHIFT);
+ size = (i->block_high - i->block_low + 1) << PAGE_SHIFT;
+ if (pmfs_write_backing_store(flp, ptr, size, &woff) != size)
+ goto out;
+ }
+ vfs_fsync(flp, 0);
+ return 0;
+out:
+ return -EINVAL;
+}
+
+static int pmfs_loadfs(struct file *flp, struct super_block *sb)
+{
+ char *pmfs_base, *buf1, *buf2, *ptr;
+ struct pmfs_super_block *super;
+ loff_t roff = 0;
+ int retval = -EINVAL;
+ u64 pmfs_size, buf_sz, num_blocknodes, i, size;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_blocknode_lowhigh *p;
+
+ if (pmfs_read_backing_store(flp, (char *)&num_blocknodes, sizeof(u64),
+ &roff) != sizeof(u64))
+ return retval;
+
+ pmfs_info("Loading PMFS from %s to phys %llx with 0x%llx blknodes\n",
+ sbi->pmfs_backing_file, sbi->phys_addr, num_blocknodes);
+ buf_sz = num_blocknodes * sizeof(struct pmfs_blocknode_lowhigh);
+
+ buf1 = kmalloc(buf_sz, GFP_KERNEL);
+ if (buf1 == NULL)
+ return retval;
+
+ if (pmfs_read_backing_store(flp, buf1, buf_sz, &roff) != buf_sz)
+ goto out1;
+ p = (struct pmfs_blocknode_lowhigh *)buf1;
+
+ /* align the read offset on 4K boundary */
+ roff = (roff + PAGE_SIZE - 1) & ~(0xFFFUL);
+
+ buf2 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (buf2 == NULL)
+ goto out1;
+ if (pmfs_read_backing_store(flp, buf2, PAGE_SIZE, &roff) != PAGE_SIZE)
+ goto out2;
+
+ super = (struct pmfs_super_block *)buf2;
+ if (pmfs_check_integrity(NULL, super) == 0) {
+ pmfs_err(sb, "file contains invalid pmfs\n");
+ goto out2;
+ }
+ pmfs_size = le64_to_cpu(super->s_size);
+ pmfs_base = pmfs_ioremap(NULL, sbi->phys_addr, pmfs_size);
+ if (!pmfs_base) {
+ pmfs_err(sb, "ioremap of the pmfs image failed\n");
+ goto out2;
+ }
+ memcpy(pmfs_base, buf2, PAGE_SIZE);
+ /* now walk through the blocknode list and copy every range specified
+ * in the list to PMFS area */
+ for (i = 0; i < num_blocknodes; i++, p++) {
+ if (p->block_low == 0) {
+ ptr = pmfs_base + PAGE_SIZE;
+ size = (le64_to_cpu(p->block_high) -
+ le64_to_cpu(p->block_low)) << PAGE_SHIFT;
+ } else {
+ ptr = pmfs_base + (le64_to_cpu(p->block_low) <<
+ PAGE_SHIFT);
+ size = (le64_to_cpu(p->block_high) -
+ le64_to_cpu(p->block_low) + 1) << PAGE_SHIFT;
+ }
+ if (pmfs_read_backing_store(flp, ptr, size, &roff) != size)
+ goto out;
+ }
+ retval = 0;
+out:
+ iounmap(pmfs_base);
+ release_mem_region(sbi->phys_addr, pmfs_size);
+out2:
+ kfree(buf2);
+out1:
+ kfree(buf1);
+ return retval;
+}
+
+void pmfs_load_from_file(struct super_block *sb)
+{
+ struct file *flp;
+ mm_segment_t oldfs;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ if (strlen(sbi->pmfs_backing_file) && sbi->pmfs_backing_option != 1) {
+ oldfs = get_fs();
+ set_fs(get_ds());
+ flp = filp_open(sbi->pmfs_backing_file, O_RDONLY | O_LARGEFILE,
+ S_IRWXU);
+ set_fs(oldfs);
+ if (IS_ERR(flp)) {
+ pmfs_info("Can't open backing file %s\n",
+ sbi->pmfs_backing_file);
+ } else {
+ pmfs_loadfs(flp, sb);
+ oldfs = get_fs();
+ set_fs(get_ds());
+ filp_close(flp, current->files);
+ set_fs(oldfs);
+ }
+ }
+}
+
+void pmfs_store_to_file(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ if (strlen(sbi->pmfs_backing_file) && sbi->pmfs_backing_option != 2) {
+ struct file *flp;
+ mm_segment_t oldfs;
+ oldfs = get_fs();
+ set_fs(get_ds());
+ flp = filp_open(sbi->pmfs_backing_file,
+ O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRWXU);
+ set_fs(oldfs);
+ if (IS_ERR(flp)) {
+ pmfs_info("Can't open file %s\n",
+ sbi->pmfs_backing_file);
+ } else {
+ pmfs_storefs(flp, sb);
+ oldfs = get_fs();
+ set_fs(get_ds());
+ filp_close(flp, current->files);
+ set_fs(oldfs);
+ }
+ }
+ sbi->pmfs_backing_file[0] = '\0';
+ sbi->pmfs_backing_option = 0;
+}
diff --git a/fs/pmfs/pmfs.h b/fs/pmfs/pmfs.h
new file mode 100644
index 0000000..5dbb74c
--- /dev/null
+++ b/fs/pmfs/pmfs.h
@@ -0,0 +1,576 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __PMFS_H
+#define __PMFS_H
+
+#include <linux/buffer_head.h>
+#include <linux/pmfs_def.h>
+#include <linux/pmfs_sb.h>
+#include <linux/crc16.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include "wprotect.h"
+#include "journal.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+#define PMFS_ASSERT(x) \
+ if (!(x)) { \
+ printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ }
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define pmfs_dbg(s, args...) pr_debug(s, ## args) */
+#define pmfs_dbg(s, args ...) pr_info(s, ## args)
+#define pmfs_dbg1(s, args ...)
+#define pmfs_err(sb, s, args ...) pmfs_error_mng(sb, s, ## args)
+#define pmfs_warn(s, args ...) pr_warning(s, ## args)
+#define pmfs_info(s, args ...) pr_info(s, ## args)
+
+extern unsigned int pmfs_dbgmask;
+#define PMFS_DBGMASK_MMAPHUGE (0x00000001)
+#define PMFS_DBGMASK_MMAP4K (0x00000002)
+#define PMFS_DBGMASK_MMAPVERBOSE (0x00000004)
+#define PMFS_DBGMASK_MMAPVVERBOSE (0x00000008)
+#define PMFS_DBGMASK_VERBOSE (0x00000010)
+#define PMFS_DBGMASK_TRANSACTION (0x00000020)
+
+#define pmfs_dbg_mmaphuge(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_MMAPHUGE) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmap4k(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_MMAP4K) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmapv(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_MMAPVERBOSE) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmapvv(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_MMAPVVERBOSE) ? pmfs_dbg(s, args) : 0)
+
+#define pmfs_dbg_verbose(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_VERBOSE) ? pmfs_dbg(s, ##args) : 0)
+#define pmfs_dbg_trans(s, args ...) \
+ ((pmfs_dbgmask & PMFS_DBGMASK_TRANSACTION) ? pmfs_dbg(s, ##args) : 0)
+
+#define pmfs_set_bit __test_and_set_bit_le
+#define pmfs_clear_bit __test_and_clear_bit_le
+#define pmfs_find_next_zero_bit find_next_zero_bit_le
+
+#define clear_opt(o, opt) (o &= ~PMFS_MOUNT_ ## opt)
+#define set_opt(o, opt) (o |= PMFS_MOUNT_ ## opt)
+#define test_opt(sb, opt) (PMFS_SB(sb)->s_mount_opt & PMFS_MOUNT_ ## opt)
+
+#define PMFS_LARGE_INODE_TABLE_SIZE (0x200000)
+/* PMFS size threashold for using 2M blocks for inode table */
+#define PMFS_LARGE_INODE_TABLE_THREASHOLD (0x20000000)
+/*
+ * pmfs inode flags
+ *
+ * PMFS_EOFBLOCKS_FL There are blocks allocated beyond eof
+ */
+#define PMFS_EOFBLOCKS_FL 0x20000000
+/* Flags that should be inherited by new inodes from their parent. */
+#define PMFS_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+ FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL | \
+ FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_JOURNAL_DATA_FL | \
+ FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define PMFS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define PMFS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define PMFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | PMFS_EOFBLOCKS_FL)
+
+#define INODES_PER_BLOCK(bt) (1 << (blk_type_to_shift[bt] - PMFS_INODE_BITS))
+
+extern unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX];
+extern unsigned int blk_type_to_size[PMFS_BLOCK_TYPE_MAX];
+
+/* Function Prototypes */
+extern void pmfs_error_mng(struct super_block *sb, const char *fmt, ...);
+
+/* file.c */
+extern int pmfs_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* balloc.c */
+int pmfs_setup_blocknode_map(struct super_block *sb);
+extern struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb);
+extern void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode);
+extern void pmfs_init_blockmap(struct super_block *sb,
+ unsigned long init_used_size);
+extern void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
+ unsigned short btype);
+extern int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
+ unsigned short btype, int zero);
+extern unsigned long pmfs_count_free_blocks(struct super_block *sb);
+
+/* dir.c */
+extern int pmfs_add_entry(pmfs_transaction_t *trans,
+ struct dentry *dentry, struct inode *inode);
+extern int pmfs_remove_entry(pmfs_transaction_t *trans,
+ struct dentry *dentry, struct inode *inode);
+
+/* namei.c */
+extern struct dentry *pmfs_get_parent(struct dentry *child);
+
+/* inode.c */
+extern unsigned int pmfs_free_inode_subtree(struct super_block *sb,
+ u64 root, u32 height, u32 btype, loff_t end);
+extern int __pmfs_alloc_blocks(pmfs_transaction_t *trans,
+ struct super_block *sb, struct pmfs_inode *pi,
+ unsigned long file_blocknr, unsigned int num, bool zero);
+extern int pmfs_init_inode_table(struct super_block *sb);
+extern int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode,
+ unsigned long file_blocknr, unsigned int num, bool zero);
+extern u64 pmfs_find_data_block(struct inode *inode,
+ unsigned long file_blocknr);
+int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
+ loff_t new_size);
+void pmfs_setsize(struct inode *inode, loff_t newsize);
+
+extern struct inode *pmfs_iget(struct super_block *sb, unsigned long ino);
+extern void pmfs_put_inode(struct inode *inode);
+extern void pmfs_evict_inode(struct inode *inode);
+extern struct inode *pmfs_new_inode(pmfs_transaction_t *trans,
+ struct inode *dir, umode_t mode, const struct qstr *qstr);
+extern inline void pmfs_update_isize(struct inode *inode,
+ struct pmfs_inode *pi);
+extern inline void pmfs_update_nlink(struct inode *inode,
+ struct pmfs_inode *pi);
+extern inline void pmfs_update_time(struct inode *inode,
+ struct pmfs_inode *pi);
+extern int pmfs_write_inode(struct inode *inode,
+ struct writeback_control *wbc);
+extern void pmfs_dirty_inode(struct inode *inode, int flags);
+extern int pmfs_notify_change(struct dentry *dentry, struct iattr *attr);
+int pmfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+extern void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi);
+extern void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi);
+extern unsigned long pmfs_find_region(struct inode *inode, loff_t *offset,
+ int hole);
+extern void pmfs_truncate_del(struct inode *inode);
+extern void pmfs_truncate_add(struct inode *inode, u64 truncate_size);
+
+/* ioctl.c */
+extern long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+extern long pmfs_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg);
+#endif
+
+/* super.c */
+#ifdef CONFIG_PMFS_TEST
+extern struct pmfs_super_block *get_pmfs_super(void);
+#endif
+extern void __pmfs_free_blocknode(struct pmfs_blocknode *bnode);
+extern struct super_block *pmfs_read_super(struct super_block *sb, void *data,
+ int silent);
+extern int pmfs_statfs(struct dentry *d, struct kstatfs *buf);
+extern int pmfs_remount(struct super_block *sb, int *flags, char *data);
+
+/* Provides ordering from all previous clflush too */
+static inline void PERSISTENT_MARK(void)
+{
+ /* TODO: Fix me. */
+}
+
+static inline void PERSISTENT_BARRIER(void)
+{
+ asm volatile ("sfence\n" : : );
+}
+
+static inline void pmfs_flush_buffer(void *buf, uint32_t len, bool fence)
+{
+ uint32_t i;
+ len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1));
+ for (i = 0; i < len; i += CACHELINE_SIZE)
+ asm volatile ("clflush %0\n" : "+m" (*(char *)(buf+i)));
+ /* Do a fence only if asked. We often don't need to do a fence immidiately
+ * after clflush because even if we get context switched between clflush
+ * and subsequent fence, the context switch operation provides implicit
+ * fence. */
+ if (fence)
+ asm volatile ("sfence\n" : : );
+}
+
+/* symlink.c */
+extern int pmfs_block_symlink(struct inode *inode, const char *symname,
+ int len);
+
+/* Inline functions start here */
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __le32 pmfs_mask_flags(umode_t mode, __le32 flags)
+{
+ flags &= cpu_to_le32(PMFS_FL_INHERITED);
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & cpu_to_le32(PMFS_REG_FLMASK);
+ else
+ return flags & cpu_to_le32(PMFS_OTHER_FLMASK);
+}
+
+static inline int pmfs_calc_checksum(u8 *data, int n)
+{
+ u16 crc = 0;
+
+ crc = crc16(~0, (__u8 *)data + sizeof(__le16), n - sizeof(__le16));
+ if (*((__le16 *)data) == cpu_to_le16(crc))
+ return 0;
+ else
+ return 1;
+}
+
+struct pmfs_blocknode_lowhigh {
+ unsigned long block_low;
+ unsigned long block_high;
+};
+
+struct pmfs_blocknode {
+ struct list_head link;
+ unsigned long block_low;
+ unsigned long block_high;
+};
+
+struct pmfs_inode_vfs {
+ __u32 i_dir_start_lookup;
+ struct list_head i_truncated;
+ struct inode vfs_inode;
+};
+
+static inline struct pmfs_sb_info *PMFS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct pmfs_inode_vfs *PMFS_I(struct inode *inode)
+{
+ return container_of(inode, struct pmfs_inode_vfs, vfs_inode);
+}
+
+/* If this is part of a read-modify-write of the super block,
+ * pmfs_memunlock_super() before calling! */
+static inline struct pmfs_super_block *pmfs_get_super(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ return (struct pmfs_super_block *)sbi->virt_addr;
+}
+
+static inline pmfs_journal_t *pmfs_get_journal(struct super_block *sb)
+{
+ struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+ return (pmfs_journal_t *)((char *)ps +
+ le64_to_cpu(ps->s_journal_offset));
+}
+
+static inline struct pmfs_inode *pmfs_get_inode_table(struct super_block *sb)
+{
+ struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+ return (struct pmfs_inode *)((char *)ps +
+ le64_to_cpu(ps->s_inode_table_offset));
+}
+
+static inline struct pmfs_super_block *pmfs_get_redund_super(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ return (struct pmfs_super_block *)(sbi->virt_addr + PMFS_SB_SIZE);
+}
+
+/* If this is part of a read-modify-write of the block,
+ * pmfs_memunlock_block() before calling! */
+static inline void *pmfs_get_block(struct super_block *sb, u64 block)
+{
+ struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+ return block ? ((void *)ps + block) : NULL;
+}
+
+/* uses CPU instructions to atomically write upto 8 bytes */
+static inline void pmfs_memcpy_atomic (void *dst, const void *src, u8 size)
+{
+ switch (size) {
+ case 1: {
+ volatile u8 *daddr = dst;
+ const u8 *saddr = src;
+ *daddr = *saddr;
+ break;
+ }
+ case 2: {
+ volatile u16 *daddr = dst;
+ const u16 *saddr = src;
+ *daddr = cpu_to_le16(*saddr);
+ break;
+ }
+ case 4: {
+ volatile u32 *daddr = dst;
+ const u32 *saddr = src;
+ *daddr = cpu_to_le32(*saddr);
+ break;
+ }
+ case 8: {
+ volatile u64 *daddr = dst;
+ const u64 *saddr = src;
+ *daddr = cpu_to_le64(*saddr);
+ break;
+ }
+ default:
+ pmfs_dbg("error: memcpy_atomic called with %d bytes\n", size);
+ //BUG();
+ }
+}
+
+static inline void pmfs_update_time_and_size(struct inode *inode,
+ struct pmfs_inode *pi)
+{
+ uint32_t words[2];
+ /* pi->i_size, pi->i_ctime, and pi->i_mtime need to be atomically updated.
+ * So use cmpxchg16b here. */
+ words[0] = cpu_to_le32(inode->i_ctime.tv_sec);
+ words[1] = cpu_to_le32(inode->i_mtime.tv_sec);
+ /* TODO: the following function assumes cmpxchg16b instruction writes
+ * 16 bytes atomically. Confirm if it is really true. */
+ cmpxchg_double_local(&pi->i_size, (u64 *)&pi->i_ctime, pi->i_size,
+ *(u64 *)&pi->i_ctime, inode->i_size, *(u64 *)words);
+}
+
+/* assumes the length to be 4-byte aligned */
+static inline void memset_nt(void *dest, uint32_t dword, size_t length)
+{
+ uint64_t dummy1, dummy2;
+ uint64_t qword = ((uint64_t)dword << 32) | dword;
+
+ asm volatile ("movl %%edx,%%ecx\n"
+ "andl $63,%%edx\n"
+ "shrl $6,%%ecx\n"
+ "jz 9f\n"
+ "1: movnti %%rax,(%%rdi)\n"
+ "2: movnti %%rax,1*8(%%rdi)\n"
+ "3: movnti %%rax,2*8(%%rdi)\n"
+ "4: movnti %%rax,3*8(%%rdi)\n"
+ "5: movnti %%rax,4*8(%%rdi)\n"
+ "8: movnti %%rax,5*8(%%rdi)\n"
+ "7: movnti %%rax,6*8(%%rdi)\n"
+ "8: movnti %%rax,7*8(%%rdi)\n"
+ "leaq 64(%%rdi),%%rdi\n"
+ "decl %%ecx\n"
+ "jnz 1b\n"
+ "9: movl %%edx,%%ecx\n"
+ "andl $7,%%edx\n"
+ "shrl $3,%%ecx\n"
+ "jz 11f\n"
+ "10: movnti %%rax,(%%rdi)\n"
+ "leaq 8(%%rdi),%%rdi\n"
+ "decl %%ecx\n"
+ "jnz 10b\n"
+ "11: movl %%edx,%%ecx\n"
+ "shrl $2,%%ecx\n"
+ "jz 12f\n"
+ "movnti %%eax,(%%rdi)\n"
+ "12:\n"
+ : "=D"(dummy1), "=d" (dummy2) : "D" (dest), "a" (qword), "d" (length) : "memory", "rcx");
+}
+
+static inline u64 __pmfs_find_data_block(struct super_block *sb,
+ struct pmfs_inode *pi, unsigned long blocknr)
+{
+ u64 *level_ptr, bp = 0;
+ u32 height, bit_shift;
+ unsigned int idx;
+
+ height = pi->height;
+ bp = le64_to_cpu(pi->root);
+
+ while (height > 0) {
+ level_ptr = pmfs_get_block(sb, bp);
+ bit_shift = (height - 1) * META_BLK_SHIFT;
+ idx = blocknr >> bit_shift;
+ bp = le64_to_cpu(level_ptr[idx]);
+ if (bp == 0)
+ return 0;
+ blocknr = blocknr & ((1 << bit_shift) - 1);
+ height--;
+ }
+ return bp;
+}
+
+static inline unsigned int pmfs_inode_blk_shift (struct pmfs_inode *pi)
+{
+ return blk_type_to_shift[pi->i_blk_type];
+}
+
+static inline uint32_t pmfs_inode_blk_size (struct pmfs_inode *pi)
+{
+ return blk_type_to_size[pi->i_blk_type];
+}
+
+/* If this is part of a read-modify-write of the inode metadata,
+ * pmfs_memunlock_inode() before calling! */
+static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb,
+ u64 ino)
+{
+ struct pmfs_super_block *ps = pmfs_get_super(sb);
+ struct pmfs_inode *inode_table = pmfs_get_inode_table(sb);
+ u64 bp, block, ino_offset;
+
+ if (ino == 0)
+ return NULL;
+
+ block = ino >> pmfs_inode_blk_shift(inode_table);
+ bp = __pmfs_find_data_block(sb, inode_table, block);
+
+ if (bp == 0)
+ return NULL;
+ ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1));
+ return (struct pmfs_inode *)((void *)ps + bp + ino_offset);
+}
+
+static inline u64
+pmfs_get_addr_off(struct pmfs_sb_info *sbi, void *addr)
+{
+ PMFS_ASSERT((addr >= sbi->virt_addr) &&
+ (addr < (sbi->virt_addr + sbi->initsize)));
+ return (u64)(addr - sbi->virt_addr);
+}
+
+static inline u64
+pmfs_get_block_off(struct super_block *sb, unsigned long blocknr,
+ unsigned short btype)
+{
+ return (u64)blocknr << PAGE_SHIFT;
+}
+
+static inline unsigned long
+pmfs_get_numblocks(unsigned short btype)
+{
+ unsigned long num_blocks;
+
+ if (btype == PMFS_BLOCK_TYPE_4K) {
+ num_blocks = 1;
+ } else if (btype == PMFS_BLOCK_TYPE_2M) {
+ num_blocks = 512;
+ } else {
+ //btype == PMFS_BLOCK_TYPE_1G
+ num_blocks = 0x40000;
+ }
+ return num_blocks;
+}
+
+static inline unsigned long
+pmfs_get_blocknr(struct super_block *sb, u64 block, unsigned short btype)
+{
+ return block >> PAGE_SHIFT;
+}
+
+static inline unsigned long pmfs_get_pfn(struct super_block *sb, u64 block)
+{
+ return (PMFS_SB(sb)->phys_addr + block) >> PAGE_SHIFT;
+}
+
+static inline int pmfs_is_mounting(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+ return sbi->s_mount_opt & PMFS_MOUNT_MOUNTING;
+}
+
+static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_item (struct
+ super_block *sb, u64 ino)
+{
+ struct pmfs_inode *pi = pmfs_get_inode(sb, ino);
+ return (struct pmfs_inode_truncate_item *)(pi + 1);
+}
+
+static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_list_head (
+ struct super_block *sb)
+{
+ struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+ return (struct pmfs_inode_truncate_item *)(pi + 1);
+}
+
+static inline void check_eof_blocks(struct super_block *sb,
+ struct pmfs_inode *pi, loff_t size)
+{
+ if ((pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) &&
+ (size + sb->s_blocksize) > (le64_to_cpu(pi->i_blocks)
+ << sb->s_blocksize_bits))
+ pi->i_flags &= cpu_to_le32(~PMFS_EOFBLOCKS_FL);
+}
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations pmfs_dir_operations;
+
+/* file.c */
+extern const struct inode_operations pmfs_file_inode_operations;
+extern const struct file_operations pmfs_xip_file_operations;
+
+/* inode.c */
+extern const struct address_space_operations pmfs_aops_xip;
+
+/* bbuild.c */
+void pmfs_save_blocknode_mappings(struct super_block *sb);
+
+/* namei.c */
+extern const struct inode_operations pmfs_dir_inode_operations;
+extern const struct inode_operations pmfs_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations pmfs_symlink_inode_operations;
+
+extern struct backing_dev_info pmfs_backing_dev_info;
+
+int pmfs_check_integrity(struct super_block *sb,
+ struct pmfs_super_block *super);
+void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr,
+ ssize_t size);
+
+/* Emulated persistence APIs */
+void pmfs_set_backing_file(char *file_str);
+void pmfs_set_backing_option(int option);
+void pmfs_load_from_file(struct super_block *sb);
+void pmfs_store_to_file(struct super_block *sb);
+
+int pmfs_check_dir_entry(const char *function, struct inode *dir,
+ struct pmfs_direntry *de, u8 *base,
+ unsigned long offset);
+
+static inline int pmfs_match(int len, const char *const name,
+ struct pmfs_direntry *de)
+{
+ if (len == de->name_len && de->ino && !memcmp(de->name, name, len))
+ return 1;
+ return 0;
+}
+
+int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
+ unsigned long offset,
+ struct pmfs_direntry **res_dir,
+ struct pmfs_direntry **prev_dir);
+
+#endif /* __PMFS_H */
diff --git a/fs/pmfs/pmfs_test.c b/fs/pmfs/pmfs_test.c
new file mode 100644
index 0000000..94edbef
--- /dev/null
+++ b/fs/pmfs/pmfs_test.c
@@ -0,0 +1,50 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * pmfs test module.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "pmfs.h"
+
+int __init test_pmfs_write(void)
+{
+ struct pmfs_super_block *psb;
+
+ psb = get_pmfs_super();
+ if (!psb) {
+ printk(KERN_ERR
+ "%s: PMFS super block not found (not mounted?)\n",
+ __func__);
+ return 1;
+ }
+
+ /*
+ * Attempt an unprotected clear of checksum information in the
+ * superblock, this should cause a kernel page protection fault.
+ */
+ printk("%s: writing to kernel VA %p\n", __func__, psb);
+ psb->s_sum = 0;
+
+ return 0;
+}
+
+void test_pmfs_write_cleanup(void)
+{
+}
+
+/* Module information */
+MODULE_LICENSE("GPL");
+module_init(test_pmfs_write);
+module_exit(test_pmfs_write_cleanup);
diff --git a/fs/pmfs/super.c b/fs/pmfs/super.c
new file mode 100644
index 0000000..7f708e8
--- /dev/null
+++ b/fs/pmfs/super.c
@@ -0,0 +1,1217 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include <linux/exportfs.h>
+#include <linux/random.h>
+#include <linux/cred.h>
+#include <linux/backing-dev.h>
+#include <linux/list.h>
+#include "pmfs.h"
+
+static struct super_operations pmfs_sops;
+static const struct export_operations pmfs_export_ops;
+static struct kmem_cache *pmfs_inode_cachep;
+static struct kmem_cache *pmfs_blocknode_cachep;
+static struct kmem_cache *pmfs_transaction_cachep;
+/* FIXME: should the following variable be one per PMFS instance? */
+unsigned int pmfs_dbgmask = 0;
+
+#ifdef CONFIG_PMFS_TEST
+static void *first_pmfs_super;
+
+struct pmfs_super_block *get_pmfs_super(void)
+{
+ return (struct pmfs_super_block *)first_pmfs_super;
+}
+EXPORT_SYMBOL(get_pmfs_super);
+#endif
+
+void pmfs_error_mng(struct super_block *sb, const char *fmt, ...)
+{
+ va_list args;
+
+ printk("pmfs error: ");
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ if (test_opt(sb, ERRORS_PANIC))
+ panic("pmfs: panic from previous error\n");
+ if (test_opt(sb, ERRORS_RO)) {
+ printk(KERN_CRIT "pmfs err: remounting filesystem read-only");
+ sb->s_flags |= MS_RDONLY;
+ }
+}
+
+static void pmfs_set_blocksize(struct super_block *sb, unsigned long size)
+{
+ int bits;
+
+ /*
+ * We've already validated the user input and the value here must be
+ * between PMFS_MAX_BLOCK_SIZE and PMFS_MIN_BLOCK_SIZE
+ * and it must be a power of 2.
+ */
+ bits = fls(size) - 1;
+ sb->s_blocksize_bits = bits;
+ sb->s_blocksize = (1 << bits);
+}
+
+static inline int pmfs_has_huge_ioremap(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+ return sbi->s_mount_opt & PMFS_MOUNT_HUGEIOREMAP;
+}
+
+void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr, ssize_t size)
+{
+ void *retval;
+ int protect, hugeioremap;
+
+ if (sb) {
+ protect = pmfs_is_wprotected(sb);
+ hugeioremap = pmfs_has_huge_ioremap(sb);
+ } else {
+ protect = 0;
+ hugeioremap = 1;
+ }
+
+ /*
+ * NOTE: Userland may not map this resource, we will mark the region so
+ * /dev/mem and the sysfs MMIO access will not be allowed. This
+ * restriction depends on STRICT_DEVMEM option. If this option is
+ * disabled or not available we mark the region only as busy.
+ */
+ retval = request_mem_region_exclusive(phys_addr, size, "pmfs");
+ if (!retval)
+ goto fail;
+
+ if (protect) {
+ if (hugeioremap)
+ retval = ioremap_hpage_cache_ro(phys_addr, size);
+ else
+ retval = ioremap_cache_ro(phys_addr, size);
+ } else {
+ if (hugeioremap)
+ retval = ioremap_hpage_cache(phys_addr, size);
+ else
+ retval = ioremap_cache(phys_addr, size);
+ }
+
+fail:
+ return retval;
+}
+
+static inline int pmfs_iounmap(void *virt_addr, ssize_t size, int protected)
+{
+ iounmap(virt_addr);
+ return 0;
+}
+
+static loff_t pmfs_max_size(int bits)
+{
+ loff_t res;
+
+ res = (1ULL << (3 * 9 + bits)) - 1;
+
+ if (res > MAX_LFS_FILESIZE)
+ res = MAX_LFS_FILESIZE;
+
+ pmfs_dbg_verbose("max file size %llu bytes\n", res);
+ return res;
+}
+
+enum {
+ Opt_addr, Opt_bpi, Opt_size, Opt_jsize,
+ Opt_num_inodes, Opt_mode, Opt_uid,
+ Opt_gid, Opt_blocksize, Opt_wprotect, Opt_wprotectold,
+ Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_backing, Opt_backing_opt,
+ Opt_hugemmap, Opt_nohugeioremap, Opt_dbgmask, Opt_err
+};
+
+static const match_table_t tokens = {
+ { Opt_addr, "physaddr=%x" },
+ { Opt_bpi, "bpi=%u" },
+ { Opt_size, "init=%s" },
+ { Opt_jsize, "jsize=%s" },
+ { Opt_num_inodes,"num_inodes=%u" },
+ { Opt_mode, "mode=%o" },
+ { Opt_uid, "uid=%u" },
+ { Opt_gid, "gid=%u" },
+ { Opt_wprotect, "wprotect" },
+ { Opt_wprotectold, "wprotectold" },
+ { Opt_err_cont, "errors=continue" },
+ { Opt_err_panic, "errors=panic" },
+ { Opt_err_ro, "errors=remount-ro" },
+ { Opt_backing, "backing=%s" },
+ { Opt_backing_opt, "backing_opt=%u" },
+ { Opt_hugemmap, "hugemmap" },
+ { Opt_nohugeioremap, "nohugeioremap" },
+ { Opt_dbgmask, "dbgmask=%u" },
+ { Opt_err, NULL },
+};
+
+static phys_addr_t get_phys_addr(void **data)
+{
+ phys_addr_t phys_addr;
+ char *options = (char *)*data;
+
+ if (!options || strncmp(options, "physaddr=", 9) != 0)
+ return (phys_addr_t)ULLONG_MAX;
+ options += 9;
+ phys_addr = (phys_addr_t)simple_strtoull(options, &options, 0);
+ if (*options && *options != ',') {
+ printk(KERN_ERR "Invalid phys addr specification: %s\n",
+ (char *)*data);
+ return (phys_addr_t)ULLONG_MAX;
+ }
+ if (phys_addr & (PAGE_SIZE - 1)) {
+ printk(KERN_ERR "physical address 0x%16llx for pmfs isn't "
+ "aligned to a page boundary\n", (u64)phys_addr);
+ return (phys_addr_t)ULLONG_MAX;
+ }
+ if (*options == ',')
+ options++;
+ *data = (void *)options;
+ return phys_addr;
+}
+
+static int pmfs_parse_options(char *options, struct pmfs_sb_info *sbi,
+ bool remount)
+{
+ char *p, *rest;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_addr:
+ if (remount)
+ goto bad_opt;
+ /* physaddr managed in get_phys_addr() */
+ break;
+ case Opt_bpi:
+ if (remount)
+ goto bad_opt;
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ sbi->bpi = option;
+ break;
+ case Opt_uid:
+ if (remount)
+ goto bad_opt;
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ sbi->uid = option;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ sbi->gid = option;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ goto bad_val;
+ sbi->mode = option & 01777U;
+ break;
+ case Opt_size:
+ if (remount)
+ goto bad_opt;
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(*args[0].from))
+ goto bad_val;
+ sbi->initsize = memparse(args[0].from, &rest);
+ set_opt(sbi->s_mount_opt, FORMAT);
+ break;
+ case Opt_jsize:
+ if (remount)
+ goto bad_opt;
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(*args[0].from))
+ goto bad_val;
+ sbi->jsize = memparse(args[0].from, &rest);
+ /* make sure journal size is integer power of 2 */
+ if (sbi->jsize & (sbi->jsize - 1) ||
+ sbi->jsize < PMFS_MINIMUM_JOURNAL_SIZE) {
+ pmfs_dbg("Invalid jsize: "
+ "must be whole power of 2 & >= 64KB\n");
+ goto bad_val;
+ }
+ break;
+ case Opt_num_inodes:
+ if (remount)
+ goto bad_opt;
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ sbi->num_inodes = option;
+ break;
+ case Opt_err_panic:
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ break;
+ case Opt_err_ro:
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
+ break;
+ case Opt_err_cont:
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ break;
+ case Opt_wprotect:
+ if (remount)
+ goto bad_opt;
+ set_opt(sbi->s_mount_opt, PROTECT);
+ pmfs_info
+ ("PMFS: Enabling new Write Protection (CR0.WP)\n");
+ break;
+ case Opt_wprotectold:
+ if (remount)
+ goto bad_opt;
+ set_opt(sbi->s_mount_opt, PROTECT_OLD);
+ pmfs_info
+ ("PMFS: Enabling old Write Protection (PAGE RW Bit)\n");
+ break;
+ case Opt_hugemmap:
+ if (remount)
+ goto bad_opt;
+ set_opt(sbi->s_mount_opt, HUGEMMAP);
+ pmfs_info("PMFS: Enabling huge mappings for mmap\n");
+ break;
+ case Opt_nohugeioremap:
+ if (remount)
+ goto bad_opt;
+ clear_opt(sbi->s_mount_opt, HUGEIOREMAP);
+ pmfs_info("PMFS: Disabling huge ioremap\n");
+ break;
+ case Opt_dbgmask:
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ pmfs_dbgmask = option;
+ break;
+ case Opt_backing:
+ strncpy(sbi->pmfs_backing_file, args[0].from, 255);
+ break;
+ case Opt_backing_opt:
+ if (match_int(&args[0], &option))
+ goto bad_val;
+ sbi->pmfs_backing_option = option;
+ break;
+ default: {
+ goto bad_opt;
+ }
+ }
+ }
+
+ return 0;
+
+bad_val:
+ printk(KERN_INFO "Bad value '%s' for mount option '%s'\n", args[0].from,
+ p);
+ return -EINVAL;
+bad_opt:
+ printk(KERN_INFO "Bad mount option: \"%s\"\n", p);
+ return -EINVAL;
+}
+
+static bool pmfs_check_size (struct super_block *sb, unsigned long size)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ unsigned long minimum_size, num_blocks;
+
+ /* space required for super block and root directory */
+ minimum_size = 2 << sb->s_blocksize_bits;
+
+ /* space required for inode table */
+ if (sbi->num_inodes > 0)
+ num_blocks = (sbi->num_inodes >>
+ (sb->s_blocksize_bits - PMFS_INODE_BITS)) + 1;
+ else
+ num_blocks = 1;
+ minimum_size += (num_blocks << sb->s_blocksize_bits);
+ /* space required for journal */
+ minimum_size += sbi->jsize;
+
+ if (size < minimum_size)
+ return false;
+
+ return true;
+}
+
+
+static struct pmfs_inode *pmfs_init(struct super_block *sb,
+ unsigned long size)
+{
+ unsigned long blocksize;
+ u64 journal_meta_start, journal_data_start, inode_table_start;
+ struct pmfs_inode *root_i;
+ struct pmfs_super_block *super;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_direntry *de;
+ unsigned long blocknr;
+
+ pmfs_info("creating an empty pmfs of size %lu\n", size);
+ sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, size);
+ sbi->block_start = (unsigned long)0;
+ sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT);
+ sbi->num_free_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
+
+ if (!sbi->virt_addr) {
+ printk(KERN_ERR "ioremap of the pmfs image failed(1)\n");
+ return ERR_PTR(-EINVAL);
+ }
+#ifdef CONFIG_PMFS_TEST
+ if (!first_pmfs_super)
+ first_pmfs_super = sbi->virt_addr;
+#endif
+
+ pmfs_dbg_verbose("pmfs: Default block size set to 4K\n");
+ blocksize = sbi->blocksize = PMFS_DEF_BLOCK_SIZE_4K;
+
+ pmfs_set_blocksize(sb, blocksize);
+ blocksize = sb->s_blocksize;
+
+ if (sbi->blocksize && sbi->blocksize != blocksize)
+ sbi->blocksize = blocksize;
+
+ if (!pmfs_check_size(sb, size)) {
+ pmfs_dbg("Specified PMFS size too small 0x%lx. Either increase"
+ " PMFS size, or reduce num. of inodes (minimum 32)"
+ " or journal size (minimum 64KB)\n", size);
+ return ERR_PTR(-EINVAL);
+ }
+
+ journal_meta_start = sizeof(struct pmfs_super_block);
+ journal_meta_start = (journal_meta_start + CACHELINE_SIZE - 1) &
+ ~(CACHELINE_SIZE - 1);
+ inode_table_start = journal_meta_start + sizeof(pmfs_journal_t);
+ inode_table_start = (inode_table_start + CACHELINE_SIZE - 1) &
+ ~(CACHELINE_SIZE - 1);
+
+ if ((inode_table_start + sizeof(struct pmfs_inode)) > PMFS_SB_SIZE) {
+ pmfs_dbg("PMFS super block defined too small. defined 0x%x, "
+ "required 0x%llx\n", PMFS_SB_SIZE,
+ inode_table_start + sizeof(struct pmfs_inode));
+ return ERR_PTR(-EINVAL);
+ }
+
+ journal_data_start = PMFS_SB_SIZE * 2;
+ journal_data_start = (journal_data_start + blocksize - 1) &
+ ~(blocksize - 1);
+
+ pmfs_dbg_verbose("journal meta start %llx data start 0x%llx, "
+ "journal size 0x%x, inode_table 0x%llx\n", journal_meta_start,
+ journal_data_start, sbi->jsize, inode_table_start);
+ pmfs_dbg_verbose("max file name len %d\n", (unsigned int)PMFS_NAME_LEN);
+
+ super = pmfs_get_super(sb);
+ pmfs_memunlock_range(sb, super, journal_data_start);
+
+ /* clear out super-block and inode table */
+ memset_nt(super, 0, journal_data_start);
+ super->s_size = cpu_to_le64(size);
+ super->s_blocksize = cpu_to_le32(blocksize);
+ super->s_magic = cpu_to_le16(PMFS_SUPER_MAGIC);
+ super->s_journal_offset = cpu_to_le64(journal_meta_start);
+ super->s_inode_table_offset = cpu_to_le64(inode_table_start);
+
+ pmfs_init_blockmap(sb, journal_data_start + sbi->jsize);
+ pmfs_memlock_range(sb, super, journal_data_start);
+
+ if (pmfs_journal_hard_init(sb, journal_data_start, sbi->jsize) < 0) {
+ printk(KERN_ERR "Journal hard initialization failed\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (pmfs_init_inode_table(sb) < 0)
+ return ERR_PTR(-EINVAL);
+
+ pmfs_memunlock_range(sb, super, PMFS_SB_SIZE*2);
+ pmfs_sync_super(super);
+ pmfs_memlock_range(sb, super, PMFS_SB_SIZE*2);
+
+ pmfs_flush_buffer(super, PMFS_SB_SIZE, false);
+ pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, sizeof(*super), false);
+
+ pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
+
+ root_i = pmfs_get_inode(sb, PMFS_ROOT_INO);
+
+ pmfs_memunlock_inode(sb, root_i);
+ root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR);
+ root_i->i_uid = cpu_to_le32(sbi->uid);
+ root_i->i_gid = cpu_to_le32(sbi->gid);
+ root_i->i_links_count = cpu_to_le16(2);
+ root_i->i_blk_type = PMFS_BLOCK_TYPE_4K;
+ root_i->i_flags = 0;
+ root_i->i_blocks = cpu_to_le32(1);
+ root_i->i_size = cpu_to_le32(sb->s_blocksize);
+ root_i->root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
+ PMFS_BLOCK_TYPE_4K));
+ root_i->height = 0;
+ /* pmfs_sync_inode(root_i); */
+ pmfs_memlock_inode(sb, root_i);
+ pmfs_flush_buffer(root_i, sizeof(*root_i), false);
+ de = (struct pmfs_direntry *)
+ pmfs_get_block(sb, pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K));
+
+ pmfs_memunlock_range(sb, de, sb->s_blocksize);
+ de->ino = cpu_to_le64(PMFS_ROOT_INO);
+ de->name_len = 1;
+ de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
+ strcpy(de->name, ".");
+ de = (struct pmfs_direntry *)((char *)de + le16_to_cpu(de->de_len));
+ de->ino = cpu_to_le64(PMFS_ROOT_INO);
+ de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
+ de->name_len = 2;
+ strcpy(de->name, "..");
+ pmfs_memlock_range(sb, de, sb->s_blocksize);
+ pmfs_flush_buffer(de, PMFS_DIR_REC_LEN(2), false);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ return root_i;
+}
+
+static inline void set_default_opts(struct pmfs_sb_info *sbi)
+{
+ /* set_opt(sbi->s_mount_opt, PROTECT); */
+ set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ sbi->pmfs_backing_file[0] = '\0';
+ sbi->pmfs_backing_option = 0;
+ sbi->jsize = PMFS_DEFAULT_JOURNAL_SIZE;
+}
+
+static void pmfs_root_check(struct super_block *sb, struct pmfs_inode *root_pi)
+{
+ pmfs_memunlock_inode(sb, root_pi);
+/*
+ * if (root_pi->i_d.d_next) {
+ * pmfs_warn("root->next not NULL, trying to fix\n");
+ * goto fail1;
+ * }
+ */
+ if (!S_ISDIR(le16_to_cpu(root_pi->i_mode)))
+ pmfs_warn("root is not a directory, trying to fix\n");
+#if 0
+ if (pmfs_calc_checksum((u8 *)root_pi, PMFS_INODE_SIZE)) {
+ pmfs_dbg("checksum error in root inode, trying to fix\n");
+ goto fail3;
+ }
+#endif
+ root_pi->i_mode = cpu_to_le16(S_IRWXUGO | S_ISVTX | S_IFDIR);
+ pmfs_memlock_inode(sb, root_pi);
+ pmfs_flush_buffer(&root_pi->i_mode, sizeof(root_pi->i_mode), false);
+}
+
+int pmfs_check_integrity(struct super_block *sb,
+ struct pmfs_super_block *super)
+{
+ struct pmfs_super_block *super_redund;
+
+ super_redund =
+ (struct pmfs_super_block *)((char *)super + PMFS_SB_SIZE);
+
+ /* Do sanity checks on the superblock */
+ if (le16_to_cpu(super->s_magic) != PMFS_SUPER_MAGIC) {
+ if (le16_to_cpu(super_redund->s_magic) != PMFS_SUPER_MAGIC) {
+ printk(KERN_ERR "Can't find a valid pmfs partition\n");
+ goto out;
+ } else {
+ pmfs_warn
+ ("Error in super block: try to repair it with "
+ "the redundant copy");
+ /* Try to auto-recover the super block */
+ if (sb)
+ pmfs_memunlock_super(sb, super);
+ memcpy(super, super_redund,
+ sizeof(struct pmfs_super_block));
+ if (sb)
+ pmfs_memlock_super(sb, super);
+ pmfs_flush_buffer(super, sizeof(*super), false);
+ pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
+ sizeof(*super), false);
+
+ }
+ }
+
+ /* Read the superblock */
+ if (pmfs_calc_checksum((u8 *)super, PMFS_SB_STATIC_SIZE(super))) {
+ if (pmfs_calc_checksum((u8 *)super_redund,
+ PMFS_SB_STATIC_SIZE(super_redund))) {
+ printk(KERN_ERR "checksum error in super block\n");
+ goto out;
+ } else {
+ pmfs_warn
+ ("Error in super block: try to repair it with "
+ "the redundant copy");
+ /* Try to auto-recover the super block */
+ if (sb)
+ pmfs_memunlock_super(sb, super);
+ memcpy(super, super_redund,
+ sizeof(struct pmfs_super_block));
+ if (sb)
+ pmfs_memlock_super(sb, super);
+ pmfs_flush_buffer(super, sizeof(*super), false);
+ pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
+ sizeof(*super), false);
+ }
+ }
+
+ return 1;
+out:
+ return 0;
+}
+
+static void pmfs_recover_truncate_list(struct super_block *sb)
+{
+ struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+ u64 ino_next = le64_to_cpu(head->i_next_truncate);
+ struct pmfs_inode *pi;
+ struct pmfs_inode_truncate_item *li;
+ struct inode *inode;
+
+ if (ino_next == 0)
+ return;
+
+ while (ino_next != 0) {
+ pi = pmfs_get_inode(sb, ino_next);
+ li = (struct pmfs_inode_truncate_item *)(pi + 1);
+ inode = pmfs_iget(sb, ino_next);
+ if (IS_ERR(inode))
+ break;
+ pmfs_dbg("Recover ino %llx nlink %d sz %llx:%llx\n", ino_next,
+ inode->i_nlink, pi->i_size, li->i_truncatesize);
+ if (inode->i_nlink) {
+ /* set allocation hint */
+ pmfs_set_blocksize_hint(sb, pi,
+ le64_to_cpu(li->i_truncatesize));
+ pmfs_setsize(inode, le64_to_cpu(li->i_truncatesize));
+ pmfs_update_isize(inode, pi);
+ } else {
+ /* free the inode */
+ pmfs_dbg("deleting unreferenced inode %lx\n",
+ inode->i_ino);
+ }
+ iput(inode);
+ pmfs_flush_buffer(pi, CACHELINE_SIZE, false);
+ ino_next = le64_to_cpu(li->i_next_truncate);
+ }
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ /* reset the truncate_list */
+ pmfs_memunlock_range(sb, head, sizeof(*head));
+ head->i_next_truncate = 0;
+ pmfs_memlock_range(sb, head, sizeof(*head));
+ pmfs_flush_buffer(head, sizeof(*head), false);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+}
+
+static int pmfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct pmfs_super_block *super;
+ struct pmfs_inode *root_pi;
+ struct pmfs_sb_info *sbi = NULL;
+ struct inode *root_i = NULL;
+ unsigned long blocksize, initsize = 0;
+ u32 random = 0;
+ int retval = -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct pmfs_super_block) > PMFS_SB_SIZE);
+ BUILD_BUG_ON(sizeof(struct pmfs_inode) > PMFS_INODE_SIZE);
+
+ sbi = kzalloc(sizeof(struct pmfs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+ sb->s_fs_info = sbi;
+
+ set_default_opts(sbi);
+
+ sbi->phys_addr = get_phys_addr(&data);
+ if (sbi->phys_addr == (phys_addr_t)ULLONG_MAX)
+ goto out;
+
+ get_random_bytes(&random, sizeof(u32));
+ atomic_set(&sbi->next_generation, random);
+
+ /* Init with default values */
+ INIT_LIST_HEAD(&sbi->block_inuse_head);
+ sbi->mode = (S_IRWXUGO | S_ISVTX);
+ sbi->uid = current_fsuid();
+ sbi->gid = current_fsgid();
+ set_opt(sbi->s_mount_opt, XIP);
+ clear_opt(sbi->s_mount_opt, PROTECT);
+ set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+
+ INIT_LIST_HEAD(&sbi->s_truncate);
+ mutex_init(&sbi->s_truncate_lock);
+ mutex_init(&sbi->inode_table_mutex);
+ mutex_init(&sbi->s_lock);
+
+ if (pmfs_parse_options(data, sbi, 0))
+ goto out;
+
+ set_opt(sbi->s_mount_opt, MOUNTING);
+ initsize = sbi->initsize;
+
+ /* Init a new pmfs instance */
+ if (initsize) {
+ root_pi = pmfs_init(sb, initsize);
+
+ if (IS_ERR(root_pi))
+ goto out;
+
+ super = pmfs_get_super(sb);
+
+ goto setup_sb;
+ } else {
+ pmfs_load_from_file(sb);
+ }
+ pmfs_dbg_verbose("checking physical address 0x%016llx for pmfs image\n",
+ (u64)sbi->phys_addr);
+
+ /* Map only one page for now. Will remap it when fs size is known. */
+ initsize = PAGE_SIZE;
+ sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, initsize);
+ if (!sbi->virt_addr) {
+ printk(KERN_ERR "ioremap of the pmfs image failed(2)\n");
+ goto out;
+ }
+
+ super = pmfs_get_super(sb);
+
+ initsize = le64_to_cpu(super->s_size);
+ sbi->initsize = initsize;
+ pmfs_dbg_verbose("pmfs image appears to be %lu KB in size\n",
+ initsize >> 10);
+
+ pmfs_iounmap(sbi->virt_addr, PAGE_SIZE, pmfs_is_wprotected(sb));
+
+ /* Remap the whole filesystem now */
+ release_mem_region(sbi->phys_addr, PAGE_SIZE);
+ /* FIXME: Remap the whole filesystem in pmfs virtual address range. */
+ sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, initsize);
+ if (!sbi->virt_addr) {
+ printk(KERN_ERR "ioremap of the pmfs image failed(3)\n");
+ goto out;
+ }
+
+ super = pmfs_get_super(sb);
+
+ if (pmfs_journal_soft_init(sb)) {
+ retval = -EINVAL;
+ printk(KERN_ERR "Journal initialization failed\n");
+ goto out;
+ }
+ if (pmfs_recover_journal(sb)) {
+ retval = -EINVAL;
+ printk(KERN_ERR "Journal recovery failed\n");
+ goto out;
+ }
+
+ if (pmfs_check_integrity(sb, super) == 0) {
+ pmfs_dbg("Memory contains invalid pmfs %x:%x\n",
+ le16_to_cpu(super->s_magic), PMFS_SUPER_MAGIC);
+ goto out;
+ }
+
+ blocksize = le32_to_cpu(super->s_blocksize);
+ pmfs_set_blocksize(sb, blocksize);
+
+ pmfs_dbg_verbose("blocksize %lu\n", blocksize);
+
+ /* Read the root inode */
+ root_pi = pmfs_get_inode(sb, PMFS_ROOT_INO);
+
+ /* Check that the root inode is in a sane state */
+ pmfs_root_check(sb, root_pi);
+
+#ifdef CONFIG_PMFS_TEST
+ if (!first_pmfs_super)
+ first_pmfs_super = sbi->virt_addr;
+#endif
+
+ /* Set it all up.. */
+setup_sb:
+ sb->s_magic = le16_to_cpu(super->s_magic);
+ sb->s_op = &pmfs_sops;
+ sb->s_maxbytes = pmfs_max_size(sb->s_blocksize_bits);
+ sb->s_time_gran = 1;
+ sb->s_export_op = &pmfs_export_ops;
+ sb->s_xattr = NULL;
+ sb->s_flags |= MS_NOSEC;
+ root_i = pmfs_iget(sb, PMFS_ROOT_INO);
+ if (IS_ERR(root_i)) {
+ retval = PTR_ERR(root_i);
+ goto out;
+ }
+
+ sb->s_root = d_make_root(root_i);
+ if (!sb->s_root) {
+ printk(KERN_ERR "get pmfs root inode failed\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ pmfs_recover_truncate_list(sb);
+ /* If the FS was not formatted on this mount, scan the meta-data after
+ * truncate list has been processed */
+ if ((sbi->s_mount_opt & PMFS_MOUNT_FORMAT) == 0)
+ pmfs_setup_blocknode_map(sb);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ u64 mnt_write_time;
+ /* update mount time and write time atomically. */
+ mnt_write_time = (get_seconds() & 0xFFFFFFFF);
+ mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+ pmfs_memunlock_range(sb, &super->s_mtime, 8);
+ pmfs_memcpy_atomic(&super->s_mtime, &mnt_write_time, 8);
+ pmfs_memlock_range(sb, &super->s_mtime, 8);
+
+ pmfs_flush_buffer(&super->s_mtime, 8, false);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+
+ clear_opt(sbi->s_mount_opt, MOUNTING);
+ retval = 0;
+ return retval;
+out:
+ if (sbi->virt_addr) {
+ pmfs_iounmap(sbi->virt_addr, initsize, pmfs_is_wprotected(sb));
+ release_mem_region(sbi->phys_addr, initsize);
+ }
+
+ kfree(sbi);
+ return retval;
+}
+
+int pmfs_statfs(struct dentry *d, struct kstatfs *buf)
+{
+ struct super_block *sb = d->d_sb;
+ unsigned long count = 0;
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+ buf->f_type = PMFS_SUPER_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+
+ count = sbi->block_end;
+ buf->f_blocks = sbi->block_end;
+ buf->f_bfree = buf->f_bavail = pmfs_count_free_blocks(sb);
+ buf->f_files = (sbi->s_inodes_count);
+ buf->f_ffree = (sbi->s_free_inodes_count);
+ buf->f_namelen = PMFS_NAME_LEN;
+ pmfs_dbg("pmfs_stats: total 4k free blocks 0x%llx\n", buf->f_bfree);
+ pmfs_dbg("total inodes 0x%x, free inodes 0x%x, blocknodes 0x%lx\n",
+ (sbi->s_inodes_count),
+ (sbi->s_free_inodes_count), (sbi->num_blocknode_allocated));
+ return 0;
+}
+
+static int pmfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(root->d_sb);
+
+ seq_printf(seq, ",physaddr=0x%016llx", (u64)sbi->phys_addr);
+ if (sbi->initsize)
+ seq_printf(seq, ",init=%luk", sbi->initsize >> 10);
+ if (sbi->blocksize)
+ seq_printf(seq, ",bs=%lu", sbi->blocksize);
+ if (sbi->bpi)
+ seq_printf(seq, ",bpi=%lu", sbi->bpi);
+ if (sbi->num_inodes)
+ seq_printf(seq, ",N=%lu", sbi->num_inodes);
+ if (sbi->mode != (S_IRWXUGO | S_ISVTX))
+ seq_printf(seq, ",mode=%03o", sbi->mode);
+ if (sbi->uid != 0)
+ seq_printf(seq, ",uid=%u", sbi->uid);
+ if (sbi->gid != 0)
+ seq_printf(seq, ",gid=%u", sbi->gid);
+ if (test_opt(root->d_sb, ERRORS_RO))
+ seq_puts(seq, ",errors=remount-ro");
+ if (test_opt(root->d_sb, ERRORS_PANIC))
+ seq_puts(seq, ",errors=panic");
+ /* memory protection disabled by default */
+ if (test_opt(root->d_sb, PROTECT))
+ seq_puts(seq, ",wprotect");
+ if (test_opt(root->d_sb, HUGEMMAP))
+ seq_puts(seq, ",hugemmap");
+ if (test_opt(root->d_sb, HUGEIOREMAP))
+ seq_puts(seq, ",hugeioremap");
+ /* xip not enabled by default */
+ if (test_opt(root->d_sb, XIP))
+ seq_puts(seq, ",xip");
+
+ return 0;
+}
+
+int pmfs_remount(struct super_block *sb, int *mntflags, char *data)
+{
+ unsigned long old_sb_flags;
+ unsigned long old_mount_opt;
+ struct pmfs_super_block *ps;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ int ret = -EINVAL;
+
+ /* Store the old options */
+ mutex_lock(&sbi->s_lock);
+ old_sb_flags = sb->s_flags;
+ old_mount_opt = sbi->s_mount_opt;
+
+ if (pmfs_parse_options(data, sbi, 1))
+ goto restore_opt;
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & PMFS_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+ if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ u64 mnt_write_time;
+ ps = pmfs_get_super(sb);
+ /* update mount time and write time atomically. */
+ mnt_write_time = (get_seconds() & 0xFFFFFFFF);
+ mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+ pmfs_memunlock_range(sb, &ps->s_mtime, 8);
+ pmfs_memcpy_atomic(&ps->s_mtime, &mnt_write_time, 8);
+ pmfs_memlock_range(sb, &ps->s_mtime, 8);
+
+ pmfs_flush_buffer(&ps->s_mtime, 8, false);
+ PERSISTENT_MARK();
+ PERSISTENT_BARRIER();
+ }
+
+ mutex_unlock(&sbi->s_lock);
+ ret = 0;
+ return ret;
+
+restore_opt:
+ sb->s_flags = old_sb_flags;
+ sbi->s_mount_opt = old_mount_opt;
+ mutex_unlock(&sbi->s_lock);
+ return ret;
+}
+
+void pmfs_put_super(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct pmfs_super_block *ps = pmfs_get_super(sb);
+ u64 size = le64_to_cpu(ps->s_size);
+ struct pmfs_blocknode *i;
+ struct list_head *head = &(sbi->block_inuse_head);
+
+#ifdef CONFIG_PMFS_TEST
+ if (first_pmfs_super == sbi->virt_addr)
+ first_pmfs_super = NULL;
+#endif
+
+ /* It's unmount time, so unmap the pmfs memory */
+ if (sbi->virt_addr) {
+ pmfs_save_blocknode_mappings(sb);
+ pmfs_journal_uninit(sb);
+ pmfs_store_to_file(sb);
+ pmfs_iounmap(sbi->virt_addr, size, pmfs_is_wprotected(sb));
+ sbi->virt_addr = NULL;
+ release_mem_region(sbi->phys_addr, size);
+ }
+
+ /* Free all the pmfs_blocknodes */
+ while (!list_empty(head)) {
+ i = list_first_entry(head, struct pmfs_blocknode, link);
+ list_del(&i->link);
+ pmfs_free_blocknode(sb, i);
+ }
+ sb->s_fs_info = NULL;
+ pmfs_dbgmask = 0;
+ kfree(sbi);
+}
+
+inline void pmfs_free_transaction(pmfs_transaction_t *trans)
+{
+ kmem_cache_free(pmfs_transaction_cachep, trans);
+}
+
+void __pmfs_free_blocknode(struct pmfs_blocknode *bnode)
+{
+ kmem_cache_free(pmfs_blocknode_cachep, bnode);
+}
+
+void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ sbi->num_blocknode_allocated--;
+ __pmfs_free_blocknode(bnode);
+}
+
+inline pmfs_transaction_t *pmfs_alloc_transaction(void)
+{
+ return (pmfs_transaction_t *)
+ kmem_cache_alloc(pmfs_transaction_cachep, GFP_NOFS);
+}
+
+struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb)
+{
+ struct pmfs_blocknode *p;
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ p = (struct pmfs_blocknode *)
+ kmem_cache_alloc(pmfs_blocknode_cachep, GFP_NOFS);
+ if (p) {
+ sbi->num_blocknode_allocated++;
+ }
+ return p;
+}
+
+static struct inode *pmfs_alloc_inode(struct super_block *sb)
+{
+ struct pmfs_inode_vfs *vi = (struct pmfs_inode_vfs *)
+ kmem_cache_alloc(pmfs_inode_cachep, GFP_NOFS);
+
+ if (!vi)
+ return NULL;
+ vi->vfs_inode.i_version = 1;
+ return &vi->vfs_inode;
+}
+
+static void pmfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ kmem_cache_free(pmfs_inode_cachep, PMFS_I(inode));
+}
+
+static void pmfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, pmfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+ struct pmfs_inode_vfs *vi = (struct pmfs_inode_vfs *)foo;
+
+ vi->i_dir_start_lookup = 0;
+ INIT_LIST_HEAD(&vi->i_truncated);
+ inode_init_once(&vi->vfs_inode);
+}
+
+
+static int __init init_blocknode_cache(void)
+{
+ pmfs_blocknode_cachep = kmem_cache_create("pmfs_blocknode_cache",
+ sizeof(struct pmfs_blocknode),
+ 0, (SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD), NULL);
+ if (pmfs_blocknode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+
+static int __init init_inodecache(void)
+{
+ pmfs_inode_cachep = kmem_cache_create("pmfs_inode_cache",
+ sizeof(struct pmfs_inode_vfs),
+ 0, (SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD), init_once);
+ if (pmfs_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static int __init init_transaction_cache(void)
+{
+ pmfs_transaction_cachep = kmem_cache_create("pmfs_journal_transaction",
+ sizeof(pmfs_transaction_t), 0, (SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD), NULL);
+ if (pmfs_transaction_cachep == NULL) {
+ pmfs_dbg("PMFS: failed to init transaction cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void destroy_transaction_cache(void)
+{
+ if (pmfs_transaction_cachep)
+ kmem_cache_destroy(pmfs_transaction_cachep);
+ pmfs_transaction_cachep = NULL;
+}
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(pmfs_inode_cachep);
+}
+
+static void destroy_blocknode_cache(void)
+{
+ kmem_cache_destroy(pmfs_blocknode_cachep);
+}
+
+/*
+ * the super block writes are all done "on the fly", so the
+ * super block is never in a "dirty" state, so there's no need
+ * for write_super.
+ */
+static struct super_operations pmfs_sops = {
+ .alloc_inode = pmfs_alloc_inode,
+ .destroy_inode = pmfs_destroy_inode,
+ .write_inode = pmfs_write_inode,
+ .dirty_inode = pmfs_dirty_inode,
+ .evict_inode = pmfs_evict_inode,
+ .put_super = pmfs_put_super,
+ .statfs = pmfs_statfs,
+ .remount_fs = pmfs_remount,
+ .show_options = pmfs_show_options,
+};
+
+static struct dentry *pmfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_nodev(fs_type, flags, data, pmfs_fill_super);
+}
+
+static struct file_system_type pmfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "pmfs",
+ .mount = pmfs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static struct inode *pmfs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+ struct inode *inode;
+
+ if (ino < PMFS_ROOT_INO)
+ return ERR_PTR(-ESTALE);
+
+ if ((ino >> PMFS_INODE_BITS) > (sbi->s_inodes_count))
+ return ERR_PTR(-ESTALE);
+
+ inode = pmfs_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ if (generation && inode->i_generation != generation) {
+ /* we didn't find the right inode.. */
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+}
+
+static struct dentry *pmfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ pmfs_nfs_get_inode);
+}
+
+static struct dentry *pmfs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ pmfs_nfs_get_inode);
+}
+
+static const struct export_operations pmfs_export_ops = {
+ .fh_to_dentry = pmfs_fh_to_dentry,
+ .fh_to_parent = pmfs_fh_to_parent,
+ .get_parent = pmfs_get_parent,
+};
+
+static int __init init_pmfs_fs(void)
+{
+ int rc = 0;
+
+ rc = init_blocknode_cache();
+ if (rc)
+ return rc;
+
+ rc = init_transaction_cache();
+ if (rc)
+ goto out1;
+
+ rc = init_inodecache();
+ if (rc)
+ goto out2;
+
+ rc = bdi_init(&pmfs_backing_dev_info);
+ if (rc)
+ goto out3;
+
+ rc = register_filesystem(&pmfs_fs_type);
+ if (rc)
+ goto out4;
+
+ return 0;
+
+out4:
+ bdi_destroy(&pmfs_backing_dev_info);
+out3:
+ destroy_inodecache();
+out2:
+ destroy_transaction_cache();
+out1:
+ destroy_blocknode_cache();
+ return rc;
+}
+
+static void __exit exit_pmfs_fs(void)
+{
+ unregister_filesystem(&pmfs_fs_type);
+ bdi_destroy(&pmfs_backing_dev_info);
+ destroy_inodecache();
+ destroy_blocknode_cache();
+ destroy_transaction_cache();
+ return;
+}
+
+MODULE_AUTHOR("Intel Corporation <linux-pmfs at intel.com>");
+MODULE_DESCRIPTION("Persistent Memory File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_pmfs_fs)
+module_exit(exit_pmfs_fs)
diff --git a/fs/pmfs/symlink.c b/fs/pmfs/symlink.c
new file mode 100644
index 0000000..f02cbe2
--- /dev/null
+++ b/fs/pmfs/symlink.c
@@ -0,0 +1,71 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Symlink operations
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include "pmfs.h"
+
+int pmfs_block_symlink(struct inode *inode, const char *symname, int len)
+{
+ struct super_block *sb = inode->i_sb;
+ u64 block;
+ char *blockp;
+ int err;
+
+ err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
+ if (err)
+ return err;
+
+ block = pmfs_find_data_block(inode, 0);
+ blockp = pmfs_get_block(sb, block);
+
+ pmfs_memunlock_block(sb, blockp);
+ memcpy(blockp, symname, len);
+ blockp[len] = '\0';
+ pmfs_memlock_block(sb, blockp);
+ pmfs_flush_buffer(blockp, len+1, false);
+ return 0;
+}
+
+static int pmfs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+ struct inode *inode = dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ u64 block;
+ char *blockp;
+
+ block = pmfs_find_data_block(inode, 0);
+ blockp = pmfs_get_block(sb, block);
+ return vfs_readlink(dentry, buffer, buflen, blockp);
+}
+
+static void *pmfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode = dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ off_t block;
+ int status;
+ char *blockp;
+
+ block = pmfs_find_data_block(inode, 0);
+ blockp = pmfs_get_block(sb, block);
+ status = vfs_follow_link(nd, blockp);
+ return ERR_PTR(status);
+}
+
+const struct inode_operations pmfs_symlink_inode_operations = {
+ .readlink = pmfs_readlink,
+ .follow_link = pmfs_follow_link,
+ .setattr = pmfs_notify_change,
+};
diff --git a/fs/pmfs/wprotect.c b/fs/pmfs/wprotect.c
new file mode 100644
index 0000000..d80b4fe
--- /dev/null
+++ b/fs/pmfs/wprotect.c
@@ -0,0 +1,91 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Write protection for the filesystem pages.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include "pmfs.h"
+
+DEFINE_SPINLOCK(pmfs_writeable_lock);
+
+static inline void wprotect_disable(void)
+{
+ unsigned long cr0_val;
+
+ cr0_val = read_cr0();
+ cr0_val &= (~X86_CR0_WP);
+ write_cr0(cr0_val);
+ return;
+}
+
+static inline void wprotect_enable(void)
+{
+ unsigned long cr0_val;
+
+ cr0_val = read_cr0();
+ cr0_val |= X86_CR0_WP;
+ write_cr0(cr0_val);
+ return;
+}
+
+/* FIXME: Use PAGE RW Bit */
+int pmfs_writeable_old(void *vaddr, unsigned long size, int rw)
+{
+ int ret = 0;
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)vaddr;
+
+ /* Page aligned */
+ addr &= PAGE_MASK;
+
+ if (size & (PAGE_SIZE - 1))
+ nrpages++;
+
+ if (rw)
+ ret = set_memory_rw(addr, nrpages);
+ else
+ ret = set_memory_ro(addr, nrpages);
+
+ BUG_ON(ret);
+ return 0;
+}
+
+/* FIXME: Assumes that we are always called in the right order.
+ * pmfs_writeable(vaddr, size, 1);
+ * pmfs_writeable(vaddr, size, 0);
+ */
+int pmfs_writeable(void *vaddr, unsigned long size, int rw)
+{
+ static unsigned long flags;
+ if (rw) {
+ local_irq_save(flags);
+ wprotect_disable();
+ } else {
+ wprotect_enable();
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
+int pmfs_xip_mem_protect(struct super_block *sb, void *vaddr,
+ unsigned long size, int rw)
+{
+ if (!pmfs_is_wprotected(sb))
+ return 0;
+ if (pmfs_is_protected_old(sb))
+ return pmfs_writeable_old(vaddr, size, rw);
+ return pmfs_writeable(vaddr, size, rw);
+}
diff --git a/fs/pmfs/wprotect.h b/fs/pmfs/wprotect.h
new file mode 100644
index 0000000..818638b
--- /dev/null
+++ b/fs/pmfs/wprotect.h
@@ -0,0 +1,166 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Memory protection definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef __WPROTECT_H
+#define __WPROTECT_H
+
+#include <linux/pmfs_def.h>
+#include <linux/fs.h>
+
+/* pmfs_memunlock_super() before calling! */
+static inline void pmfs_sync_super(struct pmfs_super_block *ps)
+{
+ u16 crc = 0;
+
+ ps->s_wtime = cpu_to_le32(get_seconds());
+ ps->s_sum = 0;
+ crc = crc16(~0, (__u8 *)ps + sizeof(__le16),
+ PMFS_SB_STATIC_SIZE(ps) - sizeof(__le16));
+ ps->s_sum = cpu_to_le16(crc);
+ /* Keep sync redundant super block */
+ memcpy((void *)ps + PMFS_SB_SIZE, (void *)ps,
+ sizeof(struct pmfs_super_block));
+}
+
+#if 0
+/* pmfs_memunlock_inode() before calling! */
+static inline void pmfs_sync_inode(struct pmfs_inode *pi)
+{
+ u16 crc = 0;
+
+ pi->i_sum = 0;
+ crc = crc16(~0, (__u8 *)pi + sizeof(__le16), PMFS_INODE_SIZE -
+ sizeof(__le16));
+ pi->i_sum = cpu_to_le16(crc);
+}
+#endif
+
+extern int pmfs_writeable(void *vaddr, unsigned long size, int rw);
+extern int pmfs_xip_mem_protect(struct super_block *sb,
+ void *vaddr, unsigned long size, int rw);
+
+extern spinlock_t pmfs_writeable_lock;
+static inline int pmfs_is_protected(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+ return sbi->s_mount_opt & PMFS_MOUNT_PROTECT;
+}
+
+static inline int pmfs_is_protected_old(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+ return sbi->s_mount_opt & PMFS_MOUNT_PROTECT_OLD;
+}
+
+static inline int pmfs_is_wprotected(struct super_block *sb)
+{
+ return pmfs_is_protected(sb) || pmfs_is_protected_old(sb);
+}
+
+static inline void
+__pmfs_memunlock_range(void *p, unsigned long len, int hold_lock)
+{
+ /*
+ * NOTE: Ideally we should lock all the kernel to be memory safe
+ * and avoid to write in the protected memory,
+ * obviously it's not possible, so we only serialize
+ * the operations at fs level. We can't disable the interrupts
+ * because we could have a deadlock in this path.
+ */
+ if (hold_lock)
+ spin_lock(&pmfs_writeable_lock);
+ pmfs_writeable(p, len, 1);
+}
+
+static inline void
+__pmfs_memlock_range(void *p, unsigned long len, int hold_lock)
+{
+ pmfs_writeable(p, len, 0);
+ if (hold_lock)
+ spin_unlock(&pmfs_writeable_lock);
+}
+
+static inline void pmfs_memunlock_range(struct super_block *sb, void *p,
+ unsigned long len)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memunlock_range(p, len, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memunlock_range(p, len, 1);
+}
+
+static inline void pmfs_memlock_range(struct super_block *sb, void *p,
+ unsigned long len)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memlock_range(p, len, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memlock_range(p, len, 1);
+}
+
+static inline void pmfs_memunlock_super(struct super_block *sb,
+ struct pmfs_super_block *ps)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memunlock_range(ps, PMFS_SB_SIZE, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memunlock_range(ps, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memlock_super(struct super_block *sb,
+ struct pmfs_super_block *ps)
+{
+ pmfs_sync_super(ps);
+ if (pmfs_is_protected(sb))
+ __pmfs_memlock_range(ps, PMFS_SB_SIZE, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memlock_range(ps, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memunlock_inode(struct super_block *sb,
+ struct pmfs_inode *pi)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memunlock_range(pi, PMFS_SB_SIZE, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memunlock_range(pi, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memlock_inode(struct super_block *sb,
+ struct pmfs_inode *pi)
+{
+ /* pmfs_sync_inode(pi); */
+ if (pmfs_is_protected(sb))
+ __pmfs_memlock_range(pi, PMFS_SB_SIZE, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memlock_range(pi, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memunlock_block(struct super_block *sb, void *bp)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memunlock_range(bp, sb->s_blocksize, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memunlock_range(bp, sb->s_blocksize, 1);
+}
+
+static inline void pmfs_memlock_block(struct super_block *sb, void *bp)
+{
+ if (pmfs_is_protected(sb))
+ __pmfs_memlock_range(bp, sb->s_blocksize, 0);
+ else if (pmfs_is_protected_old(sb))
+ __pmfs_memlock_range(bp, sb->s_blocksize, 1);
+}
+
+#endif
diff --git a/fs/pmfs/xip.c b/fs/pmfs/xip.c
new file mode 100644
index 0000000..a7cf780
--- /dev/null
+++ b/fs/pmfs/xip.c
@@ -0,0 +1,672 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * XIP operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/cpufeature.h>
+#include <asm/pgtable.h>
+#include "pmfs.h"
+#include "xip.h"
+
+/*
+ * Wrappers. We need to use the rcu read lock to avoid
+ * concurrent truncate operation. No problem for write because we held
+ * i_mutex.
+ */
+ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t res;
+
+ rcu_read_lock();
+ res = xip_file_read(filp, buf, len, ppos);
+ rcu_read_unlock();
+ return res;
+}
+
+static inline void pmfs_flush_edge_cachelines(loff_t pos, ssize_t len,
+ void *start_addr)
+{
+ if (unlikely(pos & 0x7))
+ pmfs_flush_buffer(start_addr, 1, false);
+ if (unlikely(((pos + len) & 0x7) && ((pos & (CACHELINE_SIZE - 1)) !=
+ ((pos + len) & (CACHELINE_SIZE - 1)))))
+ pmfs_flush_buffer(start_addr + len, 1, false);
+}
+
+static ssize_t
+__pmfs_xip_file_write(struct address_space *mapping, const char __user *buf,
+ size_t count, loff_t pos, loff_t *ppos)
+{
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ long status = 0;
+ size_t bytes;
+ ssize_t written = 0;
+ struct pmfs_inode *pi;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ do {
+ unsigned long index;
+ unsigned long offset;
+ size_t copied;
+ void *xmem;
+ unsigned long xpfn;
+
+ offset = (pos & (sb->s_blocksize - 1)); /* Within page */
+ index = pos >> sb->s_blocksize_bits;
+ bytes = sb->s_blocksize - offset;
+ if (bytes > count)
+ bytes = count;
+
+ status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn);
+ if (status)
+ break;
+ pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1);
+ copied = bytes -
+ __copy_from_user_inatomic_nocache(xmem + offset, buf, bytes);
+ pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0);
+
+ /* if start or end dest address is not 8 byte aligned,
+ * __copy_from_user_inatomic_nocache uses cacheable instructions
+ * (instead of movnti) to write. So flush those cachelines. */
+ pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
+
+ if (likely(copied > 0)) {
+ status = copied;
+
+ if (status >= 0) {
+ written += status;
+ count -= status;
+ pos += status;
+ buf += status;
+ }
+ }
+ if (unlikely(copied != bytes))
+ if (status >= 0)
+ status = -EFAULT;
+ if (status < 0)
+ break;
+ } while (count);
+ *ppos = pos;
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ */
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ pmfs_update_isize(inode, pi);
+ }
+
+ return written ? written : status;
+}
+
+/* optimized path for file write that doesn't require a transaction. In this
+ * path we dont need to allocate any new data blocks. So the only meta-data
+ * modified in path is inode's i_size, i_ctime, and i_mtime fields */
+static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode,
+ struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos,
+ loff_t *ppos, u64 block)
+{
+ void *xmem = pmfs_get_block(sb, block);
+ size_t copied, ret = 0, offset;
+
+ offset = pos & (sb->s_blocksize - 1);
+
+ pmfs_xip_mem_protect(sb, xmem + offset, count, 1);
+ copied = count - __copy_from_user_inatomic_nocache(xmem
+ + offset, buf, count);
+ pmfs_xip_mem_protect(sb, xmem + offset, count, 0);
+
+ pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
+
+ if (likely(copied > 0)) {
+ pos += copied;
+ ret = copied;
+ }
+ if (unlikely(copied != count && copied == 0))
+ ret = -EFAULT;
+ *ppos = pos;
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ if (pos > inode->i_size) {
+ /* make sure written data is persistent before updating
+ * time and size */
+ PERSISTENT_MARK();
+ i_size_write(inode, pos);
+ PERSISTENT_BARRIER();
+ pmfs_memunlock_inode(sb, pi);
+ pmfs_update_time_and_size(inode, pi);
+ pmfs_memlock_inode(sb, pi);
+ } else {
+ u64 c_m_time;
+ /* update c_time and m_time atomically. We don't need to make the data
+ * persistent because the expectation is that the close() or an explicit
+ * fsync will do that. */
+ c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF);
+ c_m_time = c_m_time | (c_m_time << 32);
+ pmfs_memunlock_inode(sb, pi);
+ pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8);
+ pmfs_memlock_inode(sb, pi);
+ }
+ pmfs_flush_buffer(pi, 1, false);
+ return ret;
+}
+
+static inline void pmfs_clear_edge_blk (struct super_block *sb, struct
+ pmfs_inode *pi, bool new_blk, unsigned long block)
+{
+ void *ptr;
+ unsigned long blknr;
+
+ if (new_blk) {
+ blknr = block >> (pmfs_inode_blk_shift(pi) -
+ sb->s_blocksize_bits);
+ ptr = pmfs_get_block(sb, __pmfs_find_data_block(sb, pi, blknr));
+ if (ptr != NULL) {
+ pmfs_memunlock_range(sb, ptr, pmfs_inode_blk_size(pi));
+ memset_nt(ptr, 0, pmfs_inode_blk_size(pi));
+ pmfs_memlock_range(sb, ptr, pmfs_inode_blk_size(pi));
+ }
+ }
+}
+
+ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ pmfs_transaction_t *trans;
+ struct pmfs_inode *pi;
+ ssize_t written = 0;
+ loff_t pos;
+ u64 block;
+ bool new_sblk = false, new_eblk = false;
+ size_t count, offset, ret;
+ unsigned long start_blk, num_blocks, max_logentries;
+
+ sb_start_write(inode->i_sb);
+ mutex_lock(&inode->i_mutex);
+
+ if (!access_ok(VERIFY_READ, buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ pos = *ppos;
+ count = len;
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret || count == 0)
+ goto out_backing;
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+
+ offset = pos & (sb->s_blocksize - 1);
+ num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+ /* offset in the actual block size block */
+ offset = pos & (pmfs_inode_blk_size(pi) - 1);
+ start_blk = pos >> sb->s_blocksize_bits;
+
+ if ((((count + offset - 1) >> pmfs_inode_blk_shift(pi)) == 0) &&
+ (block = pmfs_find_data_block(inode, start_blk))) {
+ ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos,
+ ppos, block);
+ goto out_backing;
+ }
+ max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2;
+ if (max_logentries > MAX_METABLOCK_LENTRIES)
+ max_logentries = MAX_METABLOCK_LENTRIES;
+
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_backing;
+ }
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+ ret = file_remove_suid(filp);
+ if (ret) {
+ pmfs_abort_transaction(sb, trans);
+ goto out_backing;
+ }
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ pmfs_update_time(inode, pi);
+
+ /* We avoid zeroing the alloc'd range, which is going to be overwritten
+ * by this system call anyway */
+ if (offset != 0) {
+ if (pmfs_find_data_block(inode, start_blk) == 0)
+ new_sblk = true;
+ }
+ if (((count + offset - 1) >> pmfs_inode_blk_shift(pi)) != 0 &&
+ ((pos + count) & (pmfs_inode_blk_size(pi) - 1)) != 0) {
+ if (pmfs_find_data_block(inode, start_blk + num_blocks - 1)
+ == 0)
+ new_eblk = true;
+ }
+
+ /* don't zero-out the allocated blocks */
+ pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false);
+
+ /* now zero out the edge blocks which will be partially written */
+ pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk);
+ pmfs_clear_edge_blk(sb, pi, new_eblk, start_blk + num_blocks - 1);
+
+ written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos);
+ if (written < 0 || written != count)
+ pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld"
+ " pos %llx start_blk %lx num_blocks %lx\n",
+ written, count, pos, start_blk, num_blocks);
+
+ pmfs_commit_transaction(sb, trans);
+ ret = written;
+out_backing:
+ current->backing_dev_info = NULL;
+out:
+ mutex_unlock(&inode->i_mutex);
+ sb_end_write(inode->i_sb);
+ return ret;
+}
+
+/* OOM err return with xip file fault handlers doesn't mean anything.
+ * It would just cause the OS to go an unnecessary killing spree !
+ */
+static int __pmfs_xip_file_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t size;
+ void *xip_mem;
+ unsigned long xip_pfn;
+ int err;
+
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (vmf->pgoff >= size) {
+ pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx),"
+ " vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+ __func__, __LINE__, vma->vm_start, vma->vm_end,
+ vmf->pgoff, (unsigned long)vmf->virtual_address);
+ return VM_FAULT_SIGBUS;
+ }
+
+ err = pmfs_get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn);
+ if (unlikely(err)) {
+ pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x%lx),"
+ " vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+ __func__, __LINE__, vma->vm_start, vma->vm_end,
+ vmf->pgoff, (unsigned long)vmf->virtual_address);
+ return VM_FAULT_SIGBUS;
+ }
+
+ pmfs_dbg_mmapv("[%s:%d] vm_start(0x%lx), vm_end(0x%lx), pgoff(0x%lx), "
+ "BlockSz(0x%lx), VA(0x%lx)->PA(0x%lx)\n", __func__,
+ __LINE__, vma->vm_start, vma->vm_end, vmf->pgoff,
+ PAGE_SIZE, (unsigned long)vmf->virtual_address,
+ (unsigned long)xip_pfn << PAGE_SHIFT);
+
+ err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_SIGBUS;
+ /*
+ * err == -EBUSY is fine, we've raced against another thread
+ * that faulted-in the same page
+ */
+ if (err != -EBUSY)
+ BUG_ON(err);
+ return VM_FAULT_NOPAGE;
+}
+
+extern int
+pmfs_xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int ret = 0;
+
+ rcu_read_lock();
+ ret = __pmfs_xip_file_fault(vma, vmf);
+ rcu_read_unlock();
+ return ret;
+}
+
+static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock,
+ sector_t *data_block, int create)
+{
+ int err = -EIO;
+ u64 block;
+ pmfs_transaction_t *trans;
+ struct pmfs_inode *pi;
+
+ block = pmfs_find_data_block(inode, iblock);
+
+ if (!block) {
+ struct super_block *sb = inode->i_sb;
+ if (!create) {
+ err = -ENODATA;
+ goto err;
+ }
+
+ pi = pmfs_get_inode(sb, inode->i_ino);
+ trans = pmfs_current_transaction();
+ if (trans) {
+ err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
+ if (err) {
+ pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
+ __func__, __LINE__);
+ goto err;
+ }
+ } else {
+ /* 1 lentry for inode, 1 lentry for inode's b-tree */
+ trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto err;
+ }
+
+ rcu_read_unlock();
+ mutex_lock(&inode->i_mutex);
+
+ pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY,
+ LE_DATA);
+ err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
+
+ pmfs_commit_transaction(sb, trans);
+
+ mutex_unlock(&inode->i_mutex);
+ rcu_read_lock();
+ if (err) {
+ pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
+ __func__, __LINE__);
+ goto err;
+ }
+ }
+ block = pmfs_find_data_block(inode, iblock);
+ if (!block) {
+ pmfs_dbg("[%s:%d] But alloc didn't fail!\n",
+ __func__, __LINE__);
+ err = -ENODATA;
+ goto err;
+ }
+ }
+ pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock,
+ block);
+
+ *data_block = block;
+ err = 0;
+
+err:
+ return err;
+}
+
+static inline int __pmfs_get_block(struct inode *inode, pgoff_t pgoff,
+ int create, sector_t *result)
+{
+ int rc = 0;
+
+ rc = pmfs_find_and_alloc_blocks(inode, (sector_t)pgoff, result,
+ create);
+ return rc;
+}
+
+int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
+ void **kmem, unsigned long *pfn)
+{
+ int rc;
+ sector_t block = 0;
+ struct inode *inode = mapping->host;
+
+ rc = __pmfs_get_block(inode, pgoff, create, &block);
+ if (rc) {
+ pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx),"
+ " pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__,
+ __LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr,
+ block, pgoff, create, *pfn);
+ return rc;
+ }
+
+ *kmem = pmfs_get_block(inode->i_sb, block);
+ *pfn = pmfs_get_pfn(inode->i_sb, block);
+
+ pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx),"
+ " pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__,
+ PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn);
+ return 0;
+}
+
+unsigned long pmfs_data_block_size(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pgoff)
+{
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_mapping->host;
+ struct pmfs_inode *pi;
+ unsigned long map_virt;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return -EFAULT;
+
+ pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+
+ map_virt = addr & PUD_MASK;
+
+ if (!cpu_has_gbpages || pi->i_blk_type != PMFS_BLOCK_TYPE_1G ||
+ (vma->vm_start & ~PUD_MASK) ||
+ map_virt < vma->vm_start ||
+ (map_virt + PUD_SIZE) > vma->vm_end)
+ goto use_2M_mappings;
+
+ pmfs_dbg_mmapv("[%s:%d] Using 1G Mappings : "
+ "vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+ "VA(0x%lx), MAP_VA(%lx)\n", __func__, __LINE__,
+ vma->vm_start, vma->vm_end, pgoff, addr, map_virt);
+ return PUD_SIZE;
+
+use_2M_mappings:
+ map_virt = addr & PMD_MASK;
+
+ if (!cpu_has_pse || pi->i_blk_type != PMFS_BLOCK_TYPE_2M ||
+ (vma->vm_start & ~PMD_MASK) ||
+ map_virt < vma->vm_start ||
+ (map_virt + PMD_SIZE) > vma->vm_end)
+ goto use_4K_mappings;
+
+ pmfs_dbg_mmapv("[%s:%d] Using 2M Mappings : "
+ "vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+ "VA(0x%lx), MAP_VA(%lx)\n", __func__, __LINE__,
+ vma->vm_start, vma->vm_end, pgoff, addr, map_virt);
+
+ return PMD_SIZE;
+
+use_4K_mappings:
+ pmfs_dbg_mmapvv("[%s:%d] 4K Mappings : "
+ "vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+ "VA(0x%lx)\n", __func__, __LINE__,
+ vma->vm_start, vma->vm_end, pgoff, addr);
+
+ return PAGE_SIZE;
+}
+
+static inline pte_t *pmfs_xip_hugetlb_pte_offset(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long *sz)
+{
+ return pte_offset_pagesz(mm, addr, sz);
+}
+
+static inline pte_t *pmfs_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
+{
+ return pte_alloc_pagesz(mm, addr, sz);
+}
+
+static pte_t pmfs_make_huge_pte(struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long sz,
+ int writable)
+{
+ pte_t entry;
+
+ if (writable)
+ entry = pte_mkwrite(pte_mkdirty(pfn_pte(pfn, vma->vm_page_prot)));
+ else
+ entry = pte_wrprotect(pfn_pte(pfn, vma->vm_page_prot));
+
+ entry = pte_mkspecial(pte_mkyoung(entry));
+
+ if (sz != PAGE_SIZE) {
+ BUG_ON(sz != PMD_SIZE && sz != PUD_SIZE);
+ entry = pte_mkhuge(entry);
+ }
+
+ return entry;
+}
+
+static int __pmfs_xip_file_hpage_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ int ret;
+ pte_t *ptep, new_pte;
+ unsigned long size, block_sz;
+ struct mm_struct *mm = vma->vm_mm;
+ struct inode *inode = vma->vm_file->f_mapping->host;
+ unsigned long address = (unsigned long)vmf->virtual_address;
+
+ static DEFINE_MUTEX(pmfs_instantiation_mutex);
+
+ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ if (vmf->pgoff >= size) {
+ pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx),"
+ " vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+ __func__, __LINE__, vma->vm_start, vma->vm_end,
+ vmf->pgoff, (unsigned long)vmf->virtual_address);
+ return VM_FAULT_SIGBUS;
+ }
+
+ block_sz = pmfs_data_block_size(vma, address, vmf->pgoff);
+ address &= ~(block_sz - 1);
+ BUG_ON(block_sz == PAGE_SIZE);
+ pmfs_dbg_mmapvv("[%s:%d] BlockSz : %lx",
+ __func__, __LINE__, block_sz);
+
+ ptep = pmfs_pte_alloc(mm, address, block_sz);
+ if (!ptep) {
+ pmfs_dbg("[%s:%d] pmfs_pte_alloc failed(OOM). vm_start(0x%lx),"
+ " vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+ __func__, __LINE__, vma->vm_start, vma->vm_end,
+ vmf->pgoff, (unsigned long)vmf->virtual_address);
+ return VM_FAULT_SIGBUS;
+ }
+
+ /* Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mutex_lock(&pmfs_instantiation_mutex);
+ if (pte_none(*ptep)) {
+ void *xip_mem;
+ unsigned long xip_pfn;
+ if (pmfs_get_xip_mem(vma->vm_file->f_mapping, vmf->pgoff, 1,
+ &xip_mem, &xip_pfn) != 0) {
+ pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x"
+ "%lx), vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+ __func__, __LINE__, vma->vm_start,
+ vma->vm_end, vmf->pgoff,
+ (unsigned long)vmf->virtual_address);
+ ret = VM_FAULT_SIGBUS;
+ goto out_mutex;
+ }
+
+ /* VA has already been aligned. Align xip_pfn to block_sz. */
+ xip_pfn <<= PAGE_SHIFT;
+ xip_pfn &= ~(block_sz - 1);
+ xip_pfn >>= PAGE_SHIFT;
+ new_pte = pmfs_make_huge_pte(vma, xip_pfn, block_sz,
+ ((vma->vm_flags & VM_WRITE) &&
+ (vma->vm_flags & VM_SHARED)));
+ /* FIXME: Is lock necessary ? */
+ spin_lock(&mm->page_table_lock);
+ set_pte_at(mm, address, ptep, new_pte);
+ spin_unlock(&mm->page_table_lock);
+
+ if (ptep_set_access_flags(vma, address, ptep, new_pte,
+ vmf->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache(vma, address, ptep);
+ }
+ ret = VM_FAULT_NOPAGE;
+
+out_mutex:
+ mutex_unlock(&pmfs_instantiation_mutex);
+ return ret;
+}
+
+int pmfs_xip_file_hpage_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ int ret = 0;
+
+ rcu_read_lock();
+ ret = __pmfs_xip_file_hpage_fault(vma, vmf);
+ rcu_read_unlock();
+ return ret;
+}
+
+static const struct vm_operations_struct pmfs_xip_vm_ops = {
+ .fault = pmfs_xip_file_fault,
+};
+
+static const struct vm_operations_struct pmfs_xip_hpage_vm_ops = {
+ .fault = pmfs_xip_file_hpage_fault,
+};
+
+static inline int pmfs_has_huge_mmap(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+ return sbi->s_mount_opt & PMFS_MOUNT_HUGEMMAP;
+}
+
+int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long block_sz;
+
+ BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+ file_accessed(file);
+
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ block_sz = pmfs_data_block_size(vma, vma->vm_start, 0);
+ if (pmfs_has_huge_mmap(file->f_mapping->host->i_sb) &&
+ (vma->vm_flags & VM_SHARED) &&
+ (block_sz == PUD_SIZE || block_sz == PMD_SIZE)) {
+ /* vma->vm_flags |= (VM_XIP_HUGETLB | VM_SHARED | VM_DONTCOPY); */
+ vma->vm_flags |= VM_XIP_HUGETLB;
+ vma->vm_ops = &pmfs_xip_hpage_vm_ops;
+ pmfs_dbg_mmaphuge("[%s:%d] MMAP HUGEPAGE vm_start(0x%lx),"
+ " vm_end(0x%lx), vm_flags(0x%lx), "
+ "vm_page_prot(0x%lx)\n", __func__,
+ __LINE__, vma->vm_start, vma->vm_end, vma->vm_flags,
+ pgprot_val(vma->vm_page_prot));
+ } else {
+ vma->vm_ops = &pmfs_xip_vm_ops;
+ pmfs_dbg_mmap4k("[%s:%d] MMAP 4KPAGE vm_start(0x%lx),"
+ " vm_end(0x%lx), vm_flags(0x%lx), "
+ "vm_page_prot(0x%lx)\n", __func__,
+ __LINE__, vma->vm_start, vma->vm_end,
+ vma->vm_flags, pgprot_val(vma->vm_page_prot));
+ }
+
+ return 0;
+}
diff --git a/fs/pmfs/xip.h b/fs/pmfs/xip.h
new file mode 100644
index 0000000..3bd9306
--- /dev/null
+++ b/fs/pmfs/xip.h
@@ -0,0 +1,28 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * XIP operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+int pmfs_get_xip_mem(struct address_space *, pgoff_t, int, void **,
+ unsigned long *);
+ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *ppos);
+ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *ppos);
+int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma);
+
+static inline int pmfs_use_xip(struct super_block *sb)
+{
+ struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+ return sbi->s_mount_opt & PMFS_MOUNT_XIP;
+}
+
+#define mapping_is_xip(map) (map->a_ops->get_xip_mem)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfd8768..7a62889 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -274,6 +274,10 @@ static inline int pud_none_or_clear_bad(pud_t *pud)
{
if (pud_none(*pud))
return 1;
+ if (unlikely(pud_large(*pud))) {
+ pud_clear(pud);
+ return 1;
+ }
if (unlikely(pud_bad(*pud))) {
pud_clear_bad(pud);
return 1;
@@ -285,6 +289,10 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
if (pmd_none(*pmd))
return 1;
+ if (unlikely(pmd_large(*pmd))) {
+ pmd_clear(pmd);
+ return 1;
+ }
if (unlikely(pmd_bad(*pmd))) {
pmd_clear_bad(pmd);
return 1;
diff --git a/include/linux/io.h b/include/linux/io.h
index 069e407..db3de04 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -38,6 +38,16 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
}
#endif
+#ifdef CONFIG_MMU
+int ioremap_hpage_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot);
+#else
+static inline int ioremap_hpage_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ return 0;
+}
+#endif
/*
* Managed iomap interface
*/
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e19ff30..f7a1aa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -84,6 +84,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define VM_XIP_HUGETLB 0x00000200
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
@@ -96,6 +97,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
+#define VM_PFN_AT_MMAP 0x00080000 /* PFNMAP vma that is fully mapped at mmap time */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
@@ -165,6 +167,11 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
#define FAULT_FLAG_TRIED 0x40 /* second try */
+static inline int is_xip_hugetlb_mapping(struct vm_area_struct *vma)
+{
+ return !!(vma->vm_flags & VM_XIP_HUGETLB);
+}
+
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
* ->fault function. The vma's ->fault is responsible for returning a bitmask
@@ -1010,6 +1017,14 @@ static inline int fixup_user_fault(struct task_struct *tsk,
}
#endif
+extern pte_t *pte_alloc_pagesz(struct mm_struct *mm, unsigned long addr,
+ unsigned long sz);
+extern pte_t *pte_offset_pagesz(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sz);
+extern void unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+
+
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
diff --git a/include/linux/pmfs_def.h b/include/linux/pmfs_def.h
new file mode 100644
index 0000000..e52741e
--- /dev/null
+++ b/include/linux/pmfs_def.h
@@ -0,0 +1,206 @@
+/*
+ * FILE NAME include/linux/pmfs_fs.h
+ *
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _LINUX_PMFS_DEF_H
+#define _LINUX_PMFS_DEF_H
+
+#include <linux/types.h>
+#include <linux/magic.h>
+
+/*
+ * The PMFS filesystem constants/structures
+ */
+
+/*
+ * Mount flags
+ */
+#define PMFS_MOUNT_PROTECT 0x000001 /* wprotect CR0.WP */
+#define PMFS_MOUNT_XATTR_USER 0x000002 /* Extended user attributes */
+#define PMFS_MOUNT_POSIX_ACL 0x000004 /* POSIX Access Control Lists */
+#define PMFS_MOUNT_XIP 0x000008 /* Execute in place */
+#define PMFS_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */
+#define PMFS_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */
+#define PMFS_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */
+#define PMFS_MOUNT_HUGEMMAP 0x000080 /* Huge mappings with mmap */
+#define PMFS_MOUNT_HUGEIOREMAP 0x000100 /* Huge mappings with ioremap */
+#define PMFS_MOUNT_PROTECT_OLD 0x000200 /* wprotect PAGE RW Bit */
+#define PMFS_MOUNT_FORMAT 0x000400 /* was FS formatted on mount? */
+#define PMFS_MOUNT_MOUNTING 0x000800 /* FS currently being mounted */
+
+/*
+ * Maximal count of links to a file
+ */
+#define PMFS_LINK_MAX 32000
+
+#define PMFS_DEF_BLOCK_SIZE_4K 4096
+
+#define PMFS_INODE_SIZE 128 /* must be power of two */
+#define PMFS_INODE_BITS 7
+
+#define PMFS_NAME_LEN 255
+/*
+ * Structure of a directory entry in PMFS.
+ */
+struct pmfs_direntry {
+ __le64 ino; /* inode no pointed to by this entry */
+ __le16 de_len; /* length of this directory entry */
+ u8 name_len; /* length of the directory entry name */
+ u8 file_type; /* file type */
+ char name[PMFS_NAME_LEN]; /* File name */
+};
+
+#define PMFS_DIR_PAD 4
+#define PMFS_DIR_ROUND (PMFS_DIR_PAD - 1)
+#define PMFS_DIR_REC_LEN(name_len) (((name_len) + 12 + PMFS_DIR_ROUND) & \
+ ~PMFS_DIR_ROUND)
+
+/* PMFS supported data blocks */
+#define PMFS_BLOCK_TYPE_4K 0
+#define PMFS_BLOCK_TYPE_2M 1
+#define PMFS_BLOCK_TYPE_1G 2
+#define PMFS_BLOCK_TYPE_MAX 3
+
+#define META_BLK_SHIFT 9
+
+/*
+ * Play with this knob to change the default block type.
+ * By changing the PMFS_DEFAULT_BLOCK_TYPE to 2M or 1G,
+ * we should get pretty good coverage in testing.
+ */
+#define PMFS_DEFAULT_BLOCK_TYPE PMFS_BLOCK_TYPE_4K
+
+/*
+ * Structure of an inode in PMFS. Things to keep in mind when modifying it.
+ * 1) Keep the inode size to within 96 bytes if possible. This is because
+ * a 64 byte log-entry can store 48 bytes of data and we would like
+ * to log an inode using only 2 log-entries
+ * 2) root must be immediately after the qw containing height because we update
+ * root and height atomically using cmpxchg16b in pmfs_decrease_btree_height
+ * 3) i_size, i_ctime, and i_mtime must be in that order and i_size must be at
+ * 16 byte aligned offset from the start of the inode. We use cmpxchg16b to
+ * update these three fields atomically.
+ */
+struct pmfs_inode {
+ /* first 48 bytes */
+ __le16 i_rsvd; /* reserved. used to be checksum */
+ u8 height; /* height of data b-tree; max 3 for now */
+ u8 i_blk_type; /* data block size this inode uses */
+ __le32 i_flags; /* Inode flags */
+ __le64 root; /* btree root. must be below qw w/ height */
+ __le64 i_size; /* Size of data in bytes */
+ __le32 i_ctime; /* Inode modification time */
+ __le32 i_mtime; /* Inode b-tree Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_mode; /* File mode */
+ __le16 i_links_count; /* Links count */
+ __le64 i_blocks; /* Blocks count */
+
+ /* second 48 bytes */
+ __le64 i_xattr; /* Extended attribute block */
+ __le32 i_uid; /* Owner Uid */
+ __le32 i_gid; /* Group Id */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_atime; /* Access time */
+
+ struct {
+ __le32 rdev; /* major/minor # */
+ } dev; /* device inode */
+ __le32 padding; /* pad to ensure truncate_item starts 8-byte aligned */
+};
+
+/* This is a per-inode structure and follows immediately after the
+ * struct pmfs_inode. It is used to implement the truncate linked list and is
+ * by pmfs_truncate_add(), pmfs_truncate_del(), and pmfs_recover_truncate_list()
+ * functions to manage the truncate list */
+struct pmfs_inode_truncate_item {
+ __le64 i_truncatesize; /* Size of truncated inode */
+ __le64 i_next_truncate; /* inode num of the next truncated inode */
+};
+
+/*
+ * #define PMFS_NAME_LEN (PMFS_INODE_SIZE - offsetof(struct pmfs_inode,
+ * i_d.d_name) - 1)
+ */
+
+/* #define PMFS_SB_SIZE 128 */ /* must be power of two */
+#define PMFS_SB_SIZE 512 /* must be power of two */
+
+typedef struct pmfs_journal {
+ __le64 base;
+ __le32 size;
+ __le32 head;
+ /* the next three fields must be in the same order and together.
+ * tail and gen_id must fall in the same 8-byte quadword */
+ __le32 tail;
+ __le16 gen_id; /* generation id of the log */
+ __le16 pad;
+ __le16 redo_logging;
+} pmfs_journal_t;
+
+
+/*
+ * Structure of the super block in PMFS
+ * The fields are partitioned into static and dynamic fields. The static fields
+ * never change after file system creation. This was primarily done because
+ * pmfs_get_block() returns NULL if the block offset is 0 (helps in catching
+ * bugs). So if we modify any field using journaling (for consistency), we
+ * will have to modify s_sum which is at offset 0. So journaling code fails.
+ * This (static+dynamic fields) is a temporary solution and can be avoided
+ * once the file system becomes stable and pmfs_get_block() returns correct
+ * pointers even for offset 0.
+ */
+struct pmfs_super_block {
+ /* static fields. they never change after file system creation.
+ * checksum only validates upto s_start_dynamic field below */
+ __le16 s_sum; /* checksum of this sb */
+ __le16 s_magic; /* magic signature */
+ __le32 s_blocksize; /* blocksize in bytes */
+ __le64 s_size; /* total size of fs in bytes */
+ char s_volume_name[16]; /* volume name */
+ /* points to the location of pmfs_journal_t */
+ __le64 s_journal_offset;
+ /* points to the location of struct pmfs_inode for the inode table */
+ __le64 s_inode_table_offset;
+
+ __le64 s_start_dynamic;
+
+ /* all the dynamic fields should go here */
+ /* s_mtime and s_wtime should be together and their order should not be
+ * changed. we use an 8 byte write to update both of them atomically */
+ __le32 s_mtime; /* mount time */
+ __le32 s_wtime; /* write time */
+ /* fields for fast mount support. Always keep them together */
+ __le64 s_num_blocknode_allocated;
+ __le64 s_num_free_blocks;
+ __le32 s_inodes_count;
+ __le32 s_free_inodes_count;
+ __le32 s_inodes_used_count;
+ __le32 s_free_inode_hint;
+};
+
+#define PMFS_SB_STATIC_SIZE(ps) ((u64)&ps->s_start_dynamic - (u64)ps)
+
+/* the above fast mount fields take total 32 bytes in the super block */
+#define PMFS_FAST_MOUNT_FIELD_SIZE (36)
+
+/* The root inode follows immediately after the redundant super block */
+#define PMFS_ROOT_INO (PMFS_INODE_SIZE)
+#define PMFS_BLOCKNODE_IN0 (PMFS_ROOT_INO + PMFS_INODE_SIZE)
+
+/* INODE HINT START at 3 */
+#define PMFS_FREE_INODE_HINT_START (3)
+
+#endif /* _LINUX_PMFS_DEF_H */
diff --git a/include/linux/pmfs_sb.h b/include/linux/pmfs_sb.h
new file mode 100644
index 0000000..d483f2c
--- /dev/null
+++ b/include/linux/pmfs_sb.h
@@ -0,0 +1,83 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _LINUX_PMFS_SB
+#define _LINUX_PMFS_SB
+
+/*
+ * PMFS super-block data in memory
+ */
+struct pmfs_sb_info {
+ /*
+ * base physical and virtual address of PMFS (which is also
+ * the pointer to the super block)
+ */
+ phys_addr_t phys_addr;
+ void *virt_addr;
+ struct list_head block_inuse_head;
+ unsigned long block_start;
+ unsigned long block_end;
+ unsigned long num_free_blocks;
+ char pmfs_backing_file[256];
+ struct mutex s_lock; /* protects the SB's buffer-head */
+
+ /*
+ * Backing store option:
+ * 1 = no load, 2 = no store,
+ * else do both
+ */
+ unsigned int pmfs_backing_option;
+
+ /* Mount options */
+ unsigned long bpi;
+ unsigned long num_inodes;
+ unsigned long blocksize;
+ unsigned long initsize;
+ unsigned long s_mount_opt;
+ uid_t uid; /* Mount uid for root directory */
+ gid_t gid; /* Mount gid for root directory */
+ umode_t mode; /* Mount mode for root directory */
+ atomic_t next_generation;
+ /* inode tracking */
+ struct mutex inode_table_mutex;
+ unsigned int s_inodes_count; /* total inodes count (used or free) */
+ unsigned int s_free_inodes_count; /* free inodes count */
+ unsigned int s_inodes_used_count;
+ unsigned int s_free_inode_hint;
+
+ /* temp bitmap space */
+ unsigned long num_blocknode_allocated;
+ unsigned long bitmap_4k_size;
+ unsigned long bitmap_2M_size;
+ unsigned long bitmap_1G_size;
+ unsigned long *bitmap_4k;
+ unsigned long *bitmap_2M;
+ unsigned long *bitmap_1G;
+
+ /* Journaling related structures */
+ uint32_t next_transaction_id;
+ uint32_t jsize;
+ void *journal_base_addr;
+ struct mutex journal_mutex;
+ struct task_struct *log_cleaner_thread;
+ wait_queue_head_t log_cleaner_wait;
+ bool redo_log;
+
+ /* truncate list related structures */
+ struct list_head s_truncate;
+ struct mutex s_truncate_lock;
+};
+
+#endif /* _LINUX_PMFS_SB */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e91..d6cd080 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,7 +21,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */
* Can be overriden by arch-specific value.
*/
#ifndef IOREMAP_MAX_ORDER
-#define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */
+#define IOREMAP_MAX_ORDER (PUD_SHIFT) /* 1G pages */
#endif
struct vm_struct {
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 873e086..a070c14 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -64,6 +64,7 @@
#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
#define PIPEFS_MAGIC 0x50495045
#define PROC_SUPER_MAGIC 0x9fa0
+#define PMFS_SUPER_MAGIC 0xEFFC
#define SOCKFS_MAGIC 0x534F434B
#define SYSFS_MAGIC 0x62656572
#define USBDEVICE_SUPER_MAGIC 0x9fa2
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 0c9216c..8eeaa60 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
@@ -32,37 +33,104 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
}
static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot, int hpages)
{
- pmd_t *pmd;
+ pmd_t *pmd_page, *pmd;
unsigned long next;
phys_addr -= addr;
- pmd = pmd_alloc(&init_mm, pud, addr);
- if (!pmd)
+ pmd_page = pmd_alloc(&init_mm, pud, addr);
+ if (!pmd_page)
return -ENOMEM;
+
+ if (hpages)
+ {
+ printk (KERN_INFO "PMD_MAPPING (START) [%s,%d]"
+ " VA START(0x%lx), VA END(0x%lx), "
+ "PA(0x%lx), SIZE(0x%lx)\n", __FUNCTION__, __LINE__,
+ addr, end, (unsigned long)(phys_addr+addr), (end-addr));
+
+ }
+
+ pmd = pmd_page;
do {
next = pmd_addr_end(addr, end);
- if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
- return -ENOMEM;
+ if (hpages && cpu_has_pse && ((next-addr)>=PMD_SIZE))
+ {
+ u64 pfn = ((u64)(phys_addr + addr)) >> PAGE_SHIFT;
+ prot = __pgprot((unsigned long)prot.pgprot | _PAGE_PSE);
+
+ if ((s64)pfn < 0)
+ {
+ printk (KERN_INFO "MAPPING ERROR [%s, %d] : phys_addr(0x%lx)"
+ "addr(0x%lx), next(0x%lx), end(0x%lx),"
+ "pfn(0x%lx)\n", __FUNCTION__, __LINE__,
+ (unsigned long)phys_addr,
+ (unsigned long)addr, (unsigned long)next,
+ (unsigned long)end, (unsigned long)pfn);
+ return -ENOMEM;
+ }
+
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pmd, pfn_pte(pfn, prot));
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ else
+ {
+ if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
+ return -ENOMEM;
+ }
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot, int hpages)
{
- pud_t *pud;
+ pud_t *pud_page, *pud;
unsigned long next;
phys_addr -= addr;
- pud = pud_alloc(&init_mm, pgd, addr);
- if (!pud)
+ pud_page = pud_alloc(&init_mm, pgd, addr);
+ if (!pud_page)
return -ENOMEM;
+
+ if (hpages)
+ {
+ printk (KERN_INFO "PUD_MAPPING (START) [%s,%d]"
+ " VA START(0x%lx), VA END(0x%lx), "
+ "PA(0x%lx), SIZE(0x%lx)\n", __FUNCTION__, __LINE__,
+ addr, end, (unsigned long)(phys_addr+addr), (end-addr));
+ }
+
+ pud = pud_page;
do {
next = pud_addr_end(addr, end);
- if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
- return -ENOMEM;
+ if (hpages && cpu_has_gbpages && ((next-addr)>=PUD_SIZE))
+ {
+ u64 pfn = ((u64)(phys_addr + addr)) >> PAGE_SHIFT;
+ prot = __pgprot((unsigned long)prot.pgprot | _PAGE_PSE);
+ if ((s64)pfn < 0)
+ {
+ printk (KERN_INFO "MAPPING ERROR [%s, %d] : phys_addr(0x%lx)"
+ "addr(0x%lx), next(0x%lx), end(0x%lx),"
+ "pfn(0x%lx)\n", __FUNCTION__, __LINE__,
+ (unsigned long)phys_addr,
+ (unsigned long)addr, (unsigned long)next,
+ (unsigned long)end, (unsigned long)pfn);
+ return -ENOMEM;
+ }
+
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pud, pfn_pte(pfn, prot));
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ else
+ {
+ if (ioremap_pmd_range(pud, addr, next, phys_addr + addr,
+ prot, hpages))
+ return -ENOMEM;
+ }
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -82,7 +150,7 @@ int ioremap_page_range(unsigned long addr,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
+ err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot, 0);
if (err)
break;
} while (pgd++, addr = next, addr != end);
@@ -92,3 +160,35 @@ int ioremap_page_range(unsigned long addr,
return err;
}
EXPORT_SYMBOL_GPL(ioremap_page_range);
+
+int ioremap_hpage_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+
+ BUG_ON(addr >= end);
+
+ printk (KERN_INFO "[%s,%d] hpages ON; startVA(0x%lx), endVA(0x%lx), "
+ "startPA(0x%lx), startPFN(0x%lx)\n", __FUNCTION__, __LINE__,
+ addr, end, (unsigned long)phys_addr,
+ (unsigned long)phys_addr >> PAGE_SHIFT);
+
+ start = addr;
+ phys_addr -= addr;
+ pgd = pgd_offset_k(addr);
+
+ do {
+ next = pgd_addr_end(addr, end);
+ err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot, 1);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ flush_cache_vmap(start, end);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ioremap_hpage_range);
diff --git a/mm/madvise.c b/mm/madvise.c
index c58c94b..11bba93 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -511,6 +511,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (!vma)
goto out_plug;
+ /* madvise not supported with XIP_HUGETLB */
+ if (is_xip_hugetlb_mapping(vma)) {
+ error = -EINVAL;
+ goto out;
+ }
+
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b55222..8f49564 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6480,7 +6480,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
.mm = mm,
.private = vma,
};
- if (is_vm_hugetlb_page(vma))
+ if (is_vm_hugetlb_page(vma) || is_xip_hugetlb_mapping(vma))
continue;
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
@@ -6743,7 +6743,7 @@ retry:
.mm = mm,
.private = vma,
};
- if (is_vm_hugetlb_page(vma))
+ if (is_vm_hugetlb_page(vma) || is_xip_hugetlb_mapping(vma))
continue;
ret = walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_move_charge_walk);
diff --git a/mm/memory.c b/mm/memory.c
index 13cbc42..5a8bd23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1054,6 +1054,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 0;
}
+ /* FIXME: For now, don't copy ptes and let it fault. */
+ if (is_xip_hugetlb_mapping(vma))
+ return 0;
+
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1353,6 +1357,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
+ } else if (is_xip_hugetlb_mapping(vma)) {
+ unmap_xip_hugetlb_range(vma, start, end);
+ start = end;
} else
unmap_page_range(tlb, vma, start, end, details);
}
@@ -1650,6 +1657,54 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
stack_guard_page_end(vma, addr+PAGE_SIZE);
}
+/* FIXME : Move it to the right place ! */
+static int follow_xip_hugetlb_page(
+ struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *position, int *length, int i, unsigned int flags)
+{
+ unsigned long vaddr = *position;
+ int remainder = *length;
+
+ while (vaddr < vma->vm_end && remainder) {
+ int err, absent;
+ pte_t *pte;
+ unsigned long size;
+ struct vm_fault vmf;
+
+ pte = pte_offset_pagesz(mm, vaddr, &size);
+ absent = !pte || pte_none(*pte);
+
+ /* populate an entry */
+ if (absent || ((flags & FOLL_WRITE) && !pte_write(*pte))) {
+ vmf.virtual_address = (void __user *)(vaddr & PAGE_MASK);
+ vmf.pgoff = (((vaddr & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff;
+ vmf.flags = (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0;
+ vmf.page = NULL;
+ err = vma->vm_ops->fault(vma, &vmf);
+
+ if (!err || (err == VM_FAULT_NOPAGE)) {
+ pte = pte_offset_pagesz(mm, vaddr, &size);
+ vaddr = (vaddr & ~(size-1)) + size;
+ remainder -= size>>PAGE_SHIFT;
+ i += size>>PAGE_SHIFT;
+ continue;
+ }
+
+ remainder = 0;
+ break;
+ }
+
+ vaddr = (vaddr & ~(size-1)) + size;
+ remainder -= size>>PAGE_SHIFT;
+ i += size>>PAGE_SHIFT;
+ }
+
+ *length = remainder;
+ *position = vaddr;
+ return i ? i : -EFAULT;
+}
+
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
@@ -1790,9 +1845,20 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (!vma ||
(vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
- !(vm_flags & vma->vm_flags))
+ !(vm_flags & vma->vm_flags) || is_xip_hugetlb_mapping(vma))
return i ? : -EFAULT;
+#if 0
+ /* FIXME : Requires more testing */
+ if (is_xip_hugetlb_mapping(vma)) {
+ /* caller expects vmas or pages to be populated. */
+ if (vmas || pages)
+ return -EFAULT;
+ i = follow_xip_hugetlb_page(mm, vma,
+ &start, &nr_pages, i, gup_flags);
+ continue;
+ }
+#endif
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i, gup_flags);
@@ -3724,6 +3790,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ /* FIXME : Can't find a single flag in vm_area_struct->vma_flags. */
+ if (is_xip_hugetlb_mapping(vma))
+ {
+ int err;
+ struct vm_fault vmf;
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff;
+ vmf.flags = flags;
+ vmf.page = NULL;
+ err = vma->vm_ops->fault(vma, &vmf);
+ if (!err || (err == VM_FAULT_NOPAGE))
+ return 0;
+ }
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3857,6 +3938,117 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+/****************************************************************************/
+/* XIP_HUGETLB support */
+pte_t *pte_offset_pagesz(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd)) {
+ *sz = PGDIR_SIZE;
+ return (pte_t *)pgd;
+ }
+
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud) || pud_large(*pud)) {
+ *sz = PUD_SIZE;
+ return (pte_t *)pud;
+ }
+ pmd = pmd_offset(pud, addr);
+ //if (pmd_none(*pmd) || pmd_large(*pmd)) {
+ *sz = PMD_SIZE;
+ return (pte_t *)pmd;
+}
+EXPORT_SYMBOL(pte_offset_pagesz);
+
+pte_t *pte_alloc_pagesz(struct mm_struct *mm, unsigned long addr,
+ unsigned long sz)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset(mm, addr);
+ pud = pud_alloc(mm, pgd, addr);
+ if (pud) {
+ if (sz == PUD_SIZE) {
+ pte = (pte_t *)pud;
+ } else {
+ BUG_ON(sz != PMD_SIZE);
+ pte = (pte_t *) pmd_alloc(mm, pud, addr);
+ }
+ }
+ BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+ return pte;
+}
+EXPORT_SYMBOL(pte_alloc_pagesz);
+
+static void __unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long address;
+ pte_t *ptep;
+ pte_t pte;
+ unsigned long sz;
+
+ WARN_ON(!is_xip_hugetlb_mapping(vma));
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ spin_lock(&mm->page_table_lock);
+ for (address = start, sz=PMD_SIZE; address < end; address += sz) {
+ ptep = pte_offset_pagesz(mm, address, &sz);
+ if (!ptep)
+ continue;
+
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_none(pte))
+ continue;
+ }
+ flush_tlb_range(vma, start, end);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+}
+
+void unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ __unmap_xip_hugetlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+}
+EXPORT_SYMBOL(unmap_xip_hugetlb_range);
+
+/****************************************************************************/
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ return -ENOMEM;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
+ BUG_ON(addr >= end);
+ BUG_ON(end > vma->vm_end);
+ len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
+ ret = get_user_pages(current, current->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -EFAULT;
+}
+
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..f92ef16 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,6 +173,11 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+ if (is_xip_hugetlb_mapping(vma)) {
+ vma->vm_flags &= ~VM_LOCKED;
+ return nr_pages;;
+ }
+
gup_flags = FOLL_TOUCH | FOLL_MLOCK;
/*
* We want to touch writable mappings with a write fault in order
diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1..79f4003 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2355,6 +2355,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
struct vm_area_struct *new;
int err = -ENOMEM;
+ if (is_xip_hugetlb_mapping(vma))
+ return -EINVAL;
+
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..f1e8754 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -250,6 +250,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
return 0;
}
+ /* FIXME: Do nothing for now */
+ if (is_xip_hugetlb_mapping(vma))
+ return -EINVAL;
+
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
diff --git a/mm/msync.c b/mm/msync.c
index 632df45..a2abc04 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -30,9 +30,10 @@
*/
SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
{
- unsigned long end;
+ unsigned long end, fsync_start, fsync_end;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+ size_t file_offset;
int unmapped_error = 0;
int error = -EINVAL;
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
}
file = vma->vm_file;
+ fsync_start = start;
+ fsync_end = min(end, vma->vm_end);
start = vma->vm_end;
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = vfs_fsync(file, 0);
+ file_offset = vma->vm_pgoff * PAGE_SIZE;
+ error = vfs_fsync_range(file,
+ file_offset + fsync_start - vma->vm_start,
+ file_offset + fsync_end - vma->vm_start - 1, 0);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2..31b9206 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -52,6 +52,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
@@ -66,6 +70,10 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
@@ -1329,7 +1337,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP) {
- int bit = fls(size);
+ int bit = fls64((__u64)size);
if (bit > IOREMAP_MAX_ORDER)
bit = IOREMAP_MAX_ORDER;
--
1.7.0.4
More information about the Linux-pmfs
mailing list