[PATCH 1/5] PMFS: Add Persistent Memory File System

Vishal Verma vishal.l.verma at linux.intel.com
Fri Apr 26 18:20:42 EDT 2013


Initial version of PMFS, the Persistent Memory File System.
This commit rebases to Linux 3.9-rc7

Signed-off-by: Sanjay Kumar <sanjay.k.kumar at intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy at intel.com>
Signed-off-by: Dulloor <subramanya.r.dulloor at intel.com>
Signed-off-by: Edmund Nadolski <edmund.nadolski at intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler at linux.intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma at linux.intel.com>
---
 Documentation/filesystems/pmfs.txt   |  140 +++
 arch/Kconfig                         |    3 +
 arch/x86/Kconfig                     |    2 +
 arch/x86/include/asm/io.h            |    6 +
 arch/x86/include/asm/pgtable_types.h |    1 +
 arch/x86/kernel/setup.c              |   17 +-
 arch/x86/kernel/sys_x86_64.c         |  150 ++++
 arch/x86/mm/ioremap.c                |   90 ++-
 arch/x86/mm/pat.c                    |    2 +
 arch/x86/mm/pgtable.c                |    2 +
 fs/Kconfig                           |    3 +-
 fs/Makefile                          |    1 +
 fs/pmfs/Kconfig                      |   37 +
 fs/pmfs/Makefile                     |   11 +
 fs/pmfs/balloc.c                     |  256 ++++++
 fs/pmfs/bbuild.c                     |  509 +++++++++++
 fs/pmfs/dir.c                        |  310 +++++++
 fs/pmfs/file.c                       |  333 +++++++
 fs/pmfs/inode.c                      | 1568 ++++++++++++++++++++++++++++++++++
 fs/pmfs/ioctl.c                      |  150 ++++
 fs/pmfs/journal.c                    |  866 +++++++++++++++++++
 fs/pmfs/journal.h                    |  101 +++
 fs/pmfs/namei.c                      |  797 +++++++++++++++++
 fs/pmfs/persist.c                    |  238 +++++
 fs/pmfs/pmfs.h                       |  576 +++++++++++++
 fs/pmfs/pmfs_test.c                  |   50 ++
 fs/pmfs/super.c                      | 1217 ++++++++++++++++++++++++++
 fs/pmfs/symlink.c                    |   71 ++
 fs/pmfs/wprotect.c                   |   91 ++
 fs/pmfs/wprotect.h                   |  166 ++++
 fs/pmfs/xip.c                        |  672 +++++++++++++++
 fs/pmfs/xip.h                        |   28 +
 include/asm-generic/pgtable.h        |    8 +
 include/linux/io.h                   |   10 +
 include/linux/mm.h                   |   15 +
 include/linux/pmfs_def.h             |  206 +++++
 include/linux/pmfs_sb.h              |   83 ++
 include/linux/vmalloc.h              |    2 +-
 include/uapi/linux/magic.h           |    1 +
 lib/ioremap.c                        |  126 +++-
 mm/madvise.c                         |    6 +
 mm/memcontrol.c                      |    4 +-
 mm/memory.c                          |  194 +++++-
 mm/mlock.c                           |    5 +
 mm/mmap.c                            |    3 +
 mm/mprotect.c                        |    4 +
 mm/msync.c                           |   10 +-
 mm/vmalloc.c                         |   10 +-
 48 files changed, 9109 insertions(+), 42 deletions(-)
 create mode 100644 Documentation/filesystems/pmfs.txt
 create mode 100644 fs/pmfs/Kconfig
 create mode 100644 fs/pmfs/Makefile
 create mode 100644 fs/pmfs/balloc.c
 create mode 100644 fs/pmfs/bbuild.c
 create mode 100644 fs/pmfs/dir.c
 create mode 100644 fs/pmfs/file.c
 create mode 100644 fs/pmfs/inode.c
 create mode 100644 fs/pmfs/ioctl.c
 create mode 100644 fs/pmfs/journal.c
 create mode 100644 fs/pmfs/journal.h
 create mode 100644 fs/pmfs/namei.c
 create mode 100644 fs/pmfs/persist.c
 create mode 100644 fs/pmfs/pmfs.h
 create mode 100644 fs/pmfs/pmfs_test.c
 create mode 100644 fs/pmfs/super.c
 create mode 100644 fs/pmfs/symlink.c
 create mode 100644 fs/pmfs/wprotect.c
 create mode 100644 fs/pmfs/wprotect.h
 create mode 100644 fs/pmfs/xip.c
 create mode 100644 fs/pmfs/xip.h
 create mode 100644 include/linux/pmfs_def.h
 create mode 100644 include/linux/pmfs_sb.h

diff --git a/Documentation/filesystems/pmfs.txt b/Documentation/filesystems/pmfs.txt
new file mode 100644
index 0000000..e9f2bb4
--- /dev/null
+++ b/Documentation/filesystems/pmfs.txt
@@ -0,0 +1,140 @@
+
+PMFS Introduction
+=================
+
+PMFS is a file system for persistent memory. The file system is optimized to be
+lightweight and efficient in providing access to persistent memory that is
+directly accessible via CPU load/store instructions. It manages the persistent
+memory directly and avoids the block driver layer and page cache layer and thus
+provides synchronous reads and writes to persistent area. It supports all the
+existing POSIX style file system APIs so that the applications need not be
+modified to use this file system. In addition, PMFS provides support for huge
+pages to minimize TLB entry usage and speed up virtual address lookup. PMFS's
+mmap interface can map a file's data directly into the process's address space
+without any intermediate buffering. This file system has been validated using
+DRAM to emulate persistent memory. Hence, PMFS also provides an option to load
+the file system from a disk-based file into memory during mount and save the
+file system from memory into the disk-based file during unmount. PMFS also
+guarantees consistent and durable updates to the file system meta-data against
+arbitrary system and power failures. PMFS uses journaling (undo log) to provide
+consistent updates to meta-data.
+
+
+Configuring PMFS
+================
+
+PMFS uses a physically contiguous area of DRAM (which is not used by the
+kernel) as the file system space. To make sure that the kernel doesn't use a
+certain contiguous physical memory area you can boot the kernel with 'memmap'
+kernel command line option.  For more information on this, please see
+Documentation/kernel-parameters.txt.
+
+For example, adding 'memmap=2G$4G' to the kernel boot parameters will reserve
+2G of memory, starting at 4G.  (You may have to escape the $ so it isn't
+interpreted by GRUB 2, if you use that as your boot loader.)
+
+After the OS has booted, you can initialize PMFS during mount command by
+passing 'init=' mount option.
+
+For example,
+
+#mount -t pmfs -o physaddr=0x100000000,init=2G none /mnt/pmfs
+
+The above command will create a PMFS file system in the 2GB region starting at
+0x100000000 (4GB) and mount it at /mnt/pmfs.  There are many other mount time
+options supported by pmfs. Some of the main options include:
+
+wprotect: This option protects pmfs from stray writes (e.g., because of kernel
+bugs). It makes sure that the file system is mapped read-only into the kernel
+and makes it writable only for a brief period when writing to it. (EXPERIMENTAL
+- Use with Caution).  
+
+jsize: This option specifies the journal size. Default is 4MB.
+
+hugemmap: This option enables support for using huge pages in memory-mapped
+files.  
+
+backing: This option specifies a disk based file which should be used as a
+persistent backing store for pmfs during mount and unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,init=2G,backing="/data/pmfs.img" none /mnt/pmfs
+
+The above example initializes a 2GB PMFS and during unmount it saves the file
+system into a file /data/pmfs.img
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img" none /mnt/pmfs
+
+The above example loads the PMFS from /data/pmfs.img during mount and saves
+the file system to /data/pmfs.img during unmount.
+
+backing_opt: This option specifies how the backing file should be used. It can
+have 2 values;
+
+1: This value means that PMFS will not be loaded from the backing file during
+mount. It is either created using 'init=' option, or the pre-existing file
+system in the memory is used.
+
+2: This value means that the PMFS will not be stored to the backing file during
+unmount.
+
+If backing_opt is not specified, PMFS will load the file system from backing
+file (if init= option is not specified) during mount and store the file system
+to the backing file during unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img",backing_opt=2 none /mnt/pmfs
+
+The above example loads the PMFS from /data/pmfs.img during mount but does not
+save the file system to /data/pmfs.img during unmount.
+
+#mount -t pmfs -o physaddr=0x100000000,backing="/data/pmfs.img",backing_opt=1 none /mnt/pmfs
+
+The above example assumes that there is a PMFS already present at the specified
+physical address (create during an earlier mount). It uses that same PMFS
+instead of loading it from /data/pmfs.img. It, however, saves the file system
+to /data/pmfs.img during unmount. 
+
+For full list of options, please refer to the source code. 
+
+
+Using Huge Pages with PMFS 
+==========================
+
+PMFS supports the use of huge-pages through the fallocate(), and ftruncate()
+system calls. These functions set the file size and also provide PMFS with a
+hint about what data-block size to use (fallocate() also pre-allocates the
+data-blocks).  For example, if we set the file size below 2MB, 4KB blocksize is
+used.  If we set the file size between >= 2MB but < 1GB, 2MB block size is used,
+and if we set the file size >= 1GB, 1GB block-size is used.  fallocate() or
+ftruncate() should be called on a empty file (size 0) for the block-size hint
+to be applied properly. So, a good way to use Huge Pages in PMFS is to open a
+new file through the open() system call, and call fallocate() or ftruncate() to
+set the file size and block-size hint.  Remember, that it is only a hint, so if
+PMFS can't find enough free blocks of a particular size, it will try to use
+smaller block-size.  If the block-size hint is not set, default 4KB block-size
+will be used for file's data-blocks.
+
+
+Current Limitations
+===================
+
+a) PMFS uses a memory region not used by the kernel. Hence the memory needs to
+be reserved by using the memmap= option or using BIOS ACPI tables.
+
+b) Because of multiple blocksize support, PMFS supports multiple max file
+sizes. For example, if the file's block size is 4KB, the file can grow upto
+512 GB in size, if blocksize is 2MB, file can grow upto 256 TB, and if the
+blocksize is 1GB, the file can grow upto 128 PB.
+
+c) PMFS does not currently support extended attributes.
+
+d) PMFS currently only works with x86_64 kernels.
+
+e) We ran out of bits in vma’s vm_flags field, so we reused a flag that is
+guaranteed not to be used on x86_64.
+
+
+Contact Information
+=====================
+
+Please send bug reports/comments/feedback to the PMFS development
+list: linux-pmfs at intel.com
diff --git a/arch/Kconfig b/arch/Kconfig
index 1455579..82a5965 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -174,6 +174,9 @@ config USER_RETURN_NOTIFIER
 config HAVE_IOREMAP_PROT
 	bool
 
+config HAVE_SET_MEMORY_RO
+	bool
+
 config HAVE_KPROBES
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 70c0f3d..b94d591 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,8 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
+	select HAVE_IRQ_WORK
+	select HAVE_SET_MEMORY_RO
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eef..1cfda69 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -173,9 +173,15 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem * ioremap_cache_ro(resource_size_t offset,
+				unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
 
+extern void __iomem *
+ioremap_hpage_cache_ro(resource_size_t phys_addr, unsigned long size);
+extern void __iomem *
+ioremap_hpage_cache(resource_size_t phys_addr, unsigned long size);
 /*
  * The default ioremap() behavior is non-cached:
  */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 567b5d0..7b84690 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -157,6 +157,7 @@
 #define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
 #define PAGE_KERNEL_IO_UC_MINUS		__pgprot(__PAGE_KERNEL_IO_UC_MINUS)
 #define PAGE_KERNEL_IO_WC		__pgprot(__PAGE_KERNEL_IO_WC)
+#define PAGE_KERNEL_IO_LARGE	__pgprot(__PAGE_KERNEL_IO | _PAGE_PSE)
 
 /*         xwr */
 #define __P000	PAGE_NONE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 90d8cc9..3be22d8 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -281,7 +281,22 @@ void * __init extend_brk(size_t size, size_t align)
 	return ret;
 }
 
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+	{
+		printk(KERN_INFO "direct_gbpages(%d). cpu_has_gbpages(%d)."
+				"GB pages not supported.\n", direct_gbpages, cpu_has_gbpages);
+		direct_gbpages = 0;
+	}
+}
+#else
+static inline void init_gbpages(void)
+{
+}
 static void __init cleanup_highmap(void)
 {
 }
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index dbded5a..d607f1a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -15,6 +15,7 @@
 #include <linux/random.h>
 #include <linux/uaccess.h>
 #include <linux/elf.h>
+#include <linux/export.h>
 
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
@@ -190,3 +191,152 @@ bottomup:
 	 */
 	return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 }
+
+
+static unsigned long arch_get_unmapped_area_bottomup_sz(struct file *file,
+		unsigned long addr, unsigned long len, unsigned long align_size,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+
+	if (len > mm->cached_hole_size) {
+	        start_addr = mm->free_area_cache;
+	} else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+
+full_search:
+	addr = ALIGN(start_addr, align_size);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+		addr = ALIGN(vma->vm_end, align_size);
+	}
+}
+
+static unsigned long arch_get_unmapped_area_topdown_sz(struct file *file,
+		unsigned long addr0, unsigned long len, unsigned long align_size,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma, *prev_vma;
+	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
+	unsigned long align_mask = ~(align_size - 1);
+	int first_time = 1;
+
+	/* don't allow allocations above current base */
+	if (mm->free_area_cache > base)
+		mm->free_area_cache = base;
+
+	if (len <= largest_hole) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
+try_again:
+	/* make sure it can fit in the remaining address space */
+	if (mm->free_area_cache < len)
+		goto fail;
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = (mm->free_area_cache - len) & align_mask;
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * i.e. return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (!vma)
+			return addr;
+
+		/*
+		 * new region fits between prev_vma->vm_end and
+		 * vma->vm_start, use it:
+		 */
+		prev_vma = vma->vm_prev;
+		if (addr + len <= vma->vm_start &&
+		            (!prev_vma || (addr >= prev_vma->vm_end))) {
+			/* remember the address as a hint for next time */
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else {
+			/* pull free_area_cache down to the first hole */
+		        if (mm->free_area_cache == vma->vm_end) {
+				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + largest_hole < vma->vm_start)
+		        largest_hole = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = (vma->vm_start - len) & align_mask;
+	} while (len <= vma->vm_start);
+
+fail:
+	/*
+	 * if hint left us with no space for the requested
+	 * mapping then try again:
+	 */
+	if (first_time) {
+		mm->free_area_cache = base;
+		largest_hole = 0;
+		first_time = 0;
+		goto try_again;
+	}
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+	addr = arch_get_unmapped_area_bottomup_sz(file, addr0, len, align_size,
+																pgoff, flags);
+
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
+unsigned long arch_get_unmapped_area_sz(struct file *file,
+		unsigned long addr, unsigned long len, unsigned long align_size,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	if (mm->get_unmapped_area == arch_get_unmapped_area)
+		return arch_get_unmapped_area_bottomup_sz(file, addr, len, align_size,
+				pgoff, flags);
+	return arch_get_unmapped_area_topdown_sz(file, addr, len, align_size,
+				pgoff, flags);
+}
+EXPORT_SYMBOL(arch_get_unmapped_area_sz);
+
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78fe3f1..a212b3f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -21,6 +21,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
+#include <asm/cpufeature.h>
 
 #include "physaddr.h"
 
@@ -50,17 +51,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-		unsigned long size, unsigned long prot_val, void *caller)
+static void __iomem *___ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, unsigned long prot_val, void *caller,
+				unsigned int hpages, unsigned int readonly)
 {
 	unsigned long offset, vaddr;
 	resource_size_t pfn, last_pfn, last_addr;
@@ -94,12 +87,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
 	last_pfn = last_addr >> PAGE_SHIFT;
-	for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
-		int is_ram = page_is_ram(pfn);
-
-		if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
-			return NULL;
-		WARN_ON_ONCE(is_ram);
+	if ((phys_addr >> PAGE_SHIFT) < max_pfn)
+	{
+		for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
+			int is_ram = page_is_ram(pfn);
+
+			if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+				return NULL;
+			WARN_ON_ONCE(is_ram);
+		}
 	}
 
 	/*
@@ -145,6 +141,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		break;
 	}
 
+	/* Map pages RO */
+	if (readonly)
+		prot = __pgprot((unsigned long)prot.pgprot & ~_PAGE_RW);
+
 	/*
 	 * Ok, go for it..
 	 */
@@ -157,8 +157,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	if (kernel_map_sync_memtype(phys_addr, size, prot_val))
 		goto err_free_area;
 
-	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
-		goto err_free_area;
+	if (hpages)
+	{
+		if (ioremap_hpage_range(vaddr, vaddr + size, phys_addr, prot))
+			goto err_free_area;
+	}
+	else
+	{
+		if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+			goto err_free_area;
+	}
 
 	ret_addr = (void __iomem *) (vaddr + offset);
 	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
@@ -178,6 +186,21 @@ err_free_memtype:
 	return NULL;
 }
 
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, unsigned long prot_val, void *caller)
+{
+	return ___ioremap_caller(phys_addr, size, prot_val, caller, 0, 0);
+}
+
 /**
  * ioremap_nocache     -   map bus memory into CPU space
  * @phys_addr:    bus address of the memory
@@ -235,13 +258,40 @@ void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 }
 EXPORT_SYMBOL(ioremap_wc);
 
+void __iomem *
+ioremap_hpage_cache(resource_size_t phys_addr, unsigned long size)
+{
+	/* Map using hugepages */
+	return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0), 1, 0);
+}
+EXPORT_SYMBOL(ioremap_hpage_cache);
+
+void __iomem *
+ioremap_hpage_cache_ro(resource_size_t phys_addr, unsigned long size)
+{
+	/* Map using hugepages */
+	return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0), 1, 1);
+}
+EXPORT_SYMBOL(ioremap_hpage_cache_ro);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
-	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
-				__builtin_return_address(0));
+	/* Map using 4k pages */
+	return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0), 0, 0);
 }
 EXPORT_SYMBOL(ioremap_cache);
 
+void __iomem *ioremap_cache_ro(resource_size_t phys_addr, unsigned long size)
+{
+	/* Map using 4k pages */
+	return ___ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0), 0, 1);
+}
+
+EXPORT_SYMBOL(ioremap_cache_ro);
 void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 				unsigned long prot_val)
 {
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 6574388..333a795 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -183,6 +183,8 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct pagerange_state state = {start_pfn, 0, 0};
 
+	if (start_pfn >= max_pfn)
+		return 0;
 	/*
 	 * For legacy reasons, physical address range in the legacy ISA
 	 * region is tracked as non-RAM. This will allow users of
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17fda6a..58e4e52 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/export.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
@@ -328,6 +329,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 
 	return changed;
 }
+EXPORT_SYMBOL(ptep_set_access_flags);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pmdp_set_access_flags(struct vm_area_struct *vma,
diff --git a/fs/Kconfig b/fs/Kconfig
index 780725a..7901c54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -17,7 +17,7 @@ source "fs/ext4/Kconfig"
 config FS_XIP
 # execute in place
 	bool
-	depends on EXT2_FS_XIP
+	depends on EXT2_FS_XIP || PMFS_XIP
 	default y
 
 source "fs/jbd/Kconfig"
@@ -209,6 +209,7 @@ source "fs/romfs/Kconfig"
 source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
+source "fs/pmfs/Kconfig"
 source "fs/exofs/Kconfig"
 source "fs/f2fs/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 9d53192..4a3aad5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -127,3 +127,4 @@ obj-$(CONFIG_F2FS_FS)		+= f2fs/
 obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_PMFS)		+= pmfs/
diff --git a/fs/pmfs/Kconfig b/fs/pmfs/Kconfig
new file mode 100644
index 0000000..7173b04
--- /dev/null
+++ b/fs/pmfs/Kconfig
@@ -0,0 +1,37 @@
+config PMFS
+	tristate "Persistent and Protected PM file system support"
+	depends on HAS_IOMEM
+	select CRC16
+	help
+	   If your system has a block of fast (comparable in access speed to
+	   system memory) and non-volatile byte-addressable memory and you wish to
+	   mount a light-weight, full-featured, and space-efficient filesystem over
+	   it, say Y here, and read <file:Documentation/filesystems/pmfs.txt>.
+
+	   To compile this as a module,  choose M here: the module will be
+	   called pmfs.
+
+config PMFS_XIP
+	bool "Execute-in-place in PMFS"
+	depends on PMFS && BLOCK
+	help
+	   Say Y here to enable XIP feature of PMFS.
+
+config PMFS_WRITE_PROTECT
+	bool "PMFS write protection"
+	depends on PMFS && MMU && HAVE_SET_MEMORY_RO
+	default y
+	help
+	   Say Y here to enable the write protect feature of PMFS.
+
+config PMFS_TEST
+	boolean
+	depends on PMFS
+
+config PMFS_TEST_MODULE
+	tristate "PMFS Test"
+	depends on PMFS && PMFS_WRITE_PROTECT && m
+	select PMFS_TEST
+	help
+	  Say Y here to build a simple module to test the protection of
+	  PMFS. The module will be called pmfs_test.
diff --git a/fs/pmfs/Makefile b/fs/pmfs/Makefile
new file mode 100644
index 0000000..806c19d
--- /dev/null
+++ b/fs/pmfs/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux pmfs-filesystem routines.
+#
+
+obj-$(CONFIG_PMFS) += pmfs.o
+obj-$(CONFIG_PMFS_TEST_MODULE) += pmfs_test.o
+
+pmfs-y := bbuild.o balloc.o dir.o file.o inode.o namei.o super.o symlink.o ioctl.o persist.o journal.o
+
+pmfs-$(CONFIG_PMFS_WRITE_PROTECT) += wprotect.o
+pmfs-$(CONFIG_PMFS_XIP) += xip.o
diff --git a/fs/pmfs/balloc.c b/fs/pmfs/balloc.c
new file mode 100644
index 0000000..3acc81d
--- /dev/null
+++ b/fs/pmfs/balloc.c
@@ -0,0 +1,256 @@
+/*
+ * PMFS emulated persistence. This file contains code to 
+ * handle data blocks of various sizes efficiently.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "pmfs.h"
+
+void pmfs_init_blockmap(struct super_block *sb, unsigned long init_used_size)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	unsigned long num_used_block;
+	struct pmfs_blocknode *blknode;
+
+	num_used_block = (init_used_size + sb->s_blocksize - 1) >>
+		sb->s_blocksize_bits;
+
+	blknode = pmfs_alloc_blocknode(sb);
+	if (blknode == NULL)
+		PMFS_ASSERT(0);
+	blknode->block_low = sbi->block_start;
+	blknode->block_high = sbi->block_start + num_used_block - 1;
+	sbi->num_free_blocks -= num_used_block;
+	list_add(&blknode->link, &sbi->block_inuse_head);
+}
+
+void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
+		      unsigned short btype)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct list_head *head = &(sbi->block_inuse_head);
+	unsigned long new_block_low;
+	unsigned long new_block_high;
+	unsigned long num_blocks = 0;
+	struct pmfs_blocknode *i;
+	struct pmfs_blocknode *free_blocknode= NULL;
+	struct pmfs_blocknode *curr_node;
+
+	num_blocks = pmfs_get_numblocks(btype);
+	new_block_low = blocknr;
+	new_block_high = blocknr + num_blocks - 1;
+
+	mutex_lock(&sbi->s_lock);
+
+	/* Traverese each blocknode entry */
+	list_for_each_entry(i, head, link) {
+
+		if (new_block_low > i->block_high) {
+			/* skip to next blocknode */
+			continue;
+		}
+
+		if ((new_block_low == i->block_low) &&
+			(new_block_high == i->block_high)) {
+			/* fits entire datablock */
+			list_del(&i->link);
+			free_blocknode = i;
+			sbi->num_blocknode_allocated--;
+			sbi->num_free_blocks += num_blocks;
+			break;
+		}
+		if ((new_block_low == i->block_low) &&
+			(new_block_high < i->block_high)) {
+			/* Aligns left */
+			i->block_low = new_block_high + 1;
+			sbi->num_free_blocks += num_blocks;
+			break;
+		}
+		if ((new_block_low > i->block_low) && 
+			(new_block_high == i->block_high)) {
+			/* Aligns right */
+			i->block_high = new_block_low - 1;
+			sbi->num_free_blocks += num_blocks;
+			break;
+		}
+		if ((new_block_low > i->block_low) &&
+			(new_block_high < i->block_high)) {
+			/* Aligns somewhere in the middle */
+			curr_node = pmfs_alloc_blocknode(sb);
+			PMFS_ASSERT(curr_node);
+			if (curr_node == NULL) {
+				/* returning without freeing the block*/
+				break;
+			}
+			curr_node->block_low = new_block_high + 1;
+			curr_node->block_high = i->block_high;
+			i->block_high = new_block_low - 1;
+			list_add(&curr_node->link, &i->link);
+			sbi->num_free_blocks += num_blocks;
+			break;
+		}
+	}
+
+	mutex_unlock(&sbi->s_lock);
+
+	if (free_blocknode)
+		__pmfs_free_blocknode(free_blocknode);
+		
+	return;
+}
+
+
+int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
+	unsigned short btype, int zero)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct list_head *head = &(sbi->block_inuse_head);
+	struct pmfs_blocknode *i, *next_i;
+	struct pmfs_blocknode *free_blocknode= NULL;
+	void *bp;
+	unsigned long num_blocks = 0;
+	struct pmfs_blocknode *curr_node;
+	int errval = 0;
+	bool found = 0;
+	unsigned long next_block_low;
+	unsigned long new_block_low;
+	unsigned long new_block_high;
+
+	num_blocks = pmfs_get_numblocks(btype);
+
+	mutex_lock(&sbi->s_lock);
+
+	/* Traverese each blocknode entry */
+	list_for_each_entry(i, head, link) {
+		if (i->link.next == head) {
+			next_i = NULL;
+			next_block_low = sbi->block_end;
+		} else {
+			next_i = list_entry(i->link.next, typeof(*i), link);
+			next_block_low = next_i->block_low;
+		}
+
+		new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
+		new_block_high = new_block_low + num_blocks - 1;
+
+		if (new_block_high >= next_block_low) {
+			/* Does not fit - skip to next blocknode */
+			continue;
+		}
+
+		if ((new_block_low == (i->block_high + 1)) &&
+			(new_block_high == (next_block_low - 1)))
+		{
+			/* Fill the gap completly */
+			if (next_i) {
+				i->block_high = next_i->block_high;
+				list_del(&next_i->link);
+				free_blocknode = next_i;
+				sbi->num_blocknode_allocated--;
+			} else {
+				i->block_high = new_block_high;
+			}
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low == (i->block_high + 1)) &&
+			(new_block_high < (next_block_low - 1))) {
+			/* Aligns to left */
+			i->block_high = new_block_high;
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low > (i->block_high + 1)) &&
+			(new_block_high == (next_block_low - 1))) {
+			/* Aligns to right */
+			if (next_i) {
+				/* right node exist */
+				next_i->block_low = new_block_low;
+			} else {
+				/* right node does NOT exist */
+				curr_node = pmfs_alloc_blocknode(sb);
+				PMFS_ASSERT(curr_node);
+				if (curr_node == NULL) {
+					errval = -ENOSPC;
+					break;
+				}
+				curr_node->block_low = new_block_low;
+				curr_node->block_high = new_block_high;
+				list_add(&curr_node->link, &i->link);
+			}
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low > (i->block_high + 1)) &&
+			(new_block_high < (next_block_low - 1))) {
+			/* Aligns somewhere in the middle */
+			curr_node = pmfs_alloc_blocknode(sb);
+			PMFS_ASSERT(curr_node);
+			if (curr_node == NULL) {
+				errval = -ENOSPC;
+				break;
+			}
+			curr_node->block_low = new_block_low;
+			curr_node->block_high = new_block_high;
+			list_add(&curr_node->link, &i->link);
+			found = 1;
+			break;
+		}
+	}
+	
+	if (found == 1) {
+		sbi->num_free_blocks -= num_blocks;
+	}	
+
+	mutex_unlock(&sbi->s_lock);
+
+	if (free_blocknode)
+		__pmfs_free_blocknode(free_blocknode);
+
+	if (found == 0) {
+		return -ENOSPC;
+	}
+
+	if (zero) {
+		size_t size;
+		bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
+		pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
+		if (btype == PMFS_BLOCK_TYPE_4K)
+			size = 0x1 << 12;
+		else if (btype == PMFS_BLOCK_TYPE_2M)
+			size = 0x1 << 21;
+		else
+			size = 0x1 << 30;
+		memset_nt(bp, 0, size);
+		pmfs_memlock_block(sb, bp);
+	}
+	*blocknr = new_block_low;
+
+	return errval;
+}
+
+unsigned long pmfs_count_free_blocks(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	return sbi->num_free_blocks; 
+}
diff --git a/fs/pmfs/bbuild.c b/fs/pmfs/bbuild.c
new file mode 100644
index 0000000..e279b64
--- /dev/null
+++ b/fs/pmfs/bbuild.c
@@ -0,0 +1,509 @@
+/*
+ * PMFS emulated persistence. This file contains code to 
+ * handle data blocks of various sizes efficiently.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include "pmfs.h"
+
+static void pmfs_clear_datablock_inode(struct super_block *sb)
+{
+	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+	pmfs_transaction_t *trans;
+
+	/* 2 log entry for inode */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+	if (IS_ERR(trans))
+		return;
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	pmfs_memunlock_inode(sb, pi);
+	memset(pi, 0, MAX_DATA_PER_LENTRY);
+	pmfs_memlock_inode(sb, pi);
+
+	/* commit the transaction */
+	pmfs_commit_transaction(sb, trans);
+}
+
+static void pmfs_init_blockmap_from_inode(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+	struct pmfs_blocknode_lowhigh *p = NULL;
+	struct pmfs_blocknode *blknode;
+	unsigned long index;
+	unsigned long blocknr;
+	unsigned long i;
+	unsigned long num_blocknode;
+	u64 bp;
+
+	num_blocknode = sbi->num_blocknode_allocated;
+	sbi->num_blocknode_allocated = 0;
+	for (i=0; i<num_blocknode; i++) {
+		index = i & 0xFF;
+		if (i == (i & 0xFFFFFFFFFFFFFF00)) {
+			/* Find and get new data block */
+			blocknr = i >> 8; //256 Entries in a block
+			bp = __pmfs_find_data_block(sb, pi, blocknr);
+			p = pmfs_get_block(sb, bp);
+		}
+		PMFS_ASSERT(p);
+		blknode = pmfs_alloc_blocknode(sb);
+		if (blknode == NULL)
+                	PMFS_ASSERT(0);
+		blknode->block_low = le64_to_cpu(p[index].block_low);
+		blknode->block_high = le64_to_cpu(p[index].block_high);
+		list_add_tail(&blknode->link, &sbi->block_inuse_head);
+	}
+
+	return;
+}
+
+static bool pmfs_can_skip_full_scan(struct super_block *sb)
+{
+	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+	struct pmfs_super_block *super = pmfs_get_super(sb);
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	u64 root, isize;
+	unsigned int height, btype;
+
+	if (!pi->root)
+	{
+		return  false;
+	}
+
+	sbi->num_blocknode_allocated =
+		le64_to_cpu(super->s_num_blocknode_allocated);
+	sbi->num_free_blocks = le64_to_cpu(super->s_num_free_blocks);
+	sbi->s_inodes_count = le32_to_cpu(super->s_inodes_count);
+	sbi->s_free_inodes_count = le32_to_cpu(super->s_free_inodes_count);
+	sbi->s_inodes_used_count = le32_to_cpu(super->s_inodes_used_count);
+	sbi->s_free_inode_hint = le32_to_cpu(super->s_free_inode_hint);
+
+	pmfs_init_blockmap_from_inode(sb);
+
+	root = pi->root;
+	height = pi->height;
+	btype = pi->i_blk_type;
+	isize = le64_to_cpu(pi->i_size);
+
+	/* Clearing the datablock inode */
+	pmfs_clear_datablock_inode(sb);
+
+	pmfs_free_inode_subtree(sb, root, height, btype, isize);
+	
+	return true;
+}
+
+
+static int pmfs_allocate_datablock_block_inode(pmfs_transaction_t *trans,
+	struct super_block *sb, struct pmfs_inode *pi, unsigned long num_blocks)
+{
+	int errval;
+	
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_mode = 0;
+	pi->i_links_count = cpu_to_le16(1);
+	pi->i_blk_type = PMFS_BLOCK_TYPE_4K;
+	pi->i_flags = 0;
+	pi->height = 0;
+	pi->i_dtime = 0; 
+	pi->i_size = cpu_to_le64(num_blocks << sb->s_blocksize_bits);
+	pmfs_memlock_inode(sb, pi);
+
+	errval = __pmfs_alloc_blocks(trans, sb, pi, 0, num_blocks, false);
+
+	return errval;
+}
+
+void pmfs_save_blocknode_mappings(struct super_block *sb)
+{
+	unsigned long num_blocks, blocknr;
+	struct pmfs_inode *pi =  pmfs_get_inode(sb, PMFS_BLOCKNODE_IN0);
+	struct pmfs_blocknode_lowhigh *p;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct list_head *head = &(sbi->block_inuse_head);
+	struct pmfs_blocknode *i;
+	struct pmfs_super_block *super;
+	pmfs_transaction_t *trans;
+	u64 bp;
+	int j, k;
+	int errval;
+	
+	num_blocks = ((sbi->num_blocknode_allocated * sizeof(struct 
+		pmfs_blocknode_lowhigh) - 1) >> sb->s_blocksize_bits) + 1;
+
+	/* 2 log entry for inode, 2 lentry for super-block */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + MAX_SB_LENTRIES);
+	if (IS_ERR(trans))
+		return;
+
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	errval = pmfs_allocate_datablock_block_inode(trans, sb, pi, num_blocks);
+
+	if (errval != 0) {
+		pmfs_dbg("Error saving the blocknode mappings: %d\n", errval);
+		pmfs_abort_transaction(sb, trans);
+		return;
+	}
+
+	j = 0;
+	k = 0;
+	p = NULL;
+	list_for_each_entry(i, head, link) {
+		blocknr = k >> 8;
+		if (j == 0) {
+			/* Find, get and unlock new data block */
+			bp = __pmfs_find_data_block(sb, pi, blocknr);
+			p = pmfs_get_block(sb, bp); 
+			pmfs_memunlock_block(sb, p);
+		}
+		p[j].block_low = cpu_to_le64(i->block_low);
+		p[j].block_high = cpu_to_le64(i->block_high);
+		j++;
+
+		if (j == 256) {
+			j = 0;
+			/* Lock the data block */
+			pmfs_memlock_block(sb, p);
+			pmfs_flush_buffer(p, 4096, false);
+		}
+		
+		k++;
+	}
+	
+	/* Lock the block */	
+	if (j) {
+		pmfs_flush_buffer(p, j << 4, false);
+		pmfs_memlock_block(sb, p);	
+	}	
+
+	/* 
+	 * save the total allocated blocknode mappings 
+	 * in super block
+	 */
+	super = pmfs_get_super(sb);
+	pmfs_add_logentry(sb, trans, &super->s_wtime,
+			PMFS_FAST_MOUNT_FIELD_SIZE, LE_DATA);
+
+	pmfs_memunlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
+
+	super->s_wtime = cpu_to_le32(get_seconds());
+	super->s_num_blocknode_allocated = 
+			cpu_to_le64(sbi->num_blocknode_allocated);
+	super->s_num_free_blocks = cpu_to_le64(sbi->num_free_blocks);
+	super->s_inodes_count = cpu_to_le32(sbi->s_inodes_count);
+	super->s_free_inodes_count = cpu_to_le32(sbi->s_free_inodes_count);
+	super->s_inodes_used_count = cpu_to_le32(sbi->s_inodes_used_count);
+	super->s_free_inode_hint = cpu_to_le32(sbi->s_free_inode_hint);
+
+	pmfs_memlock_range(sb, &super->s_wtime, PMFS_FAST_MOUNT_FIELD_SIZE);
+	/* commit the transaction */
+	pmfs_commit_transaction(sb, trans);
+}
+
+static void pmfs_inode_crawl_recursive(struct super_block *sb,
+                                        unsigned long block, u32 height,
+					u32 btype)
+{
+	u64 *node;
+	unsigned int i;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	
+	if (height == 0) {
+		/* This is the data block */
+		if (btype == cpu_to_le16(PMFS_BLOCK_TYPE_4K)) {
+			set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+		} else if (btype == cpu_to_le16(PMFS_BLOCK_TYPE_2M)) {
+			set_bit(block >> PAGE_SHIFT_2M, sbi->bitmap_2M);
+		} else {
+			set_bit(block >> PAGE_SHIFT_1G, sbi->bitmap_1G);
+			
+		}
+		return;
+	}
+
+	node = pmfs_get_block(sb, block);
+	set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+	for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+		if (node[i] == 0)
+			continue;
+		pmfs_inode_crawl_recursive(sb, 
+			le64_to_cpu(node[i]), height - 1, btype);
+	}
+}
+
+static inline void pmfs_inode_crawl(struct super_block *sb,
+	struct pmfs_inode *pi)
+{
+	if (pi->root == 0)
+		return;
+	pmfs_inode_crawl_recursive(sb, le64_to_cpu(pi->root), pi->height,
+		pi->i_blk_type);
+}
+
+static void pmfs_inode_table_crawl_recursive(struct super_block *sb,
+			unsigned long block, u32 height, u32 btype)
+{
+	u64 *node;
+	unsigned int i;
+	struct pmfs_inode *pi;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	
+	node = pmfs_get_block(sb, block);
+
+	if (height == 0) {
+		unsigned int inodes_per_block = INODES_PER_BLOCK(btype);
+		if (likely(btype == PMFS_BLOCK_TYPE_2M))
+			set_bit(block >> PAGE_SHIFT_2M, sbi->bitmap_2M);
+		else
+			set_bit(block >> PAGE_SHIFT, sbi->bitmap_4k);
+
+		sbi->s_inodes_count += inodes_per_block;
+		for (i = 0; i < inodes_per_block; i++) {
+			pi = (struct pmfs_inode *)((void *)node +
+                                                        PMFS_INODE_SIZE * i);
+			if (le16_to_cpu(pi->i_links_count) == 0 &&
+                        	(le16_to_cpu(pi->i_mode) == 0 ||
+                         	le32_to_cpu(pi->i_dtime))) {
+					/* Empty inode */
+					continue;
+			}
+			sbi->s_inodes_used_count++;
+			pmfs_inode_crawl(sb, pi);
+		}
+		return; 
+	}
+
+	set_bit(block >> PAGE_SHIFT , sbi->bitmap_4k);
+	for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+		if (node[i] == 0)
+			continue;
+		pmfs_inode_table_crawl_recursive(sb, 
+			le64_to_cpu(node[i]), height - 1, btype);
+	}
+}
+
+static int pmfs_alloc_insert_blocknode_map(struct super_block *sb,
+	unsigned long low, unsigned long high)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct list_head *head = &(sbi->block_inuse_head);
+	struct pmfs_blocknode *i, *next_i;
+	struct pmfs_blocknode *free_blocknode= NULL;
+	unsigned long num_blocks = 0;
+	struct pmfs_blocknode *curr_node;
+	int errval = 0;
+	bool found = 0;
+	unsigned long next_block_low;
+	unsigned long new_block_low;
+	unsigned long new_block_high;
+
+	//num_blocks = pmfs_get_numblocks(btype);
+
+	new_block_low = low;
+	new_block_high = high;
+	num_blocks = high - low + 1;
+
+	/* Traverese each blocknode entry */
+	list_for_each_entry(i, head, link) {
+		if (i->link.next == head) {
+			next_i = NULL;
+			next_block_low = sbi->block_end;
+		} else {
+			next_i = list_entry(i->link.next, typeof(*i), link);
+			next_block_low = next_i->block_low;
+		}
+
+
+		if (new_block_high >= next_block_low) {
+			/* Does not fit - skip to next blocknode */
+			continue;
+		}
+
+		if ((new_block_low == (i->block_high + 1)) &&
+			(new_block_high == (next_block_low - 1)))
+		{
+			/* Fill the gap completly */
+			if (next_i) {
+				i->block_high = next_i->block_high;
+				list_del(&next_i->link);
+				free_blocknode = next_i;
+			} else {
+				i->block_high = new_block_high;
+			}
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low == (i->block_high + 1)) &&
+			(new_block_high < (next_block_low - 1))) {
+			/* Aligns to left */
+			i->block_high = new_block_high;
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low > (i->block_high + 1)) &&
+			(new_block_high == (next_block_low - 1))) {
+			/* Aligns to right */
+			if (next_i) {
+				/* right node exist */
+				next_i->block_low = new_block_low;
+			} else {
+				/* right node does NOT exist */
+				curr_node = pmfs_alloc_blocknode(sb);
+				PMFS_ASSERT(curr_node);
+				if (curr_node == NULL) {
+					errval = -ENOSPC;
+					break;
+				}
+				curr_node->block_low = new_block_low;
+				curr_node->block_high = new_block_high;
+				list_add(&curr_node->link, &i->link);
+			}
+			found = 1;
+			break;
+		}
+
+		if ((new_block_low > (i->block_high + 1)) &&
+			(new_block_high < (next_block_low - 1))) {
+			/* Aligns somewhere in the middle */
+			curr_node = pmfs_alloc_blocknode(sb);
+			PMFS_ASSERT(curr_node);
+			if (curr_node == NULL) {
+				errval = -ENOSPC;
+				break;
+			}
+			curr_node->block_low = new_block_low;
+			curr_node->block_high = new_block_high;
+			list_add(&curr_node->link, &i->link);
+			found = 1;
+			break;
+		}
+	}
+	
+	if (found == 1) {
+		sbi->num_free_blocks -= num_blocks;
+	}	
+
+	if (free_blocknode)
+		pmfs_free_blocknode(sb, free_blocknode);
+
+	if (found == 0) {
+		return -ENOSPC;
+	}
+
+
+	return errval;
+}
+
+static int __pmfs_build_blocknode_map(struct super_block *sb,
+	unsigned long *bitmap, unsigned long bsize, unsigned long scale)
+{
+	unsigned long next = 1;
+	unsigned long low = 0;
+
+        while (1) {
+                next = find_next_bit(bitmap, bsize, next);
+                if (next == bsize)
+                        break;
+                low = next;
+                next = find_next_zero_bit(bitmap, bsize, next);
+                if (pmfs_alloc_insert_blocknode_map(sb, low << scale , (next << scale) - 1)) {
+                        printk("PMFS: Error could not insert 0x%lx-0x%lx\n", low << scale, ((next << scale) - 1));
+                }
+                if (next == bsize)
+                        break;
+        }
+        return 0;
+}
+	
+static int pmfs_build_blocknode_map(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	
+	__pmfs_build_blocknode_map(sb, sbi->bitmap_4k, sbi->bitmap_4k_size * 8, PAGE_SHIFT - 12);
+	__pmfs_build_blocknode_map(sb, sbi->bitmap_2M, sbi->bitmap_2M_size * 8, PAGE_SHIFT_2M - 12);
+	__pmfs_build_blocknode_map(sb, sbi->bitmap_1G, sbi->bitmap_1G_size * 8, PAGE_SHIFT_1G - 12);
+
+	return 0;
+}
+
+int pmfs_setup_blocknode_map(struct super_block *sb)
+{
+	struct pmfs_super_block *super = pmfs_get_super(sb);
+	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	unsigned long initsize = le64_to_cpu(super->s_size);
+	bool value = false;
+
+	mutex_init(&sbi->inode_table_mutex);
+	sbi->block_start = (unsigned long)0;
+	sbi->block_end = ((unsigned long)(initsize) >> PAGE_SHIFT);
+	
+	value = pmfs_can_skip_full_scan(sb);
+	if (value) {
+		pmfs_dbg_verbose("PMFS: Skipping full scan of inodes...\n");
+		return 0;
+	}
+
+	sbi->bitmap_4k_size = (initsize >> (PAGE_SHIFT + 0x3)) + 1;
+	sbi->bitmap_2M_size = (initsize >> (PAGE_SHIFT_2M + 0x3)) + 1;
+	sbi->bitmap_1G_size = (initsize >> (PAGE_SHIFT_1G + 0x3)) + 1;
+
+	/* Alloc memory to hold the block alloc bitmap */
+	sbi->bitmap_4k = kzalloc(sbi->bitmap_4k_size, GFP_KERNEL);
+	sbi->bitmap_2M = kzalloc(sbi->bitmap_2M_size, GFP_KERNEL);
+	sbi->bitmap_1G = kzalloc(sbi->bitmap_1G_size, GFP_KERNEL);
+	if (!sbi->bitmap_4k || !sbi->bitmap_2M || !sbi->bitmap_1G) {
+		goto skip;
+	}
+	
+	/* Clearing the datablock inode */
+	pmfs_clear_datablock_inode(sb);
+
+	pmfs_inode_table_crawl_recursive(sb, le64_to_cpu(pi->root), pi->height,
+		pi->i_blk_type);
+
+	/* Reserving tow inodes - Inode 0 and Inode for datablock */
+	sbi->s_free_inodes_count = sbi->s_inodes_count -  
+		(sbi->s_inodes_used_count + 2);
+	
+	/* set the block 0 as this is used */
+	sbi->s_free_inode_hint = PMFS_FREE_INODE_HINT_START;
+
+	/* initialize the num_free_blocks to */
+	sbi->num_free_blocks = ((unsigned long)(initsize) >> PAGE_SHIFT);
+	pmfs_init_blockmap(sb, le64_to_cpu(journal->base) + sbi->jsize);
+
+	pmfs_build_blocknode_map(sb);
+
+skip:
+	
+	kfree(sbi->bitmap_4k);
+	kfree(sbi->bitmap_2M);
+	kfree(sbi->bitmap_1G);
+
+	return 0;
+}
diff --git a/fs/pmfs/dir.c b/fs/pmfs/dir.c
new file mode 100644
index 0000000..b3ddb3c
--- /dev/null
+++ b/fs/pmfs/dir.c
@@ -0,0 +1,310 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for directories.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "pmfs.h"
+
+/*
+ *	Parent is locked.
+ */
+
+#define DT2IF(dt) (((dt) << 12) & S_IFMT)
+#define IF2DT(sif) (((sif) & S_IFMT) >> 12)
+
+static int pmfs_add_dirent_to_buf(pmfs_transaction_t *trans,
+	struct dentry *dentry, struct inode *inode,
+	struct pmfs_direntry *de, u8 *blk_base,  struct pmfs_inode *pidir)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned short reclen;
+	int nlen, rlen;
+	char *top;
+
+	reclen = PMFS_DIR_REC_LEN(namelen);
+	if (!de) {
+		de = (struct pmfs_direntry *)blk_base;
+		top = blk_base + dir->i_sb->s_blocksize - reclen;
+		while ((char *)de <= top) {
+#if 0
+			if (!pmfs_check_dir_entry("pmfs_add_dirent_to_buf",
+			    dir, de, blk_base, offset))
+				return -EIO;
+			if (pmfs_match(namelen, name, de))
+				return -EEXIST;
+#endif
+			rlen = le16_to_cpu(de->de_len);
+			if (de->ino) {
+				nlen = PMFS_DIR_REC_LEN(de->name_len);
+				if ((rlen - nlen) >= reclen)
+					break;
+			} else if (rlen >= reclen)
+				break;
+			de = (struct pmfs_direntry *)((char *)de + rlen);
+		}
+		if ((char *)de > top)
+			return -ENOSPC;
+	}
+	rlen = le16_to_cpu(de->de_len);
+
+	if (de->ino) {
+		struct pmfs_direntry *de1;
+		pmfs_add_logentry(dir->i_sb, trans, &de->de_len,
+			sizeof(de->de_len), LE_DATA);
+		nlen = PMFS_DIR_REC_LEN(de->name_len);
+		de1 = (struct pmfs_direntry *)((char *)de + nlen);
+		pmfs_memunlock_block(dir->i_sb, blk_base);
+		de1->de_len = cpu_to_le16(rlen - nlen);
+		de->de_len = cpu_to_le16(nlen);
+		pmfs_memlock_block(dir->i_sb, blk_base);
+		de = de1;
+	} else {
+		pmfs_add_logentry(dir->i_sb, trans, &de->ino,
+			sizeof(de->ino), LE_DATA);
+	}
+	pmfs_memunlock_block(dir->i_sb, blk_base);
+	/*de->file_type = 0;*/
+	if (inode) {
+		de->ino = cpu_to_le64(inode->i_ino);
+		/*de->file_type = IF2DT(inode->i_mode); */
+	} else {
+		de->ino = 0;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	pmfs_memlock_block(dir->i_sb, blk_base);
+	pmfs_flush_buffer(de, reclen, false);
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	/*dir->i_version++; */
+
+	pmfs_memunlock_inode(dir->i_sb, pidir);
+	pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+	pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
+	pmfs_memlock_inode(dir->i_sb, pidir);
+	return 0;
+}
+
+/* adds a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int pmfs_add_entry(pmfs_transaction_t *trans, struct dentry *dentry,
+		struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block *sb = dir->i_sb;
+	int retval = -EINVAL;
+	unsigned long block, blocks;
+	struct pmfs_direntry *de;
+	char *blk_base;
+	struct pmfs_inode *pidir;
+
+	if (!dentry->d_name.len)
+		return -EINVAL;
+
+	pidir = pmfs_get_inode(sb, dir->i_ino);
+	pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0; block < blocks; block++) {
+		blk_base =
+			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+		if (!blk_base) {
+			retval = -EIO;
+			goto out;
+		}
+		retval = pmfs_add_dirent_to_buf(trans, dentry, inode,
+				NULL, blk_base, pidir);
+		if (retval != -ENOSPC)
+			goto out;
+	}
+	retval = pmfs_alloc_blocks(trans, dir, blocks, 1, false);
+	if (retval)
+		goto out;
+
+	dir->i_size += dir->i_sb->s_blocksize;
+	pmfs_update_isize(dir, pidir);
+
+	blk_base = pmfs_get_block(sb, pmfs_find_data_block(dir, blocks));
+	if (!blk_base) {
+		retval = -ENOSPC;
+		goto out;
+	}
+	/* No need to log the changes to this de because its a new block */
+	de = (struct pmfs_direntry *)blk_base;
+	pmfs_memunlock_block(sb, blk_base);
+	de->ino = 0;
+	de->de_len = cpu_to_le16(sb->s_blocksize);
+	pmfs_memlock_block(sb, blk_base);
+	/* Since this is a new block, no need to log changes to this block */
+	retval = pmfs_add_dirent_to_buf(NULL, dentry, inode, de, blk_base,
+		pidir);
+out:
+	return retval;
+}
+
+/* removes a directory entry pointing to the inode. assumes the inode has
+ * already been logged for consistency
+ */
+int pmfs_remove_entry(pmfs_transaction_t *trans, struct dentry *de,
+		struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct inode *dir = de->d_parent->d_inode;
+	struct pmfs_inode *pidir;
+	struct qstr *entry = &de->d_name;
+	struct pmfs_direntry *res_entry, *prev_entry;
+	int retval = -EINVAL;
+	unsigned long blocks, block;
+	char *blk_base = NULL;
+
+	if (!de->d_name.len)
+		return -EINVAL;
+
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+
+	for (block = 0; block < blocks; block++) {
+		blk_base =
+			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+		if (!blk_base)
+			goto out;
+		if (pmfs_search_dirblock(blk_base, dir, entry,
+					  block << sb->s_blocksize_bits,
+					  &res_entry, &prev_entry) == 1)
+			break;
+	}
+
+	if (block == blocks)
+		goto out;
+	if (prev_entry) {
+		pmfs_add_logentry(sb, trans, &prev_entry->de_len,
+				sizeof(prev_entry->de_len), LE_DATA);
+		pmfs_memunlock_block(sb, blk_base);
+		prev_entry->de_len =
+			cpu_to_le16(le16_to_cpu(prev_entry->de_len) +
+				    le16_to_cpu(res_entry->de_len));
+		pmfs_memlock_block(sb, blk_base);
+	} else {
+		pmfs_add_logentry(sb, trans, &res_entry->ino,
+				sizeof(res_entry->ino), LE_DATA);
+		pmfs_memunlock_block(sb, blk_base);
+		res_entry->ino = 0;
+		pmfs_memlock_block(sb, blk_base);
+	}
+	/*dir->i_version++; */
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+
+	pidir = pmfs_get_inode(sb, dir->i_ino);
+	pmfs_add_logentry(sb, trans, pidir, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	pmfs_memunlock_inode(sb, pidir);
+	pidir->i_mtime = cpu_to_le32(dir->i_mtime.tv_sec);
+	pidir->i_ctime = cpu_to_le32(dir->i_ctime.tv_sec);
+	pmfs_memlock_inode(sb, pidir);
+	retval = 0;
+out:
+	return retval;
+}
+
+static int pmfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi;
+	char *blk_base;
+	int ret = 0, stored;
+	int error = 0;
+	unsigned long offset;
+	struct pmfs_direntry *de;
+	ino_t ino;
+
+	stored = 0;
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+	while (!error && !stored && filp->f_pos < inode->i_size) {
+		unsigned long blk = filp->f_pos >> sb->s_blocksize_bits;
+
+		blk_base =
+			pmfs_get_block(sb, pmfs_find_data_block(inode, blk));
+		if (!blk_base) {
+			pmfs_dbg("directory %lu contains a hole at offset %lld\n",
+				inode->i_ino, filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+#if 0
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct pmfs_direntry *)(blk_base + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->de_len) <
+				    PMFS_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->de_len);
+			}
+			offset = i;
+			filp->f_pos =
+				(filp->f_pos & ~(sb->s_blocksize - 1)) | offset;
+			filp->f_version = inode->i_version;
+		}
+#endif
+		while (!error && filp->f_pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct pmfs_direntry *)(blk_base + offset);
+			if (!pmfs_check_dir_entry("pmfs_readdir", inode, de,
+						   blk_base, offset)) {
+				/* On error, skip the f_pos to the next block. */
+				filp->f_pos = (filp->f_pos | (sb->s_blocksize - 1)) + 1;
+				ret = stored;
+				goto out;
+			}
+			offset += le16_to_cpu(de->de_len);
+			if (de->ino) {
+				ino = le64_to_cpu(de->ino);
+				pi = pmfs_get_inode(sb, ino);
+				error = filldir(dirent, de->name, de->name_len,
+						filp->f_pos, ino,
+						IF2DT(le16_to_cpu(pi->i_mode)));
+				if (error)
+					break;
+				stored++;
+			}
+			filp->f_pos += le16_to_cpu(de->de_len);
+		}
+		offset = 0;
+	}
+out:
+	return ret;
+}
+
+const struct file_operations pmfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= pmfs_readdir,
+	.fsync		= noop_fsync,
+	.unlocked_ioctl = pmfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= pmfs_compat_ioctl,
+#endif
+};
diff --git a/fs/pmfs/file.c b/fs/pmfs/file.c
new file mode 100644
index 0000000..e6c3812
--- /dev/null
+++ b/fs/pmfs/file.c
@@ -0,0 +1,333 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * File operations for files.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/falloc.h>
+#include <asm/mman.h>
+#include "pmfs.h"
+#include "xip.h"
+
+static inline int pmfs_can_set_blocksize_hint(struct pmfs_inode *pi,
+					       loff_t new_size)
+{
+	/* Currently, we don't deallocate data blocks till the file is deleted.
+	 * So no changing blocksize hints once allocation is done. */
+	if (le64_to_cpu(pi->root))
+		return 0;
+	return 1;
+}
+
+int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
+		loff_t new_size)
+{
+	unsigned short block_type;
+
+	if (!pmfs_can_set_blocksize_hint(pi, new_size))
+		return 0;
+
+	if (new_size >= 0x40000000) {   /* 1G */
+		block_type = PMFS_BLOCK_TYPE_1G;
+		goto hint_set;
+	}
+
+	if (new_size >= 0x200000) {     /* 2M */
+		block_type = PMFS_BLOCK_TYPE_2M;
+		goto hint_set;
+	}
+
+	/* defaulting to 4K */
+	block_type = PMFS_BLOCK_TYPE_4K;
+
+hint_set:
+	pmfs_dbg_verbose(
+		"Hint: new_size 0x%llx, i_size 0x%llx, root 0x%llx\n",
+		new_size, pi->i_size, le64_to_cpu(pi->root));
+	pmfs_dbg_verbose("Setting the hint to 0x%x\n", block_type);
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_blk_type = block_type;
+	pmfs_memlock_inode(sb, pi);
+	return 0;
+}
+
+static long pmfs_fallocate(struct file *file, int mode, loff_t offset,
+			    loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	long ret = 0;
+	unsigned long blocknr, blockoff;
+	int num_blocks, blocksize_mask;
+	struct pmfs_inode *pi;
+	pmfs_transaction_t *trans;
+	loff_t new_size;
+
+	/* We only support the FALLOC_FL_KEEP_SIZE mode */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	if (S_ISDIR(inode->i_mode))
+		return -ENODEV;
+
+	mutex_lock(&inode->i_mutex);
+
+	new_size = len + offset;
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out;
+	}
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	if (!pi) {
+		ret = -EACCES;
+		goto out;
+	}
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES +
+			MAX_METABLOCK_LENTRIES);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	/* Set the block size hint */
+	pmfs_set_blocksize_hint(sb, pi, new_size);
+
+	blocksize_mask = sb->s_blocksize - 1;
+	blocknr = offset >> sb->s_blocksize_bits;
+	blockoff = offset & blocksize_mask;
+	num_blocks = (blockoff + len + blocksize_mask) >> sb->s_blocksize_bits;
+	ret = pmfs_alloc_blocks(trans, inode, blocknr, num_blocks, true);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	pmfs_memunlock_inode(sb, pi);
+	if (ret || (mode & FALLOC_FL_KEEP_SIZE)) {
+		pi->i_flags |= cpu_to_le32(PMFS_EOFBLOCKS_FL);
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && new_size > inode->i_size) {
+		inode->i_size = new_size;
+		pi->i_size = cpu_to_le64(inode->i_size);
+	}
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pmfs_memlock_inode(sb, pi);
+
+	pmfs_commit_transaction(sb, trans);
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+loff_t pmfs_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int retval;
+
+	if (origin != SEEK_DATA && origin != SEEK_HOLE)
+		return generic_file_llseek(file, offset, origin);
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_DATA:
+		retval = pmfs_find_region(inode, &offset, 0);
+		if (retval) {
+			mutex_unlock(&inode->i_mutex);
+			return retval;
+		}
+		break;
+	case SEEK_HOLE:
+		retval = pmfs_find_region(inode, &offset, 1);
+		if (retval) {
+			mutex_unlock(&inode->i_mutex);
+			return retval;
+		}
+		break;
+	}
+
+	if ((offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) ||
+	    offset > inode->i_sb->s_maxbytes) {
+		mutex_unlock(&inode->i_mutex);
+		return -EINVAL;
+	}
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+/* This function is called by both msync() and fsync().
+ * TODO: Check if we can avoid calling pmfs_flush_buffer() for fsync. We use
+ * movnti to write data to files, so we may want to avoid doing unnecessary
+ * pmfs_flush_buffer() on fsync() */
+int pmfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+	/* Sync from start to end[inclusive] */
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	loff_t isize;
+	int error;
+
+	end += 1; /* end is inclusive. We like our indices normal please ! */
+
+	isize = i_size_read(inode);
+
+	if ((unsigned long)end > (unsigned long)isize)
+		end = isize;
+	if (!isize || (start >= end))
+	{
+		pmfs_dbg_verbose("[%s:%d] : (ERR) isize(%llx), start(%llx),"
+			" end(%llx)\n", __func__, __LINE__, isize, start, end);
+		return -ENODATA;
+	}
+
+	/* Align start and end to cacheline boundaries */
+	start = start & CACHELINE_MASK;
+	end = CACHELINE_ALIGN(end);
+	do {
+		void *xip_mem;
+		pgoff_t pgoff;
+		loff_t offset;
+		unsigned long xip_pfn, nr_flush_bytes;
+
+		pgoff = start >> PAGE_CACHE_SHIFT;
+		offset = start & ~PAGE_CACHE_MASK;
+
+		nr_flush_bytes = PAGE_CACHE_SIZE - offset;
+		if (nr_flush_bytes > (end - start))
+			nr_flush_bytes = end - start;
+
+		error = mapping->a_ops->get_xip_mem(mapping, pgoff, 0,
+		&xip_mem, &xip_pfn);
+
+		if (unlikely(error)) {
+			/* sparse files could have such holes */
+			pmfs_dbg_verbose("[%s:%d] : start(%llx), end(%llx),"
+			" pgoff(%lx)\n", __func__, __LINE__, start, end, pgoff);
+		} else {
+			/* flush the range */
+			pmfs_flush_buffer(xip_mem+offset, nr_flush_bytes, 0);
+		}
+
+		start += nr_flush_bytes;
+	} while (start < end);
+
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	return 0;
+}
+
+/* This callback is called when a file is closed */
+static int pmfs_flush(struct file *file, fl_owner_t id)
+{
+	int ret = 0;
+	/* if the file was opened for writing, make it persistent.
+	 * TODO: Should we be more smart to check if the file was modified? */
+	if (file->f_flags & FMODE_WRITE) {
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+
+	return ret;
+}
+
+extern unsigned long arch_get_unmapped_area_sz(struct file *file,
+	unsigned long addr0, unsigned long len, unsigned long align_size,
+	unsigned long pgoff, unsigned long flags);
+
+unsigned long
+pmfs_get_unmapped_area(struct file *file, unsigned long addr,
+			unsigned long len, unsigned long pgoff,
+			unsigned long flags)
+{
+	unsigned long align_size;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct inode *inode = file->f_mapping->host;
+	struct pmfs_inode *pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (pi->i_blk_type == PMFS_BLOCK_TYPE_1G)
+		align_size = PUD_SIZE;
+	else if (pi->i_blk_type == PMFS_BLOCK_TYPE_2M)
+		align_size = PMD_SIZE;
+	else
+		align_size = PAGE_SIZE;
+
+	if (flags & MAP_FIXED) {
+		/* FIXME: We could use 4K mappings as fallback. */
+		if (len & (align_size - 1))
+			return -EINVAL;
+		if (addr & (align_size - 1))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, align_size);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	return arch_get_unmapped_area_sz(file, addr, len, align_size, pgoff,
+					 flags);
+#if 0
+	if (mm->get_unmapped_area == arch_get_unmapped_area)
+		return pmfs_get_unmapped_area_bottomup(file, addr, len,
+							align_size, pgoff,
+							flags);
+	else
+		return pmfs_get_unmapped_area_topdown(file, addr, len,
+						       align_size, pgoff,
+						       flags);
+#endif
+}
+
+const struct file_operations pmfs_xip_file_operations = {
+	.llseek			= pmfs_llseek,
+	.read			= pmfs_xip_file_read,
+	.write			= pmfs_xip_file_write,
+	.mmap			= pmfs_xip_file_mmap,
+	.open			= generic_file_open,
+	.fsync			= pmfs_fsync,
+	.flush			= pmfs_flush,
+	.get_unmapped_area	= pmfs_get_unmapped_area,
+	.unlocked_ioctl		= pmfs_ioctl,
+	.fallocate		= pmfs_fallocate,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl		= pmfs_compat_ioctl,
+#endif
+};
+
+const struct inode_operations pmfs_file_inode_operations = {
+	.setattr	= pmfs_notify_change,
+	.getattr	= pmfs_getattr,
+	.get_acl	= NULL,
+};
diff --git a/fs/pmfs/inode.c b/fs/pmfs/inode.c
new file mode 100644
index 0000000..08bdc87
--- /dev/null
+++ b/fs/pmfs/inode.c
@@ -0,0 +1,1568 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/highuid.h>
+#include <linux/module.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/types.h>
+#include "pmfs.h"
+#include "xip.h"
+
+struct backing_dev_info pmfs_backing_dev_info __read_mostly = {
+	.ra_pages	= 0,                          /* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[PMFS_BLOCK_TYPE_MAX] = {0x1000, 0x200000, 0x40000000};
+
+/*
+ * allocate a data block for inode and return it's absolute blocknr.
+ * Zeroes out the block if zero set. Increments inode->i_blocks.
+ */
+static int pmfs_new_data_block(struct super_block *sb, struct pmfs_inode *pi,
+		unsigned long *blocknr, int zero)
+{
+	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+
+	int errval = pmfs_new_block(sb, blocknr, pi->i_blk_type, zero);
+
+	if (!errval) {
+		pmfs_memunlock_inode(sb, pi);
+		le64_add_cpu(&pi->i_blocks,
+			(1 << (data_bits - sb->s_blocksize_bits)));
+		pmfs_memlock_inode(sb, pi);
+	}
+
+	return errval;
+}
+
+/*
+ * find the offset to the block represented by the given inode's file
+ * relative block number.
+ */
+u64 pmfs_find_data_block(struct inode *inode, unsigned long file_blocknr)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	u32 blk_shift;
+	unsigned long blk_offset, blocknr = file_blocknr;
+	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+	unsigned int meta_bits = META_BLK_SHIFT;
+	u64 bp;
+
+	/* convert the 4K blocks into the actual blocks the inode is using */
+	blk_shift = data_bits - sb->s_blocksize_bits;
+	blk_offset = file_blocknr & ((1 << blk_shift) - 1);
+	blocknr = file_blocknr >> blk_shift;
+
+	if (blocknr >= (1UL << (pi->height * meta_bits)))
+		return 0;
+
+	bp = __pmfs_find_data_block(sb, pi, blocknr);
+	pmfs_dbg1("find_data_block %lx, %x %llx blk_p %p blk_shift %x"
+		" blk_offset %lx\n", file_blocknr, pi->height, bp,
+		pmfs_get_block(sb, bp), blk_shift, blk_offset);
+
+	if (bp == 0)
+		return 0;
+	return bp + (blk_offset << sb->s_blocksize_bits);
+}
+
+/* recursive_find_region: recursively search the btree to find hole or data
+ * in the specified range
+ * Input:
+ * block: points to the root of the b-tree
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int recursive_find_region(struct super_block *sb, unsigned long block,
+	u32 height, unsigned long first_blocknr, unsigned long last_blocknr,
+	int *data_found, int *hole_found, int hole)
+{
+	unsigned int meta_bits = META_BLK_SHIFT;
+	u64 *node;
+	unsigned long first_blk, last_blk, node_bits, blocks = 0;
+	unsigned int first_index, last_index, i;
+
+	node_bits = (height - 1) * meta_bits;
+
+	first_index = first_blocknr >> node_bits;
+	last_index = last_blocknr >> node_bits;
+
+	node = pmfs_get_block(sb, le64_to_cpu(block));
+
+	for (i = first_index; i <= last_index; i++) {
+		if (height == 1 || node[i] == 0) {
+			if (node[i]) {
+				*data_found = 1;
+				if (!hole)
+					goto done;
+			} else {
+				*hole_found = 1;
+			}
+
+			if (!*hole_found || !hole)
+				blocks += (1UL << node_bits);
+		} else {
+			first_blk = (i == first_index) ?  (first_blocknr &
+				((1 << node_bits) - 1)) : 0;
+
+			last_blk = (i == last_index) ? (last_blocknr &
+				((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+			blocks += recursive_find_region(sb, node[i], height - 1,
+				first_blk, last_blk, data_found, hole_found,
+				hole);
+			if (!hole && *data_found)
+				goto done;
+			/* cond_resched(); */
+		}
+	}
+done:
+	return blocks;
+}
+
+/*
+ * find the file offset for SEEK_DATA/SEEK_HOLE
+ */
+unsigned long pmfs_find_region(struct inode *inode, loff_t *offset, int hole)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+	unsigned long first_blocknr, last_blocknr;
+	unsigned long blocks = 0, offset_in_block;
+	int data_found = 0, hole_found = 0;
+
+	if (*offset >= inode->i_size)
+		return -ENXIO;
+
+	if (!inode->i_blocks || !pi->root) {
+		if (hole)
+			return inode->i_size;
+		else
+			return -ENXIO;
+	}
+
+	offset_in_block = *offset & ((1UL << data_bits) - 1);
+
+	if (pi->height == 0) {
+		data_found = 1;
+		goto out;
+	}
+
+	first_blocknr = *offset >> data_bits;
+	last_blocknr = inode->i_size >> data_bits;
+
+	pmfs_dbg_verbose("find_region offset %llx, first_blocknr %lx,"
+		" last_blocknr %lx hole %d\n",
+		  *offset, first_blocknr, last_blocknr, hole);
+
+	blocks = recursive_find_region(inode->i_sb, pi->root, pi->height,
+		first_blocknr, last_blocknr, &data_found, &hole_found, hole);
+
+out:
+	/* Searching data but only hole found till the end */
+	if (!hole && !data_found && hole_found)
+		return -ENXIO;
+
+	if (data_found && !hole_found) {
+		/* Searching data but we are alredy into them */
+		if (hole)
+			/* Searching hole but only data found, go to the end */
+			*offset = inode->i_size;
+		return 0;
+	}
+
+	/* Searching for hole, hole found and starting inside an hole */
+	if (hole && hole_found && !blocks) {
+		/* we found data after it */
+		if (!data_found)
+			/* last hole */
+			*offset = inode->i_size;
+		return 0;
+	}
+
+	if (offset_in_block) {
+		blocks--;
+		*offset += (blocks << data_bits) +
+			   ((1 << data_bits) - offset_in_block);
+	} else {
+		*offset += blocks << data_bits;
+	}
+
+	return 0;
+}
+
+/* examine the meta-data block node upto the end_idx for any non-null
+ * pointers. if found return false, else return true.
+ * requied to determine if a meta-data block contains no pointers and hence
+ * can be freed.
+ */
+static inline bool is_empty_meta_block(u64 *node, unsigned int start_idx,
+	unsigned int end_idx)
+{
+	int i, last_idx = (1 << META_BLK_SHIFT) - 1;
+	for (i = 0; i < start_idx; i++)
+		if (unlikely(node[i]))
+			return false;
+	for (i = end_idx + 1; i <= last_idx; i++)
+		if (unlikely(node[i]))
+			return false;
+	return true;
+}
+
+/* recursive_truncate_blocks: recursively deallocate a range of blocks from
+ * first_blocknr to last_blocknr in the inode's btree.
+ * Input:
+ * block: points to the root of the b-tree where the blocks need to be allocated
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * end: last byte offset of the range
+ */
+static int recursive_truncate_blocks(struct super_block *sb, u64 block,
+	u32 height, u32 btype, unsigned long first_blocknr,
+	unsigned long last_blocknr, bool *meta_empty)
+{
+	unsigned long blocknr, first_blk, last_blk;
+	unsigned int node_bits, first_index, last_index, i;
+	u64 *node;
+	unsigned int freed = 0, bzero;
+	int start, end;
+	bool mpty, all_range_freed = true;;
+
+	node = pmfs_get_block(sb, le64_to_cpu(block));
+
+	node_bits = (height - 1) * META_BLK_SHIFT;
+
+	start = first_index = first_blocknr >> node_bits;
+	end = last_index = last_blocknr >> node_bits;
+
+	if (height == 1) {
+		for (i = first_index; i <= last_index; i++) {
+			if (unlikely(!node[i]))
+				continue;
+			/* Freeing the data block */
+			blocknr = pmfs_get_blocknr(sb, le64_to_cpu(node[i]),
+				    btype);
+			pmfs_free_block(sb, blocknr, btype);
+			freed++;
+		}
+	} else {
+		for (i = first_index; i <= last_index; i++) {
+			if (unlikely(!node[i]))
+				continue;
+			first_blk = (i == first_index) ? (first_blocknr &
+				((1 << node_bits) - 1)) : 0;
+
+			last_blk = (i == last_index) ? (last_blocknr &
+				((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+			freed += recursive_truncate_blocks(sb, node[i],
+				height - 1, btype, first_blk, last_blk, &mpty);
+			/* cond_resched(); */
+			if (mpty) {
+				/* Freeing the meta-data block */
+				blocknr = pmfs_get_blocknr(sb, le64_to_cpu(
+					    node[i]), PMFS_BLOCK_TYPE_4K);
+				pmfs_free_block(sb, blocknr,PMFS_BLOCK_TYPE_4K);
+			} else {
+				if (i == first_index)
+				    start++;
+				else if (i == last_index)
+				    end--;
+				all_range_freed = false;
+			}
+		}
+	}
+	if (all_range_freed &&
+		is_empty_meta_block(node, first_index, last_index)) {
+		*meta_empty = true;
+	} else {
+		/* Zero-out the freed range if the meta-block in not empty */
+		if (start <= end) {
+			bzero = (end - start + 1) * sizeof(u64);
+			pmfs_memunlock_block(sb, node);
+			memset(&node[start], 0, bzero);
+			pmfs_memlock_block(sb, node);
+			pmfs_flush_buffer(&node[start], bzero, false);
+		}
+		*meta_empty = false;
+	}
+	return freed;
+}
+
+unsigned int pmfs_free_inode_subtree(struct super_block *sb,
+		u64 root, u32 height, u32 btype, loff_t end)
+{
+	unsigned long first_blocknr, last_blocknr;
+	unsigned int freed;
+	unsigned int data_bits = blk_type_to_shift[btype];
+	bool mpty;
+
+	if (!root)
+		return 0;
+
+	if (height == 0) {
+		first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+			btype);
+		pmfs_free_block(sb, first_blocknr, btype);
+		freed = 1;
+	} else {
+		first_blocknr = 0;
+		last_blocknr = (end - 1) >> data_bits;
+
+		freed = recursive_truncate_blocks(sb, root, height, btype,
+			first_blocknr, last_blocknr, &mpty);
+		BUG_ON(!mpty);
+		first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+			PMFS_BLOCK_TYPE_4K);
+		pmfs_free_block(sb, first_blocknr,PMFS_BLOCK_TYPE_4K);
+	}
+	return freed;
+}
+
+static void pmfs_decrease_btree_height(struct super_block *sb,
+	struct pmfs_inode *pi, unsigned long newsize, u64 newroot)
+{
+	unsigned int height = pi->height, new_height = 0;
+	unsigned long blocknr, last_blocknr;
+	u64 *root;
+	char b[8];
+
+	if (pi->i_blocks == 0 || newsize == 0) {
+		/* root must be NULL */
+		BUG_ON(newroot != 0);
+		goto update_root_and_height;
+	}
+
+	last_blocknr = ((newsize + pmfs_inode_blk_size(pi) - 1) >>
+		pmfs_inode_blk_shift(pi)) - 1;
+	while (last_blocknr > 0) {
+		last_blocknr = last_blocknr >> META_BLK_SHIFT;
+		new_height++;
+	}
+	if (height == new_height)
+		return;
+	pmfs_dbg_verbose("reducing tree height %x->%x\n", height, new_height);
+	while (height > new_height) {
+		/* freeing the meta block */
+		root = pmfs_get_block(sb, le64_to_cpu(newroot));
+		blocknr = pmfs_get_blocknr(sb, le64_to_cpu(newroot),
+			PMFS_BLOCK_TYPE_4K);
+		newroot = root[0];
+		pmfs_free_block(sb, blocknr, PMFS_BLOCK_TYPE_4K);
+		height--;
+	}
+update_root_and_height:
+	/* pi->height and pi->root need to be atomically updated. use
+	 * cmpxchg16 here. The following is dependent on a specific layout of
+	 * inode fields */
+	*(u64 *)b = *(u64 *)pi;
+	/* pi->height is at offset 2 from pi */
+	b[2] = (u8)new_height;
+	/* TODO: the following function assumes cmpxchg16b instruction writes
+	 * 16 bytes atomically. Confirm if it is really true. */
+	cmpxchg_double_local((u64 *)pi, &pi->root, *(u64 *)pi, pi->root,
+		*(u64 *)b, newroot);
+	return;
+}
+
+static unsigned long pmfs_inode_count_iblocks_recursive(struct super_block *sb,
+		unsigned long block, u32 height)
+{
+	u64 *node;
+	unsigned int i;
+	unsigned long i_blocks = 0;
+
+	if (height == 0)
+		return 1;
+	node = pmfs_get_block(sb, block);
+	for (i = 0; i < (1 << META_BLK_SHIFT); i++) {
+		if (node[i] == 0)
+			continue;
+		i_blocks += pmfs_inode_count_iblocks_recursive(sb,
+			le64_to_cpu(node[i]), height - 1);
+	}
+	return i_blocks;
+}
+
+static inline unsigned long pmfs_inode_count_iblocks (struct super_block *sb,
+	struct pmfs_inode *pi, u64 root)
+{
+	unsigned long iblocks;
+	if (root == 0)
+		return 0;
+	iblocks = pmfs_inode_count_iblocks_recursive(sb, le64_to_cpu(root),
+ 						pi->height);
+	return (iblocks << (pmfs_inode_blk_shift(pi) - sb->s_blocksize_bits));
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void __pmfs_truncate_blocks(struct inode *inode, loff_t start,
+				    loff_t end)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	unsigned long first_blocknr, last_blocknr;
+	u64 root;
+	unsigned int freed = 0;
+	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+	unsigned int meta_bits = META_BLK_SHIFT;
+	bool mpty;
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	if (!pi->root)
+		goto end_truncate_blocks;
+
+	pmfs_dbg_verbose("truncate: pi %p iblocks %llx %llx %llx %x %llx\n", pi,
+			 pi->i_blocks, start, end, pi->height, pi->i_size);
+
+	first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+	if (pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) {
+		last_blocknr = (1UL << (pi->height * meta_bits)) - 1;
+	} else {
+		if (end == 0)
+			goto end_truncate_blocks;
+		last_blocknr = (end - 1) >> data_bits;
+	}
+
+	if (first_blocknr > last_blocknr)
+		goto end_truncate_blocks;
+	root = pi->root;
+
+	if (pi->height == 0) {
+		first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+			pi->i_blk_type);
+		pmfs_free_block(sb, first_blocknr, pi->i_blk_type);
+		root = 0;
+		freed = 1;
+	} else {
+		freed = recursive_truncate_blocks(sb, root, pi->height,
+			pi->i_blk_type, first_blocknr, last_blocknr, &mpty);
+		if (mpty) {
+			first_blocknr = pmfs_get_blocknr(sb, le64_to_cpu(root),
+				PMFS_BLOCK_TYPE_4K);
+			pmfs_free_block(sb, first_blocknr, PMFS_BLOCK_TYPE_4K);
+			root = 0;
+		}
+	}
+	/* if we are called during mount, a power/system failure had happened.
+	 * Dont trust inode->i_blocks; recalculate it by rescanning the inode */
+	if (pmfs_is_mounting(sb))
+		inode->i_blocks = pmfs_inode_count_iblocks(sb, pi, root);
+	else
+		inode->i_blocks -= (freed * (1 << (data_bits -
+				sb->s_blocksize_bits)));
+
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_blocks = cpu_to_le64(inode->i_blocks);
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pmfs_decrease_btree_height(sb, pi, start, root);
+	/* Check for the flag EOFBLOCKS is still valid after the set size */
+	check_eof_blocks(sb, pi, inode->i_size);
+	pmfs_memlock_inode(sb, pi);
+	/* now flush the inode's first cacheline which was modified */
+	pmfs_flush_buffer(pi, 1, false);
+	return;
+end_truncate_blocks:
+	/* we still need to update ctime and mtime */
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pmfs_memlock_inode(sb, pi);
+	pmfs_flush_buffer(pi, 1, false);
+	return;
+}
+
+
+static int pmfs_increase_btree_height(struct super_block *sb,
+		struct pmfs_inode *pi, u32 new_height)
+{
+	u32 height = pi->height;
+	u64 *root, prev_root = pi->root;
+	unsigned long blocknr;
+	int errval = 0;
+
+	pmfs_dbg_verbose("increasing tree height %x:%x\n", height, new_height);
+	while (height < new_height) {
+		/* allocate the meta block */
+		errval = pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
+		if (errval) {
+			pmfs_err(sb, "failed to increase btree height\n");
+			break;
+		}
+		blocknr = pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K);
+		root = pmfs_get_block(sb, blocknr);
+		pmfs_memunlock_block(sb, root);
+		root[0] = prev_root;
+		pmfs_memlock_block(sb, root);
+		pmfs_flush_buffer(root, sizeof(*root), false);
+		prev_root = cpu_to_le64(blocknr);
+		height++;
+	}
+	pmfs_memunlock_inode(sb, pi);
+	pi->root = prev_root;
+	pi->height = height;
+	pmfs_memlock_inode(sb, pi);
+	return errval;
+}
+
+/* recursive_alloc_blocks: recursively allocate a range of blocks from
+ * first_blocknr to last_blocknr in the inode's btree.
+ * Input:
+ * block: points to the root of the b-tree where the blocks need to be allocated
+ * height: height of the btree
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * zero: whether to zero-out the allocated block(s)
+ */
+static int recursive_alloc_blocks(pmfs_transaction_t *trans,
+	struct super_block *sb, struct pmfs_inode *pi, u64 block, u32 height,
+	unsigned long first_blocknr, unsigned long last_blocknr, bool new_node,
+	bool zero)
+{
+	int i, errval;
+	unsigned int meta_bits = META_BLK_SHIFT, node_bits;
+	u64 *node;
+	bool journal_saved = 0;
+	unsigned long blocknr, first_blk, last_blk;
+	unsigned int first_index, last_index;
+	unsigned int flush_bytes;
+
+	node = pmfs_get_block(sb, le64_to_cpu(block));
+
+	node_bits = (height - 1) * meta_bits;
+
+	first_index = first_blocknr >> node_bits;
+	last_index = last_blocknr >> node_bits;
+
+	for (i = first_index; i <= last_index; i++) {
+		if (height == 1) {
+			if (node[i] == 0) {
+				errval = pmfs_new_data_block(sb, pi, &blocknr,
+							zero);
+				if (errval) {
+					pmfs_dbg_verbose("alloc data blk failed"
+						" %d\n", errval);
+					/* For later recovery in truncate... */
+					pmfs_memunlock_inode(sb, pi);
+					pi->i_flags |= cpu_to_le32(
+							PMFS_EOFBLOCKS_FL);
+					pmfs_memlock_inode(sb, pi);
+					return errval;
+				}
+				/* save the meta-data into the journal before
+				 * modifying */
+				if (new_node == 0 && journal_saved == 0) {
+					int le_size = (last_index - i + 1) << 3;
+					pmfs_add_logentry(sb, trans, &node[i],
+						le_size, LE_DATA);
+					journal_saved = 1;
+				}
+				pmfs_memunlock_block(sb, node);
+				node[i] = cpu_to_le64(pmfs_get_block_off(sb,
+						blocknr, pi->i_blk_type));
+				pmfs_memlock_block(sb, node);
+			}
+		} else {
+			if (node[i] == 0) {
+				/* allocate the meta block */
+				errval = pmfs_new_block(sb, &blocknr,
+						PMFS_BLOCK_TYPE_4K, 1);
+				if (errval) {
+					pmfs_dbg_verbose("alloc meta blk"
+						" failed\n");
+					goto fail;
+				}
+				/* save the meta-data into the journal before
+				 * modifying */
+				if (new_node == 0 && journal_saved == 0) {
+					int le_size = (last_index - i + 1) << 3;
+					pmfs_add_logentry(sb, trans, &node[i],
+						le_size, LE_DATA);
+					journal_saved = 1;
+				}
+				pmfs_memunlock_block(sb, node);
+				node[i] = cpu_to_le64(pmfs_get_block_off(sb,
+					    blocknr, PMFS_BLOCK_TYPE_4K));
+				pmfs_memlock_block(sb, node);
+				new_node = 1;
+			}
+
+			first_blk = (i == first_index) ? (first_blocknr &
+				((1 << node_bits) - 1)) : 0;
+
+			last_blk = (i == last_index) ? (last_blocknr &
+				((1 << node_bits) - 1)) : (1 << node_bits) - 1;
+
+			errval = recursive_alloc_blocks(trans, sb, pi, node[i],
+			height - 1, first_blk, last_blk, new_node, zero);
+			if (errval < 0)
+				goto fail;
+		}
+	}
+	if (new_node || trans == NULL) {
+		/* if the changes were not logged, flush the cachelines we may
+	 	* have modified */
+		flush_bytes = (last_index - first_index + 1) * sizeof(node[0]);
+		pmfs_flush_buffer(&node[first_index], flush_bytes, false);
+	}
+	errval = 0;
+fail:
+	return errval;
+}
+
+int __pmfs_alloc_blocks(pmfs_transaction_t *trans, struct super_block *sb,
+	struct pmfs_inode *pi, unsigned long file_blocknr, unsigned int num,
+	bool zero)
+{
+	int errval;
+	unsigned long max_blocks;
+	unsigned int height;
+	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
+	unsigned int blk_shift, meta_bits = META_BLK_SHIFT;
+	unsigned long blocknr, first_blocknr, last_blocknr, total_blocks;
+	/* convert the 4K blocks into the actual blocks the inode is using */
+	blk_shift = data_bits - sb->s_blocksize_bits;
+
+	first_blocknr = file_blocknr >> blk_shift;
+	last_blocknr = (file_blocknr + num - 1) >> blk_shift;
+
+	pmfs_dbg_verbose("alloc_blocks height %d file_blocknr %lx num %x, "
+		   "first blocknr 0x%lx, last_blocknr 0x%lx\n",
+		   pi->height, file_blocknr, num, first_blocknr, last_blocknr);
+
+	height = pi->height;
+
+	blk_shift = height * meta_bits;
+
+	max_blocks = 0x1UL << blk_shift;
+
+	if (last_blocknr > max_blocks - 1) {
+		/* B-tree height increases as a result of this allocation */
+		total_blocks = last_blocknr >> blk_shift;
+		while (total_blocks > 0) {
+			total_blocks = total_blocks >> meta_bits;
+			height++;
+		}
+		if (height > 3) {
+			pmfs_dbg("[%s:%d] Max file size. Cant grow the file\n",
+				__func__, __LINE__);
+			errval = -ENOSPC;
+			goto fail;
+		}
+	}
+
+	if (!pi->root) {
+		if (height == 0) {
+			u64 root;
+			errval = pmfs_new_data_block(sb, pi, &blocknr, zero);
+			if (errval) {
+				pmfs_dbg_verbose("[%s:%d] failed: alloc data"
+					" block\n", __func__, __LINE__);
+				goto fail;
+			}
+			root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
+					   pi->i_blk_type));
+			/* TODO: use RTM for in-place atomic update */
+			pmfs_memunlock_inode(sb, pi);
+			pi->root = root;
+			pi->height = height;
+			pmfs_memlock_inode(sb, pi);
+		} else {
+			errval = pmfs_increase_btree_height(sb, pi, height);
+			if (errval) {
+				pmfs_dbg_verbose("[%s:%d] failed: inc btree"
+					" height\n", __func__, __LINE__);
+				goto fail;
+			}
+			errval = recursive_alloc_blocks(trans, sb, pi, pi->root,
+			pi->height, first_blocknr, last_blocknr, 1, zero);
+			if (errval < 0)
+				goto fail;
+		}
+	} else {
+		/* Go forward only if the height of the tree is non-zero. */
+		if (height == 0)
+			return 0;
+
+		if (height > pi->height) {
+			errval = pmfs_increase_btree_height(sb, pi, height);
+			if (errval) {
+				pmfs_dbg_verbose("Err: inc height %x:%x tot %lx"
+					"\n", pi->height, height, total_blocks);
+				goto fail;
+			}
+		}
+		errval = recursive_alloc_blocks(trans, sb, pi, pi->root, height,
+				first_blocknr, last_blocknr, 0, zero);
+		if (errval < 0)
+			goto fail;
+	}
+	return 0;
+fail:
+	return errval;
+}
+
+/*
+ * Allocate num data blocks for inode, starting at given file-relative
+ * block number.
+ */
+inline int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode,
+		unsigned long file_blocknr, unsigned int num, bool zero)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	int errval;
+
+	errval = __pmfs_alloc_blocks(trans, sb, pi, file_blocknr, num, zero);
+	inode->i_blocks = le64_to_cpu(pi->i_blocks);
+
+	return errval;
+}
+
+/* Initialize the inode table. The pmfs_inode struct corresponding to the
+ * inode table has already been zero'd out */
+int pmfs_init_inode_table(struct super_block *sb)
+{
+	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	unsigned long num_blocks = 0, init_inode_table_size;
+	int errval;
+
+	if (sbi->num_inodes == 0) {
+		/* initial inode table size was not specified. */
+		if (sbi->initsize >= PMFS_LARGE_INODE_TABLE_THREASHOLD)
+			init_inode_table_size = PMFS_LARGE_INODE_TABLE_SIZE;
+		else
+			init_inode_table_size = PMFS_DEF_BLOCK_SIZE_4K;
+	} else {
+		init_inode_table_size = sbi->num_inodes << PMFS_INODE_BITS;
+	}
+
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_mode = 0;
+	pi->i_uid = 0;
+	pi->i_gid = 0;
+	pi->i_links_count = cpu_to_le16(1);
+	pi->i_flags = 0;
+	pi->height = 0;
+	pi->i_dtime = 0;
+	if (init_inode_table_size >= PMFS_LARGE_INODE_TABLE_SIZE)
+		pi->i_blk_type = PMFS_BLOCK_TYPE_2M;
+	else
+		pi->i_blk_type = PMFS_BLOCK_TYPE_4K;
+
+	num_blocks = (init_inode_table_size + pmfs_inode_blk_size(pi) - 1) >>
+				pmfs_inode_blk_shift(pi);
+
+	pi->i_size = cpu_to_le64(num_blocks << pmfs_inode_blk_shift(pi));
+	/* pmfs_sync_inode(pi); */
+	pmfs_memlock_inode(sb, pi);
+
+	sbi->s_inodes_count = num_blocks <<
+			(pmfs_inode_blk_shift(pi) - PMFS_INODE_BITS);
+	/* calculate num_blocks in terms of 4k blocksize */
+	num_blocks = num_blocks << (pmfs_inode_blk_shift(pi) -
+					sb->s_blocksize_bits);
+	errval = __pmfs_alloc_blocks(NULL, sb, pi, 0, num_blocks, true);
+
+	if (errval != 0) {
+		pmfs_err(sb, "Err: initializing the Inode Table: %d\n", errval);
+		return errval;
+	}
+
+	/* inode 0 is considered invalid and hence never used */
+	sbi->s_free_inodes_count =
+		(sbi->s_inodes_count - PMFS_FREE_INODE_HINT_START);
+	sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+
+	return 0;
+}
+
+static int pmfs_read_inode(struct inode *inode, struct pmfs_inode *pi)
+{
+	int ret = -EIO;
+
+#if 0
+	if (pmfs_calc_checksum((u8 *)pi, PMFS_INODE_SIZE)) {
+		pmfs_err(inode->i_sb, "checksum error in inode %lx\n",
+			  (u64)inode->i_ino);
+		goto bad_inode;
+	}
+#endif
+
+	inode->i_mode = le16_to_cpu(pi->i_mode);
+	inode->i_uid = le32_to_cpu(pi->i_uid);
+	inode->i_gid = le32_to_cpu(pi->i_gid);
+	set_nlink(inode, le16_to_cpu(pi->i_links_count));
+	inode->i_size = le64_to_cpu(pi->i_size);
+	inode->i_atime.tv_sec = le32_to_cpu(pi->i_atime);
+	inode->i_ctime.tv_sec = le32_to_cpu(pi->i_ctime);
+	inode->i_mtime.tv_sec = le32_to_cpu(pi->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+					 inode->i_ctime.tv_nsec = 0;
+	inode->i_generation = le32_to_cpu(pi->i_generation);
+	pmfs_set_inode_flags(inode, pi);
+
+	/* check if the inode is active. */
+	if (inode->i_nlink == 0 &&
+	   (inode->i_mode == 0 || le32_to_cpu(pi->i_dtime))) {
+		/* this inode is deleted */
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	inode->i_blocks = le64_to_cpu(pi->i_blocks);
+	inode->i_mapping->a_ops = &pmfs_aops_xip;
+	inode->i_mapping->backing_dev_info = &pmfs_backing_dev_info;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &pmfs_file_inode_operations;
+		inode->i_fop = &pmfs_xip_file_operations;
+		break;
+	case S_IFDIR:
+		inode->i_op = &pmfs_dir_inode_operations;
+		inode->i_fop = &pmfs_dir_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &pmfs_symlink_inode_operations;
+		break;
+	default:
+		inode->i_size = 0;
+		inode->i_op = &pmfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode,
+				   le32_to_cpu(pi->dev.rdev));
+		break;
+	}
+
+	return 0;
+
+bad_inode:
+	make_bad_inode(inode);
+	return ret;
+}
+
+static void pmfs_update_inode(struct inode *inode, struct pmfs_inode *pi)
+{
+	pmfs_memunlock_inode(inode->i_sb, pi);
+	pi->i_mode = cpu_to_le16(inode->i_mode);
+	pi->i_uid = cpu_to_le32(inode->i_uid);
+	pi->i_gid = cpu_to_le32(inode->i_gid);
+	pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	pi->i_size = cpu_to_le64(inode->i_size);
+	pi->i_blocks = cpu_to_le64(inode->i_blocks);
+	pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pi->i_generation = cpu_to_le32(inode->i_generation);
+	pmfs_get_inode_flags(inode, pi);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+
+	pmfs_memlock_inode(inode->i_sb, pi);
+
+	return;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ */
+static int pmfs_free_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_inode *pi;
+	unsigned long inode_nr;
+	pmfs_transaction_t *trans;
+	int err = 0;
+
+	mutex_lock(&PMFS_SB(sb)->inode_table_mutex);
+
+	pmfs_dbg_verbose("free_inode: %lx free_nodes %x tot nodes %x hint %x\n",
+		   inode->i_ino, sbi->s_free_inodes_count, sbi->s_inodes_count,
+		   sbi->s_free_inode_hint);
+	inode_nr = inode->i_ino >> PMFS_INODE_BITS;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+
+	/* This transaction can be avoided if using RTM */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	/* TODO: use RTM to write the below cacheline atomically */
+	pmfs_memunlock_inode(sb, pi);
+	pi->root = 0;
+	/* pi->i_links_count = 0;
+	pi->i_xattr = 0; */
+	pi->i_size = 0;
+	pi->i_dtime = cpu_to_le32(get_seconds());
+	pmfs_memlock_inode(sb, pi);
+
+	pmfs_commit_transaction(sb, trans);
+
+	/* increment s_free_inodes_count */
+	if (inode_nr < (sbi->s_free_inode_hint))
+		sbi->s_free_inode_hint = (inode_nr);
+
+	sbi->s_free_inodes_count += 1;
+
+	if ((sbi->s_free_inodes_count) ==
+	    (sbi->s_inodes_count) - PMFS_FREE_INODE_HINT_START) {
+		/* filesystem is empty */
+		pmfs_dbg_verbose("fs is empty!\n");
+		sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+	}
+
+	pmfs_dbg_verbose("free_inode: free_nodes %x total_nodes %x hint %x\n",
+		   sbi->s_free_inodes_count, sbi->s_inodes_count,
+		   sbi->s_free_inode_hint);
+out:
+	mutex_unlock(&PMFS_SB(sb)->inode_table_mutex);
+	return err;
+}
+
+struct inode *pmfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	struct pmfs_inode *pi;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	pi = pmfs_get_inode(sb, ino);
+	if (!pi) {
+		err = -EACCES;
+		goto fail;
+	}
+	err = pmfs_read_inode(inode, pi);
+	if (unlikely(err))
+		goto fail;
+	inode->i_ino = ino;
+
+	unlock_new_inode(inode);
+	return inode;
+fail:
+	iget_failed(inode);
+	return ERR_PTR(err);
+}
+
+void pmfs_evict_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	u64 root;
+	unsigned int height, btype;
+	int err = 0;
+
+	if (!inode->i_nlink && !is_bad_inode(inode)) {
+		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+			S_ISLNK(inode->i_mode)))
+			goto out;
+		if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+			goto out;
+
+		root = pi->root;
+		height = pi->height;
+		btype = pi->i_blk_type;
+
+		/* first free the inode */
+		err = pmfs_free_inode(inode);
+		if (err)
+			goto out;
+		/* then free the blocks from the inode's b-tree */
+		pmfs_free_inode_subtree(sb, root, height, btype,
+			inode->i_size);
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_size = 0;
+	}
+out:
+	/* now it is safe to remove the inode from the truncate list */
+	pmfs_truncate_del(inode);
+	/* TODO: Since we don't use page-cache, do we really need the following
+	 * call? */
+	truncate_inode_pages(&inode->i_data, 0);
+
+	clear_inode(inode);
+}
+
+static int pmfs_increase_inode_table_size(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+	pmfs_transaction_t *trans;
+	int errval;
+
+	/* 1 log entry for inode-table inode, 1 lentry for inode-table b-tree */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	errval = __pmfs_alloc_blocks(trans, sb, pi,
+			pi->i_size >> sb->s_blocksize_bits, 1, true);
+
+	if (errval == 0) {
+		u64 i_size = le64_to_cpu(pi->i_size);
+
+		sbi->s_free_inode_hint = i_size >> PMFS_INODE_BITS;
+		i_size += pmfs_inode_blk_size(pi);
+
+		pmfs_memunlock_inode(sb, pi);
+		pi->i_size = cpu_to_le64(i_size);
+		pmfs_memlock_inode(sb, pi);
+
+		sbi->s_free_inodes_count += INODES_PER_BLOCK(pi->i_blk_type);
+		sbi->s_inodes_count = i_size >> PMFS_INODE_BITS;
+	} else
+		pmfs_dbg_verbose("no space left to inc inode table!\n");
+	/* commit the transaction */
+	pmfs_commit_transaction(sb, trans);
+	return errval;
+}
+
+struct inode *pmfs_new_inode(pmfs_transaction_t *trans, struct inode *dir,
+		umode_t mode, const struct qstr *qstr)
+{
+	struct super_block *sb;
+	struct pmfs_sb_info *sbi;
+	struct inode *inode;
+	struct pmfs_inode *pi = NULL, *inode_table;
+	struct pmfs_inode *diri = NULL;
+	int i, errval;
+	u32 num_inodes, inodes_per_block;
+	ino_t ino = 0;
+
+	sb = dir->i_sb;
+	sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode_init_owner(inode, dir, mode);
+	inode->i_blocks = inode->i_size = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	inode->i_generation = atomic_add_return(1, &sbi->next_generation);
+
+	inode_table = pmfs_get_inode_table(sb);
+
+	pmfs_dbg_verbose("inode: %p free_inodes %x total_inodes %x hint %x\n",
+		inode, sbi->s_free_inodes_count, sbi->s_inodes_count,
+		sbi->s_free_inode_hint);
+
+	diri = pmfs_get_inode(sb, dir->i_ino);
+	if (!diri)
+		return ERR_PTR(-EACCES);
+
+	mutex_lock(&sbi->inode_table_mutex);
+
+	/* find the oldest unused pmfs inode */
+	i = (sbi->s_free_inode_hint);
+	inodes_per_block = INODES_PER_BLOCK(inode_table->i_blk_type);
+retry:
+	num_inodes = (sbi->s_inodes_count);
+	while (i < num_inodes) {
+		u32 end_ino;
+		end_ino = i + (inodes_per_block - (i & (inodes_per_block - 1)));
+		ino = i <<  PMFS_INODE_BITS;
+		pi = pmfs_get_inode(sb, ino);
+		for (; i < end_ino; i++) {
+			/* check if the inode is active. */
+			if (le16_to_cpu(pi->i_links_count) == 0 &&
+			(le16_to_cpu(pi->i_mode) == 0 ||
+			 le32_to_cpu(pi->i_dtime)))
+				/* this inode is free */
+				break;
+			pi = (struct pmfs_inode *)((void *)pi +
+							PMFS_INODE_SIZE);
+		}
+		/* found a free inode */
+		if (i < end_ino)
+			break;
+	}
+	if (unlikely(i >= num_inodes)) {
+		errval = pmfs_increase_inode_table_size(sb);
+		if (errval == 0)
+			goto retry;
+		mutex_unlock(&PMFS_SB(sb)->inode_table_mutex);
+		pmfs_dbg("PMFS: could not find a free inode\n");
+		goto fail1;
+	}
+
+	ino = i << PMFS_INODE_BITS;
+	pmfs_dbg_verbose("allocating inode %lx\n", ino);
+
+	/* chosen inode is in ino */
+	inode->i_ino = ino;
+	pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_blk_type = PMFS_DEFAULT_BLOCK_TYPE;
+	pi->i_flags = pmfs_mask_flags(mode, diri->i_flags);
+	pi->height = 0;
+	pi->i_dtime = 0;
+	pmfs_memlock_inode(sb, pi);
+
+	sbi->s_free_inodes_count -= 1;
+
+	if (i < (sbi->s_inodes_count) - 1)
+		sbi->s_free_inode_hint = (i + 1);
+	else
+		sbi->s_free_inode_hint = (PMFS_FREE_INODE_HINT_START);
+
+	mutex_unlock(&sbi->inode_table_mutex);
+
+	pmfs_update_inode(inode, pi);
+
+	pmfs_set_inode_flags(inode, pi);
+
+	if (insert_inode_locked(inode) < 0) {
+		pmfs_err(sb, "pmfs_new_inode failed ino %lx\n", inode->i_ino);
+		errval = -EINVAL;
+		goto fail1;
+	}
+
+	return inode;
+fail1:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(errval);
+}
+
+inline void pmfs_update_nlink(struct inode *inode, struct pmfs_inode *pi)
+{
+	pmfs_memunlock_inode(inode->i_sb, pi);
+	pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+inline void pmfs_update_isize(struct inode *inode, struct pmfs_inode *pi)
+{
+	pmfs_memunlock_inode(inode->i_sb, pi);
+	pi->i_size = cpu_to_le64(inode->i_size);
+	pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+inline void pmfs_update_time(struct inode *inode, struct pmfs_inode *pi)
+{
+	pmfs_memunlock_inode(inode->i_sb, pi);
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	pmfs_memlock_inode(inode->i_sb, pi);
+}
+
+/* This function checks if VFS's inode and PMFS's inode are not in sync */
+static bool pmfs_is_inode_dirty(struct inode *inode, struct pmfs_inode *pi)
+{
+	if (inode->i_ctime.tv_sec != le32_to_cpu(pi->i_ctime) ||
+		inode->i_mtime.tv_sec != le32_to_cpu(pi->i_mtime) ||
+		inode->i_size != le64_to_cpu(pi->i_size) ||
+		inode->i_mode != le16_to_cpu(pi->i_mode) ||
+		inode->i_uid != le32_to_cpu(pi->i_uid) ||
+		inode->i_gid != le32_to_cpu(pi->i_gid) ||
+		inode->i_nlink != le16_to_cpu(pi->i_links_count) ||
+		inode->i_blocks != le64_to_cpu(pi->i_blocks) ||
+		inode->i_atime.tv_sec != le32_to_cpu(pi->i_atime))
+		return true;
+	return false;
+}
+
+int pmfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	/* write_inode should never be called because we always keep our inodes
+	 * clean. So let us know if write_inode ever gets called. */
+	BUG();
+	return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because PMFS always keeps its inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void pmfs_dirty_inode(struct inode *inode, int flags)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+	/* only i_atime should have changed if at all.
+	 * we can do in-place atomic update */
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	pmfs_memlock_inode(sb, pi);
+	pmfs_flush_buffer(&pi->i_atime, sizeof(pi->i_atime), true);
+
+	if (pmfs_is_inode_dirty(inode, pi))
+		BUG();
+}
+
+/*
+ * Called to zeros out a single block. It's used in the "resize"
+ * to avoid to keep data in case the file grow up again.
+ */
+/* Make sure to zero out just a single 4K page in case of 2M or 1G blocks */
+static void pmfs_block_truncate_page(struct inode *inode, loff_t newsize)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long offset = newsize & (sb->s_blocksize - 1);
+	unsigned long blocknr, length;
+	u64 blockoff;
+	char *bp;
+
+	/* Block boundary or extending ? */
+	if (!offset || newsize > inode->i_size)
+		return;
+
+	length = sb->s_blocksize - offset;
+	blocknr = newsize >> sb->s_blocksize_bits;
+
+	blockoff = pmfs_find_data_block(inode, blocknr);
+
+	/* Hole ? */
+	if (!blockoff)
+		return;
+
+	bp = pmfs_get_block(sb, blockoff);
+	if (!bp)
+		return;
+	pmfs_memunlock_block(sb, bp);
+	memset(bp + offset, 0, length);
+	pmfs_memlock_block(sb, bp);
+	pmfs_flush_buffer(bp + offset, length, false);
+	return;
+}
+
+void pmfs_truncate_del(struct inode *inode)
+{
+	struct list_head *prev;
+	struct pmfs_inode_vfs *si = PMFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+	struct pmfs_inode_truncate_item *li;
+	unsigned long ino_next;
+
+	mutex_lock(&sbi->s_truncate_lock);
+	if (list_empty(&si->i_truncated))
+		goto out;
+	/* Make sure all truncate operation is persistent before removing the
+	 * inode from the truncate list */
+	PERSISTENT_MARK();
+
+	li = pmfs_get_truncate_item(sb, inode->i_ino);
+
+	ino_next = le64_to_cpu(li->i_next_truncate);
+	prev = si->i_truncated.prev;
+
+	list_del_init(&si->i_truncated);
+	PERSISTENT_BARRIER();
+
+	/* Atomically delete the inode from the truncate list */
+	if (prev == &sbi->s_truncate) {
+		pmfs_memunlock_range(sb, head, sizeof(*head));
+		head->i_next_truncate = cpu_to_le64(ino_next);
+		pmfs_memlock_range(sb, head, sizeof(*head));
+		pmfs_flush_buffer(&head->i_next_truncate,
+			sizeof(head->i_next_truncate), false);
+	} else {
+		struct inode *i_prv = &list_entry(prev,
+			struct pmfs_inode_vfs, i_truncated)->vfs_inode;
+		struct pmfs_inode_truncate_item *li_prv = 
+				pmfs_get_truncate_item(sb, i_prv->i_ino);
+		pmfs_memunlock_range(sb, li_prv, sizeof(*li_prv));
+		li_prv->i_next_truncate = ino_next;
+		pmfs_memlock_range(sb, li_prv, sizeof(*li_prv));
+		pmfs_flush_buffer(&li_prv->i_next_truncate,
+			sizeof(li_prv->i_next_truncate), false);
+	}
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+out:
+	mutex_unlock(&sbi->s_truncate_lock);
+}
+
+/* PMFS maintains a so-called truncate list, which is a linked list of inodes
+ * which require further processing in case of a power failure. Currently, PMFS
+ * uses the truncate list for two purposes.
+ * 1) When removing a file, if the i_links_count becomes zero (i.e., the file
+ * is not referenced by any directory entry), the inode needs to be freed.
+ * However, if the file is currently in use (e.g., opened) it can't be freed
+ * until all references are closed. Hence PMFS adds the inode to the truncate
+ * list during directory entry removal, and removes it from the truncate list
+ * when VFS calls evict_inode. If a power failure happens before evict_inode,
+ * the inode is freed during the next mount when we recover the truncate list
+ * 2) When truncating a file (reducing the file size and freeing the blocks),
+ * we dont want to return the freed blocks to the free list until the whole
+ * truncate operation is complete. So we add the inode to the truncate list with
+ * the specified truncate_size. Now we can return freed blocks to the free list
+ * even before the transaction is complete. Because if a power failure happens
+ * before freeing of all the blocks is complete, PMFS will free the remaining
+ * blocks during the next mount when we recover the truncate list */
+void pmfs_truncate_add(struct inode *inode, u64 truncate_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+	struct pmfs_inode_truncate_item *li;
+
+	mutex_lock(&PMFS_SB(sb)->s_truncate_lock);
+	if (!list_empty(&PMFS_I(inode)->i_truncated))
+		goto out_unlock;
+
+	li = pmfs_get_truncate_item(sb, inode->i_ino);
+
+	pmfs_memunlock_range(sb, li, sizeof(*li));
+	li->i_next_truncate = head->i_next_truncate;
+	li->i_truncatesize = cpu_to_le64(truncate_size);
+	pmfs_memlock_range(sb, li, sizeof(*li));
+	pmfs_flush_buffer(li, sizeof(*li), false);
+	/* make sure above is persistent before changing the head pointer */
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	/* Atomically insert this inode at the head of the truncate list. */
+	pmfs_memunlock_range(sb, head, sizeof(*head));
+	head->i_next_truncate = cpu_to_le64(inode->i_ino);
+	pmfs_memlock_range(sb, head, sizeof(*head));
+	pmfs_flush_buffer(&head->i_next_truncate,
+		sizeof(head->i_next_truncate), false);
+	/* No need to make the head persistent here if we are called from
+	 * within a transaction, because the transaction will provide a
+	 * subsequent persistent barrier */
+	if (pmfs_current_transaction() == NULL) {
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+	list_add(&PMFS_I(inode)->i_truncated, &PMFS_SB(sb)->s_truncate);
+
+out_unlock:
+	mutex_unlock(&PMFS_SB(sb)->s_truncate_lock);
+}
+
+void pmfs_setsize(struct inode *inode, loff_t newsize)
+{
+	loff_t oldsize = inode->i_size;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode))) {
+		pmfs_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+		return;
+	}
+
+	if (newsize != oldsize) {
+		pmfs_block_truncate_page(inode, newsize);
+		i_size_write(inode, newsize);
+	}
+	/* FIXME: we should make sure that there is nobody reading the inode
+	 * before truncating it. Also we need to munmap the truncated range
+	 * from application address space, if mmapped. */
+	/* synchronize_rcu(); */
+	__pmfs_truncate_blocks(inode, newsize, oldsize);
+	/* No need to make the b-tree persistent here if we are called from
+	 * within a transaction, because the transaction will provide a
+	 * subsequent persistent barrier */
+	if (pmfs_current_transaction() == NULL) {
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+	return;
+}
+
+int pmfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		         struct kstat *stat)
+{
+	struct inode *inode;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	/* stat->blocks should be the number of 512B blocks */
+	stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+	return 0;
+}
+
+/* update a single inode field atomically without using a transaction */
+static int pmfs_update_single_field(struct super_block *sb, struct inode *inode,
+	struct pmfs_inode *pi, unsigned int ia_valid)
+{
+	pmfs_memunlock_inode(sb, pi);
+	switch (ia_valid) {
+		case ATTR_MODE:
+			pi->i_mode = cpu_to_le16(inode->i_mode);
+			break;
+		case ATTR_UID:
+			pi->i_uid = cpu_to_le32(inode->i_uid);
+			break;
+		case ATTR_GID:
+			pi->i_gid = cpu_to_le32(inode->i_gid);
+			break;
+		case ATTR_SIZE:
+			pi->i_size = cpu_to_le64(inode->i_size);
+			break;
+		case ATTR_ATIME:
+			pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+			break;
+		case ATTR_CTIME:
+			pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+			break;
+		case ATTR_MTIME:
+			pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+			break;
+	}
+	pmfs_memlock_inode(sb, pi);
+	pmfs_flush_buffer(pi, sizeof(*pi), true);
+	return 0;
+}
+
+int pmfs_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+	pmfs_transaction_t *trans;
+	int ret;
+	unsigned int ia_valid = attr->ia_valid, attr_mask;
+
+	if (!pi)
+		return -EACCES;
+
+	ret = inode_change_ok(inode, attr);
+	if (ret)
+		return ret;
+
+	if ((ia_valid & ATTR_SIZE) && (attr->ia_size != inode->i_size ||
+			pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL))) {
+
+		pmfs_truncate_add(inode, attr->ia_size);
+		/* set allocation hint */
+		pmfs_set_blocksize_hint(sb, pi, attr->ia_size);
+
+		/* now we can freely truncate the inode */
+		pmfs_setsize(inode, attr->ia_size);
+		pmfs_update_isize(inode, pi);
+		pmfs_flush_buffer(pi, CACHELINE_SIZE, false);
+		/* we have also updated the i_ctime and i_mtime, so no
+		 * need to update them again */
+		ia_valid = ia_valid & ~(ATTR_CTIME | ATTR_MTIME);
+		/* now it is safe to remove the inode from the truncate list */
+		pmfs_truncate_del(inode);
+	}
+	setattr_copy(inode, attr);
+
+	/* we have already handled ATTR_SIZE above so no need to check for it */
+	attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME | ATTR_MTIME |
+		ATTR_CTIME;
+
+	ia_valid = ia_valid & attr_mask;
+
+	if (ia_valid == 0)
+		return ret;
+	/* check if we need to update only a single field. we could avoid using
+	 * a transaction */
+	if ((ia_valid & (ia_valid - 1)) == 0) {
+		pmfs_update_single_field(sb, inode, pi, ia_valid);
+		return ret;
+	}
+
+	BUG_ON(pmfs_current_transaction());
+	/* multiple fields are modified. Use a transaction for atomicity */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+
+	pmfs_update_inode(inode, pi);
+
+	pmfs_commit_transaction(sb, trans);
+
+	return ret;
+}
+
+void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi)
+{
+	unsigned int flags = le32_to_cpu(pi->i_flags);
+
+	inode->i_flags &=
+		~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+	if (flags & FS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & FS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & FS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & FS_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	if (!pi->i_xattr)
+		inode_has_no_xattr(inode);
+}
+
+void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi)
+{
+	unsigned int flags = inode->i_flags;
+	unsigned int pmfs_flags = le32_to_cpu(pi->i_flags);
+
+	pmfs_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+			 FS_NOATIME_FL | FS_DIRSYNC_FL);
+	if (flags & S_SYNC)
+		pmfs_flags |= FS_SYNC_FL;
+	if (flags & S_APPEND)
+		pmfs_flags |= FS_APPEND_FL;
+	if (flags & S_IMMUTABLE)
+		pmfs_flags |= FS_IMMUTABLE_FL;
+	if (flags & S_NOATIME)
+		pmfs_flags |= FS_NOATIME_FL;
+	if (flags & S_DIRSYNC)
+		pmfs_flags |= FS_DIRSYNC_FL;
+
+	pi->i_flags = cpu_to_le32(pmfs_flags);
+}
+
+const struct address_space_operations pmfs_aops_xip = {
+	.get_xip_mem		= pmfs_get_xip_mem,
+	/*.xip_mem_protect	= pmfs_xip_mem_protect,*/
+};
diff --git a/fs/pmfs/ioctl.c b/fs/pmfs/ioctl.c
new file mode 100644
index 0000000..c9623ed
--- /dev/null
+++ b/fs/pmfs/ioctl.c
@@ -0,0 +1,150 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Ioctl operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include "pmfs.h"
+
+long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct pmfs_inode *pi;
+	struct super_block *sb = inode->i_sb;
+	unsigned int flags;
+	int ret;
+	pmfs_transaction_t *trans;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	if (!pi)
+		return -EACCES;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		flags = le32_to_cpu(pi->i_flags) & PMFS_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *)arg);
+	case FS_IOC_SETFLAGS: {
+		unsigned int oldflags;
+
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+
+		if (!inode_owner_or_capable(inode)) {
+			ret = -EPERM;
+			goto flags_out;
+		}
+
+		if (get_user(flags, (int __user *)arg)) {
+			ret = -EFAULT;
+			goto flags_out;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = le32_to_cpu(pi->i_flags);
+
+		if ((flags ^ oldflags) &
+		    (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				ret = -EPERM;
+				goto flags_out;
+			}
+		}
+
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~FS_DIRSYNC_FL;
+
+		flags = flags & FS_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+		inode->i_ctime = CURRENT_TIME_SEC;
+		/*TODO: This transaction can be avoided if we had RTM */
+		trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+		pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+		pmfs_memunlock_inode(sb, pi);
+		pi->i_flags = cpu_to_le32(flags);
+		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+		pmfs_set_inode_flags(inode, pi);
+		pmfs_memlock_inode(sb, pi);
+		pmfs_commit_transaction(sb, trans);
+out:
+		mutex_unlock(&inode->i_mutex);
+flags_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+	case FS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+	case FS_IOC_SETVERSION: {
+		__u32 generation;
+		if (!inode_owner_or_capable(inode))
+			return -EPERM;
+		ret = mnt_want_write_file(filp);
+		if (ret)
+			return ret;
+		if (get_user(generation, (int __user *)arg)) {
+			ret = -EFAULT;
+			goto setversion_out;
+		}
+		mutex_lock(&inode->i_mutex);
+		trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
+		pmfs_add_logentry(sb, trans, pi, sizeof(*pi), LE_DATA);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_generation = generation;
+		pmfs_memunlock_inode(sb, pi);
+		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+		pi->i_generation = cpu_to_le32(inode->i_generation);
+		pmfs_memlock_inode(sb, pi);
+		pmfs_commit_transaction(sb, trans);
+		mutex_unlock(&inode->i_mutex);
+setversion_out:
+		mnt_drop_write_file(filp);
+		return ret;
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long pmfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	case FS_IOC32_SETVERSION:
+		cmd = FS_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return pmfs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/pmfs/journal.c b/fs/pmfs/journal.c
new file mode 100644
index 0000000..bb710a2
--- /dev/null
+++ b/fs/pmfs/journal.c
@@ -0,0 +1,866 @@
+/*
+ * PMFS journaling facility. This file contains code to log changes to pmfs
+ * meta-data to facilitate consistent meta-data updates against arbitrary
+ * power and system failures.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include "pmfs.h"
+#include "journal.h"
+
+static void dump_transaction(struct pmfs_sb_info *sbi,
+		pmfs_transaction_t *trans)
+{
+	int i;
+	pmfs_logentry_t *le = trans->start_addr;
+
+	for (i = 0; i < trans->num_entries; i++) {
+		pmfs_dbg_trans("ao %llx tid %x gid %x type %x sz %x\n",
+			le->addr_offset, le->transaction_id, le->gen_id,
+			le->type, le->size);
+		le++;
+	}
+}
+
+static inline uint32_t next_log_entry(uint32_t jsize, uint32_t le_off)
+{
+	le_off = le_off + LOGENTRY_SIZE;
+	if (le_off >= jsize)
+		le_off = 0;
+	return le_off;
+}
+
+static inline uint32_t prev_log_entry(uint32_t jsize, uint32_t le_off)
+{
+	if (le_off == 0)
+		le_off = jsize;
+	le_off = le_off - LOGENTRY_SIZE;
+	return le_off;
+}
+
+static inline uint16_t next_gen_id(uint16_t gen_id)
+{
+	gen_id++;
+	/* check for wraparound */
+	if (gen_id == 0)
+		gen_id++;
+	return gen_id;
+}
+
+static inline uint16_t prev_gen_id(uint16_t gen_id)
+{
+	gen_id--;
+	/* check for wraparound */
+	if (gen_id == 0)
+		gen_id--;
+	return gen_id;
+}
+
+/* Undo a valid log entry */
+static inline void pmfs_undo_logentry(struct super_block *sb,
+	pmfs_logentry_t *le)
+{
+	char *data;
+
+	if (le->size > 0) {
+		data = pmfs_get_block(sb, le64_to_cpu(le->addr_offset));
+		/* Undo changes by flushing the log entry to pmfs */
+		pmfs_memunlock_range(sb, data, le->size);
+		memcpy(data, le->data, le->size);
+		pmfs_memlock_range(sb, data, le->size);
+		pmfs_flush_buffer(data, le->size, false);
+	}
+}
+
+/* can be called during journal recovery or transaction abort */
+/* We need to Undo in the reverse order */
+static void pmfs_undo_transaction(struct super_block *sb,
+		pmfs_transaction_t *trans)
+{
+	pmfs_logentry_t *le;
+	int i;
+	uint16_t gen_id = trans->gen_id;
+
+	le = trans->start_addr + trans->num_used;
+	le--;
+	for (i = trans->num_used - 1; i >= 0; i--, le--) {
+		if (gen_id == le16_to_cpu(le->gen_id))
+			pmfs_undo_logentry(sb, le);
+	}
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_flush_transaction(struct super_block *sb,
+		pmfs_transaction_t *trans)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_logentry_t *le = trans->start_addr;
+	int i;
+	char *data;
+
+	for (i = 0; i < trans->num_used; i++, le++) {
+		if (le->size) {
+			data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
+			if (sbi->redo_log) {
+				pmfs_memunlock_range(sb, data, le->size);
+				memcpy(data, le->data, le->size);
+				pmfs_memlock_range(sb, data, le->size);
+			} else
+				pmfs_flush_buffer(data, le->size, false);
+		}
+	}
+}
+
+static inline void invalidate_gen_id(pmfs_logentry_t *le)
+{
+	le->gen_id = 0;
+	pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_invalidate_logentries(struct super_block *sb,
+		pmfs_transaction_t *trans)
+{
+	pmfs_logentry_t *le = trans->start_addr;
+	int i;
+
+	pmfs_memunlock_range(sb, trans->start_addr,
+			trans->num_entries * LOGENTRY_SIZE);
+	for (i = 0; i < trans->num_entries; i++) {
+		invalidate_gen_id(le);
+		if (le->type == LE_START) {
+			PERSISTENT_MARK();
+			PERSISTENT_BARRIER();
+		}
+		le++;
+	}
+	pmfs_memlock_range(sb, trans->start_addr,
+			trans->num_entries * LOGENTRY_SIZE);
+}
+
+/* can be called by either during log cleaning or during journal recovery */
+static void pmfs_redo_transaction(struct super_block *sb,
+		pmfs_transaction_t *trans, bool recover)
+{
+	pmfs_logentry_t *le = trans->start_addr;
+	int i;
+	uint16_t gen_id = trans->gen_id;
+	char *data;
+
+	for (i = 0; i < trans->num_entries; i++) {
+		if (gen_id == le16_to_cpu(le->gen_id) && le->size > 0) {
+			data = pmfs_get_block(sb,le64_to_cpu(le->addr_offset));
+			/* flush data if we are called during recovery */
+			if (recover) {
+				pmfs_memunlock_range(sb, data, le->size);
+				memcpy(data, le->data, le->size);
+				pmfs_memlock_range(sb, data, le->size);
+			}
+			pmfs_flush_buffer(data, le->size, false);
+		}
+		le++;
+	}
+}
+
+/* recover the transaction ending at a valid log entry *le */
+/* called for Undo log and traverses the journal backward */
+static uint32_t pmfs_recover_transaction(struct super_block *sb, uint32_t head,
+		uint32_t tail, pmfs_logentry_t *le)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_transaction_t trans;
+	bool cmt_or_abrt_found = false, start_found = false;
+	uint16_t gen_id = le16_to_cpu(le->gen_id);
+
+	memset(&trans, 0, sizeof(trans));
+	trans.transaction_id = le32_to_cpu(le->transaction_id);
+	trans.gen_id = gen_id;
+
+	do {
+		trans.num_entries++;
+		trans.num_used++;
+
+		if (gen_id == le16_to_cpu(le->gen_id)) {
+			/* Handle committed/aborted transactions */
+			if (le->type & LE_COMMIT || le->type & LE_ABORT)
+				cmt_or_abrt_found = true;
+			if (le->type & LE_START) {
+				trans.start_addr = le;
+				start_found = true;
+				break;
+			}
+		}
+		if (tail == 0 || tail == head)
+		    break;
+		/* prev log entry */
+		le--;
+		/* Handle uncommitted transactions */
+		if ((gen_id == le16_to_cpu(le->gen_id))
+			&& (le->type & LE_COMMIT || le->type & LE_ABORT)) {
+			BUG_ON(trans.transaction_id == 
+				le32_to_cpu(le->transaction_id));
+			le++;
+			break;
+		}
+		tail = prev_log_entry(sbi->jsize, tail);
+	} while (1);
+
+	if (start_found && !cmt_or_abrt_found)
+		pmfs_undo_transaction(sb, &trans);
+
+	if (gen_id == MAX_GEN_ID) {
+		if (!start_found)
+			trans.start_addr = le;
+		/* make sure the changes made by pmfs_undo_transaction() are
+		 * persistent before invalidating the log entries */
+		if (start_found && !cmt_or_abrt_found) {
+			PERSISTENT_MARK();
+			PERSISTENT_BARRIER();
+		}
+		pmfs_invalidate_logentries(sb, &trans);
+	}
+	return tail;
+}
+
+/* process the transaction starting at a valid log entry *le */
+/* called by the log cleaner and journal recovery */
+static uint32_t pmfs_process_transaction(struct super_block *sb, uint32_t head,
+		uint32_t tail, pmfs_logentry_t *le, bool recover)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_transaction_t trans;
+	uint16_t gen_id;
+	uint32_t new_head = head;
+
+	gen_id = le16_to_cpu(le->gen_id);
+	if (!(le->type & LE_START)) {
+		pmfs_dbg("start of trans %x but LE_START not set. gen_id %d\n",
+				le32_to_cpu(le->transaction_id), gen_id);
+		return next_log_entry(sbi->jsize, new_head);
+	}
+	memset(&trans, 0, sizeof(trans));
+	trans.transaction_id = le32_to_cpu(le->transaction_id);
+	trans.start_addr = le;
+	trans.gen_id = gen_id;
+	do {
+		trans.num_entries++;
+		trans.num_used++;
+		new_head = next_log_entry(sbi->jsize, new_head);
+
+		/* Handle committed/aborted transactions */
+		if ((gen_id == le16_to_cpu(le->gen_id)) && (le->type & LE_COMMIT
+					|| le->type & LE_ABORT)) {
+			head = new_head;
+			if ((le->type & LE_COMMIT) && sbi->redo_log)
+				pmfs_redo_transaction(sb, &trans, recover);
+
+			if (gen_id == MAX_GEN_ID) {
+				if ((le->type & LE_COMMIT) && sbi->redo_log) {
+					PERSISTENT_MARK();
+					PERSISTENT_BARRIER();
+				}
+				pmfs_invalidate_logentries(sb, &trans);
+			}
+			break;
+		}
+		/* next log entry */
+		le++;
+		/* Handle uncommitted transactions */
+		if ((new_head == tail) || ((gen_id == le16_to_cpu(le->gen_id))
+			    && (le->type & LE_START))) {
+			/* found a new valid transaction w/o finding a commit */
+			if (recover) {
+				/* if this function is called by recovery, move
+				 * ahead even if we didn't find a commit record
+				 * for this transaction */
+				head = new_head;
+				if (gen_id == MAX_GEN_ID)
+					pmfs_invalidate_logentries(sb, &trans);
+			}
+			pmfs_dbg_trans("no cmt tid %d sa %p nle %d tail %x"
+			" gen %d\n",
+			trans.transaction_id,trans.start_addr,trans.num_entries,
+			trans.num_used, trans.gen_id);
+			/* dump_transaction(sbi, &trans); */
+			break;
+		}
+	} while (new_head != tail);
+
+	return head;
+}
+
+static void pmfs_clean_journal(struct super_block *sb, bool unmount)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	uint32_t head = le32_to_cpu(journal->head);
+	uint32_t new_head, tail;
+	uint16_t gen_id;
+	volatile u64 *ptr_tail_genid = (volatile u64 *)&journal->tail;
+	u64 tail_genid;
+	pmfs_logentry_t *le;
+
+	/* atomically read both tail and gen_id of journal. Normally use of
+	 * volatile is prohibited in kernel code but since we use volatile
+	 * to write to journal's tail and gen_id atomically, we thought we
+	 * should use volatile to read them simultaneously and avoid locking
+	 * them. */
+	tail_genid = *ptr_tail_genid;
+	tail = le32_to_cpu(tail_genid & 0xFFFFFFFF);
+	gen_id = le16_to_cpu((tail_genid >> 32) & 0xFFFF);
+
+	/* journal wraparound happened. so head points to prev generation id */
+	if (tail < head)
+		gen_id = prev_gen_id(gen_id);
+	pmfs_dbg_trans("starting journal cleaning %x %x\n", head, tail);
+	while (head != tail) {
+		le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
+		if (gen_id == le16_to_cpu(le->gen_id)) {
+			/* found a valid log entry, process the transaction */
+			new_head = pmfs_process_transaction(sb, head, tail,
+				le, false);
+			/* no progress was made. return */
+			if (new_head == head)
+				break;
+			head = new_head;
+		} else {
+			if (gen_id == MAX_GEN_ID) {
+				pmfs_memunlock_range(sb, le, sizeof(*le));
+				invalidate_gen_id(le);
+				pmfs_memlock_range(sb, le, sizeof(*le));
+			}
+			head = next_log_entry(sbi->jsize, head);
+		}
+		/* handle journal wraparound */
+		if (head == 0)
+			gen_id = next_gen_id(gen_id);
+	}
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	pmfs_memunlock_range(sb, journal, sizeof(*journal));
+	journal->head = cpu_to_le32(head);
+	pmfs_memlock_range(sb, journal, sizeof(*journal));
+	pmfs_flush_buffer(&journal->head, sizeof(journal->head), true);
+	if (unmount) {
+		PERSISTENT_MARK();
+		if (journal->head != journal->tail)
+			pmfs_dbg("PMFS: umount but journal not empty %x:%x\n",
+			le32_to_cpu(journal->head), le32_to_cpu(journal->tail));
+		PERSISTENT_BARRIER();
+	}
+	pmfs_dbg_trans("leaving journal cleaning %x %x\n", head, tail);
+}
+
+static void log_cleaner_try_sleeping(struct  pmfs_sb_info *sbi)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&sbi->log_cleaner_wait, &wait, TASK_INTERRUPTIBLE);
+	schedule();
+	finish_wait(&sbi->log_cleaner_wait, &wait);
+}
+
+static int pmfs_log_cleaner(void *arg)
+{
+	struct super_block *sb = (struct super_block *)arg;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	pmfs_dbg_trans("Running log cleaner thread\n");
+	for ( ; ; ) {
+		log_cleaner_try_sleeping(sbi);
+
+		if (kthread_should_stop())
+			break;
+
+		pmfs_clean_journal(sb, false);
+	}
+	pmfs_clean_journal(sb, true);
+	pmfs_dbg_trans("Exiting log cleaner thread\n");
+	return 0;
+}
+
+static int pmfs_journal_cleaner_run(struct super_block *sb)
+{
+	int ret = 0;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	init_waitqueue_head(&sbi->log_cleaner_wait);
+
+	sbi->log_cleaner_thread = kthread_run(pmfs_log_cleaner, sb,
+			"pmfs_log_cleaner_0x%llx", sbi->phys_addr);
+	if (IS_ERR(sbi->log_cleaner_thread)) {
+		/* failure at boot is fatal */
+		pmfs_err(sb, "Failed to start pmfs log cleaner thread\n");
+		ret = -1;
+	}
+	return ret;
+}
+
+int pmfs_journal_soft_init(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+
+	sbi->next_transaction_id = 0;
+	sbi->journal_base_addr = pmfs_get_block(sb,le64_to_cpu(journal->base));
+	sbi->jsize = le32_to_cpu(journal->size);
+	mutex_init(&sbi->journal_mutex);
+	sbi->redo_log = !!le16_to_cpu(journal->redo_logging);
+
+	return pmfs_journal_cleaner_run(sb);
+}
+
+int pmfs_journal_hard_init(struct super_block *sb, uint64_t base,
+	uint32_t size)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+
+	pmfs_memunlock_range(sb, journal, sizeof(*journal));
+	journal->base = cpu_to_le64(base);
+	journal->size = cpu_to_le32(size);
+	journal->gen_id = cpu_to_le16(1);
+	journal->head = journal->tail = 0;
+	/* lets do Undo logging for now */
+	journal->redo_logging = 0;
+	pmfs_memlock_range(sb, journal, sizeof(*journal));
+
+	sbi->journal_base_addr = pmfs_get_block(sb, base);
+	pmfs_memunlock_range(sb, sbi->journal_base_addr, size);
+	memset_nt(sbi->journal_base_addr, 0, size);
+	pmfs_memlock_range(sb, sbi->journal_base_addr, size);
+
+	return pmfs_journal_soft_init(sb);
+}
+
+static void wakeup_log_cleaner(struct pmfs_sb_info *sbi)
+{
+	if (!waitqueue_active(&sbi->log_cleaner_wait))
+		return;
+	pmfs_dbg_trans("waking up the cleaner thread\n");
+	wake_up_interruptible(&sbi->log_cleaner_wait);
+}
+
+int pmfs_journal_uninit(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	if (sbi->log_cleaner_thread)
+		kthread_stop(sbi->log_cleaner_thread);
+	return 0;
+}
+
+inline pmfs_transaction_t *pmfs_current_transaction(void)
+{
+	return (pmfs_transaction_t *)current->journal_info;
+}
+
+static int pmfs_free_logentries(int max_log_entries)
+{
+	pmfs_dbg("pmfs_free_logentries: Not Implemented\n");
+	return -ENOMEM;
+}
+
+pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
+		int max_log_entries)
+{
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_transaction_t *trans;
+	uint32_t head, tail, req_size, avail_size;
+	uint64_t base;
+#if 0
+	trans = pmfs_current_transaction();
+
+	if (trans) {
+		BUG_ON(trans->t_journal != journal);
+		return trans;
+	}
+#endif
+	/* If it is an undo log, need one more log-entry for commit record */
+	if (!sbi->redo_log)
+		max_log_entries++;
+
+	trans = pmfs_alloc_transaction();
+	if (!trans)
+		return ERR_PTR(-ENOMEM);
+	memset(trans, 0, sizeof(*trans));
+
+	trans->num_used = 0;
+	trans->num_entries = max_log_entries;
+	trans->t_journal = journal;
+	req_size = max_log_entries << LESIZE_SHIFT;
+
+	mutex_lock(&sbi->journal_mutex);
+
+	tail = le32_to_cpu(journal->tail);
+	head = le32_to_cpu(journal->head);
+	trans->transaction_id = sbi->next_transaction_id++;
+again:
+	trans->gen_id = le16_to_cpu(journal->gen_id);
+	avail_size = (tail >= head) ?
+		(sbi->jsize - (tail - head)) : (head - tail);
+	avail_size = avail_size - LOGENTRY_SIZE;
+
+	if (avail_size < req_size) {
+		uint32_t freed_size;
+		/* run the log cleaner function to free some log entries */
+		freed_size = pmfs_free_logentries(max_log_entries);
+		if ((avail_size + freed_size) < req_size)
+			goto journal_full;
+	}
+	base = le64_to_cpu(journal->base) + tail;
+	tail = tail + req_size;
+	/* journal wraparound because of this transaction allocation.
+	 * start the transaction from the beginning of the journal so
+	 * that we dont have any wraparound within a transaction */
+	pmfs_memunlock_range(sb, journal, sizeof(*journal));
+	if (tail >= sbi->jsize) {
+		volatile u64 *ptr;
+		tail = 0;
+		/* write the gen_id and tail atomically. Use of volatile is
+		 * normally prohibited in kernel code, but it is required here
+		 * because we want to write atomically against power failures
+		 * and locking can't provide that. */
+		ptr = (volatile u64 *)&journal->tail;
+		/* writing 8-bytes atomically setting tail to 0 */
+		set_64bit(ptr, (u64)cpu_to_le16(next_gen_id(le16_to_cpu(
+				journal->gen_id))) << 32);
+		pmfs_memlock_range(sb, journal, sizeof(*journal));
+		pmfs_dbg_trans("journal wrapped. tail %x gid %d cur tid %d\n",
+			le32_to_cpu(journal->tail),le16_to_cpu(journal->gen_id),
+				sbi->next_transaction_id - 1);
+		goto again;
+	} else {
+		journal->tail = cpu_to_le32(tail);
+		pmfs_memlock_range(sb, journal, sizeof(*journal));
+	}
+	mutex_unlock(&sbi->journal_mutex);
+
+	avail_size = avail_size - req_size;
+	/* wake up the log cleaner if required */
+	if ((sbi->jsize - avail_size) > (sbi->jsize >> 3))
+		wakeup_log_cleaner(sbi);
+	pmfs_flush_buffer(&journal->tail, sizeof(u64), false);
+
+	pmfs_dbg_trans("new transaction tid %d nle %d avl sz %x sa %llx\n",
+		trans->transaction_id, max_log_entries, avail_size, base);
+	trans->start_addr = pmfs_get_block(sb, base);
+
+	trans->parent = (pmfs_transaction_t *)current->journal_info;
+	current->journal_info = trans;
+	return trans;
+journal_full:
+	mutex_unlock(&sbi->journal_mutex);
+	pmfs_err(sb, "Journal full. base %llx sz %x head:tail %x:%x ncl %x\n",
+		le64_to_cpu(journal->base), le32_to_cpu(journal->size),
+		le32_to_cpu(journal->head), le32_to_cpu(journal->tail),
+		max_log_entries);
+	pmfs_free_transaction(trans);
+	return ERR_PTR(-EAGAIN);
+}
+
+static inline void pmfs_commit_logentry(struct super_block *sb,
+		pmfs_transaction_t *trans, pmfs_logentry_t *le)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	if (sbi->redo_log) {
+		/* Redo Log */
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+		/* Atomically write the commit type */
+		le->type |= LE_COMMIT;
+		barrier();
+		/* Atomically make the log entry valid */
+		le->gen_id = cpu_to_le16(trans->gen_id);
+		pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+		/* Update the FS in place */
+		pmfs_flush_transaction(sb, trans);
+	} else {
+		/* Undo Log */
+		/* Update the FS in place: currently already done. so
+		 * only need to clflush */
+		pmfs_flush_transaction(sb, trans);
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+		/* Atomically write the commit type */
+		le->type |= LE_COMMIT;
+		barrier();
+		/* Atomically make the log entry valid */
+		le->gen_id = cpu_to_le16(trans->gen_id);
+		pmfs_flush_buffer(le, LOGENTRY_SIZE, true);
+	}
+	return;
+}
+
+int pmfs_add_logentry(struct super_block *sb,
+		pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_logentry_t *le;
+	int num_les = 0, i;
+	uint64_t le_start = size ? pmfs_get_addr_off(sbi, addr) : 0;
+	uint8_t le_size;
+
+	if (trans == NULL)
+		return -EINVAL;
+	le = trans->start_addr + trans->num_used;
+
+	if (size == 0) {
+		/* At least one log entry required for commit/abort log entry */
+		if ((type & LE_COMMIT) || (type & LE_ABORT))
+			num_les = 1;
+	} else
+		num_les = (size + sizeof(le->data) - 1)/sizeof(le->data);
+
+	pmfs_dbg_trans("add le id %d size %x, num_les %d tail %x le %p\n",
+		trans->transaction_id, size, trans->num_entries,
+		trans->num_used, le);
+
+	if ((trans->num_used + num_les) > trans->num_entries) {
+		pmfs_err(sb, "Log Entry full. tid %x ne %x tail %x size %x\n",
+			trans->transaction_id, trans->num_entries,
+			trans->num_used, size);
+		dump_transaction(sbi, trans);
+		dump_stack();
+		return -ENOMEM;
+	}
+
+	pmfs_memunlock_range(sb, le, sizeof(*le) * num_les);
+	for (i = 0; i < num_les; i++) {
+		le->addr_offset = cpu_to_le64(le_start);
+		le->transaction_id = cpu_to_le32(trans->transaction_id);
+		le_size = (i == (num_les - 1)) ? size : sizeof(le->data);
+		le->size = le_size;
+		size -= le_size;
+		if (le_size)
+			memcpy(le->data, addr, le_size);
+		le->type = type;
+
+		if (i == 0 && trans->num_used == 0)
+			le->type |= LE_START;
+		trans->num_used++;
+
+		/* handle special log entry */
+		if (i == (num_les - 1) && (type & LE_COMMIT)) {
+			pmfs_commit_logentry(sb, trans, le);
+			pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
+			return 0;
+		}
+		/* put a compile time barrier so that compiler doesnt reorder
+		 * the writes to the log entry */
+		barrier();
+
+		/* Atomically make the log entry valid */
+		le->gen_id = cpu_to_le16(trans->gen_id);
+		pmfs_flush_buffer(le, LOGENTRY_SIZE, false);
+
+		addr += le_size;
+		le_start += le_size;
+		le++;
+	}
+	pmfs_memlock_range(sb, le, sizeof(*le) * num_les);
+	if (!sbi->redo_log) {
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+	return 0;
+}
+
+int pmfs_commit_transaction(struct super_block *sb,
+		pmfs_transaction_t *trans)
+{
+	if (trans == NULL)
+		return 0;
+	/* Add the commit log-entry */
+	pmfs_add_logentry(sb, trans, NULL, 0, LE_COMMIT);
+
+	pmfs_dbg_trans("completing transaction for id %d\n",
+		trans->transaction_id);
+
+	current->journal_info = trans->parent;
+	pmfs_free_transaction(trans);
+	return 0;
+}
+
+int pmfs_abort_transaction(struct super_block *sb, pmfs_transaction_t *trans)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	if (trans == NULL)
+		return 0;
+	pmfs_dbg_trans("abort trans for tid %x sa %p numle %d tail %x gen %d\n",
+		trans->transaction_id, trans->start_addr, trans->num_entries,
+		trans->num_used, trans->gen_id);
+	dump_transaction(sbi, trans);
+	/*dump_stack();*/
+
+	if (!sbi->redo_log) {
+		/* Undo Log */
+		pmfs_undo_transaction(sb, trans);
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+	/* add a abort log entry */
+	pmfs_add_logentry(sb, trans, NULL, 0, LE_ABORT);
+	current->journal_info = trans->parent;
+	pmfs_free_transaction(trans);
+	return 0;
+}
+
+static void invalidate_remaining_journal(struct super_block *sb,
+	void *journal_vaddr, uint32_t jtail, uint32_t jsize)
+{
+	pmfs_logentry_t *le = (pmfs_logentry_t *)(journal_vaddr + jtail);
+	void *start = le;
+
+	pmfs_memunlock_range(sb, start, jsize - jtail);
+	while (jtail < jsize) {
+		invalidate_gen_id(le);
+		le++;
+		jtail += LOGENTRY_SIZE;
+	}
+	pmfs_memlock_range(sb, start, jsize - jtail);
+}
+
+/* we need to increase the gen_id to invalidate all the journal log
+ * entries. This is because after the recovery, we may still have some
+ * valid log entries beyond the tail (before power failure, they became
+ * persistent before the journal tail could become persistent.
+ * should gen_id and head be updated atomically? not necessarily? we
+ * can update gen_id before journal head because gen_id and head are in
+ * the same cacheline */
+static void pmfs_forward_journal(struct super_block *sb, struct pmfs_sb_info
+		*sbi, pmfs_journal_t *journal)
+{
+	uint16_t gen_id = le16_to_cpu(journal->gen_id);
+	/* handle gen_id wrap around */
+	if (gen_id == MAX_GEN_ID) {
+		invalidate_remaining_journal(sb, sbi->journal_base_addr,
+			le32_to_cpu(journal->tail), sbi->jsize);
+	}
+	PERSISTENT_MARK();
+	gen_id = next_gen_id(gen_id);
+	/* make all changes persistent before advancing gen_id and head */
+	PERSISTENT_BARRIER();
+	pmfs_memunlock_range(sb, journal, sizeof(*journal));
+	journal->gen_id = cpu_to_le16(gen_id);
+	barrier();
+	journal->head = journal->tail;
+	pmfs_memlock_range(sb, journal, sizeof(*journal));
+	pmfs_flush_buffer(journal, sizeof(*journal), false);
+}
+
+static int pmfs_recover_undo_journal(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	uint32_t tail = le32_to_cpu(journal->tail);
+	uint32_t head = le32_to_cpu(journal->head);
+	uint16_t gen_id = le16_to_cpu(journal->gen_id);
+	pmfs_logentry_t *le;
+
+	while (head != tail) {
+		/* handle journal wraparound */
+		if (tail == 0)
+			gen_id = prev_gen_id(gen_id);
+		tail = prev_log_entry(sbi->jsize, tail);
+
+		le = (pmfs_logentry_t *)(sbi->journal_base_addr + tail);
+		if (gen_id == le16_to_cpu(le->gen_id)) {
+			tail = pmfs_recover_transaction(sb, head, tail, le);
+		} else {
+			if (gen_id == MAX_GEN_ID) {
+				pmfs_memunlock_range(sb, le, sizeof(*le));
+				invalidate_gen_id(le);
+				pmfs_memlock_range(sb, le, sizeof(*le));
+			}
+		}
+	}
+	pmfs_forward_journal(sb, sbi, journal);
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	return 0;
+}
+
+static int pmfs_recover_redo_journal(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	uint32_t tail = le32_to_cpu(journal->tail);
+	uint32_t head = le32_to_cpu(journal->head);
+	uint16_t gen_id = le16_to_cpu(journal->gen_id);
+	pmfs_logentry_t *le;
+
+	/* journal wrapped around. so head points to previous generation id */
+	if (tail < head)
+		gen_id = prev_gen_id(gen_id);
+
+	while (head != tail) {
+		le = (pmfs_logentry_t *)(sbi->journal_base_addr + head);
+		if (gen_id == le16_to_cpu(le->gen_id)) {
+			head = pmfs_process_transaction(sb, head, tail,
+				le, true);
+		} else {
+			if (gen_id == MAX_GEN_ID) {
+				pmfs_memunlock_range(sb, le, sizeof(*le));
+				invalidate_gen_id(le);
+				pmfs_memlock_range(sb, le, sizeof(*le));
+			}
+			head = next_log_entry(sbi->jsize, head);
+		}
+		/* handle journal wraparound */
+		if (head == 0)
+			gen_id = next_gen_id(gen_id);
+	}
+	pmfs_forward_journal(sb, sbi, journal);
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	return 0;
+}
+
+int pmfs_recover_journal(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	pmfs_journal_t *journal = pmfs_get_journal(sb);
+	uint32_t tail = le32_to_cpu(journal->tail);
+	uint32_t head = le32_to_cpu(journal->head);
+	uint16_t gen_id = le16_to_cpu(journal->gen_id);
+
+	/* is the journal empty? true if unmounted properly. */
+	if (head == tail)
+		return 0;
+	pmfs_dbg("PMFS: journal recovery. head:tail %x:%x gen_id %d\n",
+		head, tail, gen_id);
+	if (sbi->redo_log)
+		pmfs_recover_redo_journal(sb);
+	else
+		pmfs_recover_undo_journal(sb);
+	return 0;
+}
+
diff --git a/fs/pmfs/journal.h b/fs/pmfs/journal.h
new file mode 100644
index 0000000..6781029
--- /dev/null
+++ b/fs/pmfs/journal.h
@@ -0,0 +1,101 @@
+/*
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PMFS_JOURNAL_H__
+#define __PMFS_JOURNAL_H__
+#include <linux/slab.h>
+
+/* default pmfs journal size 4MB */
+#define PMFS_DEFAULT_JOURNAL_SIZE  (4 << 20)
+/* minimum pmfs journal size 64KB */
+#define PMFS_MINIMUM_JOURNAL_SIZE  (1 << 16)
+
+#define CACHELINE_SIZE  (64)
+#define CLINE_SHIFT		(6)
+#define CACHELINE_MASK  (~(CACHELINE_SIZE - 1))
+#define CACHELINE_ALIGN(addr) (((addr)+CACHELINE_SIZE-1) & CACHELINE_MASK)
+
+#define LOGENTRY_SIZE  CACHELINE_SIZE
+#define LESIZE_SHIFT   CLINE_SHIFT
+
+#define MAX_INODE_LENTRIES (2)
+#define MAX_SB_LENTRIES (2)
+/* 1 le for dir entry and 1 le for potentially allocating a new dir block */
+#define MAX_DIRENTRY_LENTRIES   (2)
+/* 2 le for adding or removing the inode from truncate list. used to log
+ * potential changes to inode table's i_next_truncate and i_sum */
+#define MAX_TRUNCATE_LENTRIES (2)
+#define MAX_DATA_PER_LENTRY  48
+/* blocksize * max_btree_height */
+#define MAX_METABLOCK_LENTRIES \
+	((PMFS_DEF_BLOCK_SIZE_4K * 3)/MAX_DATA_PER_LENTRY)
+
+#define MAX_PTRS_PER_LENTRY (MAX_DATA_PER_LENTRY / sizeof(u64))
+
+#define TRANS_RUNNING    1
+#define TRANS_COMMITTED  2
+#define TRANS_ABORTED    3
+
+#define LE_DATA        0
+#define LE_START       1
+#define LE_COMMIT      2
+#define LE_ABORT       4
+
+#define MAX_GEN_ID  ((uint16_t)-1)
+
+/* persistent data structure to describe a single log-entry */
+/* every log entry is max CACHELINE_SIZE bytes in size */
+typedef struct {
+	__le64   addr_offset;
+	__le32   transaction_id;
+	__le16   gen_id;
+	u8       type;  /* normal, commit, or abort */
+	u8       size;
+	char     data[48];
+} pmfs_logentry_t;
+
+/* volatile data structure to describe a transaction */
+typedef struct pmfs_transaction {
+	u32              transaction_id;
+	u16              num_entries;
+	u16              num_used;
+	u16              gen_id;
+	u16              status;
+	pmfs_journal_t  *t_journal;
+	pmfs_logentry_t *start_addr;
+	struct pmfs_transaction *parent;
+} pmfs_transaction_t;
+
+extern inline pmfs_transaction_t *pmfs_alloc_transaction(void);
+extern inline void pmfs_free_transaction(pmfs_transaction_t *trans);
+
+extern int pmfs_journal_soft_init(struct super_block *sb);
+extern int pmfs_journal_hard_init(struct super_block *sb,
+		uint64_t base, uint32_t size);
+extern int pmfs_journal_uninit(struct super_block *sb);
+extern pmfs_transaction_t *pmfs_new_transaction(struct super_block *sb,
+		int nclines);
+extern inline pmfs_transaction_t *pmfs_current_transaction(void);
+extern int pmfs_add_logentry(struct super_block *sb,
+		pmfs_transaction_t *trans, void *addr, uint16_t size, u8 type);
+extern int pmfs_commit_transaction(struct super_block *sb,
+		pmfs_transaction_t *trans);
+extern int pmfs_abort_transaction(struct super_block *sb,
+			pmfs_transaction_t *trans);
+extern int pmfs_recover_journal(struct super_block *sb);
+
+#endif    /* __PMFS_JOURNAL_H__ */
diff --git a/fs/pmfs/namei.c b/fs/pmfs/namei.c
new file mode 100644
index 0000000..490d09d
--- /dev/null
+++ b/fs/pmfs/namei.c
@@ -0,0 +1,797 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode operations for directories.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include "pmfs.h"
+#include "xip.h"
+
+/*
+ * Couple of helper functions - make the code slightly cleaner.
+ */
+static inline void pmfs_inc_count(struct inode *inode, struct pmfs_inode *pi)
+{
+	inc_nlink(inode);
+	pmfs_update_nlink(inode, pi);
+}
+
+static inline void pmfs_dec_count(struct inode *inode, struct pmfs_inode *pi)
+{
+	if (inode->i_nlink) {
+		drop_nlink(inode);
+		pmfs_update_nlink(inode, pi);
+	}
+}
+
+static inline int pmfs_add_nondir(pmfs_transaction_t *trans,
+		struct inode *dir, struct dentry *dentry, struct inode *inode)
+{
+	struct pmfs_inode *pi;
+	int err = pmfs_add_entry(trans, dentry, inode);
+
+	if (!err) {
+		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
+		return 0;
+	}
+	pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+	pmfs_dec_count(inode, pi);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
+static inline struct pmfs_direntry *pmfs_next_entry(struct pmfs_direntry *p)
+{
+	return (struct pmfs_direntry *)((char *)p + le16_to_cpu(p->de_len));
+}
+
+/*
+ * Methods themselves.
+ */
+int pmfs_check_dir_entry(const char *function, struct inode *dir,
+			  struct pmfs_direntry *de, u8 *base,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->de_len);
+
+	if (unlikely(rlen < PMFS_DIR_REC_LEN(1)))
+		error_msg = "de_len is smaller than minimal";
+	else if (unlikely(rlen % 4 != 0))
+		error_msg = "de_len % 4 != 0";
+	else if (unlikely(rlen < PMFS_DIR_REC_LEN(de->name_len)))
+		error_msg = "de_len is too small for name_len";
+	else if (unlikely((((u8 *)de - base) + rlen > dir->i_sb->s_blocksize)))
+		error_msg = "directory entry across blocks";
+
+	if (unlikely(error_msg != NULL)) {
+		pmfs_dbg("bad entry in directory #%lu: %s - "
+			  "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			  dir->i_ino, error_msg, offset,
+			  (unsigned long)le64_to_cpu(de->ino), rlen,
+			  de->name_len);
+	}
+
+	return error_msg == NULL ? 1 : 0;
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
+			  unsigned long	offset,
+			  struct pmfs_direntry **res_dir,
+			  struct pmfs_direntry **prev_dir)
+{
+	struct pmfs_direntry *de;
+	struct pmfs_direntry *pde = NULL;
+	char *dlimit;
+	int de_len;
+	const char *name = child->name;
+	int namelen = child->len;
+
+	de = (struct pmfs_direntry *)blk_base;
+	dlimit = blk_base + dir->i_sb->s_blocksize;
+	while ((char *)de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		if ((char *)de + namelen <= dlimit &&
+		    pmfs_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!pmfs_check_dir_entry("pmfs_inode_by_name",
+						   dir, de, blk_base, offset))
+				return -1;
+			*res_dir = de;
+			if (prev_dir)
+				*prev_dir = pde;
+			return 1;
+		}
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->de_len);
+		if (de_len <= 0)
+			return -1;
+		offset += de_len;
+		pde = de;
+		de = (struct pmfs_direntry *)((char *)de + de_len);
+	}
+	return 0;
+}
+
+static ino_t pmfs_inode_by_name(struct inode *dir, struct qstr *entry,
+				 struct pmfs_direntry **res_entry)
+{
+	struct pmfs_inode *pi;
+	ino_t i_no = 0;
+	int namelen, nblocks, i;
+	u8 *blk_base;
+	const u8 *name = entry->name;
+	struct super_block *sb = dir->i_sb;
+	unsigned long block, start;
+	struct pmfs_inode_vfs *si = PMFS_I(dir);
+
+	pi = pmfs_get_inode(sb, dir->i_ino);
+
+	namelen = entry->len;
+	if (namelen > PMFS_NAME_LEN)
+		return 0;
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == 0)) {
+		/*
+		 * "." or ".." will only be in the first block
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
+	nblocks = dir->i_size >> dir->i_sb->s_blocksize_bits;
+	start = si->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		blk_base =
+			pmfs_get_block(sb, pmfs_find_data_block(dir, block));
+		if (!blk_base)
+			goto done;
+		i = pmfs_search_dirblock(blk_base, dir, entry,
+					  block << sb->s_blocksize_bits,
+					  res_entry, NULL);
+		if (i == 1) {
+			si->i_dir_start_lookup = block;
+			i_no = le64_to_cpu((*res_entry)->ino);
+			goto done;
+		} else {
+			if (i < 0)
+				goto done;
+		}
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+done:
+	return i_no;
+}
+
+static struct dentry *pmfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	struct pmfs_direntry *de;
+	ino_t ino;
+
+	if (dentry->d_name.len > PMFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = pmfs_inode_by_name(dir, &dentry->d_name, &de);
+	if (ino) {
+		inode = pmfs_iget(dir->i_sb, ino);
+		if (inode == ERR_PTR(-ESTALE)) {
+			pmfs_err(dir->i_sb, __func__,
+				  "deleted inode referenced: %lu",
+				  (unsigned long)ino);
+			return ERR_PTR(-EIO);
+		}
+	}
+
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int pmfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+			struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+	int err = PTR_ERR(inode);
+	struct super_block *sb = dir->i_sb;
+	pmfs_transaction_t *trans;
+
+	/* two log entries for new inode, 1 lentry for dir inode, 1 for dir
+	 * inode's b-tree, 2 lentries for logging dir entry
+	 */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+		MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
+	if (IS_ERR(inode))
+		goto out_err;
+	inode->i_op = &pmfs_file_inode_operations;
+	inode->i_mapping->a_ops = &pmfs_aops_xip;
+	inode->i_fop = &pmfs_xip_file_operations;
+	err = pmfs_add_nondir(trans, dir, dentry, inode);
+	if (err)
+		goto out_err;
+	pmfs_commit_transaction(sb, trans);
+out:
+	return err;
+out_err:
+	pmfs_abort_transaction(sb, trans);
+	return err;
+}
+
+static int pmfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       dev_t rdev)
+{
+	struct inode *inode = NULL;
+	int err = PTR_ERR(inode);
+	pmfs_transaction_t *trans;
+	struct super_block *sb = dir->i_sb;
+	struct pmfs_inode *pi;
+
+	/* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
+	 * inode's b-tree, 2 lentries for logging dir entry
+	 */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+			MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	inode = pmfs_new_inode(trans, dir, mode, &dentry->d_name);
+	if (!IS_ERR(inode))
+		goto out_err;
+	init_special_inode(inode, mode, rdev);
+	inode->i_op = &pmfs_special_inode_operations;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+	err = pmfs_add_nondir(trans, dir, dentry, inode);
+	if (err)
+		goto out_err;
+	pmfs_commit_transaction(sb, trans);
+out:
+	return err;
+out_err:
+	pmfs_abort_transaction(sb, trans);
+	return err;
+}
+
+static int pmfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned len = strlen(symname);
+	struct inode *inode;
+	pmfs_transaction_t *trans;
+	struct pmfs_inode *pi;
+
+	if (len + 1 > sb->s_blocksize)
+		goto out;
+
+	/* 2 log entries for new inode, 1 lentry for dir inode, 1 for dir
+	 * inode's b-tree, 2 lentries for logging dir entry
+	 */
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+			MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	inode = pmfs_new_inode(trans, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		pmfs_abort_transaction(sb, trans);
+		goto out;
+	}
+
+	inode->i_op = &pmfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &pmfs_aops_xip;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	err = pmfs_block_symlink(inode, symname, len);
+	if (err)
+		goto out_fail;
+
+	inode->i_size = len;
+	pmfs_update_isize(inode, pi);
+
+	err = pmfs_add_nondir(trans, dir, dentry, inode);
+	if (err) {
+		/* free up the allocated block to the symlink inode */
+		pmfs_setsize(inode, 0);
+		pmfs_abort_transaction(sb, trans);
+		goto out;
+	}
+
+	pmfs_commit_transaction(sb, trans);
+out:
+	return err;
+
+out_fail:
+	pmfs_dec_count(inode, pi);
+	unlock_new_inode(inode);
+	iput(inode);
+	pmfs_abort_transaction(sb, trans);
+	goto out;
+}
+
+static int pmfs_link(struct dentry *dest_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = dest_dentry->d_inode;
+	int err = -ENOMEM;
+	pmfs_transaction_t *trans;
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+	if (inode->i_nlink >= PMFS_LINK_MAX)
+		return -EMLINK;
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+			MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+	/* only need to log the first 48 bytes since we only modify ctime and
+	 * i_links_count in this system call */
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	ihold(inode);
+
+	err = pmfs_add_entry(trans, dentry, inode);
+	if (!err) {
+		inode->i_ctime = CURRENT_TIME_SEC;
+		inc_nlink(inode);
+
+		pmfs_memunlock_inode(sb, pi);
+		pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+		pi->i_links_count = cpu_to_le16(inode->i_nlink);
+		pmfs_memlock_inode(sb, pi);
+
+		d_instantiate(dentry, inode);
+		pmfs_commit_transaction(sb, trans);
+	} else {
+		iput(inode);
+		pmfs_abort_transaction(sb, trans);
+	}
+out:
+	return err;
+}
+
+static int pmfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int retval = -ENOMEM;
+	pmfs_transaction_t *trans;
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+		MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		retval = PTR_ERR(trans);
+		goto out;
+	}
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	retval = pmfs_remove_entry(trans, dentry, inode);
+	if (retval)
+		goto end_unlink;
+
+	if (inode->i_nlink == 1)
+		pmfs_truncate_add(inode, inode->i_size);
+	inode->i_ctime = dir->i_ctime;
+
+	pmfs_memunlock_inode(sb, pi);
+	if (inode->i_nlink) {
+		drop_nlink(inode);
+		pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	}
+	pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	pmfs_memlock_inode(sb, pi);
+
+	pmfs_commit_transaction(sb, trans);
+	return 0;
+end_unlink:
+	pmfs_abort_transaction(sb, trans);
+out:
+	return retval;
+}
+
+static int pmfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct pmfs_inode *pi, *pidir;
+	struct pmfs_direntry *de = NULL;
+	struct super_block *sb = dir->i_sb;
+	pmfs_transaction_t *trans;
+	int err = -EMLINK;
+	char *blk_base;
+
+	if (dir->i_nlink >= PMFS_LINK_MAX)
+		goto out;
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+			MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	inode = pmfs_new_inode(trans, dir, S_IFDIR | mode, &dentry->d_name);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode)) {
+		pmfs_abort_transaction(sb, trans);
+		goto out;
+	}
+
+	inode->i_op = &pmfs_dir_inode_operations;
+	inode->i_fop = &pmfs_dir_operations;
+	inode->i_mapping->a_ops = &pmfs_aops_xip;
+
+	/* since this is a new inode so we dont need to include this
+	 * pmfs_alloc_blocks in the transaction
+	 */
+	err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
+	if (err)
+		goto out_clear_inode;
+	inode->i_size = sb->s_blocksize;
+
+	blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
+	de = (struct pmfs_direntry *)blk_base;
+	pmfs_memunlock_range(sb, blk_base, sb->s_blocksize);
+	de->ino = cpu_to_le64(inode->i_ino);
+	de->name_len = 1;
+	de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	/*de->file_type = S_IFDIR; */
+	de = pmfs_next_entry(de);
+	de->ino = cpu_to_le64(dir->i_ino);
+	de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	/*de->file_type =  S_IFDIR; */
+	pmfs_memlock_range(sb, blk_base, sb->s_blocksize);
+
+	/* No need to journal the dir entries but we need to persist them */
+	pmfs_flush_buffer(blk_base, PMFS_DIR_REC_LEN(1) +
+			PMFS_DIR_REC_LEN(2), true);
+
+	set_nlink(inode, 2);
+
+	err = pmfs_add_entry(trans, dentry, inode);
+	if (err) {
+		pmfs_dbg_verbose("failed to add dir entry\n");
+		goto out_clear_inode;
+	}
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	pi->i_size = cpu_to_le64(inode->i_size);
+	pmfs_memlock_inode(sb, pi);
+
+	pidir = pmfs_get_inode(sb, dir->i_ino);
+	pmfs_inc_count(dir, pidir);
+	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
+
+	pmfs_commit_transaction(sb, trans);
+
+out:
+	return err;
+
+out_clear_inode:
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+	iput(inode);
+	pmfs_abort_transaction(sb, trans);
+	goto out;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int pmfs_empty_dir(struct inode *inode)
+{
+	unsigned long offset;
+	struct pmfs_direntry *de, *de1;
+	struct super_block *sb;
+	char *blk_base;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (inode->i_size < PMFS_DIR_REC_LEN(1) + PMFS_DIR_REC_LEN(2)) {
+		pmfs_dbg("bad directory (dir #%lu)-no data block",
+			  inode->i_ino);
+		return 1;
+	}
+
+	blk_base = pmfs_get_block(sb, pmfs_find_data_block(inode, 0));
+	if (!blk_base) {
+		pmfs_dbg("bad directory (dir #%lu)-no data block",
+			  inode->i_ino);
+		return 1;
+	}
+
+	de = (struct pmfs_direntry *)blk_base;
+	de1 = pmfs_next_entry(de);
+
+	if (le64_to_cpu(de->ino) != inode->i_ino || !le64_to_cpu(de1->ino) ||
+	    strcmp(".", de->name) || strcmp("..", de1->name)) {
+		pmfs_dbg("bad directory (dir #%lu) - no `.' or `..'",
+			  inode->i_ino);
+		return 1;
+	}
+	offset = le16_to_cpu(de->de_len) + le16_to_cpu(de1->de_len);
+	de = pmfs_next_entry(de1);
+	while (offset < inode->i_size) {
+		if (!blk_base || (void *)de >= (void *)(blk_base +
+					sb->s_blocksize)) {
+			err = 0;
+			blk_base = pmfs_get_block(sb, pmfs_find_data_block(
+				    inode, offset >> sb->s_blocksize_bits));
+			if (!blk_base) {
+				pmfs_dbg("Error: reading dir #%lu offset %lu\n",
+					  inode->i_ino, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct pmfs_direntry *)blk_base;
+		}
+		if (!pmfs_check_dir_entry("empty_dir", inode, de, blk_base,
+					offset)) {
+			de = (struct pmfs_direntry *)(blk_base +
+				sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le64_to_cpu(de->ino))
+			return 0;
+		offset += le16_to_cpu(de->de_len);
+		de = pmfs_next_entry(de);
+	}
+	return 1;
+}
+
+static int pmfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct pmfs_direntry *de;
+	pmfs_transaction_t *trans;
+	struct super_block *sb = inode->i_sb;
+	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino), *pidir;
+	int err = -ENOTEMPTY;
+
+	if (!inode)
+		return -ENOENT;
+
+	if (pmfs_inode_by_name(dir, &dentry->d_name, &de) == 0)
+		return -ENOENT;
+
+	if (!pmfs_empty_dir(inode))
+		return err;
+
+	if (inode->i_nlink != 2)
+		pmfs_dbg("empty directory has nlink!=2 (%d)", inode->i_nlink);
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 2 +
+			MAX_DIRENTRY_LENTRIES);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		return err;
+	}
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	err = pmfs_remove_entry(trans, dentry, inode);
+	if (err)
+		goto end_rmdir;
+
+	/*inode->i_version++; */
+	clear_nlink(inode);
+	inode->i_ctime = dir->i_ctime;
+
+	pmfs_memunlock_inode(sb, pi);
+	pi->i_links_count = cpu_to_le16(inode->i_nlink);
+	pi->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	pmfs_memlock_inode(sb, pi);
+
+	/* add the inode to truncate list in case a crash happens before the
+	 * subsequent evict_inode is called. It will be deleted from the
+	 * truncate list during evict_inode.
+	 */
+	pmfs_truncate_add(inode, inode->i_size);
+
+	pidir = pmfs_get_inode(sb, dir->i_ino);
+	pmfs_dec_count(dir, pidir);
+
+	pmfs_commit_transaction(sb, trans);
+	return err;
+end_rmdir:
+	pmfs_abort_transaction(sb, trans);
+	return err;
+}
+
+static int pmfs_rename(struct inode *old_dir,
+			struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct pmfs_direntry *new_de = NULL, *old_de = NULL;
+	pmfs_transaction_t *trans;
+	struct super_block *sb = old_inode->i_sb;
+	struct pmfs_inode *pi, *new_pidir, *old_pidir;
+	int err = -ENOENT;
+
+	pmfs_inode_by_name(new_dir, &new_dentry->d_name, &new_de);
+	pmfs_inode_by_name(old_dir, &old_dentry->d_name, &old_de);
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES * 4 +
+			MAX_DIRENTRY_LENTRIES * 2);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out;
+	}
+
+	if (new_inode) {
+		err = -ENOTEMPTY;
+		if (S_ISDIR(old_inode->i_mode) && !pmfs_empty_dir(new_inode))
+			goto out;
+	} else {
+		if (S_ISDIR(old_inode->i_mode)) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= PMFS_LINK_MAX)
+				goto out;
+		}
+	}
+
+	new_pidir = pmfs_get_inode(sb, new_dir->i_ino);
+
+	pi = pmfs_get_inode(sb, old_inode->i_ino);
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	if (!new_de) {
+		/* link it into the new directory. */
+		err = pmfs_add_entry(trans, new_dentry, old_inode);
+		if (err)
+			goto out;
+	} else {
+		pmfs_add_logentry(sb, trans, &new_de->ino, sizeof(new_de->ino),
+			LE_DATA);
+
+		pmfs_memunlock_range(sb, new_de, sb->s_blocksize);
+		new_de->ino = cpu_to_le64(old_inode->i_ino);
+		/*new_de->file_type = old_de->file_type; */
+		pmfs_memlock_range(sb, new_de, sb->s_blocksize);
+
+		pmfs_add_logentry(sb, trans, new_pidir, MAX_DATA_PER_LENTRY,
+			LE_DATA);
+		/*new_dir->i_version++; */
+		new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
+		pmfs_update_time(new_dir, new_pidir);
+	}
+
+	/* and unlink the inode from the old directory ... */
+	err = pmfs_remove_entry(trans, old_dentry, old_inode);
+	if (err)
+		goto out;
+
+	if (new_inode) {
+		pi = pmfs_get_inode(sb, new_inode->i_ino);
+		pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+		new_inode->i_ctime = CURRENT_TIME;
+
+		pmfs_memunlock_inode(sb, pi);
+		if (S_ISDIR(old_inode->i_mode)) {
+			if (new_inode->i_nlink)
+				drop_nlink(new_inode);
+		}
+		pi->i_ctime = cpu_to_le32(new_inode->i_ctime.tv_sec);
+		if (new_inode->i_nlink)
+			drop_nlink(new_inode);
+		pi->i_links_count = cpu_to_le16(new_inode->i_nlink);
+		pmfs_memlock_inode(sb, pi);
+
+		if (!new_inode->i_nlink)
+			pmfs_truncate_add(new_inode, new_inode->i_size);
+	} else {
+		if (S_ISDIR(old_inode->i_mode)) {
+			pmfs_inc_count(new_dir, new_pidir);
+			old_pidir = pmfs_get_inode(sb, old_dir->i_ino);
+			pmfs_dec_count(old_dir, old_pidir);
+		}
+	}
+
+	pmfs_commit_transaction(sb, trans);
+	return 0;
+out:
+	pmfs_abort_transaction(sb, trans);
+	return err;
+}
+
+struct dentry *pmfs_get_parent(struct dentry *child)
+{
+	struct inode *inode;
+	struct qstr dotdot = { .name = "..", .len = 2 };
+	struct pmfs_direntry *de = NULL;
+	ino_t ino;
+
+	pmfs_inode_by_name(child->d_inode, &dotdot, &de);
+	if (!de)
+		return ERR_PTR(-ENOENT);
+	ino = le64_to_cpu(de->ino);
+
+	if (ino)
+		inode = pmfs_iget(child->d_inode->i_sb, ino);
+	else
+		return ERR_PTR(-ENOENT);
+
+	return d_obtain_alias(inode);
+}
+
+const struct inode_operations pmfs_dir_inode_operations = {
+	.create		= pmfs_create,
+	.lookup		= pmfs_lookup,
+	.link		= pmfs_link,
+	.unlink		= pmfs_unlink,
+	.symlink	= pmfs_symlink,
+	.mkdir		= pmfs_mkdir,
+	.rmdir		= pmfs_rmdir,
+	.mknod		= pmfs_mknod,
+	.rename		= pmfs_rename,
+	.setattr	= pmfs_notify_change,
+	.get_acl	= NULL,
+};
+
+const struct inode_operations pmfs_special_inode_operations = {
+	.setattr	= pmfs_notify_change,
+	.get_acl	= NULL,
+};
diff --git a/fs/pmfs/persist.c b/fs/pmfs/persist.c
new file mode 100644
index 0000000..a039028
--- /dev/null
+++ b/fs/pmfs/persist.c
@@ -0,0 +1,238 @@
+/*
+ * PMFS emulated persistence. this file contains code to load pmfs from a
+ * file into memory and store pmfs to a file from memory.
+ *
+ * Persistent Memory File System
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/bitops.h>
+#include <linux/cred.h>
+#include <linux/backing-dev.h>
+#include "pmfs.h"
+
+static ssize_t pmfs_write_backing_store(struct file *flp, char *src,
+		ssize_t bytes, loff_t *woff)
+{
+	mm_segment_t old_fs;
+	ssize_t len = 0;
+
+	if (bytes > 0) {
+		old_fs = get_fs();
+		set_fs(get_ds());
+		len = vfs_write(flp, src, bytes, woff);
+		set_fs(old_fs);
+		if (len <= 0)
+			pmfs_dbg_verbose("Could not write file or corrupted pmfs\n");
+	}
+	return len;
+}
+
+static ssize_t pmfs_read_backing_store(struct file *flp, char *dest,
+	ssize_t bytes, loff_t *roff)
+{
+	mm_segment_t old_fs;
+	ssize_t len = 0;
+
+	if (bytes > 0) {
+		old_fs = get_fs();
+		set_fs(get_ds());
+		len = vfs_read(flp, dest, bytes, roff);
+		set_fs(old_fs);
+		if (len <= 0)
+			pmfs_dbg_verbose("Could not read file or corrupted pmfs\n");
+	}
+	return len;
+}
+
+/* Stores PMFS memory image into a storage file. Uses the allocation blocknode
+ * linked list to determine which memory ranges to save */
+static int pmfs_storefs(struct file *flp, struct super_block *sb)
+{
+	loff_t woff = 0;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	u64 num_blocknodes = sbi->num_blocknode_allocated, size;
+	struct list_head *head = &(sbi->block_inuse_head);
+	struct pmfs_blocknode *i;
+	struct pmfs_blocknode_lowhigh p;
+	char *ptr;
+
+	pmfs_info("storing pmfs to %s with 0x%llx blknodes\n",
+			   sbi->pmfs_backing_file, num_blocknodes);
+	/* first save the number of blocknodes */
+	if (pmfs_write_backing_store(flp, (char *)&num_blocknodes, sizeof(u64),
+		    &woff) != sizeof(u64))
+		goto out;
+	/* Then save the blocks containing blocknodes. */
+	list_for_each_entry(i, head, link) {
+		p.block_low = cpu_to_le64(i->block_low);
+		p.block_high = cpu_to_le64(i->block_high);
+		if (pmfs_write_backing_store(flp, (char *)&p, sizeof(p), &woff)
+				!= sizeof(p))
+			goto out;
+	}
+	/* align the write offset on 4K boundary */
+	woff = (woff + PAGE_SIZE - 1) & ~(0xFFFUL);
+	/* Now save all the memory ranges allocated in the PMFS. These ranges
+	 * are specified by the block_low and block_high fields of every
+	 * struct pmfs_blocknode_lowhigh */
+	list_for_each_entry(i, head, link) {
+		if (i->block_low == 0)
+			ptr = (char *)pmfs_get_super(sb);
+		else
+			ptr = pmfs_get_block(sb, i->block_low << PAGE_SHIFT);
+		size = (i->block_high - i->block_low + 1) << PAGE_SHIFT;
+		if (pmfs_write_backing_store(flp, ptr, size, &woff) != size)
+			goto out;
+	}
+	vfs_fsync(flp, 0);
+	return 0;
+out:
+	return -EINVAL;
+}
+
+static int pmfs_loadfs(struct file *flp, struct super_block *sb)
+{
+	char *pmfs_base, *buf1, *buf2, *ptr;
+	struct pmfs_super_block *super;
+	loff_t roff = 0;
+	int retval = -EINVAL;
+	u64 pmfs_size, buf_sz, num_blocknodes, i, size; 
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_blocknode_lowhigh *p;
+
+	if (pmfs_read_backing_store(flp, (char *)&num_blocknodes, sizeof(u64),
+		    &roff) != sizeof(u64))
+		return retval;
+
+	pmfs_info("Loading PMFS from %s to phys %llx with 0x%llx blknodes\n",
+		sbi->pmfs_backing_file, sbi->phys_addr, num_blocknodes);
+	buf_sz = num_blocknodes * sizeof(struct pmfs_blocknode_lowhigh);
+
+	buf1 = kmalloc(buf_sz, GFP_KERNEL);
+	if (buf1 == NULL)
+		return retval;
+
+	if (pmfs_read_backing_store(flp, buf1, buf_sz, &roff) != buf_sz)
+		goto out1;
+	p = (struct pmfs_blocknode_lowhigh *)buf1;
+
+	/* align the read offset on 4K boundary */
+	roff = (roff + PAGE_SIZE - 1) & ~(0xFFFUL);
+
+	buf2 = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (buf2 == NULL)
+		goto out1;
+	if (pmfs_read_backing_store(flp, buf2, PAGE_SIZE, &roff) != PAGE_SIZE)
+		goto out2;
+
+	super = (struct pmfs_super_block *)buf2;
+	if (pmfs_check_integrity(NULL, super) == 0) {
+		pmfs_err(sb, "file contains invalid pmfs\n");
+		goto out2;
+	}
+	pmfs_size = le64_to_cpu(super->s_size);
+	pmfs_base = pmfs_ioremap(NULL, sbi->phys_addr, pmfs_size);
+	if (!pmfs_base) {
+		pmfs_err(sb, "ioremap of the pmfs image failed\n");
+		goto out2;
+	}
+	memcpy(pmfs_base, buf2, PAGE_SIZE);
+	/* now walk through the blocknode list and copy every range specified
+	 * in the list to PMFS area */
+	for (i = 0; i < num_blocknodes; i++, p++) {
+		if (p->block_low == 0) {
+			ptr = pmfs_base + PAGE_SIZE;
+			size = (le64_to_cpu(p->block_high) - 
+				le64_to_cpu(p->block_low)) << PAGE_SHIFT;
+		} else {
+			ptr = pmfs_base + (le64_to_cpu(p->block_low) << 
+				PAGE_SHIFT);
+			size = (le64_to_cpu(p->block_high) - 
+				le64_to_cpu(p->block_low) + 1) << PAGE_SHIFT;
+		}
+		if (pmfs_read_backing_store(flp, ptr, size, &roff) != size)
+			goto out;
+	}
+	retval = 0;
+out:
+	iounmap(pmfs_base);
+	release_mem_region(sbi->phys_addr, pmfs_size);
+out2:
+	kfree(buf2);
+out1:
+	kfree(buf1);
+	return retval;
+}
+
+void pmfs_load_from_file(struct super_block *sb)
+{
+	struct file *flp;
+	mm_segment_t oldfs;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	if (strlen(sbi->pmfs_backing_file) && sbi->pmfs_backing_option != 1) {
+		oldfs = get_fs();
+		set_fs(get_ds());
+		flp = filp_open(sbi->pmfs_backing_file, O_RDONLY | O_LARGEFILE,
+			S_IRWXU);
+		set_fs(oldfs);
+		if (IS_ERR(flp)) {
+			pmfs_info("Can't open backing file %s\n",
+				   sbi->pmfs_backing_file);
+		} else {
+			pmfs_loadfs(flp, sb);
+			oldfs = get_fs();
+			set_fs(get_ds());
+			filp_close(flp, current->files);
+			set_fs(oldfs);
+		}
+	}
+}
+
+void pmfs_store_to_file(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	if (strlen(sbi->pmfs_backing_file) && sbi->pmfs_backing_option != 2) {
+		struct file *flp;
+		mm_segment_t oldfs;
+		oldfs = get_fs();
+		set_fs(get_ds());
+		flp = filp_open(sbi->pmfs_backing_file,
+			O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, S_IRWXU);
+		set_fs(oldfs);
+		if (IS_ERR(flp)) {
+			pmfs_info("Can't open file %s\n",
+				   sbi->pmfs_backing_file);
+		} else {
+			pmfs_storefs(flp, sb);
+			oldfs = get_fs();
+			set_fs(get_ds());
+			filp_close(flp, current->files);
+			set_fs(oldfs);
+		}
+	}
+	sbi->pmfs_backing_file[0] = '\0';
+	sbi->pmfs_backing_option = 0;
+}
diff --git a/fs/pmfs/pmfs.h b/fs/pmfs/pmfs.h
new file mode 100644
index 0000000..5dbb74c
--- /dev/null
+++ b/fs/pmfs/pmfs.h
@@ -0,0 +1,576 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef __PMFS_H
+#define __PMFS_H
+
+#include <linux/buffer_head.h>
+#include <linux/pmfs_def.h>
+#include <linux/pmfs_sb.h>
+#include <linux/crc16.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/types.h>
+#include "wprotect.h"
+#include "journal.h"
+
+#define PAGE_SHIFT_2M 21
+#define PAGE_SHIFT_1G 30
+
+#define PMFS_ASSERT(x)                                                 \
+	if (!(x)) {                                                     \
+		printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
+	               __FILE__, __LINE__, #x);                         \
+	}
+
+/*
+ * Debug code
+ */
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
+/* #define pmfs_dbg(s, args...)         pr_debug(s, ## args) */
+#define pmfs_dbg(s, args ...)           pr_info(s, ## args)
+#define pmfs_dbg1(s, args ...)
+#define pmfs_err(sb, s, args ...)       pmfs_error_mng(sb, s, ## args)
+#define pmfs_warn(s, args ...)          pr_warning(s, ## args)
+#define pmfs_info(s, args ...)          pr_info(s, ## args)
+
+extern unsigned int pmfs_dbgmask;
+#define PMFS_DBGMASK_MMAPHUGE          (0x00000001)
+#define PMFS_DBGMASK_MMAP4K            (0x00000002)
+#define PMFS_DBGMASK_MMAPVERBOSE       (0x00000004)
+#define PMFS_DBGMASK_MMAPVVERBOSE      (0x00000008)
+#define PMFS_DBGMASK_VERBOSE           (0x00000010)
+#define PMFS_DBGMASK_TRANSACTION       (0x00000020)
+
+#define pmfs_dbg_mmaphuge(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_MMAPHUGE) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmap4k(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_MMAP4K) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmapv(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_MMAPVERBOSE) ? pmfs_dbg(s, args) : 0)
+#define pmfs_dbg_mmapvv(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_MMAPVVERBOSE) ? pmfs_dbg(s, args) : 0)
+
+#define pmfs_dbg_verbose(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_VERBOSE) ? pmfs_dbg(s, ##args) : 0)
+#define pmfs_dbg_trans(s, args ...)		 \
+	((pmfs_dbgmask & PMFS_DBGMASK_TRANSACTION) ? pmfs_dbg(s, ##args) : 0)
+
+#define pmfs_set_bit                   __test_and_set_bit_le
+#define pmfs_clear_bit                 __test_and_clear_bit_le
+#define pmfs_find_next_zero_bit                find_next_zero_bit_le
+
+#define clear_opt(o, opt)       (o &= ~PMFS_MOUNT_ ## opt)
+#define set_opt(o, opt)         (o |= PMFS_MOUNT_ ## opt)
+#define test_opt(sb, opt)       (PMFS_SB(sb)->s_mount_opt & PMFS_MOUNT_ ## opt)
+
+#define PMFS_LARGE_INODE_TABLE_SIZE    (0x200000)
+/* PMFS size threashold for using 2M blocks for inode table */
+#define PMFS_LARGE_INODE_TABLE_THREASHOLD    (0x20000000)
+/*
+ * pmfs inode flags
+ *
+ * PMFS_EOFBLOCKS_FL	There are blocks allocated beyond eof
+ */
+#define PMFS_EOFBLOCKS_FL      0x20000000
+/* Flags that should be inherited by new inodes from their parent. */
+#define PMFS_FL_INHERITED (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | \
+			    FS_SYNC_FL | FS_NODUMP_FL | FS_NOATIME_FL |	\
+			    FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_JOURNAL_DATA_FL | \
+			    FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define PMFS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define PMFS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define PMFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | PMFS_EOFBLOCKS_FL)
+
+#define INODES_PER_BLOCK(bt) (1 << (blk_type_to_shift[bt] - PMFS_INODE_BITS))
+
+extern unsigned int blk_type_to_shift[PMFS_BLOCK_TYPE_MAX];
+extern unsigned int blk_type_to_size[PMFS_BLOCK_TYPE_MAX];
+
+/* Function Prototypes */
+extern void pmfs_error_mng(struct super_block *sb, const char *fmt, ...);
+
+/* file.c */
+extern int pmfs_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* balloc.c */
+int pmfs_setup_blocknode_map(struct super_block *sb);
+extern struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb);
+extern void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode);
+extern void pmfs_init_blockmap(struct super_block *sb,
+		unsigned long init_used_size);
+extern void pmfs_free_block(struct super_block *sb, unsigned long blocknr,
+	unsigned short btype);
+extern int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
+	unsigned short btype, int zero);
+extern unsigned long pmfs_count_free_blocks(struct super_block *sb);
+
+/* dir.c */
+extern int pmfs_add_entry(pmfs_transaction_t *trans,
+		struct dentry *dentry, struct inode *inode);
+extern int pmfs_remove_entry(pmfs_transaction_t *trans,
+		struct dentry *dentry, struct inode *inode);
+
+/* namei.c */
+extern struct dentry *pmfs_get_parent(struct dentry *child);
+
+/* inode.c */
+extern unsigned int pmfs_free_inode_subtree(struct super_block *sb,
+                u64 root, u32 height, u32 btype, loff_t end);
+extern int __pmfs_alloc_blocks(pmfs_transaction_t *trans,
+		struct super_block *sb, struct pmfs_inode *pi,
+		unsigned long file_blocknr, unsigned int num, bool zero);
+extern int pmfs_init_inode_table(struct super_block *sb);
+extern int pmfs_alloc_blocks(pmfs_transaction_t *trans, struct inode *inode,
+		unsigned long file_blocknr, unsigned int num, bool zero);
+extern u64 pmfs_find_data_block(struct inode *inode,
+	unsigned long file_blocknr);
+int pmfs_set_blocksize_hint(struct super_block *sb, struct pmfs_inode *pi,
+		loff_t new_size);
+void pmfs_setsize(struct inode *inode, loff_t newsize);
+
+extern struct inode *pmfs_iget(struct super_block *sb, unsigned long ino);
+extern void pmfs_put_inode(struct inode *inode);
+extern void pmfs_evict_inode(struct inode *inode);
+extern struct inode *pmfs_new_inode(pmfs_transaction_t *trans,
+	struct inode *dir, umode_t mode, const struct qstr *qstr);
+extern inline void pmfs_update_isize(struct inode *inode,
+		struct pmfs_inode *pi);
+extern inline void pmfs_update_nlink(struct inode *inode,
+		struct pmfs_inode *pi);
+extern inline void pmfs_update_time(struct inode *inode,
+		struct pmfs_inode *pi);
+extern int pmfs_write_inode(struct inode *inode,
+	struct writeback_control *wbc);
+extern void pmfs_dirty_inode(struct inode *inode, int flags);
+extern int pmfs_notify_change(struct dentry *dentry, struct iattr *attr);
+int pmfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 
+		struct kstat *stat);
+extern void pmfs_set_inode_flags(struct inode *inode, struct pmfs_inode *pi);
+extern void pmfs_get_inode_flags(struct inode *inode, struct pmfs_inode *pi);
+extern unsigned long pmfs_find_region(struct inode *inode, loff_t *offset,
+		int hole);
+extern void pmfs_truncate_del(struct inode *inode);
+extern void pmfs_truncate_add(struct inode *inode, u64 truncate_size);
+
+/* ioctl.c */
+extern long pmfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+extern long pmfs_compat_ioctl(struct file *file, unsigned int cmd,
+	unsigned long arg);
+#endif
+
+/* super.c */
+#ifdef CONFIG_PMFS_TEST
+extern struct pmfs_super_block *get_pmfs_super(void);
+#endif
+extern void __pmfs_free_blocknode(struct pmfs_blocknode *bnode);
+extern struct super_block *pmfs_read_super(struct super_block *sb, void *data,
+	int silent);
+extern int pmfs_statfs(struct dentry *d, struct kstatfs *buf);
+extern int pmfs_remount(struct super_block *sb, int *flags, char *data);
+
+/* Provides ordering from all previous clflush too */
+static inline void PERSISTENT_MARK(void)
+{
+	/* TODO: Fix me. */
+}
+
+static inline void PERSISTENT_BARRIER(void)
+{
+	asm volatile ("sfence\n" : : );
+}
+
+static inline void pmfs_flush_buffer(void *buf, uint32_t len, bool fence)
+{
+	uint32_t i;
+	len = len + ((unsigned long)(buf) & (CACHELINE_SIZE - 1));
+	for (i = 0; i < len; i += CACHELINE_SIZE)
+		asm volatile ("clflush %0\n" : "+m" (*(char *)(buf+i)));
+	/* Do a fence only if asked. We often don't need to do a fence immidiately
+	 * after clflush because even if we get context switched between clflush
+	 * and subsequent fence, the context switch operation provides implicit
+	 * fence. */
+	if (fence)
+		asm volatile ("sfence\n" : : );
+}
+
+/* symlink.c */
+extern int pmfs_block_symlink(struct inode *inode, const char *symname,
+	int len);
+
+/* Inline functions start here */
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __le32 pmfs_mask_flags(umode_t mode, __le32 flags)
+{
+	flags &= cpu_to_le32(PMFS_FL_INHERITED);
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & cpu_to_le32(PMFS_REG_FLMASK);
+	else
+		return flags & cpu_to_le32(PMFS_OTHER_FLMASK);
+}
+
+static inline int pmfs_calc_checksum(u8 *data, int n)
+{
+	u16 crc = 0;
+
+	crc = crc16(~0, (__u8 *)data + sizeof(__le16), n - sizeof(__le16));
+	if (*((__le16 *)data) == cpu_to_le16(crc))
+		return 0;
+	else
+		return 1;
+}
+
+struct pmfs_blocknode_lowhigh {
+       unsigned long block_low;
+       unsigned long block_high;
+};
+               
+struct pmfs_blocknode {
+	struct list_head link;
+	unsigned long block_low;
+	unsigned long block_high;
+};
+
+struct pmfs_inode_vfs {
+	__u32   i_dir_start_lookup;
+	struct list_head i_truncated;
+	struct inode	vfs_inode;
+};
+
+static inline struct pmfs_sb_info *PMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct pmfs_inode_vfs *PMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct pmfs_inode_vfs, vfs_inode);
+}
+
+/* If this is part of a read-modify-write of the super block,
+ * pmfs_memunlock_super() before calling! */
+static inline struct pmfs_super_block *pmfs_get_super(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	return (struct pmfs_super_block *)sbi->virt_addr;
+}
+
+static inline pmfs_journal_t *pmfs_get_journal(struct super_block *sb)
+{
+	struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+	return (pmfs_journal_t *)((char *)ps +
+			le64_to_cpu(ps->s_journal_offset));
+}
+
+static inline struct pmfs_inode *pmfs_get_inode_table(struct super_block *sb)
+{
+	struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+	return (struct pmfs_inode *)((char *)ps +
+			le64_to_cpu(ps->s_inode_table_offset));
+}
+
+static inline struct pmfs_super_block *pmfs_get_redund_super(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	return (struct pmfs_super_block *)(sbi->virt_addr + PMFS_SB_SIZE);
+}
+
+/* If this is part of a read-modify-write of the block,
+ * pmfs_memunlock_block() before calling! */
+static inline void *pmfs_get_block(struct super_block *sb, u64 block)
+{
+	struct pmfs_super_block *ps = pmfs_get_super(sb);
+
+	return block ? ((void *)ps + block) : NULL;
+}
+
+/* uses CPU instructions to atomically write upto 8 bytes */
+static inline void pmfs_memcpy_atomic (void *dst, const void *src, u8 size)
+{
+	switch (size) {
+		case 1: {
+			volatile u8 *daddr = dst;
+			const u8 *saddr = src;
+			*daddr = *saddr;
+			break;
+		}
+		case 2: {
+			volatile u16 *daddr = dst;
+			const u16 *saddr = src;
+			*daddr = cpu_to_le16(*saddr);
+			break;
+		}
+		case 4: {
+			volatile u32 *daddr = dst;
+			const u32 *saddr = src;
+			*daddr = cpu_to_le32(*saddr);
+			break;
+		}
+		case 8: {
+			volatile u64 *daddr = dst;
+			const u64 *saddr = src;
+			*daddr = cpu_to_le64(*saddr);
+			break;
+		}
+		default:
+			pmfs_dbg("error: memcpy_atomic called with %d bytes\n", size);
+			//BUG();
+	}
+}
+
+static inline void pmfs_update_time_and_size(struct inode *inode,
+	struct pmfs_inode *pi)
+{
+	uint32_t words[2];
+	/* pi->i_size, pi->i_ctime, and pi->i_mtime need to be atomically updated.
+ 	* So use cmpxchg16b here. */
+	words[0] = cpu_to_le32(inode->i_ctime.tv_sec);
+	words[1] = cpu_to_le32(inode->i_mtime.tv_sec);
+	/* TODO: the following function assumes cmpxchg16b instruction writes
+ 	* 16 bytes atomically. Confirm if it is really true. */
+	cmpxchg_double_local(&pi->i_size, (u64 *)&pi->i_ctime, pi->i_size,
+		*(u64 *)&pi->i_ctime, inode->i_size, *(u64 *)words);
+}
+
+/* assumes the length to be 4-byte aligned */
+static inline void memset_nt(void *dest, uint32_t dword, size_t length)
+{
+	uint64_t dummy1, dummy2;
+	uint64_t qword = ((uint64_t)dword << 32) | dword;
+
+	asm volatile ("movl %%edx,%%ecx\n"
+		"andl $63,%%edx\n"
+		"shrl $6,%%ecx\n"
+		"jz 9f\n"
+		"1:      movnti %%rax,(%%rdi)\n"
+		"2:      movnti %%rax,1*8(%%rdi)\n"
+		"3:      movnti %%rax,2*8(%%rdi)\n"
+		"4:      movnti %%rax,3*8(%%rdi)\n"
+		"5:      movnti %%rax,4*8(%%rdi)\n"
+		"8:      movnti %%rax,5*8(%%rdi)\n"
+		"7:      movnti %%rax,6*8(%%rdi)\n"
+		"8:      movnti %%rax,7*8(%%rdi)\n"
+		"leaq 64(%%rdi),%%rdi\n"
+		"decl %%ecx\n"
+		"jnz 1b\n"
+		"9:     movl %%edx,%%ecx\n"
+		"andl $7,%%edx\n"
+		"shrl $3,%%ecx\n"
+		"jz 11f\n"
+		"10:     movnti %%rax,(%%rdi)\n"
+		"leaq 8(%%rdi),%%rdi\n"
+		"decl %%ecx\n"
+		"jnz 10b\n"
+		"11:     movl %%edx,%%ecx\n"
+		"shrl $2,%%ecx\n"
+		"jz 12f\n"
+		"movnti %%eax,(%%rdi)\n"
+		"12:\n"
+		: "=D"(dummy1), "=d" (dummy2) : "D" (dest), "a" (qword), "d" (length) : "memory", "rcx");
+}
+
+static inline u64 __pmfs_find_data_block(struct super_block *sb,
+		struct pmfs_inode *pi, unsigned long blocknr)
+{
+	u64 *level_ptr, bp = 0;
+	u32 height, bit_shift;
+	unsigned int idx;
+
+	height = pi->height;
+	bp = le64_to_cpu(pi->root);
+
+	while (height > 0) {
+		level_ptr = pmfs_get_block(sb, bp);
+		bit_shift = (height - 1) * META_BLK_SHIFT;
+		idx = blocknr >> bit_shift;
+		bp = le64_to_cpu(level_ptr[idx]);
+		if (bp == 0)
+			return 0;
+		blocknr = blocknr & ((1 << bit_shift) - 1);
+		height--;
+	}
+	return bp;
+}
+
+static inline unsigned int pmfs_inode_blk_shift (struct pmfs_inode *pi)
+{
+	return blk_type_to_shift[pi->i_blk_type];
+}
+
+static inline uint32_t pmfs_inode_blk_size (struct pmfs_inode *pi)
+{
+	return blk_type_to_size[pi->i_blk_type];
+}
+
+/* If this is part of a read-modify-write of the inode metadata,
+ * pmfs_memunlock_inode() before calling! */
+static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb,
+						  u64	ino)
+{
+	struct pmfs_super_block *ps = pmfs_get_super(sb);
+	struct pmfs_inode *inode_table = pmfs_get_inode_table(sb);
+	u64 bp, block, ino_offset;
+
+	if (ino == 0)
+		return NULL;
+
+	block = ino >> pmfs_inode_blk_shift(inode_table);
+	bp = __pmfs_find_data_block(sb, inode_table, block);
+
+	if (bp == 0)
+		return NULL;
+	ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1));
+	return (struct pmfs_inode *)((void *)ps + bp + ino_offset);
+}
+
+static inline u64
+pmfs_get_addr_off(struct pmfs_sb_info *sbi, void *addr)
+{
+	PMFS_ASSERT((addr >= sbi->virt_addr) &&
+			(addr < (sbi->virt_addr + sbi->initsize)));
+	return (u64)(addr - sbi->virt_addr);
+}
+
+static inline u64
+pmfs_get_block_off(struct super_block *sb, unsigned long blocknr,
+		    unsigned short btype)
+{
+	return (u64)blocknr << PAGE_SHIFT;
+}
+
+static inline unsigned long
+pmfs_get_numblocks(unsigned short btype)
+{
+	unsigned long num_blocks;
+
+	if (btype == PMFS_BLOCK_TYPE_4K) {
+		num_blocks = 1;
+	} else if (btype == PMFS_BLOCK_TYPE_2M) {
+		num_blocks = 512;
+	} else {
+		//btype == PMFS_BLOCK_TYPE_1G 
+		num_blocks = 0x40000;
+	}
+	return num_blocks;
+}
+
+static inline unsigned long
+pmfs_get_blocknr(struct super_block *sb, u64 block, unsigned short btype)
+{
+	return block >> PAGE_SHIFT;
+}
+
+static inline unsigned long pmfs_get_pfn(struct super_block *sb, u64 block)
+{
+	return (PMFS_SB(sb)->phys_addr + block) >> PAGE_SHIFT;
+}
+
+static inline int pmfs_is_mounting(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+	return sbi->s_mount_opt & PMFS_MOUNT_MOUNTING;
+}
+
+static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_item (struct 
+		super_block *sb, u64 ino)
+{
+	struct pmfs_inode *pi = pmfs_get_inode(sb, ino);
+	return (struct pmfs_inode_truncate_item *)(pi + 1);
+}
+
+static inline struct pmfs_inode_truncate_item * pmfs_get_truncate_list_head (
+		struct super_block *sb)
+{
+	struct pmfs_inode *pi = pmfs_get_inode_table(sb);
+	return (struct pmfs_inode_truncate_item *)(pi + 1);
+}
+
+static inline void check_eof_blocks(struct super_block *sb, 
+		struct pmfs_inode *pi, loff_t size)
+{
+	if ((pi->i_flags & cpu_to_le32(PMFS_EOFBLOCKS_FL)) &&
+		(size + sb->s_blocksize) > (le64_to_cpu(pi->i_blocks)
+			<< sb->s_blocksize_bits))
+		pi->i_flags &= cpu_to_le32(~PMFS_EOFBLOCKS_FL);
+}
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations pmfs_dir_operations;
+
+/* file.c */
+extern const struct inode_operations pmfs_file_inode_operations;
+extern const struct file_operations pmfs_xip_file_operations;
+
+/* inode.c */
+extern const struct address_space_operations pmfs_aops_xip;
+
+/* bbuild.c */
+void pmfs_save_blocknode_mappings(struct super_block *sb);
+
+/* namei.c */
+extern const struct inode_operations pmfs_dir_inode_operations;
+extern const struct inode_operations pmfs_special_inode_operations;
+
+/* symlink.c */
+extern const struct inode_operations pmfs_symlink_inode_operations;
+
+extern struct backing_dev_info pmfs_backing_dev_info;
+
+int pmfs_check_integrity(struct super_block *sb,
+	struct pmfs_super_block *super);
+void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr,
+	ssize_t size);
+
+/* Emulated persistence APIs */
+void pmfs_set_backing_file(char *file_str);
+void pmfs_set_backing_option(int option);
+void pmfs_load_from_file(struct super_block *sb);
+void pmfs_store_to_file(struct super_block *sb);
+
+int pmfs_check_dir_entry(const char *function, struct inode *dir,
+			  struct pmfs_direntry *de, u8 *base,
+			  unsigned long offset);
+
+static inline int pmfs_match(int len, const char *const name,
+			      struct pmfs_direntry *de)
+{
+	if (len == de->name_len && de->ino && !memcmp(de->name, name, len))
+		return 1;
+	return 0;
+}
+
+int pmfs_search_dirblock(u8 *blk_base, struct inode *dir, struct qstr *child,
+			  unsigned long offset,
+			  struct pmfs_direntry **res_dir,
+			  struct pmfs_direntry **prev_dir);
+
+#endif /* __PMFS_H */
diff --git a/fs/pmfs/pmfs_test.c b/fs/pmfs/pmfs_test.c
new file mode 100644
index 0000000..94edbef
--- /dev/null
+++ b/fs/pmfs/pmfs_test.c
@@ -0,0 +1,50 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * pmfs test module.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include "pmfs.h"
+
+int __init test_pmfs_write(void)
+{
+	struct pmfs_super_block *psb;
+
+	psb = get_pmfs_super();
+	if (!psb) {
+		printk(KERN_ERR
+		       "%s: PMFS super block not found (not mounted?)\n",
+		       __func__);
+		return 1;
+	}
+
+	/*
+	 * Attempt an unprotected clear of checksum information in the
+	 * superblock, this should cause a kernel page protection fault.
+	 */
+	printk("%s: writing to kernel VA %p\n", __func__, psb);
+	psb->s_sum = 0;
+
+	return 0;
+}
+
+void test_pmfs_write_cleanup(void)
+{
+}
+
+/* Module information */
+MODULE_LICENSE("GPL");
+module_init(test_pmfs_write);
+module_exit(test_pmfs_write_cleanup);
diff --git a/fs/pmfs/super.c b/fs/pmfs/super.c
new file mode 100644
index 0000000..7f708e8
--- /dev/null
+++ b/fs/pmfs/super.c
@@ -0,0 +1,1217 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Super block operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/ctype.h>
+#include <linux/bitops.h>
+#include <linux/magic.h>
+#include <linux/exportfs.h>
+#include <linux/random.h>
+#include <linux/cred.h>
+#include <linux/backing-dev.h>
+#include <linux/list.h>
+#include "pmfs.h"
+
+static struct super_operations pmfs_sops;
+static const struct export_operations pmfs_export_ops;
+static struct kmem_cache *pmfs_inode_cachep;
+static struct kmem_cache *pmfs_blocknode_cachep;
+static struct kmem_cache *pmfs_transaction_cachep;
+/* FIXME: should the following variable be one per PMFS instance? */
+unsigned int pmfs_dbgmask = 0;
+
+#ifdef CONFIG_PMFS_TEST
+static void *first_pmfs_super;
+
+struct pmfs_super_block *get_pmfs_super(void)
+{
+	return (struct pmfs_super_block *)first_pmfs_super;
+}
+EXPORT_SYMBOL(get_pmfs_super);
+#endif
+
+void pmfs_error_mng(struct super_block *sb, const char *fmt, ...)
+{
+	va_list args;
+
+	printk("pmfs error: ");
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("pmfs: panic from previous error\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		printk(KERN_CRIT "pmfs err: remounting filesystem read-only");
+		sb->s_flags |= MS_RDONLY;
+	}
+}
+
+static void pmfs_set_blocksize(struct super_block *sb, unsigned long size)
+{
+	int bits;
+
+	/*
+	 * We've already validated the user input and the value here must be
+	 * between PMFS_MAX_BLOCK_SIZE and PMFS_MIN_BLOCK_SIZE
+	 * and it must be a power of 2.
+	 */
+	bits = fls(size) - 1;
+	sb->s_blocksize_bits = bits;
+	sb->s_blocksize = (1 << bits);
+}
+
+static inline int pmfs_has_huge_ioremap(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+	return sbi->s_mount_opt & PMFS_MOUNT_HUGEIOREMAP;
+}
+
+void *pmfs_ioremap(struct super_block *sb, phys_addr_t phys_addr, ssize_t size)
+{
+	void *retval;
+	int protect, hugeioremap;
+
+	if (sb) {
+		protect = pmfs_is_wprotected(sb);
+		hugeioremap = pmfs_has_huge_ioremap(sb);
+	} else {
+		protect = 0;
+		hugeioremap = 1;
+	}
+
+	/*
+	 * NOTE: Userland may not map this resource, we will mark the region so
+	 * /dev/mem and the sysfs MMIO access will not be allowed. This
+	 * restriction depends on STRICT_DEVMEM option. If this option is
+	 * disabled or not available we mark the region only as busy.
+	 */
+	retval = request_mem_region_exclusive(phys_addr, size, "pmfs");
+	if (!retval)
+		goto fail;
+
+	if (protect) {
+		if (hugeioremap)
+			retval = ioremap_hpage_cache_ro(phys_addr, size);
+		else
+			retval = ioremap_cache_ro(phys_addr, size);
+	} else {
+		if (hugeioremap)
+			retval = ioremap_hpage_cache(phys_addr, size);
+		else
+			retval = ioremap_cache(phys_addr, size);
+	}
+
+fail:
+	return retval;
+}
+
+static inline int pmfs_iounmap(void *virt_addr, ssize_t size, int protected)
+{
+	iounmap(virt_addr);
+	return 0;
+}
+
+static loff_t pmfs_max_size(int bits)
+{
+	loff_t res;
+
+	res = (1ULL << (3 * 9 + bits)) - 1;
+
+	if (res > MAX_LFS_FILESIZE)
+		res = MAX_LFS_FILESIZE;
+
+	pmfs_dbg_verbose("max file size %llu bytes\n", res);
+	return res;
+}
+
+enum {
+	Opt_addr, Opt_bpi, Opt_size, Opt_jsize,
+	Opt_num_inodes, Opt_mode, Opt_uid,
+	Opt_gid, Opt_blocksize, Opt_wprotect, Opt_wprotectold,
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_backing, Opt_backing_opt,
+	Opt_hugemmap, Opt_nohugeioremap, Opt_dbgmask, Opt_err
+};
+
+static const match_table_t tokens = {
+	{ Opt_addr,	     "physaddr=%x"	  },
+	{ Opt_bpi,	     "bpi=%u"		  },
+	{ Opt_size,	     "init=%s"		  },
+	{ Opt_jsize,     "jsize=%s"		  },
+	{ Opt_num_inodes,"num_inodes=%u"  },
+	{ Opt_mode,	     "mode=%o"		  },
+	{ Opt_uid,	     "uid=%u"		  },
+	{ Opt_gid,	     "gid=%u"		  },
+	{ Opt_wprotect,	     "wprotect"		  },
+	{ Opt_wprotectold,   "wprotectold"	  },
+	{ Opt_err_cont,	     "errors=continue"	  },
+	{ Opt_err_panic,     "errors=panic"	  },
+	{ Opt_err_ro,	     "errors=remount-ro"  },
+	{ Opt_backing,	     "backing=%s"	  },
+	{ Opt_backing_opt,   "backing_opt=%u"	  },
+	{ Opt_hugemmap,	     "hugemmap"		  },
+	{ Opt_nohugeioremap, "nohugeioremap"	  },
+	{ Opt_dbgmask,	     "dbgmask=%u"	  },
+	{ Opt_err,	     NULL		  },
+};
+
+static phys_addr_t get_phys_addr(void **data)
+{
+	phys_addr_t phys_addr;
+	char *options = (char *)*data;
+
+	if (!options || strncmp(options, "physaddr=", 9) != 0)
+		return (phys_addr_t)ULLONG_MAX;
+	options += 9;
+	phys_addr = (phys_addr_t)simple_strtoull(options, &options, 0);
+	if (*options && *options != ',') {
+		printk(KERN_ERR "Invalid phys addr specification: %s\n",
+		       (char *)*data);
+		return (phys_addr_t)ULLONG_MAX;
+	}
+	if (phys_addr & (PAGE_SIZE - 1)) {
+		printk(KERN_ERR "physical address 0x%16llx for pmfs isn't "
+		       "aligned to a page boundary\n", (u64)phys_addr);
+		return (phys_addr_t)ULLONG_MAX;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *)options;
+	return phys_addr;
+}
+
+static int pmfs_parse_options(char *options, struct pmfs_sb_info *sbi,
+			       bool remount)
+{
+	char *p, *rest;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_addr:
+			if (remount)
+				goto bad_opt;
+			/* physaddr managed in get_phys_addr() */
+			break;
+		case Opt_bpi:
+			if (remount)
+				goto bad_opt;
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->bpi = option;
+			break;
+		case Opt_uid:
+			if (remount)
+				goto bad_opt;
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->gid = option;
+			break;
+		case Opt_mode:
+			if (match_octal(&args[0], &option))
+				goto bad_val;
+			sbi->mode = option & 01777U;
+			break;
+		case Opt_size:
+			if (remount)
+				goto bad_opt;
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			sbi->initsize = memparse(args[0].from, &rest);
+			set_opt(sbi->s_mount_opt, FORMAT);
+			break;
+		case Opt_jsize:
+			if (remount)
+				goto bad_opt;
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			sbi->jsize = memparse(args[0].from, &rest);
+			/* make sure journal size is integer power of 2 */
+			if (sbi->jsize & (sbi->jsize - 1) ||
+				sbi->jsize < PMFS_MINIMUM_JOURNAL_SIZE) {
+				pmfs_dbg("Invalid jsize: "
+					"must be whole power of 2 & >= 64KB\n");
+				goto bad_val;
+			}
+			break;
+		case Opt_num_inodes:
+			if (remount)
+				goto bad_opt;
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->num_inodes = option;
+			break;
+		case Opt_err_panic:
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_wprotect:
+			if (remount)
+				goto bad_opt;
+			set_opt(sbi->s_mount_opt, PROTECT);
+			pmfs_info
+				("PMFS: Enabling new Write Protection (CR0.WP)\n");
+			break;
+		case Opt_wprotectold:
+			if (remount)
+				goto bad_opt;
+			set_opt(sbi->s_mount_opt, PROTECT_OLD);
+			pmfs_info
+				("PMFS: Enabling old Write Protection (PAGE RW Bit)\n");
+			break;
+		case Opt_hugemmap:
+			if (remount)
+				goto bad_opt;
+			set_opt(sbi->s_mount_opt, HUGEMMAP);
+			pmfs_info("PMFS: Enabling huge mappings for mmap\n");
+			break;
+		case Opt_nohugeioremap:
+			if (remount)
+				goto bad_opt;
+			clear_opt(sbi->s_mount_opt, HUGEIOREMAP);
+			pmfs_info("PMFS: Disabling huge ioremap\n");
+			break;
+		case Opt_dbgmask:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			pmfs_dbgmask = option;
+			break;
+		case Opt_backing:
+			strncpy(sbi->pmfs_backing_file, args[0].from, 255);
+			break;
+		case Opt_backing_opt:
+			if (match_int(&args[0], &option))
+				goto bad_val;
+			sbi->pmfs_backing_option = option;
+			break;
+		default: {
+			goto bad_opt;
+		}
+		}
+	}
+
+	return 0;
+
+bad_val:
+	printk(KERN_INFO "Bad value '%s' for mount option '%s'\n", args[0].from,
+	       p);
+	return -EINVAL;
+bad_opt:
+	printk(KERN_INFO "Bad mount option: \"%s\"\n", p);
+	return -EINVAL;
+}
+
+static bool pmfs_check_size (struct super_block *sb, unsigned long size)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	unsigned long minimum_size, num_blocks;
+
+	/* space required for super block and root directory */
+	minimum_size = 2 << sb->s_blocksize_bits;
+
+	/* space required for inode table */
+	if (sbi->num_inodes > 0)
+		num_blocks = (sbi->num_inodes >>
+			(sb->s_blocksize_bits - PMFS_INODE_BITS)) + 1;
+	else
+		num_blocks = 1;
+	minimum_size += (num_blocks << sb->s_blocksize_bits);
+	/* space required for journal */
+	minimum_size += sbi->jsize;
+
+	if (size < minimum_size)
+	    return false;
+
+	return true;
+}
+
+
+static struct pmfs_inode *pmfs_init(struct super_block *sb,
+				      unsigned long size)
+{
+	unsigned long blocksize;
+	u64 journal_meta_start, journal_data_start, inode_table_start;
+	struct pmfs_inode *root_i;
+	struct pmfs_super_block *super;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_direntry *de;
+	unsigned long blocknr;
+
+	pmfs_info("creating an empty pmfs of size %lu\n", size);
+	sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, size);
+	sbi->block_start = (unsigned long)0;
+	sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT);
+	sbi->num_free_blocks = ((unsigned long)(size) >> PAGE_SHIFT);
+
+	if (!sbi->virt_addr) {
+		printk(KERN_ERR "ioremap of the pmfs image failed(1)\n");
+		return ERR_PTR(-EINVAL);
+	}
+#ifdef CONFIG_PMFS_TEST
+	if (!first_pmfs_super)
+		first_pmfs_super = sbi->virt_addr;
+#endif
+
+	pmfs_dbg_verbose("pmfs: Default block size set to 4K\n");
+	blocksize = sbi->blocksize = PMFS_DEF_BLOCK_SIZE_4K;
+
+	pmfs_set_blocksize(sb, blocksize);
+	blocksize = sb->s_blocksize;
+
+	if (sbi->blocksize && sbi->blocksize != blocksize)
+		sbi->blocksize = blocksize;
+
+	if (!pmfs_check_size(sb, size)) {
+		pmfs_dbg("Specified PMFS size too small 0x%lx. Either increase"
+			" PMFS size, or reduce num. of inodes (minimum 32)" 
+			" or journal size (minimum 64KB)\n", size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	journal_meta_start = sizeof(struct pmfs_super_block);
+	journal_meta_start = (journal_meta_start + CACHELINE_SIZE - 1) &
+		~(CACHELINE_SIZE - 1);
+	inode_table_start = journal_meta_start + sizeof(pmfs_journal_t);
+	inode_table_start = (inode_table_start + CACHELINE_SIZE - 1) &
+		~(CACHELINE_SIZE - 1);
+
+	if ((inode_table_start + sizeof(struct pmfs_inode)) > PMFS_SB_SIZE) {
+		pmfs_dbg("PMFS super block defined too small. defined 0x%x, "
+				"required 0x%llx\n", PMFS_SB_SIZE,
+			inode_table_start + sizeof(struct pmfs_inode));
+		return ERR_PTR(-EINVAL);
+	}
+
+	journal_data_start = PMFS_SB_SIZE * 2;
+	journal_data_start = (journal_data_start + blocksize - 1) &
+		~(blocksize - 1);
+
+	pmfs_dbg_verbose("journal meta start %llx data start 0x%llx, "
+		"journal size 0x%x, inode_table 0x%llx\n", journal_meta_start,
+		journal_data_start, sbi->jsize, inode_table_start);
+	pmfs_dbg_verbose("max file name len %d\n", (unsigned int)PMFS_NAME_LEN);
+
+	super = pmfs_get_super(sb);
+	pmfs_memunlock_range(sb, super, journal_data_start);
+
+	/* clear out super-block and inode table */
+	memset_nt(super, 0, journal_data_start);
+	super->s_size = cpu_to_le64(size);
+	super->s_blocksize = cpu_to_le32(blocksize);
+	super->s_magic = cpu_to_le16(PMFS_SUPER_MAGIC);
+	super->s_journal_offset = cpu_to_le64(journal_meta_start);
+	super->s_inode_table_offset = cpu_to_le64(inode_table_start);
+
+	pmfs_init_blockmap(sb, journal_data_start + sbi->jsize);
+	pmfs_memlock_range(sb, super, journal_data_start);
+
+	if (pmfs_journal_hard_init(sb, journal_data_start, sbi->jsize) < 0) {
+		printk(KERN_ERR "Journal hard initialization failed\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (pmfs_init_inode_table(sb) < 0)
+		return ERR_PTR(-EINVAL);
+
+	pmfs_memunlock_range(sb, super, PMFS_SB_SIZE*2);
+	pmfs_sync_super(super);
+	pmfs_memlock_range(sb, super, PMFS_SB_SIZE*2);
+
+	pmfs_flush_buffer(super, PMFS_SB_SIZE, false);
+	pmfs_flush_buffer((char *)super + PMFS_SB_SIZE, sizeof(*super), false);
+
+	pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
+
+	root_i = pmfs_get_inode(sb, PMFS_ROOT_INO);
+
+	pmfs_memunlock_inode(sb, root_i);
+	root_i->i_mode = cpu_to_le16(sbi->mode | S_IFDIR);
+	root_i->i_uid = cpu_to_le32(sbi->uid);
+	root_i->i_gid = cpu_to_le32(sbi->gid);
+	root_i->i_links_count = cpu_to_le16(2);
+	root_i->i_blk_type = PMFS_BLOCK_TYPE_4K;
+	root_i->i_flags = 0;
+	root_i->i_blocks = cpu_to_le32(1);
+	root_i->i_size = cpu_to_le32(sb->s_blocksize);
+	root_i->root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
+						       PMFS_BLOCK_TYPE_4K));
+	root_i->height = 0;
+	/* pmfs_sync_inode(root_i); */
+	pmfs_memlock_inode(sb, root_i);
+	pmfs_flush_buffer(root_i, sizeof(*root_i), false);
+	de = (struct pmfs_direntry *)
+		pmfs_get_block(sb, pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K));
+
+	pmfs_memunlock_range(sb, de, sb->s_blocksize);
+	de->ino = cpu_to_le64(PMFS_ROOT_INO);
+	de->name_len = 1;
+	de->de_len = cpu_to_le16(PMFS_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	de = (struct pmfs_direntry *)((char *)de + le16_to_cpu(de->de_len));
+	de->ino = cpu_to_le64(PMFS_ROOT_INO);
+	de->de_len = cpu_to_le16(sb->s_blocksize - PMFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	pmfs_memlock_range(sb, de, sb->s_blocksize);
+	pmfs_flush_buffer(de, PMFS_DIR_REC_LEN(2), false);
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	return root_i;
+}
+
+static inline void set_default_opts(struct pmfs_sb_info *sbi)
+{
+	/* set_opt(sbi->s_mount_opt, PROTECT); */
+	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+	set_opt(sbi->s_mount_opt, ERRORS_CONT);
+	sbi->pmfs_backing_file[0] = '\0';
+	sbi->pmfs_backing_option = 0;
+	sbi->jsize = PMFS_DEFAULT_JOURNAL_SIZE;
+}
+
+static void pmfs_root_check(struct super_block *sb, struct pmfs_inode *root_pi)
+{
+	pmfs_memunlock_inode(sb, root_pi);
+/*
+ *      if (root_pi->i_d.d_next) {
+ *              pmfs_warn("root->next not NULL, trying to fix\n");
+ *              goto fail1;
+ *      }
+ */
+	if (!S_ISDIR(le16_to_cpu(root_pi->i_mode)))
+		pmfs_warn("root is not a directory, trying to fix\n");
+#if 0
+	if (pmfs_calc_checksum((u8 *)root_pi, PMFS_INODE_SIZE)) {
+		pmfs_dbg("checksum error in root inode, trying to fix\n");
+		goto fail3;
+	}
+#endif
+	root_pi->i_mode = cpu_to_le16(S_IRWXUGO | S_ISVTX | S_IFDIR);
+	pmfs_memlock_inode(sb, root_pi);
+	pmfs_flush_buffer(&root_pi->i_mode, sizeof(root_pi->i_mode), false);
+}
+
+int pmfs_check_integrity(struct super_block *sb,
+			  struct pmfs_super_block *super)
+{
+	struct pmfs_super_block *super_redund;
+
+	super_redund =
+		(struct pmfs_super_block *)((char *)super + PMFS_SB_SIZE);
+
+	/* Do sanity checks on the superblock */
+	if (le16_to_cpu(super->s_magic) != PMFS_SUPER_MAGIC) {
+		if (le16_to_cpu(super_redund->s_magic) != PMFS_SUPER_MAGIC) {
+			printk(KERN_ERR "Can't find a valid pmfs partition\n");
+			goto out;
+		} else {
+			pmfs_warn
+				("Error in super block: try to repair it with "
+				"the redundant copy");
+			/* Try to auto-recover the super block */
+			if (sb)
+				pmfs_memunlock_super(sb, super);
+			memcpy(super, super_redund,
+				sizeof(struct pmfs_super_block));
+			if (sb)
+				pmfs_memlock_super(sb, super);
+			pmfs_flush_buffer(super, sizeof(*super), false);
+			pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
+				sizeof(*super), false);
+
+		}
+	}
+
+	/* Read the superblock */
+	if (pmfs_calc_checksum((u8 *)super, PMFS_SB_STATIC_SIZE(super))) {
+		if (pmfs_calc_checksum((u8 *)super_redund,
+					PMFS_SB_STATIC_SIZE(super_redund))) {
+			printk(KERN_ERR "checksum error in super block\n");
+			goto out;
+		} else {
+			pmfs_warn
+				("Error in super block: try to repair it with "
+				"the redundant copy");
+			/* Try to auto-recover the super block */
+			if (sb)
+				pmfs_memunlock_super(sb, super);
+			memcpy(super, super_redund,
+				sizeof(struct pmfs_super_block));
+			if (sb)
+				pmfs_memlock_super(sb, super);
+			pmfs_flush_buffer(super, sizeof(*super), false);
+			pmfs_flush_buffer((char *)super + PMFS_SB_SIZE,
+				sizeof(*super), false);
+		}
+	}
+
+	return 1;
+out:
+	return 0;
+}
+
+static void pmfs_recover_truncate_list(struct super_block *sb)
+{
+	struct pmfs_inode_truncate_item *head = pmfs_get_truncate_list_head(sb);
+	u64 ino_next = le64_to_cpu(head->i_next_truncate);
+	struct pmfs_inode *pi;
+	struct pmfs_inode_truncate_item *li;
+	struct inode *inode;
+
+	if (ino_next == 0)
+		return;
+
+	while (ino_next != 0) {
+		pi = pmfs_get_inode(sb, ino_next);
+		li = (struct pmfs_inode_truncate_item *)(pi + 1);
+		inode = pmfs_iget(sb, ino_next);
+		if (IS_ERR(inode))
+			break;
+		pmfs_dbg("Recover ino %llx nlink %d sz %llx:%llx\n", ino_next,
+			inode->i_nlink, pi->i_size, li->i_truncatesize);
+		if (inode->i_nlink) {
+			/* set allocation hint */
+			pmfs_set_blocksize_hint(sb, pi, 
+					le64_to_cpu(li->i_truncatesize));
+			pmfs_setsize(inode, le64_to_cpu(li->i_truncatesize));
+			pmfs_update_isize(inode, pi);
+		} else {
+			/* free the inode */
+			pmfs_dbg("deleting unreferenced inode %lx\n",
+				inode->i_ino);
+		}
+		iput(inode);
+		pmfs_flush_buffer(pi, CACHELINE_SIZE, false);
+		ino_next = le64_to_cpu(li->i_next_truncate);
+	}
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+	/* reset the truncate_list */
+	pmfs_memunlock_range(sb, head, sizeof(*head));
+	head->i_next_truncate = 0;
+	pmfs_memlock_range(sb, head, sizeof(*head));
+	pmfs_flush_buffer(head, sizeof(*head), false);
+	PERSISTENT_MARK();
+	PERSISTENT_BARRIER();
+}
+
+static int pmfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct pmfs_super_block *super;
+	struct pmfs_inode *root_pi;
+	struct pmfs_sb_info *sbi = NULL;
+	struct inode *root_i = NULL;
+	unsigned long blocksize, initsize = 0;
+	u32 random = 0;
+	int retval = -EINVAL;
+
+	BUILD_BUG_ON(sizeof(struct pmfs_super_block) > PMFS_SB_SIZE);
+	BUILD_BUG_ON(sizeof(struct pmfs_inode) > PMFS_INODE_SIZE);
+
+	sbi = kzalloc(sizeof(struct pmfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	set_default_opts(sbi);
+
+	sbi->phys_addr = get_phys_addr(&data);
+	if (sbi->phys_addr == (phys_addr_t)ULLONG_MAX)
+		goto out;
+
+	get_random_bytes(&random, sizeof(u32));
+	atomic_set(&sbi->next_generation, random);
+
+	/* Init with default values */
+	INIT_LIST_HEAD(&sbi->block_inuse_head);
+	sbi->mode = (S_IRWXUGO | S_ISVTX);
+	sbi->uid = current_fsuid();
+	sbi->gid = current_fsgid();
+	set_opt(sbi->s_mount_opt, XIP);
+	clear_opt(sbi->s_mount_opt, PROTECT);
+	set_opt(sbi->s_mount_opt, HUGEIOREMAP);
+
+	INIT_LIST_HEAD(&sbi->s_truncate);
+	mutex_init(&sbi->s_truncate_lock);
+	mutex_init(&sbi->inode_table_mutex);
+	mutex_init(&sbi->s_lock);
+
+	if (pmfs_parse_options(data, sbi, 0))
+		goto out;
+
+	set_opt(sbi->s_mount_opt, MOUNTING);
+	initsize = sbi->initsize;
+
+	/* Init a new pmfs instance */
+	if (initsize) {
+		root_pi = pmfs_init(sb, initsize);
+
+		if (IS_ERR(root_pi))
+			goto out;
+
+		super = pmfs_get_super(sb);
+
+		goto setup_sb;
+	} else {
+		pmfs_load_from_file(sb);
+	}
+	pmfs_dbg_verbose("checking physical address 0x%016llx for pmfs image\n",
+		  (u64)sbi->phys_addr);
+
+	/* Map only one page for now. Will remap it when fs size is known. */
+	initsize = PAGE_SIZE;
+	sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, initsize);
+	if (!sbi->virt_addr) {
+		printk(KERN_ERR "ioremap of the pmfs image failed(2)\n");
+		goto out;
+	}
+
+	super = pmfs_get_super(sb);
+
+	initsize = le64_to_cpu(super->s_size);
+	sbi->initsize = initsize;
+	pmfs_dbg_verbose("pmfs image appears to be %lu KB in size\n",
+		   initsize >> 10);
+
+	pmfs_iounmap(sbi->virt_addr, PAGE_SIZE, pmfs_is_wprotected(sb));
+
+	/* Remap the whole filesystem now */
+	release_mem_region(sbi->phys_addr, PAGE_SIZE);
+	/* FIXME: Remap the whole filesystem in pmfs virtual address range. */
+	sbi->virt_addr = pmfs_ioremap(sb, sbi->phys_addr, initsize);
+	if (!sbi->virt_addr) {
+		printk(KERN_ERR "ioremap of the pmfs image failed(3)\n");
+		goto out;
+	}
+
+	super = pmfs_get_super(sb);
+
+	if (pmfs_journal_soft_init(sb)) {
+		retval = -EINVAL;
+		printk(KERN_ERR "Journal initialization failed\n");
+		goto out;
+	}
+	if (pmfs_recover_journal(sb)) {
+		retval = -EINVAL;
+		printk(KERN_ERR "Journal recovery failed\n");
+		goto out;
+	}
+
+	if (pmfs_check_integrity(sb, super) == 0) {
+		pmfs_dbg("Memory contains invalid pmfs %x:%x\n",
+				le16_to_cpu(super->s_magic), PMFS_SUPER_MAGIC);
+		goto out;
+	}
+
+	blocksize = le32_to_cpu(super->s_blocksize);
+	pmfs_set_blocksize(sb, blocksize);
+
+	pmfs_dbg_verbose("blocksize %lu\n", blocksize);
+
+	/* Read the root inode */
+	root_pi = pmfs_get_inode(sb, PMFS_ROOT_INO);
+
+	/* Check that the root inode is in a sane state */
+	pmfs_root_check(sb, root_pi);
+
+#ifdef CONFIG_PMFS_TEST
+	if (!first_pmfs_super)
+		first_pmfs_super = sbi->virt_addr;
+#endif
+
+	/* Set it all up.. */
+setup_sb:
+	sb->s_magic = le16_to_cpu(super->s_magic);
+	sb->s_op = &pmfs_sops;
+	sb->s_maxbytes = pmfs_max_size(sb->s_blocksize_bits);
+	sb->s_time_gran = 1;
+	sb->s_export_op = &pmfs_export_ops;
+	sb->s_xattr = NULL;
+	sb->s_flags |= MS_NOSEC;
+	root_i = pmfs_iget(sb, PMFS_ROOT_INO);
+	if (IS_ERR(root_i)) {
+		retval = PTR_ERR(root_i);
+		goto out;
+	}
+
+	sb->s_root = d_make_root(root_i);
+	if (!sb->s_root) {
+		printk(KERN_ERR "get pmfs root inode failed\n");
+		retval = -ENOMEM;
+		goto out;
+	}
+
+	pmfs_recover_truncate_list(sb);
+	/* If the FS was not formatted on this mount, scan the meta-data after
+	 * truncate list has been processed */
+	if ((sbi->s_mount_opt & PMFS_MOUNT_FORMAT) == 0)
+		pmfs_setup_blocknode_map(sb);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		u64 mnt_write_time;
+		/* update mount time and write time atomically. */
+		mnt_write_time = (get_seconds() & 0xFFFFFFFF);
+		mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+		pmfs_memunlock_range(sb, &super->s_mtime, 8);
+		pmfs_memcpy_atomic(&super->s_mtime, &mnt_write_time, 8);
+		pmfs_memlock_range(sb, &super->s_mtime, 8);
+
+		pmfs_flush_buffer(&super->s_mtime, 8, false);
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+
+	clear_opt(sbi->s_mount_opt, MOUNTING);
+	retval = 0;
+	return retval;
+out:
+	if (sbi->virt_addr) {
+		pmfs_iounmap(sbi->virt_addr, initsize, pmfs_is_wprotected(sb));
+		release_mem_region(sbi->phys_addr, initsize);
+	}
+
+	kfree(sbi);
+	return retval;
+}
+
+int pmfs_statfs(struct dentry *d, struct kstatfs *buf)
+{
+	struct super_block *sb = d->d_sb;
+	unsigned long count = 0;
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+	buf->f_type = PMFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+
+	count = sbi->block_end;
+	buf->f_blocks = sbi->block_end;
+	buf->f_bfree = buf->f_bavail = pmfs_count_free_blocks(sb);
+	buf->f_files = (sbi->s_inodes_count);
+	buf->f_ffree = (sbi->s_free_inodes_count);
+	buf->f_namelen = PMFS_NAME_LEN;
+	pmfs_dbg("pmfs_stats: total 4k free blocks 0x%llx\n", buf->f_bfree);
+	pmfs_dbg("total inodes 0x%x, free inodes 0x%x, blocknodes 0x%lx\n",
+		(sbi->s_inodes_count),
+		(sbi->s_free_inodes_count), (sbi->num_blocknode_allocated));
+	return 0;
+}
+
+static int pmfs_show_options(struct seq_file *seq, struct dentry *root)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(root->d_sb);
+
+	seq_printf(seq, ",physaddr=0x%016llx", (u64)sbi->phys_addr);
+	if (sbi->initsize)
+		seq_printf(seq, ",init=%luk", sbi->initsize >> 10);
+	if (sbi->blocksize)
+		seq_printf(seq, ",bs=%lu", sbi->blocksize);
+	if (sbi->bpi)
+		seq_printf(seq, ",bpi=%lu", sbi->bpi);
+	if (sbi->num_inodes)
+		seq_printf(seq, ",N=%lu", sbi->num_inodes);
+	if (sbi->mode != (S_IRWXUGO | S_ISVTX))
+		seq_printf(seq, ",mode=%03o", sbi->mode);
+	if (sbi->uid != 0)
+		seq_printf(seq, ",uid=%u", sbi->uid);
+	if (sbi->gid != 0)
+		seq_printf(seq, ",gid=%u", sbi->gid);
+	if (test_opt(root->d_sb, ERRORS_RO))
+		seq_puts(seq, ",errors=remount-ro");
+	if (test_opt(root->d_sb, ERRORS_PANIC))
+		seq_puts(seq, ",errors=panic");
+	/* memory protection disabled by default */
+	if (test_opt(root->d_sb, PROTECT))
+		seq_puts(seq, ",wprotect");
+	if (test_opt(root->d_sb, HUGEMMAP))
+		seq_puts(seq, ",hugemmap");
+	if (test_opt(root->d_sb, HUGEIOREMAP))
+		seq_puts(seq, ",hugeioremap");
+	/* xip not enabled by default */
+	if (test_opt(root->d_sb, XIP))
+		seq_puts(seq, ",xip");
+
+	return 0;
+}
+
+int pmfs_remount(struct super_block *sb, int *mntflags, char *data)
+{
+	unsigned long old_sb_flags;
+	unsigned long old_mount_opt;
+	struct pmfs_super_block *ps;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	int ret = -EINVAL;
+
+	/* Store the old options */
+	mutex_lock(&sbi->s_lock);
+	old_sb_flags = sb->s_flags;
+	old_mount_opt = sbi->s_mount_opt;
+
+	if (pmfs_parse_options(data, sbi, 1))
+		goto restore_opt;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		      ((sbi->s_mount_opt & PMFS_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		u64 mnt_write_time;
+		ps = pmfs_get_super(sb);
+		/* update mount time and write time atomically. */
+		mnt_write_time = (get_seconds() & 0xFFFFFFFF);
+		mnt_write_time = mnt_write_time | (mnt_write_time << 32);
+
+		pmfs_memunlock_range(sb, &ps->s_mtime, 8);
+		pmfs_memcpy_atomic(&ps->s_mtime, &mnt_write_time, 8);
+		pmfs_memlock_range(sb, &ps->s_mtime, 8);
+
+		pmfs_flush_buffer(&ps->s_mtime, 8, false);
+		PERSISTENT_MARK();
+		PERSISTENT_BARRIER();
+	}
+
+	mutex_unlock(&sbi->s_lock);
+	ret = 0;
+	return ret;
+
+restore_opt:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_mount_opt;
+	mutex_unlock(&sbi->s_lock);
+	return ret;
+}
+
+void pmfs_put_super(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct pmfs_super_block *ps = pmfs_get_super(sb);
+	u64 size = le64_to_cpu(ps->s_size);
+	struct pmfs_blocknode *i;
+	struct list_head *head = &(sbi->block_inuse_head);
+
+#ifdef CONFIG_PMFS_TEST
+	if (first_pmfs_super == sbi->virt_addr)
+		first_pmfs_super = NULL;
+#endif
+
+	/* It's unmount time, so unmap the pmfs memory */
+	if (sbi->virt_addr) {
+		pmfs_save_blocknode_mappings(sb);
+		pmfs_journal_uninit(sb);
+		pmfs_store_to_file(sb);
+		pmfs_iounmap(sbi->virt_addr, size, pmfs_is_wprotected(sb));
+		sbi->virt_addr = NULL;
+		release_mem_region(sbi->phys_addr, size);
+	}
+
+	/* Free all the pmfs_blocknodes */
+	while (!list_empty(head)) {
+		i = list_first_entry(head, struct pmfs_blocknode, link);
+		list_del(&i->link);
+		pmfs_free_blocknode(sb, i);
+	}
+	sb->s_fs_info = NULL;
+	pmfs_dbgmask = 0;
+	kfree(sbi);
+}
+
+inline void pmfs_free_transaction(pmfs_transaction_t *trans)
+{
+	kmem_cache_free(pmfs_transaction_cachep, trans);
+}
+
+void __pmfs_free_blocknode(struct pmfs_blocknode *bnode)
+{
+	kmem_cache_free(pmfs_blocknode_cachep, bnode);
+}
+
+void pmfs_free_blocknode(struct super_block *sb, struct pmfs_blocknode *bnode)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	sbi->num_blocknode_allocated--;
+	__pmfs_free_blocknode(bnode);
+}
+
+inline pmfs_transaction_t *pmfs_alloc_transaction(void)
+{
+	return (pmfs_transaction_t *)
+		kmem_cache_alloc(pmfs_transaction_cachep, GFP_NOFS);
+}
+
+struct pmfs_blocknode *pmfs_alloc_blocknode(struct super_block *sb)
+{
+	struct pmfs_blocknode *p;
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	p = (struct pmfs_blocknode *)
+		kmem_cache_alloc(pmfs_blocknode_cachep, GFP_NOFS);
+	if (p) {
+		sbi->num_blocknode_allocated++;
+	}
+	return p;
+}
+
+static struct inode *pmfs_alloc_inode(struct super_block *sb)
+{
+	struct pmfs_inode_vfs *vi = (struct pmfs_inode_vfs *)
+				     kmem_cache_alloc(pmfs_inode_cachep, GFP_NOFS);
+
+	if (!vi)
+		return NULL;
+	vi->vfs_inode.i_version = 1;
+	return &vi->vfs_inode;
+}
+
+static void pmfs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	kmem_cache_free(pmfs_inode_cachep, PMFS_I(inode));
+}
+
+static void pmfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, pmfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+	struct pmfs_inode_vfs *vi = (struct pmfs_inode_vfs *)foo;
+
+	vi->i_dir_start_lookup = 0;
+	INIT_LIST_HEAD(&vi->i_truncated);
+	inode_init_once(&vi->vfs_inode);
+}
+
+
+static int __init init_blocknode_cache(void)
+{
+	pmfs_blocknode_cachep = kmem_cache_create("pmfs_blocknode_cache",
+					sizeof(struct pmfs_blocknode),
+					0, (SLAB_RECLAIM_ACCOUNT |
+                                        SLAB_MEM_SPREAD), NULL);
+	if (pmfs_blocknode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+
+static int __init init_inodecache(void)
+{
+	pmfs_inode_cachep = kmem_cache_create("pmfs_inode_cache",
+					       sizeof(struct pmfs_inode_vfs),
+					       0, (SLAB_RECLAIM_ACCOUNT |
+						   SLAB_MEM_SPREAD), init_once);
+	if (pmfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static int __init init_transaction_cache(void)
+{
+	pmfs_transaction_cachep = kmem_cache_create("pmfs_journal_transaction",
+			sizeof(pmfs_transaction_t), 0, (SLAB_RECLAIM_ACCOUNT |
+			SLAB_MEM_SPREAD), NULL);
+	if (pmfs_transaction_cachep == NULL) {
+		pmfs_dbg("PMFS: failed to init transaction cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void destroy_transaction_cache(void)
+{
+	if (pmfs_transaction_cachep)
+		kmem_cache_destroy(pmfs_transaction_cachep);
+	pmfs_transaction_cachep = NULL;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(pmfs_inode_cachep);
+}
+
+static void destroy_blocknode_cache(void)
+{
+	kmem_cache_destroy(pmfs_blocknode_cachep);
+}
+
+/*
+ * the super block writes are all done "on the fly", so the
+ * super block is never in a "dirty" state, so there's no need
+ * for write_super.
+ */
+static struct super_operations pmfs_sops = {
+	.alloc_inode	= pmfs_alloc_inode,
+	.destroy_inode	= pmfs_destroy_inode,
+	.write_inode	= pmfs_write_inode,
+	.dirty_inode	= pmfs_dirty_inode,
+	.evict_inode	= pmfs_evict_inode,
+	.put_super	= pmfs_put_super,
+	.statfs		= pmfs_statfs,
+	.remount_fs	= pmfs_remount,
+	.show_options	= pmfs_show_options,
+};
+
+static struct dentry *pmfs_mount(struct file_system_type *fs_type,
+				  int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, pmfs_fill_super);
+}
+
+static struct file_system_type pmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "pmfs",
+	.mount		= pmfs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+static struct inode *pmfs_nfs_get_inode(struct super_block *sb,
+					 u64 ino, u32 generation)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+	struct inode *inode;
+
+	if (ino < PMFS_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+
+	if ((ino >> PMFS_INODE_BITS) > (sbi->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	inode = pmfs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *pmfs_fh_to_dentry(struct super_block *sb,
+					 struct fid *fid, int fh_len,
+					 int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    pmfs_nfs_get_inode);
+}
+
+static struct dentry *pmfs_fh_to_parent(struct super_block *sb,
+					 struct fid *fid, int fh_len,
+					 int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    pmfs_nfs_get_inode);
+}
+
+static const struct export_operations pmfs_export_ops = {
+	.fh_to_dentry	= pmfs_fh_to_dentry,
+	.fh_to_parent	= pmfs_fh_to_parent,
+	.get_parent	= pmfs_get_parent,
+};
+
+static int __init init_pmfs_fs(void)
+{
+	int rc = 0;
+
+	rc = init_blocknode_cache();
+	if (rc)
+		return rc;
+
+	rc = init_transaction_cache();
+	if (rc)
+		goto out1;
+
+	rc = init_inodecache();
+	if (rc)
+		goto out2;
+
+	rc = bdi_init(&pmfs_backing_dev_info);
+	if (rc)
+		goto out3;
+
+	rc = register_filesystem(&pmfs_fs_type);
+	if (rc)
+		goto out4;
+
+	return 0;
+
+out4:
+	bdi_destroy(&pmfs_backing_dev_info);
+out3:
+	destroy_inodecache();
+out2:
+	destroy_transaction_cache();
+out1:
+	destroy_blocknode_cache();
+	return rc;
+}
+
+static void __exit exit_pmfs_fs(void)
+{
+	unregister_filesystem(&pmfs_fs_type);
+	bdi_destroy(&pmfs_backing_dev_info);
+	destroy_inodecache();
+	destroy_blocknode_cache();
+	destroy_transaction_cache();
+	return;
+}
+
+MODULE_AUTHOR("Intel Corporation <linux-pmfs at intel.com>");
+MODULE_DESCRIPTION("Persistent Memory File System");
+MODULE_LICENSE("GPL");
+
+module_init(init_pmfs_fs)
+module_exit(exit_pmfs_fs)
diff --git a/fs/pmfs/symlink.c b/fs/pmfs/symlink.c
new file mode 100644
index 0000000..f02cbe2
--- /dev/null
+++ b/fs/pmfs/symlink.c
@@ -0,0 +1,71 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Symlink operations
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include "pmfs.h"
+
+int pmfs_block_symlink(struct inode *inode, const char *symname, int len)
+{
+	struct super_block *sb = inode->i_sb;
+	u64 block;
+	char *blockp;
+	int err;
+
+	err = pmfs_alloc_blocks(NULL, inode, 0, 1, false);
+	if (err)
+		return err;
+
+	block = pmfs_find_data_block(inode, 0);
+	blockp = pmfs_get_block(sb, block);
+
+	pmfs_memunlock_block(sb, blockp);
+	memcpy(blockp, symname, len);
+	blockp[len] = '\0';
+	pmfs_memlock_block(sb, blockp);
+	pmfs_flush_buffer(blockp, len+1, false);
+	return 0;
+}
+
+static int pmfs_readlink(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	u64 block;
+	char *blockp;
+
+	block = pmfs_find_data_block(inode, 0);
+	blockp = pmfs_get_block(sb, block);
+	return vfs_readlink(dentry, buffer, buflen, blockp);
+}
+
+static void *pmfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	off_t block;
+	int status;
+	char *blockp;
+
+	block = pmfs_find_data_block(inode, 0);
+	blockp = pmfs_get_block(sb, block);
+	status = vfs_follow_link(nd, blockp);
+	return ERR_PTR(status);
+}
+
+const struct inode_operations pmfs_symlink_inode_operations = {
+	.readlink	= pmfs_readlink,
+	.follow_link	= pmfs_follow_link,
+	.setattr	= pmfs_notify_change,
+};
diff --git a/fs/pmfs/wprotect.c b/fs/pmfs/wprotect.c
new file mode 100644
index 0000000..d80b4fe
--- /dev/null
+++ b/fs/pmfs/wprotect.c
@@ -0,0 +1,91 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Write protection for the filesystem pages.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include "pmfs.h"
+
+DEFINE_SPINLOCK(pmfs_writeable_lock);
+
+static inline void wprotect_disable(void)
+{
+	unsigned long cr0_val;
+
+	cr0_val = read_cr0();
+	cr0_val &= (~X86_CR0_WP);
+	write_cr0(cr0_val);
+	return;
+}
+
+static inline void wprotect_enable(void)
+{
+	unsigned long cr0_val;
+
+	cr0_val = read_cr0();
+	cr0_val |= X86_CR0_WP;
+	write_cr0(cr0_val);
+	return;
+}
+
+/* FIXME: Use PAGE RW Bit */
+int pmfs_writeable_old(void *vaddr, unsigned long size, int rw)
+{
+	int ret = 0;
+	unsigned long nrpages = size >> PAGE_SHIFT;
+	unsigned long addr = (unsigned long)vaddr;
+
+	/* Page aligned */
+	addr &= PAGE_MASK;
+
+	if (size & (PAGE_SIZE - 1))
+		nrpages++;
+
+	if (rw)
+		ret = set_memory_rw(addr, nrpages);
+	else
+		ret = set_memory_ro(addr, nrpages);
+
+	BUG_ON(ret);
+	return 0;
+}
+
+/* FIXME: Assumes that we are always called in the right order.
+ * pmfs_writeable(vaddr, size, 1);
+ * pmfs_writeable(vaddr, size, 0);
+ */
+int pmfs_writeable(void *vaddr, unsigned long size, int rw)
+{
+	static unsigned long flags;
+	if (rw) {
+		local_irq_save(flags);
+		wprotect_disable();
+	} else {
+		wprotect_enable();
+		local_irq_restore(flags);
+	}
+	return 0;
+}
+
+int pmfs_xip_mem_protect(struct super_block *sb, void *vaddr,
+			  unsigned long size, int rw)
+{
+	if (!pmfs_is_wprotected(sb))
+		return 0;
+	if (pmfs_is_protected_old(sb))
+		return pmfs_writeable_old(vaddr, size, rw);
+	return pmfs_writeable(vaddr, size, rw);
+}
diff --git a/fs/pmfs/wprotect.h b/fs/pmfs/wprotect.h
new file mode 100644
index 0000000..818638b
--- /dev/null
+++ b/fs/pmfs/wprotect.h
@@ -0,0 +1,166 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Memory protection definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2010-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef __WPROTECT_H
+#define __WPROTECT_H
+
+#include <linux/pmfs_def.h>
+#include <linux/fs.h>
+
+/* pmfs_memunlock_super() before calling! */
+static inline void pmfs_sync_super(struct pmfs_super_block *ps)
+{
+	u16 crc = 0;
+
+	ps->s_wtime = cpu_to_le32(get_seconds());
+	ps->s_sum = 0;
+	crc = crc16(~0, (__u8 *)ps + sizeof(__le16),
+			PMFS_SB_STATIC_SIZE(ps) - sizeof(__le16));
+	ps->s_sum = cpu_to_le16(crc);
+	/* Keep sync redundant super block */
+	memcpy((void *)ps + PMFS_SB_SIZE, (void *)ps,
+		sizeof(struct pmfs_super_block));
+}
+
+#if 0
+/* pmfs_memunlock_inode() before calling! */
+static inline void pmfs_sync_inode(struct pmfs_inode *pi)
+{
+	u16 crc = 0;
+
+	pi->i_sum = 0;
+	crc = crc16(~0, (__u8 *)pi + sizeof(__le16), PMFS_INODE_SIZE -
+		    sizeof(__le16));
+	pi->i_sum = cpu_to_le16(crc);
+}
+#endif
+
+extern int pmfs_writeable(void *vaddr, unsigned long size, int rw);
+extern int pmfs_xip_mem_protect(struct super_block *sb,
+				 void *vaddr, unsigned long size, int rw);
+
+extern spinlock_t pmfs_writeable_lock;
+static inline int pmfs_is_protected(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+	return sbi->s_mount_opt & PMFS_MOUNT_PROTECT;
+}
+
+static inline int pmfs_is_protected_old(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+	return sbi->s_mount_opt & PMFS_MOUNT_PROTECT_OLD;
+}
+
+static inline int pmfs_is_wprotected(struct super_block *sb)
+{
+	return pmfs_is_protected(sb) || pmfs_is_protected_old(sb);
+}
+
+static inline void
+__pmfs_memunlock_range(void *p, unsigned long len, int hold_lock)
+{
+	/*
+	 * NOTE: Ideally we should lock all the kernel to be memory safe
+	 * and avoid to write in the protected memory,
+	 * obviously it's not possible, so we only serialize
+	 * the operations at fs level. We can't disable the interrupts
+	 * because we could have a deadlock in this path.
+	 */
+	if (hold_lock)
+		spin_lock(&pmfs_writeable_lock);
+	pmfs_writeable(p, len, 1);
+}
+
+static inline void
+__pmfs_memlock_range(void *p, unsigned long len, int hold_lock)
+{
+	pmfs_writeable(p, len, 0);
+	if (hold_lock)
+		spin_unlock(&pmfs_writeable_lock);
+}
+
+static inline void pmfs_memunlock_range(struct super_block *sb, void *p,
+					 unsigned long len)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memunlock_range(p, len, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memunlock_range(p, len, 1);
+}
+
+static inline void pmfs_memlock_range(struct super_block *sb, void *p,
+				       unsigned long len)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memlock_range(p, len, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memlock_range(p, len, 1);
+}
+
+static inline void pmfs_memunlock_super(struct super_block *sb,
+					 struct pmfs_super_block *ps)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memunlock_range(ps, PMFS_SB_SIZE, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memunlock_range(ps, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memlock_super(struct super_block *sb,
+				       struct pmfs_super_block *ps)
+{
+	pmfs_sync_super(ps);
+	if (pmfs_is_protected(sb))
+		__pmfs_memlock_range(ps, PMFS_SB_SIZE, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memlock_range(ps, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memunlock_inode(struct super_block *sb,
+					 struct pmfs_inode *pi)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memunlock_range(pi, PMFS_SB_SIZE, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memunlock_range(pi, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memlock_inode(struct super_block *sb,
+				       struct pmfs_inode *pi)
+{
+	/* pmfs_sync_inode(pi); */
+	if (pmfs_is_protected(sb))
+		__pmfs_memlock_range(pi, PMFS_SB_SIZE, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memlock_range(pi, PMFS_SB_SIZE, 1);
+}
+
+static inline void pmfs_memunlock_block(struct super_block *sb, void *bp)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memunlock_range(bp, sb->s_blocksize, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memunlock_range(bp, sb->s_blocksize, 1);
+}
+
+static inline void pmfs_memlock_block(struct super_block *sb, void *bp)
+{
+	if (pmfs_is_protected(sb))
+		__pmfs_memlock_range(bp, sb->s_blocksize, 0);
+	else if (pmfs_is_protected_old(sb))
+		__pmfs_memlock_range(bp, sb->s_blocksize, 1);
+}
+
+#endif
diff --git a/fs/pmfs/xip.c b/fs/pmfs/xip.c
new file mode 100644
index 0000000..a7cf780
--- /dev/null
+++ b/fs/pmfs/xip.c
@@ -0,0 +1,672 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * XIP operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/cpufeature.h>
+#include <asm/pgtable.h>
+#include "pmfs.h"
+#include "xip.h"
+
+/*
+ * Wrappers. We need to use the rcu read lock to avoid
+ * concurrent truncate operation. No problem for write because we held
+ * i_mutex.
+ */
+ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf,
+			    size_t len, loff_t *ppos)
+{
+	ssize_t res;
+
+	rcu_read_lock();
+	res = xip_file_read(filp, buf, len, ppos);
+	rcu_read_unlock();
+	return res;
+}
+
+static inline void pmfs_flush_edge_cachelines(loff_t pos, ssize_t len,
+	void *start_addr)
+{
+	if (unlikely(pos & 0x7))
+		pmfs_flush_buffer(start_addr, 1, false);
+	if (unlikely(((pos + len) & 0x7) && ((pos & (CACHELINE_SIZE - 1)) !=
+			((pos + len) & (CACHELINE_SIZE - 1)))))
+		pmfs_flush_buffer(start_addr + len, 1, false);
+}
+
+static ssize_t
+__pmfs_xip_file_write(struct address_space *mapping, const char __user *buf,
+          size_t count, loff_t pos, loff_t *ppos)
+{
+	struct inode    *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	long        status = 0;
+	size_t      bytes;
+	ssize_t     written = 0;
+	struct pmfs_inode *pi;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+		void *xmem;
+		unsigned long xpfn;
+
+		offset = (pos & (sb->s_blocksize - 1)); /* Within page */
+		index = pos >> sb->s_blocksize_bits;
+		bytes = sb->s_blocksize - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn);
+		if (status)
+			break;
+		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1);
+		copied = bytes -
+		__copy_from_user_inatomic_nocache(xmem + offset, buf, bytes);
+		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0);
+
+		/* if start or end dest address is not 8 byte aligned, 
+	 	 * __copy_from_user_inatomic_nocache uses cacheable instructions
+	 	 * (instead of movnti) to write. So flush those cachelines. */
+		pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
+
+        	if (likely(copied > 0)) {
+			status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		if (status < 0)
+			break;
+	} while (count);
+	*ppos = pos;
+	/*
+ 	* No need to use i_size_read() here, the i_size
+ 	* cannot change under us because we hold i_mutex.
+ 	*/
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		pmfs_update_isize(inode, pi);
+	}
+
+	return written ? written : status;
+}
+
+/* optimized path for file write that doesn't require a transaction. In this
+ * path we dont need to allocate any new data blocks. So the only meta-data
+ * modified in path is inode's i_size, i_ctime, and i_mtime fields */
+static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode,
+	struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos,
+	loff_t *ppos, u64 block)
+{
+	void *xmem = pmfs_get_block(sb, block);
+	size_t copied, ret = 0, offset;
+
+	offset = pos & (sb->s_blocksize - 1);
+
+	pmfs_xip_mem_protect(sb, xmem + offset, count, 1);
+	copied = count - __copy_from_user_inatomic_nocache(xmem
+		+ offset, buf, count);
+	pmfs_xip_mem_protect(sb, xmem + offset, count, 0);
+
+	pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
+
+	if (likely(copied > 0)) {
+		pos += copied;
+		ret = copied;
+	}
+	if (unlikely(copied != count && copied == 0))
+		ret = -EFAULT;
+	*ppos = pos;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (pos > inode->i_size) {
+		/* make sure written data is persistent before updating
+	 	* time and size */
+		PERSISTENT_MARK();
+		i_size_write(inode, pos);
+		PERSISTENT_BARRIER();
+		pmfs_memunlock_inode(sb, pi);
+		pmfs_update_time_and_size(inode, pi);
+		pmfs_memlock_inode(sb, pi);
+	} else {
+		u64 c_m_time;
+		/* update c_time and m_time atomically. We don't need to make the data
+		 * persistent because the expectation is that the close() or an explicit
+		 * fsync will do that. */
+		c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF);
+		c_m_time = c_m_time | (c_m_time << 32);
+		pmfs_memunlock_inode(sb, pi);
+		pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8);
+		pmfs_memlock_inode(sb, pi);
+	}
+	pmfs_flush_buffer(pi, 1, false);
+	return ret;
+}
+
+static inline void pmfs_clear_edge_blk (struct super_block *sb, struct
+	pmfs_inode *pi, bool new_blk, unsigned long block)
+{
+	void *ptr;
+	unsigned long blknr;
+
+	if (new_blk) {
+		blknr = block >> (pmfs_inode_blk_shift(pi) -
+			sb->s_blocksize_bits);
+		ptr = pmfs_get_block(sb, __pmfs_find_data_block(sb, pi, blknr));
+		if (ptr != NULL) {
+			pmfs_memunlock_range(sb, ptr,  pmfs_inode_blk_size(pi));
+			memset_nt(ptr, 0, pmfs_inode_blk_size(pi));
+			pmfs_memlock_range(sb, ptr,  pmfs_inode_blk_size(pi));
+		}
+	}
+}
+
+ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
+          size_t len, loff_t *ppos)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode    *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	pmfs_transaction_t *trans;
+	struct pmfs_inode *pi;
+	ssize_t     written = 0;
+	loff_t pos;
+	u64 block;
+	bool new_sblk = false, new_eblk = false;
+	size_t count, offset, ret;
+	unsigned long start_blk, num_blocks, max_logentries;
+
+	sb_start_write(inode->i_sb);
+	mutex_lock(&inode->i_mutex);
+
+	if (!access_ok(VERIFY_READ, buf, len)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	pos = *ppos;
+	count = len;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+	if (ret || count == 0)
+		goto out_backing;
+
+	pi = pmfs_get_inode(sb, inode->i_ino);
+
+	offset = pos & (sb->s_blocksize - 1);
+	num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+	/* offset in the actual block size block */
+	offset = pos & (pmfs_inode_blk_size(pi) - 1);
+	start_blk = pos >> sb->s_blocksize_bits;
+
+	if ((((count + offset - 1) >> pmfs_inode_blk_shift(pi)) == 0) &&
+		(block = pmfs_find_data_block(inode, start_blk))) {
+		ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos,
+			ppos, block);
+		goto out_backing;
+	}
+	max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2;
+	if (max_logentries > MAX_METABLOCK_LENTRIES)
+		max_logentries = MAX_METABLOCK_LENTRIES;
+
+	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_backing;
+	}
+	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
+
+	ret = file_remove_suid(filp);
+	if (ret) {
+		pmfs_abort_transaction(sb, trans);
+		goto out_backing;
+	}
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	pmfs_update_time(inode, pi);
+
+	/* We avoid zeroing the alloc'd range, which is going to be overwritten
+	 * by this system call anyway */
+	if (offset != 0) {
+		if (pmfs_find_data_block(inode, start_blk) == 0)
+		    new_sblk = true;
+	}
+	if (((count + offset - 1) >> pmfs_inode_blk_shift(pi)) != 0 &&
+			((pos + count) & (pmfs_inode_blk_size(pi) - 1)) != 0) {
+		if (pmfs_find_data_block(inode, start_blk + num_blocks - 1)
+			== 0)
+		    new_eblk = true;
+	}
+
+	/* don't zero-out the allocated blocks */
+	pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false);
+
+	/* now zero out the edge blocks which will be partially written */
+	pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk);
+	pmfs_clear_edge_blk(sb, pi, new_eblk, start_blk + num_blocks - 1);
+
+	written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos);
+	if (written < 0 || written != count)
+		pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld"
+			" pos %llx start_blk %lx num_blocks %lx\n",
+			written, count, pos, start_blk, num_blocks);
+
+	pmfs_commit_transaction(sb, trans);
+	ret = written;
+out_backing:
+	current->backing_dev_info = NULL;
+out:
+	mutex_unlock(&inode->i_mutex);
+	sb_end_write(inode->i_sb);
+	return ret;
+}
+
+/* OOM err return with xip file fault handlers doesn't mean anything.
+ * It would just cause the OS to go an unnecessary killing spree !
+ */
+static int __pmfs_xip_file_fault(struct vm_area_struct *vma,
+				  struct vm_fault *vmf)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+	pgoff_t size;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	int err;
+
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (vmf->pgoff >= size) {
+		pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx),"
+			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+			__func__, __LINE__, vma->vm_start, vma->vm_end,
+			vmf->pgoff, (unsigned long)vmf->virtual_address);
+		return VM_FAULT_SIGBUS;
+	}
+
+	err = pmfs_get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn);
+	if (unlikely(err)) {
+		pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x%lx),"
+			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+			__func__, __LINE__, vma->vm_start, vma->vm_end,
+			vmf->pgoff, (unsigned long)vmf->virtual_address);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pmfs_dbg_mmapv("[%s:%d] vm_start(0x%lx), vm_end(0x%lx), pgoff(0x%lx), "
+			"BlockSz(0x%lx), VA(0x%lx)->PA(0x%lx)\n", __func__,
+			__LINE__, vma->vm_start, vma->vm_end, vmf->pgoff,
+			PAGE_SIZE, (unsigned long)vmf->virtual_address,
+			(unsigned long)xip_pfn << PAGE_SHIFT);
+
+	err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, xip_pfn);
+
+	if (err == -ENOMEM)
+		return VM_FAULT_SIGBUS;
+	/*
+	 * err == -EBUSY is fine, we've raced against another thread
+	 * that faulted-in the same page
+	 */
+	if (err != -EBUSY)
+		BUG_ON(err);
+	return VM_FAULT_NOPAGE;
+}
+
+extern int
+pmfs_xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int ret = 0;
+
+	rcu_read_lock();
+	ret = __pmfs_xip_file_fault(vma, vmf);
+	rcu_read_unlock();
+	return ret;
+}
+
+static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock,
+				       sector_t *data_block, int create)
+{
+	int err = -EIO;
+	u64 block;
+	pmfs_transaction_t *trans;
+	struct pmfs_inode *pi;
+
+	block = pmfs_find_data_block(inode, iblock);
+
+	if (!block) {
+		struct super_block *sb = inode->i_sb;
+		if (!create) {
+			err = -ENODATA;
+			goto err;
+		}
+
+		pi = pmfs_get_inode(sb, inode->i_ino);
+		trans = pmfs_current_transaction();
+		if (trans) {
+			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
+			if (err) {
+				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
+					__func__, __LINE__);
+				goto err;
+			}
+		} else {
+			/* 1 lentry for inode, 1 lentry for inode's b-tree */
+			trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto err;
+			}
+
+			rcu_read_unlock();
+			mutex_lock(&inode->i_mutex);
+
+			pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY,
+				LE_DATA);
+			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
+
+			pmfs_commit_transaction(sb, trans);
+
+			mutex_unlock(&inode->i_mutex);
+			rcu_read_lock();
+			if (err) {
+				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
+					__func__, __LINE__);
+				goto err;
+			}
+		}
+		block = pmfs_find_data_block(inode, iblock);
+		if (!block) {
+			pmfs_dbg("[%s:%d] But alloc didn't fail!\n",
+				  __func__, __LINE__);
+			err = -ENODATA;
+			goto err;
+		}
+	}
+	pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock,
+			 block);
+
+	*data_block = block;
+	err = 0;
+
+err:
+	return err;
+}
+
+static inline int __pmfs_get_block(struct inode *inode, pgoff_t pgoff,
+				    int create, sector_t *result)
+{
+	int rc = 0;
+
+	rc = pmfs_find_and_alloc_blocks(inode, (sector_t)pgoff, result,
+					 create);
+	return rc;
+}
+
+int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
+		      void **kmem, unsigned long *pfn)
+{
+	int rc;
+	sector_t block = 0;
+	struct inode *inode = mapping->host;
+
+	rc = __pmfs_get_block(inode, pgoff, create, &block);
+	if (rc) {
+		pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx),"
+			" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__,
+			__LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr,
+			block, pgoff, create, *pfn);
+		return rc;
+	}
+
+	*kmem = pmfs_get_block(inode->i_sb, block);
+	*pfn = pmfs_get_pfn(inode->i_sb, block);
+
+	pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx),"
+		" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__,
+		PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn);
+	return 0;
+}
+
+unsigned long pmfs_data_block_size(struct vm_area_struct *vma,
+				    unsigned long addr, unsigned long pgoff)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_mapping->host;
+	struct pmfs_inode *pi;
+	unsigned long map_virt;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
+
+	pi = pmfs_get_inode(inode->i_sb, inode->i_ino);
+
+	map_virt = addr & PUD_MASK;
+
+	if (!cpu_has_gbpages || pi->i_blk_type != PMFS_BLOCK_TYPE_1G ||
+	    (vma->vm_start & ~PUD_MASK) ||
+	    map_virt < vma->vm_start ||
+	    (map_virt + PUD_SIZE) > vma->vm_end)
+		goto use_2M_mappings;
+
+	pmfs_dbg_mmapv("[%s:%d] Using 1G Mappings : "
+			"vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+			"VA(0x%lx), MAP_VA(%lx)\n", __func__, __LINE__,
+			vma->vm_start, vma->vm_end, pgoff, addr, map_virt);
+	return PUD_SIZE;
+
+use_2M_mappings:
+	map_virt = addr & PMD_MASK;
+
+	if (!cpu_has_pse || pi->i_blk_type != PMFS_BLOCK_TYPE_2M ||
+	    (vma->vm_start & ~PMD_MASK) ||
+	    map_virt < vma->vm_start ||
+	    (map_virt + PMD_SIZE) > vma->vm_end)
+		goto use_4K_mappings;
+
+	pmfs_dbg_mmapv("[%s:%d] Using 2M Mappings : "
+			"vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+			"VA(0x%lx), MAP_VA(%lx)\n", __func__, __LINE__,
+			vma->vm_start, vma->vm_end, pgoff, addr, map_virt);
+
+	return PMD_SIZE;
+
+use_4K_mappings:
+	pmfs_dbg_mmapvv("[%s:%d] 4K Mappings : "
+			 "vma_start(0x%lx), vma_end(0x%lx), file_pgoff(0x%lx), "
+			 "VA(0x%lx)\n", __func__, __LINE__,
+			 vma->vm_start, vma->vm_end, pgoff, addr);
+
+	return PAGE_SIZE;
+}
+
+static inline pte_t *pmfs_xip_hugetlb_pte_offset(struct mm_struct *mm,
+						  unsigned long	addr,
+						  unsigned long *sz)
+{
+	return pte_offset_pagesz(mm, addr, sz);
+}
+
+static inline pte_t *pmfs_pte_alloc(struct mm_struct *mm,
+				     unsigned long addr, unsigned long sz)
+{
+	return pte_alloc_pagesz(mm, addr, sz);
+}
+
+static pte_t pmfs_make_huge_pte(struct vm_area_struct *vma,
+				 unsigned long pfn, unsigned long sz,
+				 int writable)
+{
+	pte_t entry;
+
+	if (writable)
+		entry = pte_mkwrite(pte_mkdirty(pfn_pte(pfn, vma->vm_page_prot)));
+	else
+		entry = pte_wrprotect(pfn_pte(pfn, vma->vm_page_prot));
+
+	entry = pte_mkspecial(pte_mkyoung(entry));
+
+	if (sz != PAGE_SIZE) {
+		BUG_ON(sz != PMD_SIZE && sz != PUD_SIZE);
+		entry = pte_mkhuge(entry);
+	}
+
+	return entry;
+}
+
+static int __pmfs_xip_file_hpage_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
+{
+	int ret;
+	pte_t *ptep, new_pte;
+	unsigned long size, block_sz;
+	struct mm_struct *mm = vma->vm_mm;
+	struct inode *inode = vma->vm_file->f_mapping->host;
+	unsigned long address = (unsigned long)vmf->virtual_address;
+
+	static DEFINE_MUTEX(pmfs_instantiation_mutex);
+
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (vmf->pgoff >= size) {
+		pmfs_dbg("[%s:%d] pgoff >= size(SIGBUS). vm_start(0x%lx),"
+			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+			__func__, __LINE__, vma->vm_start, vma->vm_end,
+			vmf->pgoff, (unsigned long)vmf->virtual_address);
+		return VM_FAULT_SIGBUS;
+	}
+
+	block_sz = pmfs_data_block_size(vma, address, vmf->pgoff);
+	address &= ~(block_sz - 1);
+	BUG_ON(block_sz == PAGE_SIZE);
+	pmfs_dbg_mmapvv("[%s:%d] BlockSz : %lx",
+			 __func__, __LINE__, block_sz);
+
+	ptep = pmfs_pte_alloc(mm, address, block_sz);
+	if (!ptep) {
+		pmfs_dbg("[%s:%d] pmfs_pte_alloc failed(OOM). vm_start(0x%lx),"
+			" vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+			__func__, __LINE__, vma->vm_start, vma->vm_end,
+			vmf->pgoff, (unsigned long)vmf->virtual_address);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* Serialize hugepage allocation and instantiation, so that we don't
+	 * get spurious allocation failures if two CPUs race to instantiate
+	 * the same page in the page cache.
+	 */
+	mutex_lock(&pmfs_instantiation_mutex);
+	if (pte_none(*ptep)) {
+		void *xip_mem;
+		unsigned long xip_pfn;
+		if (pmfs_get_xip_mem(vma->vm_file->f_mapping, vmf->pgoff, 1,
+				      &xip_mem, &xip_pfn) != 0) {
+			pmfs_dbg("[%s:%d] get_xip_mem failed(OOM). vm_start(0x"
+				"%lx), vm_end(0x%lx), pgoff(0x%lx), VA(%lx)\n",
+				__func__, __LINE__, vma->vm_start,
+				vma->vm_end, vmf->pgoff,
+				(unsigned long)vmf->virtual_address);
+			ret = VM_FAULT_SIGBUS;
+			goto out_mutex;
+		}
+
+		/* VA has already been aligned. Align xip_pfn to block_sz. */
+		xip_pfn <<= PAGE_SHIFT;
+		xip_pfn &= ~(block_sz - 1);
+		xip_pfn >>= PAGE_SHIFT;
+		new_pte = pmfs_make_huge_pte(vma, xip_pfn, block_sz,
+					      ((vma->vm_flags & VM_WRITE) &&
+					       (vma->vm_flags & VM_SHARED)));
+		/* FIXME: Is lock necessary ? */
+		spin_lock(&mm->page_table_lock);
+		set_pte_at(mm, address, ptep, new_pte);
+		spin_unlock(&mm->page_table_lock);
+
+		if (ptep_set_access_flags(vma, address, ptep, new_pte,
+					  vmf->flags & FAULT_FLAG_WRITE))
+			update_mmu_cache(vma, address, ptep);
+	}
+	ret = VM_FAULT_NOPAGE;
+
+out_mutex:
+	mutex_unlock(&pmfs_instantiation_mutex);
+	return ret;
+}
+
+int pmfs_xip_file_hpage_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int ret = 0;
+
+	rcu_read_lock();
+	ret = __pmfs_xip_file_hpage_fault(vma, vmf);
+	rcu_read_unlock();
+	return ret;
+}
+
+static const struct vm_operations_struct pmfs_xip_vm_ops = {
+	.fault	= pmfs_xip_file_fault,
+};
+
+static const struct vm_operations_struct pmfs_xip_hpage_vm_ops = {
+	.fault	= pmfs_xip_file_hpage_fault,
+};
+
+static inline int pmfs_has_huge_mmap(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = (struct pmfs_sb_info *)sb->s_fs_info;
+
+	return sbi->s_mount_opt & PMFS_MOUNT_HUGEMMAP;
+}
+
+int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	unsigned long block_sz;
+
+	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+	file_accessed(file);
+
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	block_sz = pmfs_data_block_size(vma, vma->vm_start, 0);
+	if (pmfs_has_huge_mmap(file->f_mapping->host->i_sb) &&
+	    (vma->vm_flags & VM_SHARED) &&
+	    (block_sz == PUD_SIZE || block_sz == PMD_SIZE)) {
+		/* vma->vm_flags |= (VM_XIP_HUGETLB | VM_SHARED | VM_DONTCOPY); */
+		vma->vm_flags |= VM_XIP_HUGETLB;
+		vma->vm_ops = &pmfs_xip_hpage_vm_ops;
+		pmfs_dbg_mmaphuge("[%s:%d] MMAP HUGEPAGE vm_start(0x%lx),"
+			" vm_end(0x%lx), vm_flags(0x%lx), "
+			"vm_page_prot(0x%lx)\n", __func__,
+			__LINE__, vma->vm_start, vma->vm_end, vma->vm_flags,
+			pgprot_val(vma->vm_page_prot));
+	} else {
+		vma->vm_ops = &pmfs_xip_vm_ops;
+		pmfs_dbg_mmap4k("[%s:%d] MMAP 4KPAGE vm_start(0x%lx),"
+			" vm_end(0x%lx), vm_flags(0x%lx), "
+			"vm_page_prot(0x%lx)\n", __func__,
+			__LINE__, vma->vm_start, vma->vm_end,
+			vma->vm_flags, pgprot_val(vma->vm_page_prot));
+	}
+
+	return 0;
+}
diff --git a/fs/pmfs/xip.h b/fs/pmfs/xip.h
new file mode 100644
index 0000000..3bd9306
--- /dev/null
+++ b/fs/pmfs/xip.h
@@ -0,0 +1,28 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * XIP operations.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+int pmfs_get_xip_mem(struct address_space *, pgoff_t, int, void **,
+		      unsigned long *);
+ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf, size_t len,
+			    loff_t *ppos);
+ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
+		size_t len, loff_t *ppos);
+int pmfs_xip_file_mmap(struct file *file, struct vm_area_struct *vma);
+
+static inline int pmfs_use_xip(struct super_block *sb)
+{
+	struct pmfs_sb_info *sbi = PMFS_SB(sb);
+
+	return sbi->s_mount_opt & PMFS_MOUNT_XIP;
+}
+
+#define mapping_is_xip(map) (map->a_ops->get_xip_mem)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfd8768..7a62889 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -274,6 +274,10 @@ static inline int pud_none_or_clear_bad(pud_t *pud)
 {
 	if (pud_none(*pud))
 		return 1;
+	if (unlikely(pud_large(*pud))) {
+		pud_clear(pud);
+		return 1;
+	}
 	if (unlikely(pud_bad(*pud))) {
 		pud_clear_bad(pud);
 		return 1;
@@ -285,6 +289,10 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd)
 {
 	if (pmd_none(*pmd))
 		return 1;
+	if (unlikely(pmd_large(*pmd))) {
+		pmd_clear(pmd);
+		return 1;
+	}
 	if (unlikely(pmd_bad(*pmd))) {
 		pmd_clear_bad(pmd);
 		return 1;
diff --git a/include/linux/io.h b/include/linux/io.h
index 069e407..db3de04 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -38,6 +38,16 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 }
 #endif
 
+#ifdef CONFIG_MMU
+int ioremap_hpage_range(unsigned long addr, unsigned long end,
+		       phys_addr_t phys_addr, pgprot_t prot);
+#else
+static inline int ioremap_hpage_range(unsigned long addr, unsigned long end,
+				     phys_addr_t phys_addr, pgprot_t prot)
+{
+	return 0;
+}
+#endif
 /*
  * Managed iomap interface
  */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e19ff30..f7a1aa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -84,6 +84,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE	0x00000080
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
+#define VM_XIP_HUGETLB	0x00000200
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
@@ -96,6 +97,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
+#define VM_PFN_AT_MMAP	0x00080000	/* PFNMAP vma that is fully mapped at mmap time */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
@@ -165,6 +167,11 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
 #define FAULT_FLAG_TRIED	0x40	/* second try */
 
+static inline int is_xip_hugetlb_mapping(struct vm_area_struct *vma)
+{
+	return !!(vma->vm_flags & VM_XIP_HUGETLB);
+}
+
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
@@ -1010,6 +1017,14 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+extern pte_t *pte_alloc_pagesz(struct mm_struct *mm, unsigned long addr, 
+														unsigned long sz);
+extern pte_t *pte_offset_pagesz(struct mm_struct *mm, unsigned long addr, 
+														unsigned long *sz);
+extern void unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+									unsigned long start, unsigned long end);
+
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write);
diff --git a/include/linux/pmfs_def.h b/include/linux/pmfs_def.h
new file mode 100644
index 0000000..e52741e
--- /dev/null
+++ b/include/linux/pmfs_def.h
@@ -0,0 +1,206 @@
+/*
+ * FILE NAME include/linux/pmfs_fs.h
+ *
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS filesystem.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _LINUX_PMFS_DEF_H
+#define _LINUX_PMFS_DEF_H
+
+#include <linux/types.h>
+#include <linux/magic.h>
+
+/*
+ * The PMFS filesystem constants/structures
+ */
+
+/*
+ * Mount flags
+ */
+#define PMFS_MOUNT_PROTECT 0x000001            /* wprotect CR0.WP */
+#define PMFS_MOUNT_XATTR_USER 0x000002         /* Extended user attributes */
+#define PMFS_MOUNT_POSIX_ACL 0x000004          /* POSIX Access Control Lists */
+#define PMFS_MOUNT_XIP 0x000008                /* Execute in place */
+#define PMFS_MOUNT_ERRORS_CONT 0x000010        /* Continue on errors */
+#define PMFS_MOUNT_ERRORS_RO 0x000020          /* Remount fs ro on errors */
+#define PMFS_MOUNT_ERRORS_PANIC 0x000040       /* Panic on errors */
+#define PMFS_MOUNT_HUGEMMAP 0x000080           /* Huge mappings with mmap */
+#define PMFS_MOUNT_HUGEIOREMAP 0x000100        /* Huge mappings with ioremap */
+#define PMFS_MOUNT_PROTECT_OLD 0x000200        /* wprotect PAGE RW Bit */
+#define PMFS_MOUNT_FORMAT      0x000400        /* was FS formatted on mount? */
+#define PMFS_MOUNT_MOUNTING    0x000800        /* FS currently being mounted */
+
+/*
+ * Maximal count of links to a file
+ */
+#define PMFS_LINK_MAX          32000
+
+#define PMFS_DEF_BLOCK_SIZE_4K 4096
+
+#define PMFS_INODE_SIZE 128    /* must be power of two */
+#define PMFS_INODE_BITS   7
+
+#define PMFS_NAME_LEN 255
+/*
+ * Structure of a directory entry in PMFS.
+ */
+struct pmfs_direntry {
+	__le64	ino;                    /* inode no pointed to by this entry */
+	__le16	de_len;                 /* length of this directory entry */
+	u8	name_len;               /* length of the directory entry name */
+	u8	file_type;              /* file type */
+	char	name[PMFS_NAME_LEN];   /* File name */
+};
+
+#define PMFS_DIR_PAD            4
+#define PMFS_DIR_ROUND          (PMFS_DIR_PAD - 1)
+#define PMFS_DIR_REC_LEN(name_len)  (((name_len) + 12 + PMFS_DIR_ROUND) & \
+				      ~PMFS_DIR_ROUND)
+
+/* PMFS supported data blocks */
+#define PMFS_BLOCK_TYPE_4K     0
+#define PMFS_BLOCK_TYPE_2M     1
+#define PMFS_BLOCK_TYPE_1G     2
+#define PMFS_BLOCK_TYPE_MAX    3
+
+#define META_BLK_SHIFT 9
+
+/*
+ * Play with this knob to change the default block type.
+ * By changing the PMFS_DEFAULT_BLOCK_TYPE to 2M or 1G,
+ * we should get pretty good coverage in testing.
+ */
+#define PMFS_DEFAULT_BLOCK_TYPE PMFS_BLOCK_TYPE_4K
+
+/*
+ * Structure of an inode in PMFS. Things to keep in mind when modifying it.
+ * 1) Keep the inode size to within 96 bytes if possible. This is because
+ *    a 64 byte log-entry can store 48 bytes of data and we would like
+ *    to log an inode using only 2 log-entries
+ * 2) root must be immediately after the qw containing height because we update
+ *    root and height atomically using cmpxchg16b in pmfs_decrease_btree_height 
+ * 3) i_size, i_ctime, and i_mtime must be in that order and i_size must be at
+ *    16 byte aligned offset from the start of the inode. We use cmpxchg16b to
+ *    update these three fields atomically.
+ */
+struct pmfs_inode {
+	/* first 48 bytes */
+	__le16	i_rsvd;         /* reserved. used to be checksum */
+	u8	    height;         /* height of data b-tree; max 3 for now */
+	u8	    i_blk_type;     /* data block size this inode uses */
+	__le32	i_flags;            /* Inode flags */
+	__le64	root;               /* btree root. must be below qw w/ height */
+	__le64	i_size;             /* Size of data in bytes */
+	__le32	i_ctime;            /* Inode modification time */
+	__le32	i_mtime;            /* Inode b-tree Modification time */
+	__le32	i_dtime;            /* Deletion Time */
+	__le16	i_mode;             /* File mode */
+	__le16	i_links_count;      /* Links count */
+	__le64	i_blocks;           /* Blocks count */
+
+	/* second 48 bytes */
+	__le64	i_xattr;            /* Extended attribute block */
+	__le32	i_uid;              /* Owner Uid */
+	__le32	i_gid;              /* Group Id */
+	__le32	i_generation;       /* File version (for NFS) */
+	__le32	i_atime;            /* Access time */
+
+	struct {
+		__le32 rdev;    /* major/minor # */
+	} dev;              /* device inode */
+	__le32 padding;     /* pad to ensure truncate_item starts 8-byte aligned */
+};
+
+/* This is a per-inode structure and follows immediately after the 
+ * struct pmfs_inode. It is used to implement the truncate linked list and is 
+ * by pmfs_truncate_add(), pmfs_truncate_del(), and pmfs_recover_truncate_list()
+ * functions to manage the truncate list */
+struct pmfs_inode_truncate_item {
+	__le64	i_truncatesize;     /* Size of truncated inode */
+	__le64  i_next_truncate;    /* inode num of the next truncated inode */
+};
+
+/*
+ * #define PMFS_NAME_LEN (PMFS_INODE_SIZE - offsetof(struct pmfs_inode,
+ *         i_d.d_name) - 1)
+ */
+
+/* #define PMFS_SB_SIZE 128 */ /* must be power of two */
+#define PMFS_SB_SIZE 512       /* must be power of two */
+
+typedef struct pmfs_journal {
+	__le64     base;
+	__le32     size;
+	__le32     head;
+	/* the next three fields must be in the same order and together.
+	 * tail and gen_id must fall in the same 8-byte quadword */
+	__le32     tail;
+	__le16     gen_id;   /* generation id of the log */
+	__le16     pad;
+	__le16     redo_logging;
+} pmfs_journal_t;
+
+
+/*
+ * Structure of the super block in PMFS
+ * The fields are partitioned into static and dynamic fields. The static fields
+ * never change after file system creation. This was primarily done because
+ * pmfs_get_block() returns NULL if the block offset is 0 (helps in catching
+ * bugs). So if we modify any field using journaling (for consistency), we 
+ * will have to modify s_sum which is at offset 0. So journaling code fails.
+ * This (static+dynamic fields) is a temporary solution and can be avoided
+ * once the file system becomes stable and pmfs_get_block() returns correct
+ * pointers even for offset 0.
+ */
+struct pmfs_super_block {
+	/* static fields. they never change after file system creation.
+	 * checksum only validates upto s_start_dynamic field below */
+	__le16		s_sum;              /* checksum of this sb */
+	__le16		s_magic;            /* magic signature */
+	__le32		s_blocksize;        /* blocksize in bytes */
+	__le64		s_size;             /* total size of fs in bytes */
+	char		s_volume_name[16];  /* volume name */
+	/* points to the location of pmfs_journal_t */
+	__le64          s_journal_offset;
+	/* points to the location of struct pmfs_inode for the inode table */
+	__le64          s_inode_table_offset;
+
+	__le64       s_start_dynamic; 
+
+	/* all the dynamic fields should go here */
+	/* s_mtime and s_wtime should be together and their order should not be
+	 * changed. we use an 8 byte write to update both of them atomically */
+	__le32		s_mtime;            /* mount time */
+	__le32		s_wtime;            /* write time */
+	/* fields for fast mount support. Always keep them together */
+	__le64		s_num_blocknode_allocated;
+	__le64		s_num_free_blocks;
+	__le32		s_inodes_count;
+	__le32		s_free_inodes_count;
+	__le32		s_inodes_used_count;
+	__le32		s_free_inode_hint;
+};
+
+#define PMFS_SB_STATIC_SIZE(ps) ((u64)&ps->s_start_dynamic - (u64)ps)
+
+/* the above fast mount fields take total 32 bytes in the super block */
+#define PMFS_FAST_MOUNT_FIELD_SIZE  (36)
+
+/* The root inode follows immediately after the redundant super block */
+#define PMFS_ROOT_INO (PMFS_INODE_SIZE)
+#define PMFS_BLOCKNODE_IN0 (PMFS_ROOT_INO + PMFS_INODE_SIZE)
+
+/* INODE HINT  START at 3 */ 
+#define PMFS_FREE_INODE_HINT_START      (3)
+
+#endif /* _LINUX_PMFS_DEF_H */
diff --git a/include/linux/pmfs_sb.h b/include/linux/pmfs_sb.h
new file mode 100644
index 0000000..d483f2c
--- /dev/null
+++ b/include/linux/pmfs_sb.h
@@ -0,0 +1,83 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Definitions for the PMFS.
+ *
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli at gmail.com>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _LINUX_PMFS_SB
+#define _LINUX_PMFS_SB
+
+/*
+ * PMFS super-block data in memory
+ */
+struct pmfs_sb_info {
+	/*
+	 * base physical and virtual address of PMFS (which is also
+	 * the pointer to the super block)
+	 */
+	phys_addr_t	phys_addr;
+	void		*virt_addr;
+	struct list_head block_inuse_head;
+	unsigned long	block_start;
+	unsigned long	block_end;
+	unsigned long	num_free_blocks;
+	char		pmfs_backing_file[256];
+	struct mutex 	s_lock;	/* protects the SB's buffer-head */
+
+	/*
+	 * Backing store option:
+	 * 1 = no load, 2 = no store,
+	 * else do both
+	 */
+	unsigned int	pmfs_backing_option;
+
+	/* Mount options */
+	unsigned long	bpi;
+	unsigned long	num_inodes;
+	unsigned long	blocksize;
+	unsigned long	initsize;
+	unsigned long	s_mount_opt;
+	uid_t		uid;    /* Mount uid for root directory */
+	gid_t		gid;    /* Mount gid for root directory */
+	umode_t		mode;   /* Mount mode for root directory */
+	atomic_t	next_generation;
+	/* inode tracking */
+	struct mutex inode_table_mutex;
+	unsigned int	s_inodes_count;  /* total inodes count (used or free) */
+	unsigned int	s_free_inodes_count;    /* free inodes count */
+	unsigned int	s_inodes_used_count;
+	unsigned int	s_free_inode_hint;
+
+	/* temp bitmap space */
+	unsigned long num_blocknode_allocated;
+	unsigned long bitmap_4k_size;
+	unsigned long bitmap_2M_size;
+	unsigned long bitmap_1G_size;
+	unsigned long *bitmap_4k;
+	unsigned long *bitmap_2M;
+	unsigned long *bitmap_1G;
+
+	/* Journaling related structures */
+	uint32_t    next_transaction_id;
+	uint32_t    jsize;
+	void       *journal_base_addr;
+	struct mutex journal_mutex;
+	struct task_struct *log_cleaner_thread;
+	wait_queue_head_t  log_cleaner_wait;
+	bool redo_log;
+
+	/* truncate list related structures */
+	struct list_head s_truncate;
+	struct mutex s_truncate_lock;
+};
+
+#endif /* _LINUX_PMFS_SB */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e91..d6cd080 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,7 +21,7 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
  * Can be overriden by arch-specific value.
  */
 #ifndef IOREMAP_MAX_ORDER
-#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
+#define IOREMAP_MAX_ORDER	(PUD_SHIFT)	/* 1G pages */
 #endif
 
 struct vm_struct {
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 873e086..a070c14 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -64,6 +64,7 @@
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define PIPEFS_MAGIC            0x50495045
 #define PROC_SUPER_MAGIC	0x9fa0
+#define PMFS_SUPER_MAGIC	0xEFFC
 #define SOCKFS_MAGIC		0x534F434B
 #define SYSFS_MAGIC		0x62656572
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 0c9216c..8eeaa60 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/export.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 
 static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
@@ -32,37 +33,104 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 }
 
 static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot, int hpages)
 {
-	pmd_t *pmd;
+	pmd_t *pmd_page, *pmd;
 	unsigned long next;
 
 	phys_addr -= addr;
-	pmd = pmd_alloc(&init_mm, pud, addr);
-	if (!pmd)
+	pmd_page = pmd_alloc(&init_mm, pud, addr);
+	if (!pmd_page)
 		return -ENOMEM;
+
+	if (hpages)
+	{
+		printk (KERN_INFO "PMD_MAPPING (START) [%s,%d]"
+			" VA START(0x%lx), VA END(0x%lx), "
+			"PA(0x%lx), SIZE(0x%lx)\n", __FUNCTION__, __LINE__,
+			addr, end, (unsigned long)(phys_addr+addr), (end-addr));
+
+	}
+
+	pmd = pmd_page;
 	do {
 		next = pmd_addr_end(addr, end);
-		if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
-			return -ENOMEM;
+		if (hpages && cpu_has_pse && ((next-addr)>=PMD_SIZE))
+		{
+			u64 pfn = ((u64)(phys_addr + addr)) >> PAGE_SHIFT;
+			prot = __pgprot((unsigned long)prot.pgprot | _PAGE_PSE);
+
+			if ((s64)pfn < 0)
+			{
+				printk (KERN_INFO "MAPPING ERROR [%s, %d] : phys_addr(0x%lx)"
+						"addr(0x%lx), next(0x%lx), end(0x%lx),"
+						"pfn(0x%lx)\n", __FUNCTION__, __LINE__, 
+						(unsigned long)phys_addr,
+						(unsigned long)addr, (unsigned long)next, 
+						(unsigned long)end, (unsigned long)pfn);
+				return -ENOMEM;
+			}
+
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pmd, pfn_pte(pfn, prot));
+			spin_unlock(&init_mm.page_table_lock);
+		}
+		else
+		{
+			if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
+				return -ENOMEM;
+		}
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
 static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+		unsigned long end, phys_addr_t phys_addr, pgprot_t prot, int hpages)
 {
-	pud_t *pud;
+	pud_t *pud_page, *pud;
 	unsigned long next;
 
 	phys_addr -= addr;
-	pud = pud_alloc(&init_mm, pgd, addr);
-	if (!pud)
+	pud_page = pud_alloc(&init_mm, pgd, addr);
+	if (!pud_page)
 		return -ENOMEM;
+
+	if (hpages)
+	{
+		printk (KERN_INFO "PUD_MAPPING (START) [%s,%d]"
+			" VA START(0x%lx), VA END(0x%lx), "
+			"PA(0x%lx), SIZE(0x%lx)\n", __FUNCTION__, __LINE__,
+			addr, end, (unsigned long)(phys_addr+addr), (end-addr));
+	}
+
+	pud = pud_page;
 	do {
 		next = pud_addr_end(addr, end);
-		if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
-			return -ENOMEM;
+		if (hpages && cpu_has_gbpages && ((next-addr)>=PUD_SIZE))
+		{
+			u64 pfn = ((u64)(phys_addr + addr)) >> PAGE_SHIFT;
+			prot = __pgprot((unsigned long)prot.pgprot | _PAGE_PSE);
+			if ((s64)pfn < 0)
+			{
+				printk (KERN_INFO "MAPPING ERROR [%s, %d] : phys_addr(0x%lx)"
+						"addr(0x%lx), next(0x%lx), end(0x%lx),"
+						"pfn(0x%lx)\n", __FUNCTION__, __LINE__, 
+						(unsigned long)phys_addr,
+						(unsigned long)addr, (unsigned long)next, 
+						(unsigned long)end, (unsigned long)pfn);
+				return -ENOMEM;
+			}
+
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pud, pfn_pte(pfn, prot));
+			spin_unlock(&init_mm.page_table_lock);
+		}
+		else
+		{
+			if (ioremap_pmd_range(pud, addr, next, phys_addr + addr,
+															prot, hpages))
+				return -ENOMEM;
+		}
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
@@ -82,7 +150,7 @@ int ioremap_page_range(unsigned long addr,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
+		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot, 0);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
@@ -92,3 +160,35 @@ int ioremap_page_range(unsigned long addr,
 	return err;
 }
 EXPORT_SYMBOL_GPL(ioremap_page_range);
+
+int ioremap_hpage_range(unsigned long addr,
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	pgd_t *pgd;
+	unsigned long start;
+	unsigned long next;
+	int err;
+
+	BUG_ON(addr >= end);
+
+	printk (KERN_INFO "[%s,%d] hpages ON; startVA(0x%lx), endVA(0x%lx), "
+			"startPA(0x%lx), startPFN(0x%lx)\n", __FUNCTION__, __LINE__,
+			addr, end, (unsigned long)phys_addr,
+			(unsigned long)phys_addr >> PAGE_SHIFT);
+
+	start = addr;
+	phys_addr -= addr;
+	pgd = pgd_offset_k(addr);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot, 1);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	flush_cache_vmap(start, end);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(ioremap_hpage_range);
diff --git a/mm/madvise.c b/mm/madvise.c
index c58c94b..11bba93 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -511,6 +511,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 		if (!vma)
 			goto out_plug;
 
+		/* madvise not supported with XIP_HUGETLB */
+		if (is_xip_hugetlb_mapping(vma)) {
+			error = -EINVAL;
+			goto out;
+		}
+
 		/* Here start < (end|vma->vm_end). */
 		if (start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2b55222..8f49564 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6480,7 +6480,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 			.mm = mm,
 			.private = vma,
 		};
-		if (is_vm_hugetlb_page(vma))
+		if (is_vm_hugetlb_page(vma) || is_xip_hugetlb_mapping(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
@@ -6743,7 +6743,7 @@ retry:
 			.mm = mm,
 			.private = vma,
 		};
-		if (is_vm_hugetlb_page(vma))
+		if (is_vm_hugetlb_page(vma) || is_xip_hugetlb_mapping(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
diff --git a/mm/memory.c b/mm/memory.c
index 13cbc42..5a8bd23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1054,6 +1054,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			return 0;
 	}
 
+	/* FIXME: For now, don't copy ptes and let it fault. */
+	if (is_xip_hugetlb_mapping(vma))
+		return 0;
+
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
@@ -1353,6 +1357,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 				__unmap_hugepage_range_final(tlb, vma, start, end, NULL);
 				mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 			}
+		} else if (is_xip_hugetlb_mapping(vma)) {
+				unmap_xip_hugetlb_range(vma, start, end);
+				start = end;
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
 	}
@@ -1650,6 +1657,54 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 	       stack_guard_page_end(vma, addr+PAGE_SIZE);
 }
 
+/* FIXME : Move it to the right place ! */
+static int follow_xip_hugetlb_page(
+		struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long *position, int *length, int i, unsigned int flags)
+{
+	unsigned long vaddr = *position;
+	int remainder = *length;
+
+	while (vaddr < vma->vm_end && remainder) {
+		int err, absent;
+		pte_t *pte;
+		unsigned long size;
+		struct vm_fault vmf;
+
+		pte = pte_offset_pagesz(mm, vaddr, &size);
+		absent = !pte || pte_none(*pte);
+
+		/* populate an entry */
+		if (absent || ((flags & FOLL_WRITE) && !pte_write(*pte))) {
+			vmf.virtual_address = (void __user *)(vaddr & PAGE_MASK);
+			vmf.pgoff = (((vaddr & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+															+ vma->vm_pgoff;
+			vmf.flags = (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0;
+			vmf.page = NULL;
+			err = vma->vm_ops->fault(vma, &vmf);
+
+			if (!err || (err == VM_FAULT_NOPAGE)) {
+				pte = pte_offset_pagesz(mm, vaddr, &size);
+				vaddr = (vaddr & ~(size-1)) + size;
+				remainder -= size>>PAGE_SHIFT;
+				i += size>>PAGE_SHIFT;
+				continue;
+			}
+
+			remainder = 0;
+			break;
+		}
+
+		vaddr = (vaddr & ~(size-1)) + size;
+		remainder -= size>>PAGE_SHIFT;
+		i += size>>PAGE_SHIFT;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+	return i ? i : -EFAULT;
+}
+
 /**
  * __get_user_pages() - pin user pages in memory
  * @tsk:	task_struct of target task
@@ -1790,9 +1845,20 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 		if (!vma ||
 		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
-		    !(vm_flags & vma->vm_flags))
+		    !(vm_flags & vma->vm_flags) || is_xip_hugetlb_mapping(vma))
 			return i ? : -EFAULT;
 
+#if 0
+		/* FIXME : Requires more testing */
+		if (is_xip_hugetlb_mapping(vma)) {
+			/* caller expects vmas or pages to be populated. */
+			if (vmas || pages)
+				return -EFAULT;
+			i = follow_xip_hugetlb_page(mm, vma,
+						&start, &nr_pages, i, gup_flags);
+			continue;
+		}
+#endif
 		if (is_vm_hugetlb_page(vma)) {
 			i = follow_hugetlb_page(mm, vma, pages, vmas,
 					&start, &nr_pages, i, gup_flags);
@@ -3724,6 +3790,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* do counter updates before entering really critical section. */
 	check_sync_rss_stat(current);
 
+	/* FIXME : Can't find a single flag in vm_area_struct->vma_flags.  */
+	if (is_xip_hugetlb_mapping(vma))
+	{
+		int err;
+		struct vm_fault vmf;
+		vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+		vmf.pgoff = (((address & PAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+															+ vma->vm_pgoff;
+		vmf.flags = flags;
+		vmf.page = NULL;
+		err = vma->vm_ops->fault(vma, &vmf);
+		if (!err || (err == VM_FAULT_NOPAGE))
+			return 0;
+	}
+
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 
@@ -3857,6 +3938,117 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
+/****************************************************************************/
+/* XIP_HUGETLB support */
+pte_t *pte_offset_pagesz(struct mm_struct *mm, unsigned long addr,
+													unsigned long *sz)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd)) {
+		*sz = PGDIR_SIZE;
+		return (pte_t *)pgd;
+	}
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud) || pud_large(*pud)) {
+		*sz = PUD_SIZE;
+		return (pte_t *)pud;
+	}
+	pmd = pmd_offset(pud, addr);
+	//if (pmd_none(*pmd) || pmd_large(*pmd)) {
+	*sz = PMD_SIZE;
+	return (pte_t *)pmd;
+}
+EXPORT_SYMBOL(pte_offset_pagesz);
+
+pte_t *pte_alloc_pagesz(struct mm_struct *mm, unsigned long addr, 
+													unsigned long sz)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pte_t *pte = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		if (sz == PUD_SIZE) {
+			pte = (pte_t *)pud;
+		} else {
+			BUG_ON(sz != PMD_SIZE);
+			pte = (pte_t *) pmd_alloc(mm, pud, addr);
+		}
+	}
+	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
+
+	return pte;
+}
+EXPORT_SYMBOL(pte_alloc_pagesz);
+
+static void __unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+								unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *ptep;
+	pte_t pte;
+	unsigned long sz;
+
+	WARN_ON(!is_xip_hugetlb_mapping(vma));
+
+	mmu_notifier_invalidate_range_start(mm, start, end);
+	spin_lock(&mm->page_table_lock);
+	for (address = start, sz=PMD_SIZE; address < end; address += sz) {
+		ptep = pte_offset_pagesz(mm, address, &sz);
+		if (!ptep)
+			continue;
+
+		pte = ptep_get_and_clear(mm, address, ptep);
+		if (pte_none(pte))
+			continue;
+	}
+	flush_tlb_range(vma, start, end);
+	spin_unlock(&mm->page_table_lock);
+	mmu_notifier_invalidate_range_end(mm, start, end);
+}
+
+void unmap_xip_hugetlb_range(struct vm_area_struct *vma,
+							unsigned long start, unsigned long end)
+{
+	mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+	__unmap_xip_hugetlb_range(vma, start, end);
+	mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+}
+EXPORT_SYMBOL(unmap_xip_hugetlb_range);
+
+/****************************************************************************/
+int make_pages_present(unsigned long addr, unsigned long end)
+{
+	int ret, len, write;
+	struct vm_area_struct * vma;
+
+	vma = find_vma(current->mm, addr);
+	if (!vma)
+		return -ENOMEM;
+	/*
+	 * We want to touch writable mappings with a write fault in order
+	 * to break COW, except for shared mappings because these don't COW
+	 * and we would not want to dirty them for nothing.
+	 */
+	write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
+	BUG_ON(addr >= end);
+	BUG_ON(end > vma->vm_end);
+	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
+	ret = get_user_pages(current, current->mm, addr,
+			len, write, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+	return ret == len ? 0 : -EFAULT;
+}
+
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..f92ef16 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,6 +173,11 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	VM_BUG_ON(end   > vma->vm_end);
 	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
+	if (is_xip_hugetlb_mapping(vma)) {
+		vma->vm_flags &= ~VM_LOCKED;
+		return nr_pages;;
+	}
+
 	gup_flags = FOLL_TOUCH | FOLL_MLOCK;
 	/*
 	 * We want to touch writable mappings with a write fault in order
diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1..79f4003 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2355,6 +2355,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	struct vm_area_struct *new;
 	int err = -ENOMEM;
 
+	if (is_xip_hugetlb_mapping(vma))
+		return -EINVAL;
+
 	if (is_vm_hugetlb_page(vma) && (addr &
 					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..f1e8754 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -250,6 +250,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		return 0;
 	}
 
+	/* FIXME: Do nothing for now */	
+	if (is_xip_hugetlb_mapping(vma))
+		return -EINVAL;
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
diff --git a/mm/msync.c b/mm/msync.c
index 632df45..a2abc04 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -30,9 +30,10 @@
  */
 SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 {
-	unsigned long end;
+	unsigned long end, fsync_start, fsync_end;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
+	size_t file_offset;
 	int unmapped_error = 0;
 	int error = -EINVAL;
 
@@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 			goto out_unlock;
 		}
 		file = vma->vm_file;
+		fsync_start = start;
+		fsync_end = min(end, vma->vm_end);
 		start = vma->vm_end;
 		if ((flags & MS_SYNC) && file &&
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
 			up_read(&mm->mmap_sem);
-			error = vfs_fsync(file, 0);
+			file_offset = vma->vm_pgoff * PAGE_SIZE;
+			error = vfs_fsync_range(file, 
+					file_offset + fsync_start - vma->vm_start,
+					file_offset + fsync_end - vma->vm_start - 1, 0);
 			fput(file);
 			if (error || start >= end)
 				goto out;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2..31b9206 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -52,6 +52,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
+		if (pmd_large(*pmd)) {
+			pmd_clear(pmd);
+			continue;
+		}
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
 		vunmap_pte_range(pmd, addr, next);
@@ -66,6 +70,10 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_large(*pud)) {
+			pud_clear(pud);
+			continue;
+		}
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		vunmap_pmd_range(pud, addr, next);
@@ -1329,7 +1337,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 
 	BUG_ON(in_interrupt());
 	if (flags & VM_IOREMAP) {
-		int bit = fls(size);
+		int bit = fls64((__u64)size);
 
 		if (bit > IOREMAP_MAX_ORDER)
 			bit = IOREMAP_MAX_ORDER;
-- 
1.7.0.4




More information about the Linux-pmfs mailing list