[PATCH 8/9] ARM: socfpga: Import sequencer code from generated uboot

Steffen Trumtrar s.trumtrar at pengutronix.de
Thu Jan 8 03:46:23 PST 2015


From: Markus Pargmann <mpa at pengutronix.de>

This patch imports the sequencer code from uboot using the new script
scripts/socfpga_get_sequencer.

Signed-off-by: Markus Pargmann <mpa at pengutronix.de>
Signed-off-by: Steffen Trumtrar <s.trumtrar at pengutronix.de>
---
 arch/arm/mach-socfpga/include/mach/sdram_io.h      |   58 +
 arch/arm/mach-socfpga/include/mach/sequencer.c     | 6131 ++++++++++++--------
 arch/arm/mach-socfpga/include/mach/sequencer.h     |  397 +-
 .../mach-socfpga/include/mach/sequencer_defines.h  |    6 +
 arch/arm/mach-socfpga/include/mach/system.h        |   37 +
 arch/arm/mach-socfpga/include/mach/tclrpt.h        |   38 +
 6 files changed, 4015 insertions(+), 2652 deletions(-)
 create mode 100755 arch/arm/mach-socfpga/include/mach/sdram_io.h
 create mode 100644 arch/arm/mach-socfpga/include/mach/sequencer_defines.h
 create mode 100755 arch/arm/mach-socfpga/include/mach/system.h
 create mode 100755 arch/arm/mach-socfpga/include/mach/tclrpt.h

diff --git a/arch/arm/mach-socfpga/include/mach/sdram_io.h b/arch/arm/mach-socfpga/include/mach/sdram_io.h
new file mode 100755
index 000000000000..62698000f6d8
--- /dev/null
+++ b/arch/arm/mach-socfpga/include/mach/sdram_io.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright Altera Corporation (C) 2012-2014. All rights reserved
+ *
+ * SPDX-License-Identifier:    BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *    * Neither the name of Altera Corporation nor the
+ *      names of its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <mach/sdram.h>
+
+#define MGR_SELECT_MASK   0xf8000
+
+#define APB_BASE_SCC_MGR	SDR_PHYGRP_SCCGRP_ADDRESS
+#define APB_BASE_PHY_MGR	SDR_PHYGRP_PHYMGRGRP_ADDRESS
+#define APB_BASE_RW_MGR		SDR_PHYGRP_RWMGRGRP_ADDRESS
+#define APB_BASE_DATA_MGR	SDR_PHYGRP_DATAMGRGRP_ADDRESS
+#define APB_BASE_REG_FILE	SDR_PHYGRP_REGFILEGRP_ADDRESS
+#define APB_BASE_MMR		SDR_CTRLGRP_ADDRESS
+
+#define __AVL_TO_APB(ADDR) \
+	((((ADDR) & MGR_SELECT_MASK) == (BASE_PHY_MGR))  ? (APB_BASE_PHY_MGR)  | (((ADDR) >> (14-6)) & (0x1<<6))  | ((ADDR) & 0x3f) : \
+	 (((ADDR) & MGR_SELECT_MASK) == (BASE_RW_MGR))   ? (APB_BASE_RW_MGR)   | ((ADDR) & 0x1fff) : \
+ 	 (((ADDR) & MGR_SELECT_MASK) == (BASE_DATA_MGR)) ? (APB_BASE_DATA_MGR) | ((ADDR) & 0x7ff) : \
+	 (((ADDR) & MGR_SELECT_MASK) == (BASE_SCC_MGR))  ? (APB_BASE_SCC_MGR)  | ((ADDR) & 0xfff) : \
+	 (((ADDR) & MGR_SELECT_MASK) == (BASE_REG_FILE)) ? (APB_BASE_REG_FILE) | ((ADDR) & 0x7ff) : \
+	 (((ADDR) & MGR_SELECT_MASK) == (BASE_MMR))      ? (APB_BASE_MMR)      | ((ADDR) & 0xfff) : \
+	 -1)
+
+#define IOWR_32DIRECT(BASE, OFFSET, DATA) \
+	write_register(HPS_SDR_BASE, __AVL_TO_APB((uint32_t)((BASE) + (OFFSET))), DATA)
+
+#define IORD_32DIRECT(BASE, OFFSET) \
+	read_register(HPS_SDR_BASE, __AVL_TO_APB((uint32_t)((BASE) + (OFFSET))))
+	#define write_register(BASE, OFFSET, DATA) \
+		writel(DATA, ((BASE) + (OFFSET)))
+	#define read_register(BASE, OFFSET) \
+		readl((BASE) + (OFFSET))
+	#define HPS_SDR_BASE 0xffc20000
diff --git a/arch/arm/mach-socfpga/include/mach/sequencer.c b/arch/arm/mach-socfpga/include/mach/sequencer.c
index 87dc6677ec8b..259a400db49d 100644
--- a/arch/arm/mach-socfpga/include/mach/sequencer.c
+++ b/arch/arm/mach-socfpga/include/mach/sequencer.c
@@ -1,174 +1,337 @@
 /*
- * Copyright Altera Corporation (C) 2012-2014. All rights reserved
- *
- * SPDX-License-Identifier:  BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *  * Redistributions of source code must retain the above copyright
- *  notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *  notice, this list of conditions and the following disclaimer in the
- *  documentation and/or other materials provided with the distribution.
- *  * Neither the name of Altera Corporation nor the
- *  names of its contributors may be used to endorse or promote products
- *  derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <common.h>
-#include <io.h>
-#include <mach/socfpga-regs.h>
-#include <mach/sdram.h>
-#include <mach/sequencer.h>
+* Copyright Altera Corporation (C) 2012-2014. All rights reserved
+*
+* SPDX-License-Identifier:  BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above copyright
+*  notice, this list of conditions and the following disclaimer in the
+*  documentation and/or other materials provided with the distribution.
+*  * Neither the name of Altera Corporation nor the
+*  names of its contributors may be used to endorse or promote products
+*  derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "sequencer_defines.h"
+
+#include "system.h"
+#include "sdram_io.h"
+#include "sequencer.h"
+#include "tclrpt.h"
+
+/******************************************************************************
+ ******************************************************************************
+ ** NOTE: Special Rules for Globale Variables                                **
+ **                                                                          **
+ ** All global variables that are explicitly initialized (including          **
+ ** explicitly initialized to zero), are only initialized once, during       **
+ ** configuration time, and not again on reset.  This means that they        **
+ ** preserve their current contents across resets, which is needed for some  **
+ ** special cases involving communication with external modules.  In         **
+ ** addition, this avoids paying the price to have the memory initialized,   **
+ ** even for zeroed data, provided it is explicitly set to zero in the code, **
+ ** and doesn't rely on implicit initialization.                             **
+ ******************************************************************************
+ ******************************************************************************/
+
+// Temporary workaround to place the initial stack pointer at a safe offset from end
+#define STRINGIFY(s)		STRINGIFY_STR(s)
+#define STRINGIFY_STR(s)	#s
+asm(".global __alt_stack_pointer");
+asm("__alt_stack_pointer = " STRINGIFY(STACK_POINTER));
 
-static void IOWR_32DIRECT(uint32_t base, uint32_t ofs, uint32_t val)
-{
-	writel(val, CYCLONE5_SDR_ADDRESS + base + ofs);
-}
+#include <mach/sdram.h>
 
-static uint32_t IORD_32DIRECT(uint32_t base, uint32_t ofs)
-{
-	return readl(CYCLONE5_SDR_ADDRESS + base + ofs);
-}
+#define NEWVERSION_RDDESKEW 1
+#define NEWVERSION_WRDESKEW 1
+#define NEWVERSION_GW 1
+#define NEWVERSION_WL 1
+#define NEWVERSION_DQSEN 1
 
-/* Just to make the debugging code more uniform */
-#ifndef RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM
-#define RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM 0
-#endif
+// Just to make the debugging code more uniform
 
-#if HALF_RATE
-#define HALF_RATE_MODE 1
-#else
 #define HALF_RATE_MODE 0
-#endif
 
-#if QUARTER_RATE
-#define QUARTER_RATE_MODE 1
-#else
 #define QUARTER_RATE_MODE 0
-#endif
 #define DELTA_D 1
 
-#define BTFLD_FMT "%x"
+// case:56390
+// VFIFO_CONTROL_WIDTH_PER_DQS is the number of VFIFOs actually instantiated per DQS. This is always one except:
+// AV QDRII where it is 2 for x18 and x18w2, and 4 for x36 and x36w2
+// RLDRAMII x36 and x36w2 where it is 2.
+// In 12.0sp1 we set this to 4 for all of the special cases above to keep it simple.
+// In 12.0sp2 or 12.1 this should get moved to generation and unified with the same constant used in the phy mgr
+
+#define VFIFO_CONTROL_WIDTH_PER_DQS 1
+
+// In order to reduce ROM size, most of the selectable calibration steps are
+// decided at compile time based on the user's calibration mode selection,
+// as captured by the STATIC_CALIB_STEPS selection below.
+//
+// However, to support simulation-time selection of fast simulation mode, where
+// we skip everything except the bare minimum, we need a few of the steps to
+// be dynamic.  In those cases, we either use the DYNAMIC_CALIB_STEPS for the
+// check, which is based on the rtl-supplied value, or we dynamically compute the
+// value to use based on the dynamically-chosen calibration mode
+
+#define BTFLD_FMT "%lx"
+
+// For HPS running on actual hardware
+
+#define DLEVEL 0
+#ifdef HPS_HW_SERIAL_SUPPORT
+// space around comma is required for varargs macro to remove comma if args is empty
+#define DPRINT(level, fmt, args...) 	if (DLEVEL >= (level)) printf("SEQ.C: " fmt "\n" , ## args)
+#define IPRINT(fmt, args...) 	        printf("SEQ.C: " fmt "\n" , ## args)
+#else
+#define DPRINT(level, fmt, args...)
+#define IPRINT(fmt, args...)
+#endif
+#define BFM_GBL_SET(field,value)
+#define BFM_GBL_GET(field) 		((long unsigned int)0)
+#define BFM_STAGE(stage)
+#define BFM_INC_VFIFO
+#define COV(label)
+
+#define TRACE_FUNC(fmt, args...) DPRINT(1, "%s[%d]: " fmt, __func__, __LINE__ , ## args)
 
-#define STATIC_CALIB_STEPS (CALIB_SKIP_FULL_TEST)
+#define DYNAMIC_CALIB_STEPS (dyn_calib_steps)
 
-/* calibration steps requested by the rtl */
-static uint16_t dyn_calib_steps;
+#define STATIC_IN_RTL_SIM 0
 
-static uint32_t vfifo_idx;
+#define STATIC_SKIP_DELAY_LOOPS 0
 
-/*
- * To make CALIB_SKIP_DELAY_LOOPS a dynamic conditional option
- * instead of static, we use boolean logic to select between
- * non-skip and skip values
- *
- * The mask is set to include all bits when not-skipping, but is
- * zero when skipping
- */
+#define STATIC_CALIB_STEPS (STATIC_IN_RTL_SIM | CALIB_SKIP_FULL_TEST | STATIC_SKIP_DELAY_LOOPS)
+
+// calibration steps requested by the rtl
+static uint16_t dyn_calib_steps = 0;
 
-static uint16_t skip_delay_mask;	/* mask off bits when skipping/not-skipping */
+// To make CALIB_SKIP_DELAY_LOOPS a dynamic conditional option
+// instead of static, we use boolean logic to select between
+// non-skip and skip values
+//
+// The mask is set to include all bits when not-skipping, but is
+// zero when skipping
+
+static uint16_t skip_delay_mask = 0;	// mask off bits when skipping/not-skipping
 
 #define SKIP_DELAY_LOOP_VALUE_OR_ZERO(non_skip_value) \
 	((non_skip_value) & skip_delay_mask)
 
-static gbl_t *gbl;
-static param_t *param;
+// TODO: The skip group strategy is completely missing
 
-static uint32_t rw_mgr_mem_calibrate_write_test(uint32_t rank_bgn,
-	uint32_t write_group, uint32_t use_dm,
-	uint32_t all_correct, t_btfld * bit_chk, uint32_t all_ranks);
+static gbl_t *gbl = 0;
+static param_t *param = 0;
 
-/*
- * This (TEST_SIZE) is used to test handling of large roms, to make
- * sure we are sizing things correctly
- * Note, the initialized data takes up twice the space in rom, since
- * there needs to be a copy with the initial value and a copy that is
- * written too, since on soft-reset, it needs to have the initial values
- * without reloading the memory from external sources
- */
-
-static void reg_file_set_group(uint32_t set_group)
+static uint32_t curr_shadow_reg = 0;
+
+static uint32_t rw_mgr_mem_calibrate_write_test(uint32_t rank_bgn, uint32_t write_group,
+						uint32_t use_dm, uint32_t all_correct,
+						t_btfld * bit_chk, uint32_t all_ranks);
+
+// This (TEST_SIZE) is used to test handling of large roms, to make
+// sure we are sizing things correctly
+// Note, the initialized data takes up twice the space in rom, since
+// there needs to be a copy with the initial value and a copy that is
+// written too, since on soft-reset, it needs to have the initial values
+// without reloading the memory from external sources
+
+// #define TEST_SIZE    (6*1024)
+
+#ifdef TEST_SIZE
+
+#define PRE_POST_TEST_SIZE 3
+
+static unsigned int pre_test_size_mem[PRE_POST_TEST_SIZE] = { 1, 2, 3 };
+
+static unsigned int test_size_mem[TEST_SIZE / sizeof(unsigned int)] = { 100, 200, 300 };
+
+static unsigned int post_test_size_mem[PRE_POST_TEST_SIZE] = { 10, 20, 30 };
+
+static void write_test_mem(void)
+{
+	int i;
+
+	for (i = 0; i < PRE_POST_TEST_SIZE; i++) {
+		pre_test_size_mem[i] = (i + 1) * 10;
+		post_test_size_mem[i] = (i + 1);
+	}
+
+	for (i = 0; i < sizeof(test_size_mem) / sizeof(unsigned int); i++) {
+		test_size_mem[i] = i;
+	}
+
+}
+
+static int check_test_mem(int start)
+{
+	int i;
+
+	for (i = 0; i < PRE_POST_TEST_SIZE; i++) {
+		if (start) {
+			if (pre_test_size_mem[i] != (i + 1)) {
+				return 0;
+			}
+			if (post_test_size_mem[i] != (i + 1) * 10) {
+				return 0;
+			}
+		} else {
+			if (pre_test_size_mem[i] != (i + 1) * 10) {
+				return 0;
+			}
+			if (post_test_size_mem[i] != (i + 1)) {
+				return 0;
+			}
+		}
+	}
+
+	for (i = 0; i < sizeof(test_size_mem) / sizeof(unsigned int); i++) {
+		if (start) {
+			if (i < 3) {
+				if (test_size_mem[i] != (i + 1) * 100) {
+					return 0;
+				}
+			} else {
+				if (test_size_mem[i] != 0) {
+					return 0;
+				}
+			}
+		} else {
+			if (test_size_mem[i] != i) {
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+#endif // TEST_SIZE
+
+static void set_failing_group_stage(uint32_t group, uint32_t stage, uint32_t substage)
+{
+	ALTERA_ASSERT(group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+
+	// Only set the global stage if there was not been any other failing group
+	if (gbl->error_stage == CAL_STAGE_NIL) {
+		gbl->error_substage = substage;
+		gbl->error_stage = stage;
+		gbl->error_group = group;
+		TCLRPT_SET(debug_summary_report->error_sub_stage, substage);
+		TCLRPT_SET(debug_summary_report->error_stage, stage);
+		TCLRPT_SET(debug_summary_report->error_group, group);
+
+	}
+	// Always set the group specific errors
+	TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][group].error_stage,
+		   stage);
+	TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][group].error_sub_stage,
+		   substage);
+
+}
+
+static inline void reg_file_set_group(uint32_t set_group)
 {
-	/* Read the current group and stage */
+	// Read the current group and stage
 	uint32_t cur_stage_group = IORD_32DIRECT(REG_FILE_CUR_STAGE, 0);
 
-	/* Clear the group */
+	// Clear the group
 	cur_stage_group &= 0x0000FFFF;
 
-	/* Set the group */
+	// Set the group
 	cur_stage_group |= (set_group << 16);
 
-	/* Write the data back */
+	// Write the data back
 	IOWR_32DIRECT(REG_FILE_CUR_STAGE, 0, cur_stage_group);
 }
 
-static void reg_file_set_stage(uint32_t set_stage)
+static inline void reg_file_set_stage(uint32_t set_stage)
 {
-	/* Read the current group and stage */
+	// Read the current group and stage
 	uint32_t cur_stage_group = IORD_32DIRECT(REG_FILE_CUR_STAGE, 0);
 
-	/* Clear the stage and substage */
+	// Clear the stage and substage
 	cur_stage_group &= 0xFFFF0000;
 
-	/* Set the stage */
+	// Set the stage
 	cur_stage_group |= (set_stage & 0x000000FF);
 
-	/* Write the data back */
+	// Write the data back
 	IOWR_32DIRECT(REG_FILE_CUR_STAGE, 0, cur_stage_group);
 }
 
-static void reg_file_set_sub_stage(uint32_t set_sub_stage)
+static inline void reg_file_set_sub_stage(uint32_t set_sub_stage)
 {
-	/* Read the current group and stage */
+	// Read the current group and stage
 	uint32_t cur_stage_group = IORD_32DIRECT(REG_FILE_CUR_STAGE, 0);
 
-	/* Clear the substage */
+	// Clear the substage
 	cur_stage_group &= 0xFFFF00FF;
 
-	/* Set the sub stage */
+	// Set the sub stage
 	cur_stage_group |= ((set_sub_stage << 8) & 0x0000FF00);
 
-	/* Write the data back */
+	// Write the data back
 	IOWR_32DIRECT(REG_FILE_CUR_STAGE, 0, cur_stage_group);
 }
 
+static inline uint32_t is_write_group_enabled_for_dm(uint32_t write_group)
+{
+	return 1;
+}
+
+static inline void select_curr_shadow_reg_using_rank(uint32_t rank)
+{
+}
+
 static void initialize(void)
 {
-	/*
-	 * In Hard PHY this is a 2-bit control:
-	 * 0: AFI Mux Select
-	 * 1: DDIO Mux Select
-	 */
+	TRACE_FUNC();
+
+	//USER calibration has control over path to memory 
+
+	// In Hard PHY this is a 2-bit control:
+	// 0: AFI Mux Select
+	// 1: DDIO Mux Select
 	IOWR_32DIRECT(PHY_MGR_MUX_SEL, 0, 0x3);
 
-	/* USER memory clock is not stable we begin initialization  */
+	//USER memory clock is not stable we begin initialization 
 
 	IOWR_32DIRECT(PHY_MGR_RESET_MEM_STBL, 0, 0);
 
-	/* USER calibration status all set to zero */
+	//USER calibration status all set to zero 
 
 	IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, 0);
 	IOWR_32DIRECT(PHY_MGR_CAL_DEBUG_INFO, 0, 0);
 
-	param->read_correct_mask_vg  = ((t_btfld)1 << (RW_MGR_MEM_DQ_PER_READ_DQS / RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
-	param->write_correct_mask_vg = ((t_btfld)1 << (RW_MGR_MEM_DQ_PER_READ_DQS / RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
-	param->read_correct_mask     = ((t_btfld)1 << RW_MGR_MEM_DQ_PER_READ_DQS) - 1;
-	param->write_correct_mask    = ((t_btfld)1 << RW_MGR_MEM_DQ_PER_WRITE_DQS) - 1;
+	if (((DYNAMIC_CALIB_STEPS) & CALIB_SKIP_ALL) != CALIB_SKIP_ALL) {
+		param->read_correct_mask_vg =
+		    ((t_btfld) 1 <<
+		     (RW_MGR_MEM_DQ_PER_READ_DQS / RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
+		param->write_correct_mask_vg =
+		    ((t_btfld) 1 <<
+		     (RW_MGR_MEM_DQ_PER_READ_DQS / RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS)) - 1;
+		param->read_correct_mask = ((t_btfld) 1 << RW_MGR_MEM_DQ_PER_READ_DQS) - 1;
+		param->write_correct_mask = ((t_btfld) 1 << RW_MGR_MEM_DQ_PER_WRITE_DQS) - 1;
+		param->dm_correct_mask =
+		    ((t_btfld) 1 << (RW_MGR_MEM_DATA_WIDTH / RW_MGR_MEM_DATA_MASK_WIDTH)) - 1;
+	}
 }
 
-#if DDR3
 static void set_rank_and_odt_mask(uint32_t rank, uint32_t odt_mode)
 {
 	uint32_t odt_mask_0 = 0;
@@ -176,70 +339,93 @@ static void set_rank_and_odt_mask(uint32_t rank, uint32_t odt_mode)
 	uint32_t cs_and_odt_mask;
 
 	if (odt_mode == RW_MGR_ODT_MODE_READ_WRITE) {
-		if (RW_MGR_MEM_NUMBER_OF_RANKS == 1) {
-			/*
-			 * 1 Rank
-			 * Read: ODT = 0
-			 * Write: ODT = 1
-			 */
+
+		if (LRDIMM) {
+			// USER LRDIMMs have two cases to consider: single-slot and dual-slot.
+			// USER In single-slot, assert ODT for write only.
+			// USER In dual-slot, assert ODT for both slots for write,
+			// USER and on the opposite slot only for reads.
+			// USER
+			// USER Further complicating this is that both DIMMs have either 1 or 2 ODT
+			// USER inputs, which do the same thing (only one is actually required).
+			if ((RW_MGR_MEM_CHIP_SELECT_WIDTH / RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM) == 1) {
+				// USER Single-slot case
+				if (RW_MGR_MEM_ODT_WIDTH == 1) {
+					// USER Read = 0, Write = 1
+					odt_mask_0 = 0x0;
+					odt_mask_1 = 0x1;
+				} else if (RW_MGR_MEM_ODT_WIDTH == 2) {
+					// USER Read = 00, Write = 11
+					odt_mask_0 = 0x0;
+					odt_mask_1 = 0x3;
+				}
+			} else if ((RW_MGR_MEM_CHIP_SELECT_WIDTH / RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM)
+				   == 2) {
+				// USER Dual-slot case
+				if (RW_MGR_MEM_ODT_WIDTH == 2) {
+					// USER Read: asserted for opposite slot, Write: asserted for both
+					odt_mask_0 = (rank < 2) ? 0x2 : 0x1;
+					odt_mask_1 = 0x3;
+				} else if (RW_MGR_MEM_ODT_WIDTH == 4) {
+					// USER Read: asserted for opposite slot, Write: asserted for both
+					odt_mask_0 = (rank < 2) ? 0xC : 0x3;
+					odt_mask_1 = 0xF;
+				}
+			}
+		} else if (RW_MGR_MEM_NUMBER_OF_RANKS == 1) {
+			//USER 1 Rank
+			//USER Read: ODT = 0
+			//USER Write: ODT = 1
 			odt_mask_0 = 0x0;
 			odt_mask_1 = 0x1;
 		} else if (RW_MGR_MEM_NUMBER_OF_RANKS == 2) {
-			/* 2 Ranks */
+			//USER 2 Ranks
 			if (RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 1 ||
-			   (RDIMM && RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 2
-			   && RW_MGR_MEM_CHIP_SELECT_WIDTH == 4)) {
-				/* - Dual-Slot , Single-Rank
-				 * (1 chip-select per DIMM)
-				 * OR
-				 * - RDIMM, 4 total CS (2 CS per DIMM)
-				 * means 2 DIMM
-				 * Since MEM_NUMBER_OF_RANKS is 2 they are
-				 * both single rank
-				 * with 2 CS each (special for RDIMM)
-				 * Read: Turn on ODT on the opposite rank
-				 * Write: Turn on ODT on all ranks
-				 */
+			    (RDIMM && RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 2
+			     && RW_MGR_MEM_CHIP_SELECT_WIDTH == 4)) {
+				//USER - Dual-Slot , Single-Rank (1 chip-select per DIMM)
+				//USER OR
+				//USER - RDIMM, 4 total CS (2 CS per DIMM) means 2 DIMM
+				//USER Since MEM_NUMBER_OF_RANKS is 2 they are both single rank
+				//USER with 2 CS each (special for RDIMM)
+				//USER Read: Turn on ODT on the opposite rank
+				//USER Write: Turn on ODT on all ranks
 				odt_mask_0 = 0x3 & ~(1 << rank);
 				odt_mask_1 = 0x3;
 			} else {
-				/*
-				 * USER - Single-Slot , Dual-rank DIMMs
-				 * (2 chip-selects per DIMM)
-				 * USER Read: Turn on ODT off on all ranks
-				 * USER Write: Turn on ODT on active rank
-				 */
+				//USER - Single-Slot , Dual-rank DIMMs (2 chip-selects per DIMM)
+				//USER Read: Turn on ODT off on all ranks
+				//USER Write: Turn on ODT on active rank
 				odt_mask_0 = 0x0;
 				odt_mask_1 = 0x3 & (1 << rank);
 			}
-				} else {
-			/* 4 Ranks
-			 * Read:
-			 * ----------+-----------------------+
-			 *           |                       |
-			 *           |         ODT           |
-			 * Read From +-----------------------+
-			 *   Rank    |  3  |  2  |  1  |  0  |
-			 * ----------+-----+-----+-----+-----+
-			 *     0     |  0  |  1  |  0  |  0  |
-			 *     1     |  1  |  0  |  0  |  0  |
-			 *     2     |  0  |  0  |  0  |  1  |
-			 *     3     |  0  |  0  |  1  |  0  |
-			 * ----------+-----+-----+-----+-----+
-			 *
-			 * Write:
-			 * ----------+-----------------------+
-			 *           |                       |
-			 *           |         ODT           |
-			 * Write To  +-----------------------+
-			 *   Rank    |  3  |  2  |  1  |  0  |
-			 * ----------+-----+-----+-----+-----+
-			 *     0     |  0  |  1  |  0  |  1  |
-			 *     1     |  1  |  0  |  1  |  0  |
-			 *     2     |  0  |  1  |  0  |  1  |
-			 *     3     |  1  |  0  |  1  |  0  |
-			 * ----------+-----+-----+-----+-----+
-			 */
+		} else {
+			//USER 4 Ranks
+			//USER Read:
+			//USER ----------+-----------------------+
+			//USER           |                       |
+			//USER           |         ODT           |
+			//USER Read From +-----------------------+
+			//USER   Rank    |  3  |  2  |  1  |  0  |
+			//USER ----------+-----+-----+-----+-----+
+			//USER     0     |  0  |  1  |  0  |  0  |
+			//USER     1     |  1  |  0  |  0  |  0  |
+			//USER     2     |  0  |  0  |  0  |  1  |
+			//USER     3     |  0  |  0  |  1  |  0  |
+			//USER ----------+-----+-----+-----+-----+
+			//USER
+			//USER Write:
+			//USER ----------+-----------------------+
+			//USER           |                       |
+			//USER           |         ODT           |
+			//USER Write To  +-----------------------+
+			//USER   Rank    |  3  |  2  |  1  |  0  |
+			//USER ----------+-----+-----+-----+-----+
+			//USER     0     |  0  |  1  |  0  |  1  |
+			//USER     1     |  1  |  0  |  1  |  0  |
+			//USER     2     |  0  |  1  |  0  |  1  |
+			//USER     3     |  1  |  0  |  1  |  0  |
+			//USER ----------+-----+-----+-----+-----+
 			switch (rank) {
 			case 0:
 				odt_mask_0 = 0x4;
@@ -265,169 +451,93 @@ static void set_rank_and_odt_mask(uint32_t rank, uint32_t odt_mode)
 	}
 
 	if (RDIMM && RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 2
-		&& RW_MGR_MEM_CHIP_SELECT_WIDTH == 4
-		&& RW_MGR_MEM_NUMBER_OF_RANKS == 2) {
-		/* See RDIMM special case above */
+	    && RW_MGR_MEM_CHIP_SELECT_WIDTH == 4 && RW_MGR_MEM_NUMBER_OF_RANKS == 2) {
+		//USER See RDIMM special case above
 		cs_and_odt_mask =
-			(0xFF & ~(1 << (2*rank))) |
-			((0xFF & odt_mask_0) << 8) |
-			((0xFF & odt_mask_1) << 16);
+		    (0xFF & ~(1 << (2 * rank))) |
+		    ((0xFF & odt_mask_0) << 8) | ((0xFF & odt_mask_1) << 16);
+	} else if (LRDIMM) {
 	} else {
 		cs_and_odt_mask =
-			(0xFF & ~(1 << rank)) |
-			((0xFF & odt_mask_0) << 8) |
-			((0xFF & odt_mask_1) << 16);
+		    (0xFF & ~(1 << rank)) |
+		    ((0xFF & odt_mask_0) << 8) | ((0xFF & odt_mask_1) << 16);
 	}
 
 	IOWR_32DIRECT(RW_MGR_SET_CS_AND_ODT_MASK, 0, cs_and_odt_mask);
 }
-#else
-static void set_rank_and_odt_mask(uint32_t rank, uint32_t odt_mode)
-{
-	uint32_t odt_mask_0 = 0;
-	uint32_t odt_mask_1 = 0;
-	uint32_t cs_and_odt_mask;
-
-	if (odt_mode == RW_MGR_ODT_MODE_READ_WRITE) {
-		if (RW_MGR_MEM_NUMBER_OF_RANKS == 1) {
-			/*
-			 * 1 Rank
-			 * Read: ODT = 0
-			 * Write: ODT = 1
-			 */
-			odt_mask_0 = 0x0;
-			odt_mask_1 = 0x1;
-		} else if (RW_MGR_MEM_NUMBER_OF_RANKS == 2) {
-			/* 2 Ranks */
-			if (RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM == 1) {
-				/* USER - Dual-Slot ,
-				 * Single-Rank (1 chip-select per DIMM)
-				 * OR
-				 * - RDIMM, 4 total CS (2 CS per DIMM) means
-				 * 2 DIMM
-				 * Since MEM_NUMBER_OF_RANKS is 2 they are both
-				 * single rank with 2 CS each (special for
-				 * RDIMM)
-				 * Read/Write: Turn on ODT on the opposite rank
-				 */
-				odt_mask_0 = 0x3 & ~(1 << rank);
-				odt_mask_1 = 0x3 & ~(1 << rank);
-			} else {
-				/*
-				 * USER - Single-Slot , Dual-rank DIMMs
-				 * (2 chip-selects per DIMM)
-				 * Read: Turn on ODT off on all ranks
-				 * Write: Turn on ODT on active rank
-				 */
-				odt_mask_0 = 0x0;
-				odt_mask_1 = 0x3 & (1 << rank);
-			}
-		} else {
-			/*
-			 * 4 Ranks
-			 * Read/Write:
-			 * -----------+-----------------------+
-			 *            |                       |
-			 *            |         ODT           |
-			 * Read/Write |                       |
-			 *   From     +-----------------------+
-			 *   Rank     |  3  |  2  |  1  |  0  |
-			 * -----------+-----+-----+-----+-----+
-			 *     0      |  0  |  1  |  0  |  0  |
-			 *     1      |  1  |  0  |  0  |  0  |
-			 *     2      |  0  |  0  |  0  |  1  |
-			 *     3      |  0  |  0  |  1  |  0  |
-			 * -----------+-----+-----+-----+-----+
-			 */
-			switch (rank) {
-			case 0:
-				odt_mask_0 = 0x4;
-				odt_mask_1 = 0x4;
-				break;
-			case 1:
-				odt_mask_0 = 0x8;
-				odt_mask_1 = 0x8;
-				break;
-			case 2:
-				odt_mask_0 = 0x1;
-				odt_mask_1 = 0x1;
-				break;
-			case 3:
-				odt_mask_0 = 0x2;
-				odt_mask_1 = 0x2;
-				break;
-			}
-		}
-	} else {
-		odt_mask_0 = 0x0;
-		odt_mask_1 = 0x0;
-	}
-
-	cs_and_odt_mask = (0xFF & ~(1 << rank)) |
-		((0xFF & odt_mask_0) << 8) |
-		((0xFF & odt_mask_1) << 16);
 
-	IOWR_32DIRECT(RW_MGR_SET_CS_AND_ODT_MASK, 0, cs_and_odt_mask);
+//USER Given a rank, select the set of shadow registers that is responsible for the
+//USER delays of such rank, so that subsequent SCC updates will go to those shadow
+//USER registers. 
+static void select_shadow_regs_for_update(uint32_t rank, uint32_t group,
+					  uint32_t update_scan_chains)
+{
 }
-#endif
 
 static void scc_mgr_initialize(void)
 {
-	/*
-	 * Clear register file for HPS
-	 * 16 (2^4) is the size of the full register file in the scc mgr:
-	 *	RFILE_DEPTH = log2(MEM_DQ_PER_DQS + 1 + MEM_DM_PER_DQS +
-	 * MEM_IF_READ_DQS_WIDTH - 1) + 1;
-	 */
+	// Clear register file for HPS
+	// 16 (2^4) is the size of the full register file in the scc mgr:
+	//      RFILE_DEPTH = log2(MEM_DQ_PER_DQS + 1 + MEM_DM_PER_DQS + MEM_IF_READ_DQS_WIDTH - 1) + 1;
 	uint32_t i;
 	for (i = 0; i < 16; i++) {
-		pr_debug("Clearing SCC RFILE index %u\n", i);
+		DPRINT(1, "Clearing SCC RFILE index %lu", i);
 		IOWR_32DIRECT(SCC_MGR_HHP_RFILE, i << 2, 0);
 	}
 }
 
-static void scc_mgr_set_dqs_bus_in_delay(uint32_t read_group, uint32_t delay)
+static inline void scc_mgr_set_dqs_bus_in_delay(uint32_t read_group, uint32_t delay)
 {
 	ALTERA_ASSERT(read_group < RW_MGR_MEM_IF_READ_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_IN_DELAY(read_group, delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_in_settings[curr_shadow_reg][read_group].
+		   dqs_bus_in_delay, delay);
+
 }
 
-static void scc_mgr_set_dqs_io_in_delay(uint32_t write_group,
-	uint32_t delay)
+static inline void scc_mgr_set_dqs_io_in_delay(uint32_t write_group, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_IO_IN_DELAY(delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   dqs_io_in_delay, delay);
+
 }
 
-static void scc_mgr_set_dqs_en_phase(uint32_t read_group, uint32_t phase)
+static inline void scc_mgr_set_dqs_en_phase(uint32_t read_group, uint32_t phase)
 {
 	ALTERA_ASSERT(read_group < RW_MGR_MEM_IF_READ_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_EN_PHASE(read_group, phase);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_in_settings[curr_shadow_reg][read_group].dqs_en_phase,
+		   phase);
+
 }
 
-static void scc_mgr_set_dqs_en_phase_all_ranks (uint32_t read_group, uint32_t phase)
+static void scc_mgr_set_dqs_en_phase_all_ranks(uint32_t read_group, uint32_t phase)
 {
 	uint32_t r;
 	uint32_t update_scan_chains;
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
-		r += NUM_RANKS_PER_SHADOW_REG) {
-		/*
-		 * USER although the h/w doesn't support different phases per
-		 * shadow register, for simplicity our scc manager modeling
-		 * keeps different phase settings per shadow reg, and it's
-		 * important for us to keep them in sync to match h/w.
-		 * for efficiency, the scan chain update should occur only
-		 * once to sr0.
-		 */
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+		//USER although the h/w doesn't support different phases per shadow register,
+		//USER for simplicity our scc manager modeling keeps different phase settings per 
+		//USER shadow reg, and it's important for us to keep them in sync to match h/w.
+		//USER for efficiency, the scan chain update should occur only once to sr0.
 		update_scan_chains = (r == 0) ? 1 : 0;
 
+		select_shadow_regs_for_update(r, read_group, update_scan_chains);
 		scc_mgr_set_dqs_en_phase(read_group, phase);
 
 		if (update_scan_chains) {
@@ -437,33 +547,32 @@ static void scc_mgr_set_dqs_en_phase_all_ranks (uint32_t read_group, uint32_t ph
 	}
 }
 
-static void scc_mgr_set_dqdqs_output_phase(uint32_t write_group,
-	uint32_t phase)
+static inline void scc_mgr_set_dqdqs_output_phase(uint32_t write_group, uint32_t phase)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQDQS_OUT_PHASE(write_group, phase);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   dqdqs_out_phase, phase);
+
 }
 
-static void scc_mgr_set_dqdqs_output_phase_all_ranks (uint32_t write_group,
-	uint32_t phase)
+static void scc_mgr_set_dqdqs_output_phase_all_ranks(uint32_t write_group, uint32_t phase)
 {
 	uint32_t r;
 	uint32_t update_scan_chains;
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
-		r += NUM_RANKS_PER_SHADOW_REG) {
-		/*
-		 * USER although the h/w doesn't support different phases per
-		 * shadow register, for simplicity our scc manager modeling
-		 * keeps different phase settings per shadow reg, and it's
-		 * important for us to keep them in sync to match h/w.
-		 * for efficiency, the scan chain update should occur only
-		 * once to sr0.
-		 */
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+		//USER although the h/w doesn't support different phases per shadow register,
+		//USER for simplicity our scc manager modeling keeps different phase settings per 
+		//USER shadow reg, and it's important for us to keep them in sync to match h/w.
+		//USER for efficiency, the scan chain update should occur only once to sr0.
 		update_scan_chains = (r == 0) ? 1 : 0;
 
+		select_shadow_regs_for_update(r, write_group, update_scan_chains);
 		scc_mgr_set_dqdqs_output_phase(write_group, phase);
 
 		if (update_scan_chains) {
@@ -473,32 +582,36 @@ static void scc_mgr_set_dqdqs_output_phase_all_ranks (uint32_t write_group,
 	}
 }
 
-static void scc_mgr_set_dqs_en_delay(uint32_t read_group, uint32_t delay)
+static inline void scc_mgr_set_dqs_en_delay(uint32_t read_group, uint32_t delay)
 {
 	ALTERA_ASSERT(read_group < RW_MGR_MEM_IF_READ_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_EN_DELAY(read_group, delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_in_settings[curr_shadow_reg][read_group].dqs_en_delay,
+		   delay);
+
 }
 
-static void scc_mgr_set_dqs_en_delay_all_ranks (uint32_t read_group, uint32_t delay)
+static void scc_mgr_set_dqs_en_delay_all_ranks(uint32_t read_group, uint32_t delay)
 {
 	uint32_t r;
 
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
 
+		select_shadow_regs_for_update(r, read_group, 0);
+
 		scc_mgr_set_dqs_en_delay(read_group, delay);
 
 		IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, read_group);
 
-		/*
-		 * In shadow register mode, the T11 settings are stored in
-		 * registers in the core, which are updated by the DQS_ENA
-		 * signals. Not issuing the SCC_MGR_UPD command allows us to
-		 * save lots of rank switching overhead, by calling
-		 * select_shadow_regs_for_update with update_scan_chains
-		 * set to 0.
-		 */
+		// In shadow register mode, the T11 settings are stored in registers
+		// in the core, which are updated by the DQS_ENA signals. Not issuing
+		// the SCC_MGR_UPD command allows us to save lots of rank switching
+		// overhead, by calling select_shadow_regs_for_update with update_scan_chains
+		// set to 0.
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	}
 }
@@ -509,17 +622,23 @@ static void scc_mgr_set_oct_out1_delay(uint32_t write_group, uint32_t delay)
 
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/*
-	 * Load the setting in the SCC manager
-	 * Although OCT affects only write data, the OCT delay is controlled
-	 * by the DQS logic block which is instantiated once per read group.
-	 * For protocols where a write group consists of multiple read groups,
-	 * the setting must be set multiple times.
-	 */
-	for (read_group = write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		read_group < (write_group + 1) * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		 ++read_group)
+	// Load the setting in the SCC manager
+	// Although OCT affects only write data, the OCT delay is controlled by the DQS logic block
+	// which is instantiated once per read group. For protocols where a write group consists
+	// of multiple read groups, the setting must be set multiple times.
+	for (read_group =
+	     write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     read_group <
+	     (write_group + 1) * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     ++read_group) {
+
 		WRITE_SCC_OCT_OUT1_DELAY(read_group, delay);
+	}
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   oct_out_delay1, delay);
+
 }
 
 static void scc_mgr_set_oct_out2_delay(uint32_t write_group, uint32_t delay)
@@ -528,147 +647,207 @@ static void scc_mgr_set_oct_out2_delay(uint32_t write_group, uint32_t delay)
 
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/*
-	 * Load the setting in the SCC manager
-	 * Although OCT affects only write data, the OCT delay is controlled
-	 * by the DQS logic block which is instantiated once per read group.
-	 * For protocols where a write group consists
-	 * of multiple read groups, the setting must be set multiple times.
-	 */
-	for (read_group = write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		read_group < (write_group + 1) * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		 ++read_group)
+	// Load the setting in the SCC manager
+	// Although OCT affects only write data, the OCT delay is controlled by the DQS logic block
+	// which is instantiated once per read group. For protocols where a write group consists
+	// of multiple read groups, the setting must be set multiple times.
+	for (read_group =
+	     write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     read_group <
+	     (write_group + 1) * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     ++read_group) {
+
 		WRITE_SCC_OCT_OUT2_DELAY(read_group, delay);
+	}
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   oct_out_delay2, delay);
+
 }
 
-static void scc_mgr_set_dq_out1_delay(uint32_t write_group,
-	uint32_t dq_in_group, uint32_t delay)
+static inline void scc_mgr_set_dqs_bypass(uint32_t write_group, uint32_t bypass)
 {
+	// Load the setting in the SCC manager
+	WRITE_SCC_DQS_BYPASS(write_group, bypass);
+}
+
+static inline void scc_mgr_set_dq_out1_delay(uint32_t write_group, uint32_t dq_in_group,
+					     uint32_t delay)
+{
+
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dq < RW_MGR_MEM_DATA_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQ_OUT1_DELAY(dq_in_group, delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dq_settings[curr_shadow_reg][dq].dq_out_delay1, delay);
+
 }
 
-static void scc_mgr_set_dq_out2_delay(uint32_t write_group,
-	uint32_t dq_in_group, uint32_t delay)
+static inline void scc_mgr_set_dq_out2_delay(uint32_t write_group, uint32_t dq_in_group,
+					     uint32_t delay)
 {
+
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dq < RW_MGR_MEM_DATA_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQ_OUT2_DELAY(dq_in_group, delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dq_settings[curr_shadow_reg][dq].dq_out_delay2, delay);
+
 }
 
-static void scc_mgr_set_dq_in_delay(uint32_t write_group,
-	uint32_t dq_in_group, uint32_t delay)
+static inline void scc_mgr_set_dq_in_delay(uint32_t write_group, uint32_t dq_in_group,
+					   uint32_t delay)
 {
+
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dq < RW_MGR_MEM_DATA_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQ_IN_DELAY(dq_in_group, delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dq_settings[curr_shadow_reg][dq].dq_in_delay, delay);
+
+}
+
+static inline void scc_mgr_set_dq_bypass(uint32_t write_group, uint32_t dq_in_group,
+					 uint32_t bypass)
+{
+	// Load the setting in the SCC manager
+	WRITE_SCC_DQ_BYPASS(dq_in_group, bypass);
+}
+
+static inline void scc_mgr_set_rfifo_mode(uint32_t write_group, uint32_t dq_in_group, uint32_t mode)
+{
+	// Load the setting in the SCC manager
+	WRITE_SCC_RFIFO_MODE(dq_in_group, mode);
 }
 
-static void scc_mgr_set_hhp_extras(void)
+static inline void scc_mgr_set_hhp_extras(void)
 {
-	/*
-	 * Load the fixed setting in the SCC manager
-	 * bits: 0:0 = 1'b1   - dqs bypass
-	 * bits: 1:1 = 1'b1   - dq bypass
-	 * bits: 4:2 = 3'b001   - rfifo_mode
-	 * bits: 6:5 = 2'b01  - rfifo clock_select
-	 * bits: 7:7 = 1'b0  - separate gating from ungating setting
-	 * bits: 8:8 = 1'b0  - separate OE from Output delay setting
-	 */
-	uint32_t value = (0<<8) | (0<<7) | (1<<5) | (1<<2) | (1<<1) | (1<<0);
+	// Load the fixed setting in the SCC manager
+	// bits: 0:0 = 1'b1   - dqs bypass
+	// bits: 1:1 = 1'b1   - dq bypass
+	// bits: 4:2 = 3'b001   - rfifo_mode
+	// bits: 6:5 = 2'b01  - rfifo clock_select
+	// bits: 7:7 = 1'b0  - separate gating from ungating setting
+	// bits: 8:8 = 1'b0  - separate OE from Output delay setting
+	uint32_t value = (0 << 8) | (0 << 7) | (1 << 5) | (1 << 2) | (1 << 1) | (1 << 0);
 	WRITE_SCC_HHP_EXTRAS(value);
 }
 
-static void scc_mgr_set_dqs_out1_delay(uint32_t write_group,
-	uint32_t delay)
+static inline void scc_mgr_set_hhp_dqse_map(void)
+{
+	// Load the fixed setting in the SCC manager
+	WRITE_SCC_HHP_DQSE_MAP(0);
+}
+
+static inline void scc_mgr_set_dqs_out1_delay(uint32_t write_group, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_IO_OUT1_DELAY(delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   dqs_out_delay1, delay);
+
 }
 
-static void scc_mgr_set_dqs_out2_delay(uint32_t write_group, uint32_t delay)
+static inline void scc_mgr_set_dqs_out2_delay(uint32_t write_group, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DQS_IO_OUT2_DELAY(delay);
+
+	// Make the setting in the TCL report
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_settings[curr_shadow_reg][write_group].
+		   dqs_out_delay2, delay);
+
 }
 
-static void scc_mgr_set_dm_out1_delay(uint32_t write_group,
-	uint32_t dm, uint32_t delay)
+static inline void scc_mgr_set_dm_out1_delay(uint32_t write_group, uint32_t dm, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dm < RW_MGR_NUM_DM_PER_WRITE_GROUP);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DM_IO_OUT1_DELAY(dm, delay);
+
+	// Make the setting in the TCL report
+
+	if (RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP > 0) {
+		TCLRPT_SET(debug_cal_report->cal_dm_settings[curr_shadow_reg][write_group][dm].
+			   dm_out_delay1, delay);
+	}
 }
 
-static void scc_mgr_set_dm_out2_delay(uint32_t write_group, uint32_t dm,
-	uint32_t delay)
+static inline void scc_mgr_set_dm_out2_delay(uint32_t write_group, uint32_t dm, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dm < RW_MGR_NUM_DM_PER_WRITE_GROUP);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DM_IO_OUT2_DELAY(dm, delay);
+
+	// Make the setting in the TCL report
+
+	if (RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP > 0) {
+		TCLRPT_SET(debug_cal_report->cal_dm_settings[curr_shadow_reg][write_group][dm].
+			   dm_out_delay2, delay);
+	}
 }
 
-static void scc_mgr_set_dm_in_delay(uint32_t write_group,
-	uint32_t dm, uint32_t delay)
+static inline void scc_mgr_set_dm_in_delay(uint32_t write_group, uint32_t dm, uint32_t delay)
 {
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 	ALTERA_ASSERT(dm < RW_MGR_NUM_DM_PER_WRITE_GROUP);
 
-	/* Load the setting in the SCC manager */
+	// Load the setting in the SCC manager
 	WRITE_SCC_DM_IO_IN_DELAY(dm, delay);
+
+	// Make the setting in the TCL report
+
+	if (RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP > 0) {
+		TCLRPT_SET(debug_cal_report->cal_dm_settings[curr_shadow_reg][write_group][dm].
+			   dm_in_delay, delay);
+	}
 }
 
-static void scc_mgr_load_dqs_for_write_group (uint32_t write_group)
+static inline void scc_mgr_set_dm_bypass(uint32_t write_group, uint32_t dm, uint32_t bypass)
 {
-	uint32_t read_group;
-
-	/*
-	 * Although OCT affects only write data, the OCT delay is controlled
-	 * by the DQS logic block which is instantiated once per read group.
-	 * For protocols where a write group consists of multiple read groups,
-	 * the setting must be scanned multiple times.
-	 */
-	for (read_group = write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		read_group < (write_group + 1) * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-		++read_group)
-		IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, read_group);
+	// Load the setting in the SCC manager
+	WRITE_SCC_DM_BYPASS(dm, bypass);
 }
 
-/*
- * USER Zero all DQS config
- * TODO: maybe rename to scc_mgr_zero_dqs_config (or something)
- */
-static void scc_mgr_zero_all (void)
+//USER Zero all DQS config
+// TODO: maybe rename to scc_mgr_zero_dqs_config (or something)
+static void scc_mgr_zero_all(void)
 {
 	uint32_t i, r;
 
-	/*
-	 * USER Zero all DQS config settings, across all groups and all
-	 * shadow registers
-	 */
+	//USER Zero all DQS config settings, across all groups and all shadow registers
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
 
+		// Strictly speaking this should be called once per group to make
+		// sure each group's delay chain is refreshed from the SCC register file,
+		// but since we're resetting all delay chains anyway, we can save some
+		// runtime by calling select_shadow_regs_for_update just once to switch
+		// rank.
+		select_shadow_regs_for_update(r, 0, 1);
+
 		for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
-			/*
-			 * The phases actually don't exist on a per-rank basis,
-			 * but there's no harm updating them several times, so
-			 * let's keep the code simple.
-			 */
+			// The phases actually don't exist on a per-rank basis, but there's
+			// no harm updating them several times, so let's keep the code simple.
 			scc_mgr_set_dqs_bus_in_delay(i, IO_DQS_IN_RESERVE);
 			scc_mgr_set_dqs_en_phase(i, 0);
 			scc_mgr_set_dqs_en_delay(i, 0);
@@ -676,235 +855,302 @@ static void scc_mgr_zero_all (void)
 
 		for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
 			scc_mgr_set_dqdqs_output_phase(i, 0);
-#if ARRIAV || CYCLONEV
-			/* av/cv don't have out2 */
+			// av/cv don't have out2
 			scc_mgr_set_oct_out1_delay(i, IO_DQS_OUT_RESERVE);
-#else
-			scc_mgr_set_oct_out1_delay(i, 0);
-			scc_mgr_set_oct_out2_delay(i, IO_DQS_OUT_RESERVE);
-#endif
 		}
 
-		/* multicast to all DQS group enables */
+		//USER multicast to all DQS group enables
 		IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, 0xff);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	}
 }
 
-static void scc_set_bypass_mode(uint32_t write_group)
+static void scc_set_bypass_mode(uint32_t write_group, uint32_t mode)
 {
-	/* only need to set once for all groups, pins, dq, dqs, dm */
+	// mode = 0 : Do NOT bypass - Half Rate Mode
+	// mode = 1 : Bypass - Full Rate Mode
+
+	// only need to set once for all groups, pins, dq, dqs, dm
 	if (write_group == 0) {
-		pr_debug("Setting HHP Extras\n");
+		DPRINT(1, "Setting HHP Extras");
 		scc_mgr_set_hhp_extras();
-		pr_debug("Done Setting HHP Extras\n");
+		DPRINT(1, "Done Setting HHP Extras");
 	}
 
-	/* multicast to all DQ enables */
+	//USER multicast to all DQ enables 
 	IOWR_32DIRECT(SCC_MGR_DQ_ENA, 0, 0xff);
 
 	IOWR_32DIRECT(SCC_MGR_DM_ENA, 0, 0xff);
 
-	/* update current DQS IO enable */
+	//USER update current DQS IO enable
 	IOWR_32DIRECT(SCC_MGR_DQS_IO_ENA, 0, 0);
 
-	/* update the DQS logic */
+	//USER update the DQS logic
 	IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, write_group);
 
-	/* hit update */
+	//USER hit update
 	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 }
 
-static void scc_mgr_zero_group (uint32_t write_group, uint32_t test_begin,
-	int32_t out_only)
+// Moving up to avoid warnings
+static void scc_mgr_load_dqs_for_write_group(uint32_t write_group)
+{
+	uint32_t read_group;
+
+	// Although OCT affects only write data, the OCT delay is controlled by the DQS logic block
+	// which is instantiated once per read group. For protocols where a write group consists
+	// of multiple read groups, the setting must be scanned multiple times.
+	for (read_group =
+	     write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     read_group <
+	     (write_group + 1) * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+	     ++read_group) {
+
+		IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, read_group);
+	}
+}
+
+static void scc_mgr_zero_group(uint32_t write_group, uint32_t test_begin, int32_t out_only)
 {
 	uint32_t i, r;
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r +=
-		NUM_RANKS_PER_SHADOW_REG) {
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+
+		select_shadow_regs_for_update(r, write_group, 1);
 
-		/* Zero all DQ config settings */
+		//USER Zero all DQ config settings
 		for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
 			scc_mgr_set_dq_out1_delay(write_group, i, 0);
-			scc_mgr_set_dq_out2_delay(write_group, i,
-				IO_DQ_OUT_RESERVE);
+			scc_mgr_set_dq_out2_delay(write_group, i, IO_DQ_OUT_RESERVE);
 			if (!out_only) {
 				scc_mgr_set_dq_in_delay(write_group, i, 0);
 			}
 		}
 
-		/* multicast to all DQ enables */
+		//USER multicast to all DQ enables
 		IOWR_32DIRECT(SCC_MGR_DQ_ENA, 0, 0xff);
 
-		/* Zero all DM config settings */
+		//USER Zero all DM config settings 
 		for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
 			if (!out_only) {
-				/* Do we really need this? */
+				// Do we really need this?
 				scc_mgr_set_dm_in_delay(write_group, i, 0);
 			}
 			scc_mgr_set_dm_out1_delay(write_group, i, 0);
-			scc_mgr_set_dm_out2_delay(write_group, i,
-				IO_DM_OUT_RESERVE);
+			scc_mgr_set_dm_out2_delay(write_group, i, IO_DM_OUT_RESERVE);
 		}
 
-		/* multicast to all DM enables */
+		//USER multicast to all DM enables
 		IOWR_32DIRECT(SCC_MGR_DM_ENA, 0, 0xff);
 
-		/* zero all DQS io settings */
+		//USER zero all DQS io settings 
 		if (!out_only) {
 			scc_mgr_set_dqs_io_in_delay(write_group, 0);
 		}
-#if ARRIAV || CYCLONEV
-		/* av/cv don't have out2 */
+		// av/cv don't have out2
 		scc_mgr_set_dqs_out1_delay(write_group, IO_DQS_OUT_RESERVE);
 		scc_mgr_set_oct_out1_delay(write_group, IO_DQS_OUT_RESERVE);
 		scc_mgr_load_dqs_for_write_group(write_group);
-#else
-		scc_mgr_set_dqs_out1_delay(write_group, 0);
-		scc_mgr_set_dqs_out2_delay(write_group, IO_DQS_OUT_RESERVE);
-		scc_mgr_set_oct_out1_delay(write_group, 0);
-		scc_mgr_set_oct_out2_delay(write_group, IO_DQS_OUT_RESERVE);
-		scc_mgr_load_dqs_for_write_group(write_group);
-#endif
 
-		/* multicast to all DQS IO enables (only 1) */
+		//USER multicast to all DQS IO enables (only 1)
 		IOWR_32DIRECT(SCC_MGR_DQS_IO_ENA, 0, 0);
 
-#if USE_SHADOW_REGS
-		/*
-		 * in shadow-register mode, SCC_UPDATE is done on a per-group basis
-		 * unless we explicitly ask for a multicast via the group counter
-		 */
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-#endif
-		/* hit update to zero everything */
+		//USER hit update to zero everything 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	}
 }
 
-/* load up dqs config settings */
+//USER load up dqs config settings 
 
-static void scc_mgr_load_dqs (uint32_t dqs)
+static void scc_mgr_load_dqs(uint32_t dqs)
 {
 	IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, dqs);
 }
 
-/* load up dqs io config settings */
+//USER load up dqs io config settings 
 
-static void scc_mgr_load_dqs_io (void)
+static void scc_mgr_load_dqs_io(void)
 {
 	IOWR_32DIRECT(SCC_MGR_DQS_IO_ENA, 0, 0);
 }
 
-/* load up dq config settings */
+//USER load up dq config settings 
 
-static void scc_mgr_load_dq (uint32_t dq_in_group)
+static void scc_mgr_load_dq(uint32_t dq_in_group)
 {
 	IOWR_32DIRECT(SCC_MGR_DQ_ENA, 0, dq_in_group);
 }
 
-/* load up dm config settings */
+//USER load up dm config settings 
 
-static void scc_mgr_load_dm (uint32_t dm)
+static void scc_mgr_load_dm(uint32_t dm)
 {
 	IOWR_32DIRECT(SCC_MGR_DM_ENA, 0, dm);
 }
 
-/* apply and load a particular input delay for the DQ pins in a group */
-/* group_bgn is the index of the first dq pin (in the write group) */
+//USER apply and load a particular input delay for the DQ pins in a group
+//USER group_bgn is the index of the first dq pin (in the write group)
 
-static void scc_mgr_apply_group_dq_in_delay (uint32_t write_group,
-	uint32_t group_bgn, uint32_t delay)
+static void scc_mgr_apply_group_dq_in_delay(uint32_t write_group, uint32_t group_bgn,
+					    uint32_t delay)
 {
 	uint32_t i, p;
 
 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
 		scc_mgr_set_dq_in_delay(write_group, p, delay);
-		scc_mgr_load_dq (p);
+		scc_mgr_load_dq(p);
 	}
 }
 
-/* apply and load a particular output delay for the DQ pins in a group */
+//USER apply and load a particular output delay for the DQ pins in a group
 
-static void scc_mgr_apply_group_dq_out1_delay (uint32_t write_group, uint32_t group_bgn,
-	uint32_t delay1)
+static void scc_mgr_apply_group_dq_out1_delay(uint32_t write_group, uint32_t group_bgn,
+					      uint32_t delay1)
 {
 	uint32_t i, p;
 
 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
 		scc_mgr_set_dq_out1_delay(write_group, i, delay1);
-		scc_mgr_load_dq (i);
+		scc_mgr_load_dq(i);
+	}
+}
+
+static void scc_mgr_apply_group_dq_out2_delay(uint32_t write_group, uint32_t group_bgn,
+					      uint32_t delay2)
+{
+	uint32_t i, p;
+
+	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
+		scc_mgr_set_dq_out2_delay(write_group, i, delay2);
+		scc_mgr_load_dq(i);
 	}
 }
 
-/* apply and load a particular output delay for the DM pins in a group */
+//USER apply and load a particular output delay for the DM pins in a group
 
-static void scc_mgr_apply_group_dm_out1_delay (uint32_t write_group, uint32_t delay1)
+static void scc_mgr_apply_group_dm_out1_delay(uint32_t write_group, uint32_t delay1)
 {
 	uint32_t i;
 
 	for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
 		scc_mgr_set_dm_out1_delay(write_group, i, delay1);
-		scc_mgr_load_dm (i);
+		scc_mgr_load_dm(i);
 	}
 }
 
-
-/* apply and load delay on both DQS and OCT out1 */
-static void scc_mgr_apply_group_dqs_io_and_oct_out1 (uint32_t write_group, uint32_t delay)
+//USER apply and load delay on both DQS and OCT out1
+static void scc_mgr_apply_group_dqs_io_and_oct_out1(uint32_t write_group, uint32_t delay)
 {
 	scc_mgr_set_dqs_out1_delay(write_group, delay);
-	scc_mgr_load_dqs_io ();
+	scc_mgr_load_dqs_io();
 
 	scc_mgr_set_oct_out1_delay(write_group, delay);
-	scc_mgr_load_dqs_for_write_group (write_group);
+	scc_mgr_load_dqs_for_write_group(write_group);
+}
+
+//USER apply and load delay on both DQS and OCT out2
+static void scc_mgr_apply_group_dqs_io_and_oct_out2(uint32_t write_group, uint32_t delay)
+{
+	scc_mgr_set_dqs_out2_delay(write_group, delay);
+	scc_mgr_load_dqs_io();
+
+	scc_mgr_set_oct_out2_delay(write_group, delay);
+	scc_mgr_load_dqs_for_write_group(write_group);
+}
+
+//USER set delay on both DQS and OCT out1 by incrementally changing
+//USER the settings one dtap at a time towards the target value, to avoid
+//USER breaking the lock of the DLL/PLL on the memory device.
+static void scc_mgr_set_group_dqs_io_and_oct_out1_gradual(uint32_t write_group, uint32_t delay)
+{
+	uint32_t d = READ_SCC_DQS_IO_OUT1_DELAY();
+
+	while (d > delay) {
+		--d;
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, d);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
+	}
+	while (d < delay) {
+		++d;
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, d);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
+	}
+}
+
+//USER set delay on both DQS and OCT out2 by incrementally changing
+//USER the settings one dtap at a time towards the target value, to avoid
+//USER breaking the lock of the DLL/PLL on the memory device.
+static void scc_mgr_set_group_dqs_io_and_oct_out2_gradual(uint32_t write_group, uint32_t delay)
+{
+	uint32_t d = READ_SCC_DQS_IO_OUT2_DELAY();
+
+	while (d > delay) {
+		--d;
+		scc_mgr_apply_group_dqs_io_and_oct_out2(write_group, d);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
+	}
+	while (d < delay) {
+		++d;
+		scc_mgr_apply_group_dqs_io_and_oct_out2(write_group, d);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
+	}
 }
 
-/* apply a delay to the entire output side: DQ, DM, DQS, OCT */
+//USER apply a delay to the entire output side: DQ, DM, DQS, OCT 
 
-static void scc_mgr_apply_group_all_out_delay (uint32_t write_group,
-	uint32_t group_bgn, uint32_t delay)
+static void scc_mgr_apply_group_all_out_delay(uint32_t write_group, uint32_t group_bgn,
+					      uint32_t delay)
 {
-	/* dq shift */
+	//USER dq shift 
 
-	scc_mgr_apply_group_dq_out1_delay (write_group, group_bgn, delay);
+	scc_mgr_apply_group_dq_out1_delay(write_group, group_bgn, delay);
 
-	/* dm shift */
+	//USER dm shift 
 
-	scc_mgr_apply_group_dm_out1_delay (write_group, delay);
+	scc_mgr_apply_group_dm_out1_delay(write_group, delay);
 
-	/* dqs and oct shift */
+	//USER dqs and oct shift 
 
-	scc_mgr_apply_group_dqs_io_and_oct_out1 (write_group, delay);
+	scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, delay);
 }
 
-/*
- * USER apply a delay to the entire output side (DQ, DM, DQS, OCT)
- * and to all ranks
- */
-static void scc_mgr_apply_group_all_out_delay_all_ranks (uint32_t write_group,
-	uint32_t group_bgn, uint32_t delay)
+//USER apply a delay to the entire output side (DQ, DM, DQS, OCT) and to all ranks
+static void scc_mgr_apply_group_all_out_delay_all_ranks(uint32_t write_group, uint32_t group_bgn,
+							uint32_t delay)
 {
 	uint32_t r;
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
-		r += NUM_RANKS_PER_SHADOW_REG) {
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+
+		select_shadow_regs_for_update(r, write_group, 1);
 
-		scc_mgr_apply_group_all_out_delay (write_group, group_bgn, delay);
+		scc_mgr_apply_group_all_out_delay(write_group, group_bgn, delay);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	}
 }
 
-/* apply a delay to the entire output side: DQ, DM, DQS, OCT */
+//USER apply a delay to the entire output side: DQ, DM, DQS, OCT 
 
-static void scc_mgr_apply_group_all_out_delay_add (uint32_t write_group,
-	uint32_t group_bgn, uint32_t delay)
+static void scc_mgr_apply_group_all_out_delay_add(uint32_t write_group, uint32_t group_bgn,
+						  uint32_t delay)
 {
 	uint32_t i, p, new_delay;
 
-	/* dq shift */
+	//USER dq shift 
 
 	for (i = 0, p = group_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
 
@@ -912,70 +1158,63 @@ static void scc_mgr_apply_group_all_out_delay_add (uint32_t write_group,
 		new_delay += delay;
 
 		if (new_delay > IO_IO_OUT2_DELAY_MAX) {
-			pr_debug("%s(%u, %u, %u) DQ[%u,%u]: %u >"
-				" %u => %u\n", __func__, write_group,
-				group_bgn, delay, i, p,
-				new_delay,
-				IO_IO_OUT2_DELAY_MAX,
-				IO_IO_OUT2_DELAY_MAX);
+			DPRINT(1, "%s(%lu, %lu, %lu) DQ[%lu,%lu]: %lu > %lu => %lu",
+			       __func__, write_group, group_bgn, delay, i, p,
+			       new_delay, (long unsigned int)IO_IO_OUT2_DELAY_MAX,
+			       (long unsigned int)IO_IO_OUT2_DELAY_MAX);
 			new_delay = IO_IO_OUT2_DELAY_MAX;
 		}
 
 		scc_mgr_set_dq_out2_delay(write_group, i, new_delay);
-		scc_mgr_load_dq (i);
+		scc_mgr_load_dq(i);
 	}
 
-	/* dm shift */
+	//USER dm shift 
 
 	for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
 		new_delay = READ_SCC_DM_IO_OUT2_DELAY(i);
 		new_delay += delay;
 
 		if (new_delay > IO_IO_OUT2_DELAY_MAX) {
-			pr_debug("%s(%u, %u, %u) DM[%u]: %u > %u => %u\n",
-				__func__, write_group, group_bgn, delay, i,
-				new_delay,
-				IO_IO_OUT2_DELAY_MAX,
-				IO_IO_OUT2_DELAY_MAX);
+			DPRINT(1, "%s(%lu, %lu, %lu) DM[%lu]: %lu > %lu => %lu",
+			       __func__, write_group, group_bgn, delay, i,
+			       new_delay, (long unsigned int)IO_IO_OUT2_DELAY_MAX,
+			       (long unsigned int)IO_IO_OUT2_DELAY_MAX);
 			new_delay = IO_IO_OUT2_DELAY_MAX;
 		}
 
 		scc_mgr_set_dm_out2_delay(write_group, i, new_delay);
-		scc_mgr_load_dm (i);
+		scc_mgr_load_dm(i);
 	}
 
-	/* dqs shift */
+	//USER dqs shift 
 
 	new_delay = READ_SCC_DQS_IO_OUT2_DELAY();
 	new_delay += delay;
 
 	if (new_delay > IO_IO_OUT2_DELAY_MAX) {
-		pr_debug("%s(%u, %u, %u) DQS: %u > %d => %d;"
-			" adding %u to OUT1\n",
-			__func__, write_group, group_bgn, delay,
-			new_delay, IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
-			new_delay - IO_IO_OUT2_DELAY_MAX);
-		scc_mgr_set_dqs_out1_delay(write_group, new_delay -
-			IO_IO_OUT2_DELAY_MAX);
+		DPRINT(1, "%s(%lu, %lu, %lu) DQS: %lu > %d => %d; adding %lu to OUT1",
+		       __func__, write_group, group_bgn, delay,
+		       new_delay, IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
+		       new_delay - IO_IO_OUT2_DELAY_MAX);
+		scc_mgr_set_dqs_out1_delay(write_group, new_delay - IO_IO_OUT2_DELAY_MAX);
 		new_delay = IO_IO_OUT2_DELAY_MAX;
 	}
 
 	scc_mgr_set_dqs_out2_delay(write_group, new_delay);
-	scc_mgr_load_dqs_io ();
+	scc_mgr_load_dqs_io();
 
-	/* oct shift */
+	//USER oct shift 
 
 	new_delay = READ_SCC_OCT_OUT2_DELAY(write_group);
 	new_delay += delay;
 
 	if (new_delay > IO_IO_OUT2_DELAY_MAX) {
-		pr_debug("%s(%u, %u, %u) DQS: %u > %d => %d;"
-			" adding %u to OUT1\n",
-			__func__, write_group, group_bgn, delay,
-			new_delay, IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
-			new_delay - IO_IO_OUT2_DELAY_MAX);
-		scc_mgr_set_oct_out1_delay(write_group, new_delay -
-			IO_IO_OUT2_DELAY_MAX);
+		DPRINT(1, "%s(%lu, %lu, %lu) DQS: %lu > %d => %d; adding %lu to OUT1",
+		       __func__, write_group, group_bgn, delay,
+		       new_delay, IO_IO_OUT2_DELAY_MAX, IO_IO_OUT2_DELAY_MAX,
+		       new_delay - IO_IO_OUT2_DELAY_MAX);
+		scc_mgr_set_oct_out1_delay(write_group, new_delay - IO_IO_OUT2_DELAY_MAX);
 		new_delay = IO_IO_OUT2_DELAY_MAX;
 	}
 
@@ -983,852 +1222,568 @@ static void scc_mgr_apply_group_all_out_delay_add (uint32_t write_group,
 	scc_mgr_load_dqs_for_write_group(write_group);
 }
 
-/*
- * USER apply a delay to the entire output side (DQ, DM, DQS, OCT)
- * and to all ranks
- */
-static void scc_mgr_apply_group_all_out_delay_add_all_ranks (uint32_t write_group,
-	uint32_t group_bgn, uint32_t delay)
+//USER apply a delay to the entire output side (DQ, DM, DQS, OCT) and to all ranks
+static void scc_mgr_apply_group_all_out_delay_add_all_ranks(uint32_t write_group,
+							    uint32_t group_bgn, uint32_t delay)
 {
 	uint32_t r;
 
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
-		scc_mgr_apply_group_all_out_delay_add (write_group,
-			group_bgn, delay);
+
+		select_shadow_regs_for_update(r, write_group, 1);
+
+		scc_mgr_apply_group_all_out_delay_add(write_group, group_bgn, delay);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	}
 }
 
-static void scc_mgr_spread_out2_delay_all_ranks (uint32_t write_group,
-	uint32_t test_bgn)
+static inline void scc_mgr_spread_out2_delay_all_ranks(uint32_t write_group, uint32_t test_bgn)
 {
-#if STRATIXV || ARRIAVGZ
-	uint32_t found;
-	uint32_t i;
-	uint32_t p;
-	uint32_t d;
-	uint32_t r;
-
-	const uint32_t delay_step = IO_IO_OUT2_DELAY_MAX /
-		(RW_MGR_MEM_DQ_PER_WRITE_DQS-1);
-		/* we start at zero, so have one less dq to devide among */
-
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
-		r += NUM_RANKS_PER_SHADOW_REG) {
-		for (i = 0, p = test_bgn, d = 0;
-			i < RW_MGR_MEM_DQ_PER_WRITE_DQS;
-			i++, p++, d += delay_step) {
-			pr_debug("rw_mgr_mem_calibrate_vfifo_find"
-				"_dqs_en_phase_sweep_dq_in_delay: g=%u r=%u,"
-				" i=%u p=%u d=%u\n",
-				write_group, r, i, p, d);
-			scc_mgr_set_dq_out2_delay(write_group, i, d);
-			scc_mgr_load_dq (i);
-		}
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-	}
-#endif
 }
 
-#if DDR3
-/* optimization used to recover some slots in ddr3 inst_rom */
-/* could be applied to other protocols if we wanted to */
+// optimization used to recover some slots in ddr3 inst_rom
+// could be applied to other protocols if we wanted to
 static void set_jump_as_return(void)
 {
-	/*
-	 * to save space, we replace return with jump to special shared
-	 * RETURN instruction so we set the counter to large value so that
-	 * we always jump
-	 */
+	// to save space, we replace return with jump to special shared RETURN instruction
+	// so we set the counter to large value so that we always jump
 	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0xFF);
 	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_RETURN);
 
 }
-#endif
 
-/*
- * should always use constants as argument to ensure all computations are
- * performed at compile time
- */
-static void delay_for_n_mem_clocks(const uint32_t clocks)
+// should always use constants as argument to ensure all computations are performed at compile time
+static inline void delay_for_n_mem_clocks(const uint32_t clocks)
 {
 	uint32_t afi_clocks;
 	uint8_t inner;
 	uint8_t outer;
 	uint16_t c_loop;
 
-	afi_clocks = (clocks + AFI_RATE_RATIO-1) / AFI_RATE_RATIO;
-	/* scale (rounding up) to get afi clocks */
+	TRACE_FUNC("clocks=%lu ... start", clocks);
+
+	afi_clocks = (clocks + AFI_RATE_RATIO - 1) / AFI_RATE_RATIO;	/* scale (rounding up) to get afi clocks */
 
-	/*
-	 * Note, we don't bother accounting for being off a little bit
-	 * because of a few extra instructions in outer loops
-	 * Note, the loops have a test at the end, and do the test before
-	 * the decrement, and so always perform the loop
-	 * 1 time more than the counter value
-	 */
+	// Note, we don't bother accounting for being off a little bit because of a few extra instructions in outer loops
+	// Note, the loops have a test at the end, and do the test before the decrement, and so always perform the loop
+	// 1 time more than the counter value
 	if (afi_clocks == 0) {
 		inner = outer = c_loop = 0;
 	} else if (afi_clocks <= 0x100) {
-		inner = afi_clocks-1;
+		inner = afi_clocks - 1;
 		outer = 0;
 		c_loop = 0;
 	} else if (afi_clocks <= 0x10000) {
 		inner = 0xff;
-		outer = (afi_clocks-1) >> 8;
+		outer = (afi_clocks - 1) >> 8;
 		c_loop = 0;
 	} else {
 		inner = 0xff;
 		outer = 0xff;
-		c_loop = (afi_clocks-1) >> 16;
-	}
-
-	/*
-	 * rom instructions are structured as follows:
-	 *
-	 *    IDLE_LOOP2: jnz cntr0, TARGET_A
-	 *    IDLE_LOOP1: jnz cntr1, TARGET_B
-	 *                return
-	 *
-	 * so, when doing nested loops, TARGET_A is set to IDLE_LOOP2, and
-	 * TARGET_B is set to IDLE_LOOP2 as well
-	 *
-	 * if we have no outer loop, though, then we can use IDLE_LOOP1 only,
-	 * and set TARGET_B to IDLE_LOOP1 and we skip IDLE_LOOP2 entirely
-	 *
-	 * a little confusing, but it helps save precious space in the inst_rom
-	 * and sequencer rom and keeps the delays more accurate and reduces
-	 * overhead
-	 */
+		c_loop = (afi_clocks - 1) >> 16;
+	}
+
+	// rom instructions are structured as follows:
+	//
+	//    IDLE_LOOP2: jnz cntr0, TARGET_A
+	//    IDLE_LOOP1: jnz cntr1, TARGET_B
+	//                return
+	//
+	// so, when doing nested loops, TARGET_A is set to IDLE_LOOP2, and TARGET_B is
+	// set to IDLE_LOOP2 as well
+	//
+	// if we have no outer loop, though, then we can use IDLE_LOOP1 only, and set
+	// TARGET_B to IDLE_LOOP1 and we skip IDLE_LOOP2 entirely
+	//
+	// a little confusing, but it helps save precious space in the inst_rom and sequencer rom
+	// and keeps the delays more accurate and reduces overhead
 	if (afi_clocks <= 0x100) {
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0,
-			SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner));
+
+		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner));
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_IDLE_LOOP1);
 		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_IDLE_LOOP1);
+
 	} else {
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0,
-			SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner));
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0,
-			SKIP_DELAY_LOOP_VALUE_OR_ZERO(outer));
+		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(inner));
+		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(outer));
 
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_IDLE_LOOP2);
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_IDLE_LOOP2);
 
-		/* hack to get around compiler not being smart enough */
+		// hack to get around compiler not being smart enough
 		if (afi_clocks <= 0x10000) {
-			/* only need to run once */
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_IDLE_LOOP2);
+			// only need to run once
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_IDLE_LOOP2);
 		} else {
 			do {
-				IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-					__RW_MGR_IDLE_LOOP2);
+				IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_IDLE_LOOP2);
 			} while (c_loop-- != 0);
 		}
 	}
+
+	TRACE_FUNC("clocks=%lu ... end", clocks);
 }
 
-/* Special routine to recover memory device from illegal state after */
-/* ck/dqs relationship is violated. */
-static void recover_mem_device_after_ck_dqs_violation(void)
+// should always use constants as argument to ensure all computations are performed at compile time
+static inline void delay_for_n_ns(const uint32_t nanoseconds)
 {
-	/* Current protocol doesn't require any special recovery */
+	TRACE_FUNC("nanoseconds=%lu ... end", nanoseconds);
+	delay_for_n_mem_clocks((1000 * nanoseconds) / (1000000 / AFI_CLK_FREQ) * AFI_RATE_RATIO);
 }
 
-static void rw_mgr_rdimm_initialize(void) { }
-
-#if DDR3
+// Special routine to recover memory device from illegal state after
+// ck/dqs relationship is violated.
+static inline void recover_mem_device_after_ck_dqs_violation(void)
+{
+	// Current protocol doesn't require any special recovery
+}
 
+static void rw_mgr_rdimm_initialize(void)
+{
+}
 
-static void rw_mgr_mem_initialize (void)
+static void rw_mgr_mem_initialize(void)
 {
 	uint32_t r;
 
+	TRACE_FUNC();
 
-	/* The reset / cke part of initialization is broadcasted to all ranks */
+	//USER The reset / cke part of initialization is broadcasted to all ranks
 	IOWR_32DIRECT(RW_MGR_SET_CS_AND_ODT_MASK, 0, RW_MGR_RANK_ALL);
 
-	/*
-	 * Here's how you load register for a loop
-	 * Counters are located @ 0x800
-	 * Jump address are located @ 0xC00
-	 * For both, registers 0 to 3 are selected using bits 3 and 2, like
-	 * in 0x800, 0x804, 0x808, 0x80C and 0xC00, 0xC04, 0xC08, 0xC0C
-	 * I know this ain't pretty, but Avalon bus throws away the 2 least
-	 * significant bits
-	 */
-
-	/* start with memory RESET activated */
-
-	/* tINIT is typically 200us (but can be adjusted in the GUI)
-	 * The total number of cycles required for this nested counter structure to
-	 * complete is defined by:
-	 * num_cycles = (CTR2 + 1) * [(CTR1 + 1) * (2 * (CTR0 + 1) + 1) + 1] + 1
-	 */
-
-	/* Load counters */
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR0_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR1_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR2_VAL));
-
-	/* Load jump address */
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0,
-		__RW_MGR_INIT_RESET_0_CKE_0);
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-		__RW_MGR_INIT_RESET_0_CKE_0);
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-		__RW_MGR_INIT_RESET_0_CKE_0);
-
-	/* Execute count instruction */
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_INIT_RESET_0_CKE_0);
+	// Here's how you load register for a loop
+	//USER Counters are located @ 0x800
+	//USER Jump address are located @ 0xC00
+	//USER For both, registers 0 to 3 are selected using bits 3 and 2, like in
+	//USER 0x800, 0x804, 0x808, 0x80C and 0xC00, 0xC04, 0xC08, 0xC0C
+	// I know this ain't pretty, but Avalon bus throws away the 2 least significant bits
 
-	/* indicate that memory is stable */
-	IOWR_32DIRECT(PHY_MGR_RESET_MEM_STBL, 0, 1);
+	//USER start with memory RESET activated
+
+	//USER tINIT is typically 200us (but can be adjusted in the GUI)
+	//USER The total number of cycles required for this nested counter structure to
+	//USER complete is defined by:
+	//USER        num_cycles = (CTR2 + 1) * [(CTR1 + 1) * (2 * (CTR0 + 1) + 1) + 1] + 1
+
+	//USER Load counters
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR0_VAL));
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR1_VAL));
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR2_VAL));
 
-	/* transition the RESET to high */
-	/* Wait for 500us */
+	//USER Load jump address
+	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_INIT_RESET_0_CKE_0);
+	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_INIT_RESET_0_CKE_0);
+	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_INIT_RESET_0_CKE_0);
 
-	/* num_cycles = (CTR2 + 1) * [(CTR1 + 1) * (2 * (CTR0 + 1) + 1) + 1] + 1 */
+	//USER Execute count instruction
+	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_INIT_RESET_0_CKE_0);
+
+	//USER indicate that memory is stable
+	IOWR_32DIRECT(PHY_MGR_RESET_MEM_STBL, 0, 1);
 
-	/* Load counters */
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR0_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR1_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR2_VAL));
+	//USER transition the RESET to high 
+	//USER Wait for 500us
+	//USER        num_cycles = (CTR2 + 1) * [(CTR1 + 1) * (2 * (CTR0 + 1) + 1) + 1] + 1
+	//USER Load counters
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR0_VAL));
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR1_VAL));
+	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TRESET_CNTR2_VAL));
 
-	/* Load jump address */
+	//USER Load jump address
 	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_INIT_RESET_1_CKE_0);
 	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_INIT_RESET_1_CKE_0);
 	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_INIT_RESET_1_CKE_0);
 
-
 	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_INIT_RESET_1_CKE_0);
 
-	/* bring up clock enable */
+	//USER bring up clock enable 
 
-	/* tXRP < 250 ck cycles */
+	//USER tXRP < 250 ck cycles
 	delay_for_n_mem_clocks(250);
 
-	/*
-	 * USER initialize RDIMM buffer so MRS and RZQ Calibrate commands will
-	 * USER be propagated to discrete memory devices
-	 */
+	// USER initialize RDIMM buffer so MRS and RZQ Calibrate commands will be 
+	// USER propagated to discrete memory devices
 	rw_mgr_rdimm_initialize();
 
-
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank 
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
 
-		/*
-		 * USER Use Mirror-ed commands for odd ranks if address
-		 * mirrorring is on
-		 */
+		//USER Use Mirror-ed commands for odd ranks if address mirrorring is on
 		if ((RW_MGR_MEM_ADDRESS_MIRRORING >> r) & 0x1) {
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS2_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS2_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS3_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS3_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS1_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS1_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS0_DLL_RESET_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS0_DLL_RESET_MIRR);
 		} else {
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS2);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS2);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS3);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS3);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS1);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS1);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS0_DLL_RESET);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS0_DLL_RESET);
 		}
 
 		set_jump_as_return();
 		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_ZQCL);
 
-		/* tZQinit = tDLLK = 512 ck cycles */
+		//USER tZQinit = tDLLK = 512 ck cycles
 		delay_for_n_mem_clocks(512);
 	}
 }
-#endif /* DDR3 */
-
-#if DDR2
-static void rw_mgr_mem_initialize (void)
-{
-	uint32_t r;
-
-	/* *** NOTE *** */
-	/* The following STAGE (n) notation refers to the corresponding
-	stage in the Micron datasheet */
-
-	/*
-	 *Here's how you load register for a loop
-	 * Counters are located @ 0x800
-	 * Jump address are located @ 0xC00
-	 * For both, registers 0 to 3 are selected using bits 3 and 2,
-	 like in
-	 * 0x800, 0x804, 0x808, 0x80C and 0xC00, 0xC04, 0xC08, 0xC0C
-	 * I know this ain't pretty, but Avalon bus throws away the 2 least
-	 significant bits
-	 */
-
-	/* *** STAGE (1, 2, 3) *** */
-
-	/* start with CKE low */
-
-	/* tINIT = 200us */
-
-	/* tINIT is typically 200us (but can be adjusted in the GUI)
-	 * The total number of cycles required for this nested counter structure to
-	 * complete is defined by:
-	 * num_cycles = (CTR0 + 1) * [(CTR1 + 1) * (2 * (CTR2 + 1) + 1) + 1] + 1
-	 */
-
-	/*TODO: Need to manage multi-rank */
-
-	/* Load counters */
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR0_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR1_VAL));
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0,
-		SKIP_DELAY_LOOP_VALUE_OR_ZERO(SEQ_TINIT_CNTR2_VAL));
-
-	/* Load jump address */
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_INIT_CKE_0);
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_INIT_CKE_0);
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_INIT_CKE_0);
-
-	/* Execute count instruction */
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_INIT_CKE_0);
-
-	/* indicate that memory is stable */
-	IOWR_32DIRECT(PHY_MGR_RESET_MEM_STBL, 0, 1);
-
-	/* Bring up CKE */
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_NOP);
-
-	/* *** STAGE (4) */
-
-	/* Wait for 400ns */
-	delay_for_n_ns(400);
-
-	/* Multi-rank section begins here */
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
-		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
-
-		/*
-		 * * **** *
-		 * * NOTE *
-		 * * **** *
-		 * The following commands must be spaced by tMRD or tRPA
-		 *which are in the order
-		 * of 2 to 4 full rate cycles. This is peanuts in the
-		 *NIOS domain, so for now
-		 * we can avoid redundant wait loops
-		 */
-
-		/* Possible FIXME BEN: for HHP, we need to add delay loops
-		 * to be sure although, the sequencer write interface by itself
-		 * likely has enough delay
-		 */
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_PRECHARGE_ALL);
-
-		/* *** STAGE (5) */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR2);
-
-		/* *** STAGE (6) */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR3);
-
-		/* *** STAGE (7) */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR);
-
-		/* *** STAGE (8) */
-		/* DLL reset */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_MR_DLL_RESET);
-
-		/* *** STAGE (9) */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_PRECHARGE_ALL);
-
-		/* *** STAGE (10) */
-
-		/* Issue 2 refresh commands spaced by tREF */
-
-		/* First REFRESH */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_REFRESH);
-
-		/* tREF = 200ns */
-		delay_for_n_ns(200);
-
-		/* Second REFRESH */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_REFRESH);
-
-		/* Second idle loop */
-		delay_for_n_ns(200);
-
-		/* *** STAGE (11) */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_MR_CALIB);
-
-		/* *** STAGE (12) */
-		/* OCD defaults */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_EMR_OCD_ENABLE);
-
-		/* *** STAGE (13) */
-		/* OCD exit */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR);
-
-		/* *** STAGE (14) */
-
-		/*
-		 * The memory is now initialized. Before being able to
-		 *use it, we must still
-		 * wait for the DLL to lock, 200 clock cycles after it
-		 *was reset @ STAGE (8).
-		 * Since we cannot keep track of time in any other way,
-		 *let's start counting from now
-		 */
-		delay_for_n_mem_clocks(200);
-	}
-}
-#endif /* DDR2 */
 
-#if LPDDR2
-static void rw_mgr_mem_initialize (void)
+static void rw_mgr_mem_dll_lock_wait(void)
 {
-	uint32_t r;
-
-	/* *** NOTE *** */
-	/* The following STAGE (n) notation refers to the corresponding
-	stage in the Micron datasheet */
-
-	/*
-	 *Here's how you load register for a loop
-	 * Counters are located @ 0x800
-	 * Jump address are located @ 0xC00
-	 * For both, registers 0 to 3 are selected using bits 3 and 2,
-	 *like in
-	 * 0x800, 0x804, 0x808, 0x80C and 0xC00, 0xC04, 0xC08, 0xC0C
-	 *I know this ain't pretty, but Avalon bus throws away the 2 least
-	 *significant bits
-	 */
-
-
-	/* *** STAGE (1, 2, 3) *** */
-
-	/* start with CKE low */
-
-	/* tINIT1 = 100ns */
-
-	/*
-	 * 100ns @ 300MHz (3.333 ns) ~ 30 cycles
-	 * If a is the number of iteration in a loop
-	 * it takes the following number of cycles to complete the operation
-	 * number_of_cycles = (2 + n) * a
-	 * where n is the number of instruction in the inner loop
-	 * One possible solution is n = 0 , a = 15 => a = 0x10
-	 */
-
-	/* Load counter */
-	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, SKIP_DELAY_LOOP_VALUE_OR_ZERO(0x10));
-
-	/* Load jump address */
-	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_INIT_CKE_0);
-
-	/* Execute count instruction */
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_INIT_CKE_0);
-
-	/* tINIT3 = 200us */
-	delay_for_n_ns(200000);
-
-	/* indicate that memory is stable */
-	IOWR_32DIRECT(PHY_MGR_RESET_MEM_STBL, 0, 1);
-
-	/* Multi-rank section begins here */
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
-		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
-
-		/* MRW RESET */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR63_RESET);
-	}
-
-	/* tINIT5 = 10us */
-	delay_for_n_ns(10000);
-
-	/* Multi-rank section begins here */
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
-		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
-
-		/* MRW ZQC */
-		/* Note: We cannot calibrate other ranks when the current rank
-		is calibrating for tZQINIT */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR10_ZQC);
-
-		/* tZQINIT = 1us */
-		delay_for_n_ns(1000);
-
-		/*
-		 * * **** *
-		 * * NOTE *
-		 * * **** *
-		 * The following commands must be spaced by tMRW which is
-		 *in the order
-		 * of 3 to 5 full rate cycles. This is peanuts in the NIOS
-		 *domain, so for now
-		 * we can avoid redundant wait loops
-		 */
-
-		/* MRW MR1 */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR1_CALIB);
-
-		/* MRW MR2 */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR2);
-
-		/* MRW MR3 */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR3);
-	}
 }
-#endif /* LPDDR2 */
 
-/*  At the end of calibration we have to program the user settings in, and
-  USER  hand off the memory to the user. */
+//USER  At the end of calibration we have to program the user settings in, and
+//USER  hand off the memory to the user.
 
-#if DDR3
-static void rw_mgr_mem_handoff (void)
+static void rw_mgr_mem_handoff(void)
 {
 	uint32_t r;
 
+	TRACE_FUNC();
 
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
 
-		/* precharge all banks ... */
+		//USER precharge all banks ... 
 
 		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_PRECHARGE_ALL);
 
-		/* load up MR settings specified by user */
+		//USER load up MR settings specified by user 
 
-		/* Use Mirror-ed commands for odd ranks if address
-		mirrorring is on */
+		//USER Use Mirror-ed commands for odd ranks if address mirrorring is on
 		if ((RW_MGR_MEM_ADDRESS_MIRRORING >> r) & 0x1) {
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS2_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS2_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS3_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS3_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS1_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS1_MIRR);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS0_USER_MIRR);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS0_USER_MIRR);
 		} else {
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS2);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS2);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS3);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS3);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS1);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS1);
 			delay_for_n_mem_clocks(4);
 			set_jump_as_return();
-			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-				__RW_MGR_MRS0_USER);
+			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MRS0_USER);
 		}
-		/* USER  need to wait tMOD (12CK or 15ns) time before issuing
-		 * other commands, but we will have plenty of NIOS cycles before
-		 * actual handoff so its okay.
-		 */
-	}
-
-}
-#endif /* DDR3 */
-
-#if DDR2
-static void rw_mgr_mem_handoff (void)
-{
-	uint32_t r;
-
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
-		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
-
-		/* precharge all banks ... */
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_PRECHARGE_ALL);
-
-		/* load up MR settings specified by user */
-
-		/*
-		 * FIXME BEN: for HHP, we need to add delay loops to be sure
-		 * We can check this with BFM perhaps
-		 * Likely enough delay in RW_MGR though
-		 */
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR2);
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR3);
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_EMR);
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR_USER);
-
-		/*
-		 * USER need to wait tMOD (12CK or 15ns) time before issuing
-		 * other commands,
-		 * USER but we will have plenty of NIOS cycles before actual
-		 * handoff so its okay.
-		 */
+		//USER  need to wait tMOD (12CK or 15ns) time before issuing other commands,
+		//USER  but we will have plenty of NIOS cycles before actual handoff so its okay.
 	}
-}
-#endif /* DDR2 */
-
-#if LPDDR2
-static void rw_mgr_mem_handoff (void)
-{
-	uint32_t r;
-
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
-		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
 
-		/* precharge all banks... */
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_PRECHARGE_ALL);
-
-		/* load up MR settings specified by user */
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR1_USER);
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR2);
-
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_MR3);
-	}
 }
-#endif /* LPDDR2 */
 
-/*
- * performs a guaranteed read on the patterns we are going to use during a
- * read test to ensure memory works
- */
-static uint32_t rw_mgr_mem_calibrate_read_test_patterns (uint32_t rank_bgn,
-	uint32_t group, uint32_t num_tries, t_btfld *bit_chk, uint32_t all_ranks)
+//USER performs a guaranteed read on the patterns we are going to use during a read test to ensure memory works
+static uint32_t rw_mgr_mem_calibrate_read_test_patterns(uint32_t rank_bgn, uint32_t group,
+							uint32_t num_tries, t_btfld * bit_chk,
+							uint32_t all_ranks)
 {
 	uint32_t r, vg;
 	t_btfld correct_mask_vg;
 	t_btfld tmp_bit_chk;
-	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
-		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
+	uint32_t rank_end =
+	    all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS : (rank_bgn + NUM_RANKS_PER_SHADOW_REG);
 
 	*bit_chk = param->read_correct_mask;
 	correct_mask_vg = param->read_correct_mask_vg;
 
 	for (r = rank_bgn; r < rank_end; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
 
-		/* Load up a constant bursts of read commands */
+		//USER Load up a constant bursts of read commands
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x20);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0,
-			__RW_MGR_GUARANTEED_READ);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_GUARANTEED_READ);
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, 0x20);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_GUARANTEED_READ_CONT);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_GUARANTEED_READ_CONT);
 
 		tmp_bit_chk = 0;
-		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS-1; ; vg--) {
-			/* reset the fifos to get pointers to known state */
+		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS - 1;; vg--) {
+			//USER reset the fifos to get pointers to known state
 
 			IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
 			IOWR_32DIRECT(RW_MGR_RESET_READ_DATAPATH, 0, 0);
 
-			tmp_bit_chk = tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS
-				/ RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
+			tmp_bit_chk =
+			    tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS /
+					    RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
 
 			IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP,
-				((group*RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS +
-					vg) << 2), __RW_MGR_GUARANTEED_READ);
-			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg &
-				~(IORD_32DIRECT(BASE_RW_MGR, 0)));
+				      ((group * RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS + vg) << 2),
+				      __RW_MGR_GUARANTEED_READ);
+			tmp_bit_chk =
+			    tmp_bit_chk | (correct_mask_vg & ~(IORD_32DIRECT(BASE_RW_MGR, 0)));
 
-			if (vg == 0)
+			if (vg == 0) {
 				break;
+			}
 		}
 		*bit_chk &= tmp_bit_chk;
 	}
 
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, (group << 2),
-		__RW_MGR_CLEAR_DQS_ENABLE);
+	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, (group << 2), __RW_MGR_CLEAR_DQS_ENABLE);
 
 	set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
-	pr_debug("test_load_patterns(%u,ALL) => (%u == %u) => %u\n",
-		group, *bit_chk, param->read_correct_mask,
-		(*bit_chk == param->read_correct_mask));
+	DPRINT(2, "test_load_patterns(%lu,ALL) => (%lu == %lu) => %lu", group, *bit_chk,
+	       param->read_correct_mask, (long unsigned int)(*bit_chk == param->read_correct_mask));
 	return (*bit_chk == param->read_correct_mask);
 }
 
-static uint32_t rw_mgr_mem_calibrate_read_test_patterns_all_ranks
-	(uint32_t group, uint32_t num_tries, t_btfld *bit_chk)
+static uint32_t rw_mgr_mem_calibrate_read_test_patterns_all_ranks(uint32_t group,
+								  uint32_t num_tries,
+								  t_btfld * bit_chk)
 {
 	if (rw_mgr_mem_calibrate_read_test_patterns(0, group, num_tries, bit_chk, 1)) {
 		return 1;
 	} else {
-		/* case:139851 - if guaranteed read fails, we can retry using
-		 * different dqs enable phases. It is possible that with the
-		 * initial phase, dqs enable is asserted/deasserted too close
-		 * to an dqs edge, truncating the read burst.
-		 */
+		// case:139851 - if guaranteed read fails, we can retry using different dqs enable phases.
+		// It is possible that with the initial phase, dqs enable is asserted/deasserted too close 
+		// to an dqs edge, truncating the read burst.
 		uint32_t p;
 		for (p = 0; p <= IO_DQS_EN_PHASE_MAX; p++) {
-			scc_mgr_set_dqs_en_phase_all_ranks (group, p);
-			if (rw_mgr_mem_calibrate_read_test_patterns(0,
-				group, num_tries, bit_chk, 1)) {
+			scc_mgr_set_dqs_en_phase_all_ranks(group, p);
+			if (rw_mgr_mem_calibrate_read_test_patterns
+			    (0, group, num_tries, bit_chk, 1)) {
 				return 1;
 			}
 		}
-
 		return 0;
 	}
 }
 
-/* load up the patterns we are going to use during a read test */
-static void rw_mgr_mem_calibrate_read_load_patterns (uint32_t rank_bgn,
-	uint32_t all_ranks)
+//USER load up the patterns we are going to use during a read test 
+static void rw_mgr_mem_calibrate_read_load_patterns(uint32_t rank_bgn, uint32_t all_ranks)
 {
 	uint32_t r;
-	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
-		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
+	uint32_t rank_end =
+	    all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS : (rank_bgn + NUM_RANKS_PER_SHADOW_REG);
+
+	TRACE_FUNC();
 
 	for (r = rank_bgn; r < rank_end; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
 
-		/* Load up a constant bursts */
+		//USER Load up a constant bursts
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x20);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0,
-			__RW_MGR_GUARANTEED_WRITE_WAIT0);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_GUARANTEED_WRITE_WAIT0);
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, 0x20);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_GUARANTEED_WRITE_WAIT1);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_GUARANTEED_WRITE_WAIT1);
 
-#if QUARTER_RATE
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0x01);
-#endif
-#if HALF_RATE
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0x02);
-#endif
-#if FULL_RATE
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0x04);
-#endif
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-			__RW_MGR_GUARANTEED_WRITE_WAIT2);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_GUARANTEED_WRITE_WAIT2);
 
-#if QUARTER_RATE
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0, 0x01);
-#endif
-#if HALF_RATE
-		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0, 0x02);
-#endif
-#if FULL_RATE
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0, 0x04);
-#endif
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0,
-			__RW_MGR_GUARANTEED_WRITE_WAIT3);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_GUARANTEED_WRITE_WAIT3);
 
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_GUARANTEED_WRITE);
+		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_GUARANTEED_WRITE);
 	}
 
 	set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
 }
 
-static void rw_mgr_mem_calibrate_read_load_patterns_all_ranks (void)
+static inline void rw_mgr_mem_calibrate_read_load_patterns_all_ranks(void)
 {
-	rw_mgr_mem_calibrate_read_load_patterns (0, 1);
+	rw_mgr_mem_calibrate_read_load_patterns(0, 1);
 }
 
-/*
- * try a read and see if it returns correct data back. has dummy reads
- * inserted into the mix used to align dqs enable. has more thorough checks
- * than the regular read test.
- */
-
-static uint32_t rw_mgr_mem_calibrate_read_test (uint32_t rank_bgn, uint32_t group,
-	uint32_t num_tries, uint32_t all_correct, t_btfld *bit_chk,
-	uint32_t all_groups, uint32_t all_ranks)
+// pe checkout pattern for harden managers
+//void pe_checkout_pattern (void)
+//{
+//    // test RW manager
+//    
+//    // do some reads to check load buffer
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_1, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_READ_B2B_WAIT1);
+//
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_2, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_READ_B2B_WAIT2);
+//              
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_0, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_READ_B2B);
+//      
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_3, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_READ_B2B);
+//      
+//      // clear error word
+//      IOWR_32DIRECT (RW_MGR_RESET_READ_DATAPATH, 0, 0);
+//      
+//      IOWR_32DIRECT (RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_READ_B2B);
+//      
+//      uint32_t readdata;
+//      
+//      // read error word
+//      readdata = IORD_32DIRECT(BASE_RW_MGR, 0);
+//      
+//      // read DI buffer
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 0*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 1*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 2*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 3*4, 0);
+//      
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_1, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_READ_B2B_WAIT1);
+//
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_2, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_READ_B2B_WAIT2);
+//              
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_0, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_READ_B2B);
+//      
+//      IOWR_32DIRECT (RW_MGR_LOAD_CNTR_3, 0, 0x0);
+//      IOWR_32DIRECT (RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_READ_B2B);
+//      
+//      // clear error word
+//      IOWR_32DIRECT (RW_MGR_RESET_READ_DATAPATH, 0, 0);
+//      
+//      // do read
+//      IOWR_32DIRECT (RW_MGR_LOOPBACK_MODE, 0, __RW_MGR_READ_B2B);
+//      
+//      // read error word
+//      readdata = IORD_32DIRECT(BASE_RW_MGR, 0);
+//      
+//      // error word should be 0x00
+//      
+//      // read DI buffer
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 0*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 1*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 2*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 3*4, 0);
+//      
+//      // clear error word
+//      IOWR_32DIRECT (RW_MGR_RESET_READ_DATAPATH, 0, 0);
+//      
+//      // do dm read   
+//      IOWR_32DIRECT (RW_MGR_LOOPBACK_MODE, 0, __RW_MGR_LFSR_WR_RD_DM_BANK_0_WL_1);
+//      
+//      // read error word
+//      readdata = IORD_32DIRECT(BASE_RW_MGR, 0);
+//      
+//      // error word should be ff
+//      
+//      // read DI buffer
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 0*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 1*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 2*4, 0);
+//      readdata = IORD_32DIRECT(RW_MGR_DI_BASE + 3*4, 0);
+//      
+//      // exit loopback mode
+//      IOWR_32DIRECT (BASE_RW_MGR, 0, __RW_MGR_IDLE_LOOP2);
+//      
+//      // start of phy manager access
+//      
+//      readdata = IORD_32DIRECT (PHY_MGR_MAX_RLAT_WIDTH, 0);
+//      readdata = IORD_32DIRECT (PHY_MGR_MAX_AFI_WLAT_WIDTH, 0);
+//      readdata = IORD_32DIRECT (PHY_MGR_MAX_AFI_RLAT_WIDTH, 0);
+//      readdata = IORD_32DIRECT (PHY_MGR_CALIB_SKIP_STEPS, 0);
+//      readdata = IORD_32DIRECT (PHY_MGR_CALIB_VFIFO_OFFSET, 0);       
+//      readdata = IORD_32DIRECT (PHY_MGR_CALIB_LFIFO_OFFSET, 0);
+//      
+//      // start of data manager test
+//      
+//      readdata = IORD_32DIRECT (DATA_MGR_DRAM_CFG         , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_WL         , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_ADD    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_RL         , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_RFC    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_REFI   , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_WR         , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_MEM_T_MRD    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_COL_WIDTH    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_ROW_WIDTH    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_BANK_WIDTH   , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_CS_WIDTH         , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_ITF_WIDTH    , 0);
+//      readdata = IORD_32DIRECT (DATA_MGR_DVC_WIDTH    , 0);
+//      
+//}
+
+//USER  try a read and see if it returns correct data back. has dummy reads inserted into the mix
+//USER  used to align dqs enable. has more thorough checks than the regular read test.
+
+static uint32_t rw_mgr_mem_calibrate_read_test(uint32_t rank_bgn, uint32_t group,
+					       uint32_t num_tries, uint32_t all_correct,
+					       t_btfld * bit_chk, uint32_t all_groups,
+					       uint32_t all_ranks)
 {
 	uint32_t r, vg;
-	uint32_t quick_read_mode;
 	t_btfld correct_mask_vg;
 	t_btfld tmp_bit_chk;
-	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
-		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
-
+	uint32_t rank_end =
+	    all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS : (rank_bgn + NUM_RANKS_PER_SHADOW_REG);
 
 	*bit_chk = param->read_correct_mask;
 	correct_mask_vg = param->read_correct_mask_vg;
 
-	quick_read_mode = (((STATIC_CALIB_STEPS) &
-		CALIB_SKIP_DELAY_SWEEPS) && ENABLE_SUPER_QUICK_CALIBRATION) ||
-		BFM_MODE;
+	uint32_t quick_read_mode = (((STATIC_CALIB_STEPS) & CALIB_SKIP_DELAY_SWEEPS)
+				    && ENABLE_SUPER_QUICK_CALIBRATION) || BFM_MODE;
 
 	for (r = rank_bgn; r < rank_end; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, 0x10);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_READ_B2B_WAIT1);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_READ_B2B_WAIT1);
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0x10);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-			__RW_MGR_READ_B2B_WAIT2);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_READ_B2B_WAIT2);
 
 		if (quick_read_mode) {
-			IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x1);
-			/* need at least two (1+1) reads to capture failures */
+			IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x1);	/* need at least two (1+1) reads to capture failures */
 		} else if (all_groups) {
 			IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x06);
 		} else {
@@ -1837,29 +1792,29 @@ static uint32_t rw_mgr_mem_calibrate_read_test (uint32_t rank_bgn, uint32_t grou
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_READ_B2B);
 		if (all_groups) {
 			IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0,
-				RW_MGR_MEM_IF_READ_DQS_WIDTH *
-				RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS - 1);
+				      RW_MGR_MEM_IF_READ_DQS_WIDTH *
+				      RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS - 1);
 		} else {
 			IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0, 0x0);
 		}
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_READ_B2B);
 
 		tmp_bit_chk = 0;
-		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS-1; ; vg--) {
-			/* reset the fifos to get pointers to known state */
+		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS - 1;; vg--) {
+			//USER reset the fifos to get pointers to known state 
 
 			IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
 			IOWR_32DIRECT(RW_MGR_RESET_READ_DATAPATH, 0, 0);
 
-			tmp_bit_chk = tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS
-				/ RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
+			tmp_bit_chk =
+			    tmp_bit_chk << (RW_MGR_MEM_DQ_PER_READ_DQS /
+					    RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS);
 
-			IOWR_32DIRECT(all_groups ? RW_MGR_RUN_ALL_GROUPS :
-				RW_MGR_RUN_SINGLE_GROUP, ((group *
-				RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS+vg)
-				<< 2), __RW_MGR_READ_B2B);
-			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg &
-				~(IORD_32DIRECT(BASE_RW_MGR, 0)));
+			IOWR_32DIRECT(all_groups ? RW_MGR_RUN_ALL_GROUPS : RW_MGR_RUN_SINGLE_GROUP,
+				      ((group * RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS + vg) << 2),
+				      __RW_MGR_READ_B2B);
+			tmp_bit_chk =
+			    tmp_bit_chk | (correct_mask_vg & ~(IORD_32DIRECT(BASE_RW_MGR, 0)));
 
 			if (vg == 0) {
 				break;
@@ -1868,36 +1823,34 @@ static uint32_t rw_mgr_mem_calibrate_read_test (uint32_t rank_bgn, uint32_t grou
 		*bit_chk &= tmp_bit_chk;
 	}
 
-	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, (group << 2),
-		__RW_MGR_CLEAR_DQS_ENABLE);
+	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, (group << 2), __RW_MGR_CLEAR_DQS_ENABLE);
 
 	if (all_correct) {
 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
-		pr_debug("read_test(%u,ALL,%u) => (%u == %u) => %u\n",
-			group, all_groups, *bit_chk, param->read_correct_mask,
-			(*bit_chk ==
-			param->read_correct_mask));
+		DPRINT(2, "read_test(%lu,ALL,%lu) => (%lu == %lu) => %lu", group, all_groups,
+		       *bit_chk, param->read_correct_mask,
+		       (long unsigned int)(*bit_chk == param->read_correct_mask));
 		return (*bit_chk == param->read_correct_mask);
-	} else	{
+	} else {
 		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
-		pr_debug("read_test(%u,ONE,%u) => (%u != %u) => %u\n",
-			group, all_groups, *bit_chk, 0,
-			(*bit_chk != 0x00));
+		DPRINT(2, "read_test(%lu,ONE,%lu) => (%lu != %lu) => %lu", group, all_groups,
+		       *bit_chk, (long unsigned int)0, (long unsigned int)(*bit_chk != 0x00));
 		return (*bit_chk != 0x00);
 	}
 }
 
-static uint32_t rw_mgr_mem_calibrate_read_test_all_ranks (uint32_t group,
-	uint32_t num_tries, uint32_t all_correct, t_btfld *bit_chk,
-	uint32_t all_groups)
+static inline uint32_t rw_mgr_mem_calibrate_read_test_all_ranks(uint32_t group, uint32_t num_tries,
+								uint32_t all_correct,
+								t_btfld * bit_chk,
+								uint32_t all_groups)
 {
-	return rw_mgr_mem_calibrate_read_test (0, group, num_tries, all_correct,
-		bit_chk, all_groups, 1);
+	return rw_mgr_mem_calibrate_read_test(0, group, num_tries, all_correct, bit_chk, all_groups,
+					      1);
 }
 
-static void rw_mgr_incr_vfifo(uint32_t grp, uint32_t *v)
+static void rw_mgr_incr_vfifo(uint32_t grp, uint32_t * v)
 {
-	/* fiddle with FIFO */
+	//USER fiddle with FIFO 
 	if (HARD_PHY) {
 		IOWR_32DIRECT(PHY_MGR_CMD_INC_VFIFO_HARD_PHY, 0, grp);
 	} else if (QUARTER_RATE_MODE && !HARD_VFIFO) {
@@ -1911,8 +1864,7 @@ static void rw_mgr_incr_vfifo(uint32_t grp, uint32_t *v)
 			IOWR_32DIRECT(PHY_MGR_CMD_INC_VFIFO_FR, 0, grp);
 		}
 	} else if (HARD_VFIFO) {
-		/* Arria V & Cyclone V have a hard full-rate VFIFO that only
-		has a single incr signal */
+		// Arria V & Cyclone V have a hard full-rate VFIFO that only has a single incr signal
 		IOWR_32DIRECT(PHY_MGR_CMD_INC_VFIFO_FR, 0, grp);
 	} else {
 		if (!HALF_RATE_MODE || (*v & 1) == 1) {
@@ -1923,22 +1875,44 @@ static void rw_mgr_incr_vfifo(uint32_t grp, uint32_t *v)
 	}
 
 	(*v)++;
+	BFM_INC_VFIFO;
+}
+
+//Used in quick cal to properly loop through the duplicated VFIFOs in AV QDRII/RLDRAM
+static inline void rw_mgr_incr_vfifo_all(uint32_t grp, uint32_t * v)
+{
+#if VFIFO_CONTROL_WIDTH_PER_DQS == 1
+	rw_mgr_incr_vfifo(grp, v);
+#else
+	uint32_t i;
+	for (i = 0; i < VFIFO_CONTROL_WIDTH_PER_DQS; i++) {
+		rw_mgr_incr_vfifo(grp * VFIFO_CONTROL_WIDTH_PER_DQS + i, v);
+		if (i != 0) {
+			(*v)--;
+		}
+	}
+#endif
 }
 
-static void rw_mgr_decr_vfifo(uint32_t grp, uint32_t *v)
+static void rw_mgr_decr_vfifo(uint32_t grp, uint32_t * v)
 {
 
 	uint32_t i;
 
-	for (i = 0; i < VFIFO_SIZE-1; i++) {
+	for (i = 0; i < VFIFO_SIZE - 1; i++) {
 		rw_mgr_incr_vfifo(grp, v);
 	}
 }
 
-/* find a good dqs enable to use */
-static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
+//USER find a good dqs enable to use 
+
+#if NEWVERSION_DQSEN
+
+// Navid's version 
+
+static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(uint32_t grp)
 {
-	uint32_t i, d, v, p;
+	uint32_t i, d, v, p, sr;
 	uint32_t max_working_cnt;
 	uint32_t fail_cnt;
 	t_btfld bit_chk;
@@ -1948,6 +1922,8 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 	uint32_t test_status;
 	uint32_t found_passing_read, found_failing_read, initial_failing_dtap;
 
+	TRACE_FUNC("%lu", grp);
+	BFM_STAGE("find_dqs_en_phase");
 	ALTERA_ASSERT(grp < RW_MGR_MEM_IF_READ_DQS_WIDTH);
 
 	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
@@ -1957,8 +1933,8 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 
 	fail_cnt = 0;
 
-	/* ************************************************************** */
-	/* * Step 0 : Determine number of delay taps for each phase tap * */
+	//USER **************************************************************
+	//USER * Step 0 : Determine number of delay taps for each phase tap *
 
 	dtaps_per_ptap = 0;
 	tmp_delay = 0;
@@ -1969,51 +1945,52 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 	dtaps_per_ptap--;
 	ALTERA_ASSERT(dtaps_per_ptap <= IO_DQS_EN_DELAY_MAX);
 	tmp_delay = 0;
+	TCLRPT_SET(debug_summary_report->computed_dtap_per_ptap, dtaps_per_ptap);
+
+	// VFIFO sweep
 
-	/* ********************************************************* */
-	/* * Step 1 : First push vfifo until we get a failing read * */
-	for (v = 0; v < VFIFO_SIZE; ) {
-		pr_debug("find_dqs_en_phase: vfifo %u\n", vfifo_idx);
-		test_status = rw_mgr_mem_calibrate_read_test_all_ranks
-			(grp, 1, PASS_ONE_BIT, &bit_chk, 0);
+	//USER *********************************************************
+	//USER * Step 1 : First push vfifo until we get a failing read *
+	for (v = 0; v < VFIFO_SIZE;) {
+		DPRINT(2, "find_dqs_en_phase: vfifo %lu", BFM_GBL_GET(vfifo_idx));
+		test_status =
+		    rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0);
 		if (!test_status) {
 			fail_cnt++;
 
-			if (fail_cnt == 2)
+			if (fail_cnt == 2) {
 				break;
+			}
 		}
-
-		/* fiddle with FIFO */
+		//USER fiddle with FIFO
 		rw_mgr_incr_vfifo(grp, &v);
 	}
 
 	if (v >= VFIFO_SIZE) {
-		/* no failing read found!! Something must have gone wrong */
-		pr_debug("find_dqs_en_phase: vfifo failed\n");
+		//USER no failing read found!! Something must have gone wrong
+		DPRINT(2, "find_dqs_en_phase: vfifo failed");
 		return 0;
 	}
 
 	max_working_cnt = 0;
 
-	/* ******************************************************** */
-	/* * step 2: find first working phase, increment in ptaps * */
+	//USER ********************************************************
+	//USER * step 2: find first working phase, increment in ptaps *
 	found_begin = 0;
 	work_bgn = 0;
-	for (d = 0; d <= dtaps_per_ptap; d++, tmp_delay +=
-		IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
+	for (d = 0; d <= dtaps_per_ptap; d++, tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
 		work_bgn = tmp_delay;
 		scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
 		for (i = 0; i < VFIFO_SIZE; i++) {
-			for (p = 0; p <= IO_DQS_EN_PHASE_MAX; p++, work_bgn +=
-				IO_DELAY_PER_OPA_TAP) {
-				pr_debug("find_dqs_en_phase: begin: vfifo=%u"
-					" ptap=%u dtap=%u\n", vfifo_idx, p, d);
+			for (p = 0; p <= IO_DQS_EN_PHASE_MAX; p++, work_bgn += IO_DELAY_PER_OPA_TAP) {
+				DPRINT(2, "find_dqs_en_phase: begin: vfifo=%lu ptap=%lu dtap=%lu",
+				       BFM_GBL_GET(vfifo_idx), p, d);
 				scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
 
 				test_status =
-				rw_mgr_mem_calibrate_read_test_all_ranks
-				(grp, 1, PASS_ONE_BIT, &bit_chk, 0);
+				    rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT,
+									     &bit_chk, 0);
 
 				if (test_status) {
 					max_working_cnt = 1;
@@ -2022,38 +1999,39 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 				}
 			}
 
-			if (found_begin)
+			if (found_begin) {
 				break;
+			}
 
 			if (p > IO_DQS_EN_PHASE_MAX) {
-				/* fiddle with FIFO */
+				//USER fiddle with FIFO
 				rw_mgr_incr_vfifo(grp, &v);
 			}
 		}
 
-		if (found_begin)
+		if (found_begin) {
 			break;
+		}
 	}
 
 	if (i >= VFIFO_SIZE) {
-		/* cannot find working solution */
-		pr_debug("find_dqs_en_phase: no vfifo/ptap/dtap\n");
+		//USER cannot find working solution 
+		DPRINT(2, "find_dqs_en_phase: no vfifo/ptap/dtap");
 		return 0;
 	}
 
 	work_end = work_bgn;
 
-	/*  If d is 0 then the working window covers a phase tap and
-	we can follow the old procedure otherwise, we've found the beginning,
-	and we need to increment the dtaps until we find the end */
+	//USER  If d is 0 then the working window covers a phase tap and we can follow the old procedure
+	//USER  otherwise, we've found the beginning, and we need to increment the dtaps until we find the end 
 	if (d == 0) {
-		/* ********************************************************* */
-		/* * step 3a: if we have room, back off by one and
-		increment in dtaps * */
+		//USER ********************************************************************
+		//USER * step 3a: if we have room, back off by one and increment in dtaps *
+		COV(EN_PHASE_PTAP_OVERLAP);
 
-		/* Special case code for backing up a phase */
+		//USER Special case code for backing up a phase 
 		if (p == 0) {
-			p = IO_DQS_EN_PHASE_MAX ;
+			p = IO_DQS_EN_PHASE_MAX;
 			rw_mgr_decr_vfifo(grp, &v);
 		} else {
 			p = p - 1;
@@ -2063,28 +2041,55 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 
 		found_begin = 0;
 		for (d = 0; d <= IO_DQS_EN_DELAY_MAX && tmp_delay < work_bgn;
-			d++, tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
+		     d++, tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
 
-			pr_debug("find_dqs_en_phase: begin-2: vfifo=%u "
-				"ptap=%u dtap=%u\n", vfifo_idx, p, d);
+			DPRINT(2, "find_dqs_en_phase: begin-2: vfifo=%lu ptap=%lu dtap=%lu",
+			       BFM_GBL_GET(vfifo_idx), p, d);
 
 			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
-			if (rw_mgr_mem_calibrate_read_test_all_ranks (grp, 1,
-				PASS_ONE_BIT, &bit_chk, 0)) {
+			if (rw_mgr_mem_calibrate_read_test_all_ranks
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
 				found_begin = 1;
 				work_bgn = tmp_delay;
 				break;
 			}
 		}
 
-		/* We have found a working dtap before the ptap found above */
+		// Record the debug data
+		// Currently dqsen is the same for all ranks
+		for (sr = 0; sr < NUM_SHADOW_REGS; sr++) {
+			TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].work_begin,
+				   work_bgn);
+			if (found_begin) {
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].phase_begin,
+					   p);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].delay_begin,
+					   d);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].vfifo_begin,
+					   v % VFIFO_SIZE);
+			} else if (p == IO_DQS_EN_PHASE_MAX) {
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].phase_begin,
+					   0);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].delay_begin,
+					   0);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].vfifo_begin,
+					   (v + 1) % VFIFO_SIZE);
+			} else {
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].phase_begin,
+					   p + 1);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].delay_begin,
+					   0);
+				TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].vfifo_begin,
+					   v % VFIFO_SIZE);
+			}
+		}
+
+		//USER We have found a working dtap before the ptap found above 
 		if (found_begin == 1) {
 			max_working_cnt++;
 		}
-
-		/* Restore VFIFO to old state before we decremented it
-		(if needed) */
+		//USER Restore VFIFO to old state before we decremented it (if needed)
 		p = p + 1;
 		if (p > IO_DQS_EN_PHASE_MAX) {
 			p = 0;
@@ -2093,54 +2098,52 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 
 		scc_mgr_set_dqs_en_delay_all_ranks(grp, 0);
 
-		/* ********************************************************* */
-		/* * step 4a: go forward from working phase to non working
-		phase, increment in ptaps * */
+		//USER ***********************************************************************************
+		//USER * step 4a: go forward from working phase to non working phase, increment in ptaps *
 		p = p + 1;
 		work_end += IO_DELAY_PER_OPA_TAP;
 		if (p > IO_DQS_EN_PHASE_MAX) {
-			/* fiddle with FIFO */
+			//USER fiddle with FIFO
 			p = 0;
 			rw_mgr_incr_vfifo(grp, &v);
 		}
 
 		found_end = 0;
 		for (; i < VFIFO_SIZE + 1; i++) {
-			for (; p <= IO_DQS_EN_PHASE_MAX; p++, work_end
-				+= IO_DELAY_PER_OPA_TAP) {
-				pr_debug("find_dqs_en_phase: end: vfifo=%u "
-					"ptap=%u dtap=%u\n", vfifo_idx, p, 0);
+			for (; p <= IO_DQS_EN_PHASE_MAX; p++, work_end += IO_DELAY_PER_OPA_TAP) {
+				DPRINT(2, "find_dqs_en_phase: end: vfifo=%lu ptap=%lu dtap=%lu",
+				       BFM_GBL_GET(vfifo_idx), p, (long unsigned int)0);
 				scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
 
 				if (!rw_mgr_mem_calibrate_read_test_all_ranks
-					(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+				    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
 					found_end = 1;
 					break;
+				} else {
+					max_working_cnt++;
 				}
-
-				max_working_cnt++;
 			}
 
-			if (found_end)
+			if (found_end) {
 				break;
+			}
 
 			if (p > IO_DQS_EN_PHASE_MAX) {
-				/* fiddle with FIFO */
+				//USER fiddle with FIFO
 				rw_mgr_incr_vfifo(grp, &v);
 				p = 0;
 			}
 		}
 
 		if (i >= VFIFO_SIZE + 1) {
-			/* cannot see edge of failing read */
-			pr_debug("find_dqs_en_phase: end: failed\n");
+			//USER cannot see edge of failing read 
+			DPRINT(2, "find_dqs_en_phase: end: failed");
 			return 0;
 		}
+		//USER *********************************************************
+		//USER * step 5a:  back off one from last, increment in dtaps  *
 
-		/* ********************************************************* */
-		/* * step 5a:  back off one from last, increment in dtaps  * */
-
-		/* Special case code for backing up a phase */
+		//USER Special case code for backing up a phase 
 		if (p == 0) {
 			p = IO_DQS_EN_PHASE_MAX;
 			rw_mgr_decr_vfifo(grp, &v);
@@ -2151,134 +2154,134 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 		work_end -= IO_DELAY_PER_OPA_TAP;
 		scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
 
-		/* * The actual increment of dtaps is done outside of
-		the if/else loop to share code */
+		//USER * The actual increment of dtaps is done outside of the if/else loop to share code
 		d = 0;
 
-		pr_debug("find_dqs_en_phase: found end v/p: vfifo=%u ptap=%u\n",
-				vfifo_idx, p);
+		DPRINT(2, "find_dqs_en_phase: found end v/p: vfifo=%lu ptap=%lu",
+		       BFM_GBL_GET(vfifo_idx), p);
 	} else {
 
-		/* ******************************************************* */
-		/* * step 3-5b:  Find the right edge of the window using
-		delay taps   * */
+		//USER ********************************************************************
+		//USER * step 3-5b:  Find the right edge of the window using delay taps   *             
+		COV(EN_PHASE_PTAP_NO_OVERLAP);
 
-		pr_debug("find_dqs_en_phase: begin found: vfifo=%u ptap=%u "
-			"dtap=%u begin=%u\n", vfifo_idx, p, d,
-			work_bgn);
+		DPRINT(2, "find_dqs_en_phase: begin found: vfifo=%lu ptap=%lu dtap=%lu begin=%lu",
+		       BFM_GBL_GET(vfifo_idx), p, d, work_bgn);
+		BFM_GBL_SET(dqs_enable_left_edge[grp].v, BFM_GBL_GET(vfifo_idx));
+		BFM_GBL_SET(dqs_enable_left_edge[grp].p, p);
+		BFM_GBL_SET(dqs_enable_left_edge[grp].d, d);
+		BFM_GBL_SET(dqs_enable_left_edge[grp].ps, work_bgn);
 
 		work_end = work_bgn;
 
-		/* * The actual increment of dtaps is done outside of the
-		if/else loop to share code */
+		//USER * The actual increment of dtaps is done outside of the if/else loop to share code
 
-		/* Only here to counterbalance a subtract later on which is
-		not needed if this branch of the algorithm is taken */
+		//USER Only here to counterbalance a subtract later on which is not needed if this branch
+		//USER  of the algorithm is taken 
 		max_working_cnt++;
 	}
 
-	/* The dtap increment to find the failing edge is done here */
-	for (; d <= IO_DQS_EN_DELAY_MAX; d++, work_end +=
-		IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
+	//USER The dtap increment to find the failing edge is done here
+	for (; d <= IO_DQS_EN_DELAY_MAX; d++, work_end += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) {
 
-			pr_debug("find_dqs_en_phase: end-2: dtap=%u\n", d);
-			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+		DPRINT(2, "find_dqs_en_phase: end-2: dtap=%lu", d);
+		scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
-			if (!rw_mgr_mem_calibrate_read_test_all_ranks (grp, 1,
-				PASS_ONE_BIT, &bit_chk, 0)) {
-				break;
-			}
+		if (!rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+			break;
 		}
+	}
 
-	/* Go back to working dtap */
+	//USER Go back to working dtap 
 	if (d != 0) {
 		work_end -= IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
 	}
 
-	pr_debug("find_dqs_en_phase: found end v/p/d: vfifo=%u ptap=%u "
-		"dtap=%u end=%u\n", vfifo_idx, p, d-1, work_end);
+	DPRINT(2, "find_dqs_en_phase: found end v/p/d: vfifo=%lu ptap=%lu dtap=%lu end=%lu",
+	       BFM_GBL_GET(vfifo_idx), p, d - 1, work_end);
+	BFM_GBL_SET(dqs_enable_right_edge[grp].v, BFM_GBL_GET(vfifo_idx));
+	BFM_GBL_SET(dqs_enable_right_edge[grp].p, p);
+	BFM_GBL_SET(dqs_enable_right_edge[grp].d, d - 1);
+	BFM_GBL_SET(dqs_enable_right_edge[grp].ps, work_end);
+
+	// Record the debug data
+	for (sr = 0; sr < NUM_SHADOW_REGS; sr++) {
+		TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].work_end, work_end);
+		TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].phase_end, p);
+		TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].delay_end, d - 1);
+		TCLRPT_SET(debug_cal_report->cal_dqsen_margins[sr][grp].vfifo_end, v % VFIFO_SIZE);
+	}
 
 	if (work_end >= work_bgn) {
-		/* we have a working range */
+		//USER we have a working range 
 	} else {
-		/* nil range */
-		pr_debug("find_dqs_en_phase: end-2: failed\n");
+		//USER nil range 
+		DPRINT(2, "find_dqs_en_phase: end-2: failed");
 		return 0;
 	}
 
-	pr_debug("find_dqs_en_phase: found range [%u,%u]\n",
-		work_bgn, work_end);
+	DPRINT(2, "find_dqs_en_phase: found range [%lu,%lu]", work_bgn, work_end);
 
-#if USE_DQS_TRACKING
-	/* *************************************************************** */
-	/*
-	 * * We need to calculate the number of dtaps that equal a ptap
-	 * * To do that we'll back up a ptap and re-find the edge of the
-	 * * window using dtaps
-	 */
+	// ***************************************************************
+	//USER * We need to calculate the number of dtaps that equal a ptap
+	//USER * To do that we'll back up a ptap and re-find the edge of the 
+	//USER * window using dtaps
 
-	pr_debug("find_dqs_en_phase: calculate dtaps_per_ptap for tracking\n");
+	DPRINT(2, "find_dqs_en_phase: calculate dtaps_per_ptap for tracking");
 
-	/* Special case code for backing up a phase */
+	//USER Special case code for backing up a phase 
 	if (p == 0) {
 		p = IO_DQS_EN_PHASE_MAX;
 		rw_mgr_decr_vfifo(grp, &v);
-		pr_debug("find_dqs_en_phase: backed up cycle/phase: "
-			"v=%u p=%u\n", vfifo_idx, p);
+		DPRINT(2, "find_dqs_en_phase: backed up cycle/phase: v=%lu p=%lu",
+		       BFM_GBL_GET(vfifo_idx), p);
 	} else {
 		p = p - 1;
-		pr_debug("find_dqs_en_phase: backed up phase only: v=%u "
-			"p=%u\n", vfifo_idx, p);
+		DPRINT(2, "find_dqs_en_phase: backed up phase only: v=%lu p=%lu",
+		       BFM_GBL_GET(vfifo_idx), p);
 	}
 
 	scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
 
-	/*
-	 * Increase dtap until we first see a passing read (in case the
-	 * window is smaller than a ptap),
-	 * and then a failing read to mark the edge of the window again
-	 */
+	//USER Increase dtap until we first see a passing read (in case the window is smaller than a ptap),
+	//USER and then a failing read to mark the edge of the window again
 
-	/* Find a passing read */
-	pr_debug("find_dqs_en_phase: find passing read\n");
+	//USER Find a passing read
+	DPRINT(2, "find_dqs_en_phase: find passing read");
 	found_passing_read = 0;
 	found_failing_read = 0;
 	initial_failing_dtap = d;
 	for (; d <= IO_DQS_EN_DELAY_MAX; d++) {
-		pr_debug("find_dqs_en_phase: testing read d=%u\n", d);
+		DPRINT(2, "find_dqs_en_phase: testing read d=%lu", d);
 		scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
-		if (rw_mgr_mem_calibrate_read_test_all_ranks (grp, 1,
-			PASS_ONE_BIT, &bit_chk, 0)) {
+		if (rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
 			found_passing_read = 1;
 			break;
 		}
 	}
 
 	if (found_passing_read) {
-		/* Find a failing read */
-		pr_debug("find_dqs_en_phase: find failing read\n");
+		//USER Find a failing read 
+		DPRINT(2, "find_dqs_en_phase: find failing read");
 		for (d = d + 1; d <= IO_DQS_EN_DELAY_MAX; d++) {
-			pr_debug("find_dqs_en_phase: testing read d=%u\n", d);
+			DPRINT(2, "find_dqs_en_phase: testing read d=%lu", d);
 			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
 			if (!rw_mgr_mem_calibrate_read_test_all_ranks
-				(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
 				found_failing_read = 1;
 				break;
 			}
 		}
 	} else {
-		pr_debug("find_dqs_en_phase: failed to calculate dtaps "
-			"per ptap. Fall back on static value\n");
+		DPRINT(1,
+		       "find_dqs_en_phase: failed to calculate dtaps per ptap. Fall back on static value");
 	}
 
-	/*
-	 * The dynamically calculated dtaps_per_ptap is only valid if we
-	 * found a passing/failing read. If we didn't, it means d hit the max
-	 * (IO_DQS_EN_DELAY_MAX). Otherwise, dtaps_per_ptap retains its
-	 * statically calculated value.
-	 */
+	//USER The dynamically calculated dtaps_per_ptap is only valid if we found a passing/failing read
+	//USER If we didn't, it means d hit the max (IO_DQS_EN_DELAY_MAX).
+	//USER Otherwise, dtaps_per_ptap retains its statically calculated value.
 	if (found_passing_read && found_failing_read) {
 		dtaps_per_ptap = d - initial_failing_dtap;
 	}
@@ -2286,233 +2289,657 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase (uint32_t grp)
 	ALTERA_ASSERT(dtaps_per_ptap <= IO_DQS_EN_DELAY_MAX);
 	IOWR_32DIRECT(REG_FILE_DTAPS_PER_PTAP, 0, dtaps_per_ptap);
 
-	pr_debug("find_dqs_en_phase: dtaps_per_ptap=%u - %u = %u\n", d,
-		initial_failing_dtap, dtaps_per_ptap);
-#endif
+	DPRINT(2, "find_dqs_en_phase: dtaps_per_ptap=%lu - %lu = %lu", d, initial_failing_dtap,
+	       dtaps_per_ptap);
 
-	/* ******************************************** */
-	/* * step 6:  Find the centre of the window   * */
+	//USER ********************************************
+	//USER * step 6:  Find the centre of the window   *
 
 	work_mid = (work_bgn + work_end) / 2;
 	tmp_delay = 0;
 
-	pr_debug("work_bgn=%d work_end=%d work_mid=%d\n", work_bgn,
-		work_end, work_mid);
-	/* Get the middle delay to be less than a VFIFO delay */
-	for (p = 0; p <= IO_DQS_EN_PHASE_MAX;
-		p++, tmp_delay += IO_DELAY_PER_OPA_TAP)
-		;
-	pr_debug("vfifo ptap delay %d\n", tmp_delay);
+	DPRINT(2, "work_bgn=%ld work_end=%ld work_mid=%ld", work_bgn, work_end, work_mid);
+	//USER Get the middle delay to be less than a VFIFO delay 
+	for (p = 0; p <= IO_DQS_EN_PHASE_MAX; p++, tmp_delay += IO_DELAY_PER_OPA_TAP) ;
+	DPRINT(2, "vfifo ptap delay %ld", tmp_delay);
 	while (work_mid > tmp_delay)
 		work_mid -= tmp_delay;
-	pr_debug("new work_mid %d\n", work_mid);
+	DPRINT(2, "new work_mid %ld", work_mid);
 	tmp_delay = 0;
 	for (p = 0; p <= IO_DQS_EN_PHASE_MAX && tmp_delay < work_mid;
-		p++, tmp_delay += IO_DELAY_PER_OPA_TAP)
-		;
+	     p++, tmp_delay += IO_DELAY_PER_OPA_TAP) ;
 	tmp_delay -= IO_DELAY_PER_OPA_TAP;
-	pr_debug("new p %d, tmp_delay=%d\n", p-1, tmp_delay);
-	for (d = 0; d <= IO_DQS_EN_DELAY_MAX && tmp_delay < work_mid; d++,
-		tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP)
-		;
-	pr_debug("new d %d, tmp_delay=%d\n", d, tmp_delay);
+	DPRINT(2, "new p %ld, tmp_delay=%ld", p - 1, tmp_delay);
+	for (d = 0; d <= IO_DQS_EN_DELAY_MAX && tmp_delay < work_mid;
+	     d++, tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) ;
+	DPRINT(2, "new d %ld, tmp_delay=%ld", d, tmp_delay);
+
+	// DQSEN same for all shadow reg
+	for (sr = 0; sr < NUM_SHADOW_REGS; sr++) {
+		TCLRPT_SET(debug_cal_report->cal_dqs_in_margins[sr][grp].dqsen_margin,
+			   max_working_cnt - 1);
+	}
 
-	scc_mgr_set_dqs_en_phase_all_ranks(grp, p-1);
+	scc_mgr_set_dqs_en_phase_all_ranks(grp, p - 1);
 	scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
 
-	/* push vfifo until we can successfully calibrate. We can do this
-	because the largest possible margin in 1 VFIFO cycle */
+	//USER push vfifo until we can successfully calibrate. We can do this because
+	//USER the largest possible margin in 1 VFIFO cycle
 
 	for (i = 0; i < VFIFO_SIZE; i++) {
-		pr_debug("find_dqs_en_phase: center: vfifo=%u\n", vfifo_idx);
-		if (rw_mgr_mem_calibrate_read_test_all_ranks (grp, 1,
-			PASS_ONE_BIT, &bit_chk, 0)) {
+		DPRINT(2, "find_dqs_en_phase: center: vfifo=%lu", BFM_GBL_GET(vfifo_idx));
+		if (rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
 			break;
 		}
-
-		/* fiddle with FIFO */
+		//USER fiddle with FIFO
 		rw_mgr_incr_vfifo(grp, &v);
 	}
 
 	if (i >= VFIFO_SIZE) {
-		pr_debug("find_dqs_en_phase: center: failed\n");
+		DPRINT(2, "find_dqs_en_phase: center: failed");
 		return 0;
 	}
-	pr_debug("find_dqs_en_phase: center found: vfifo=%u ptap=%u "
-		"dtap=%u\n", vfifo_idx, p-1, d);
+	DPRINT(2, "find_dqs_en_phase: center found: vfifo=%li ptap=%lu dtap=%lu",
+	       BFM_GBL_GET(vfifo_idx), p - 1, d);
+	BFM_GBL_SET(dqs_enable_mid[grp].v, BFM_GBL_GET(vfifo_idx));
+	BFM_GBL_SET(dqs_enable_mid[grp].p, p - 1);
+	BFM_GBL_SET(dqs_enable_mid[grp].d, d);
+	BFM_GBL_SET(dqs_enable_mid[grp].ps, work_mid);
 	return 1;
 }
 
-/* Try rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase across different
-dq_in_delay values */
-static uint32_t
-rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay(uint32_t write_group, uint32_t read_group, uint32_t test_bgn)
-{
-#if STRATIXV || ARRIAV || CYCLONEV || ARRIAVGZ
-	uint32_t found;
-	uint32_t i;
-	uint32_t p;
-	uint32_t d;
-	uint32_t r;
-	const uint32_t delay_step = IO_IO_IN_DELAY_MAX / (RW_MGR_MEM_DQ_PER_READ_DQS - 1);
-	/* we start at zero, so have one less dq to devide among */
+#if 0
+// Ryan's algorithm 
 
-	/* try different dq_in_delays since the dq path is shorter than dqs */
+static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(uint32_t grp)
+{
+	uint32_t i, d, v, p;
+	uint32_t min_working_p, max_working_p, min_working_d, max_working_d, max_working_cnt;
+	uint32_t fail_cnt;
+	t_btfld bit_chk;
+	uint32_t dtaps_per_ptap;
+	uint32_t found_begin, found_end;
+	uint32_t tmp_delay;
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
-		for (i = 0, p = test_bgn, d = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS;
-			i++, p++, d += delay_step) {
-			pr_debug("rw_mgr_mem_calibrate_vfifo_find_dqs_"
-				"en_phase_sweep_dq_in_delay: g=%u/%u "
-				"r=%u, i=%u p=%u d=%u\n",
-				write_group, read_group, r, i, p, d);
-			scc_mgr_set_dq_in_delay(write_group, p, d);
-			scc_mgr_load_dq (p);
-		}
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-	}
+	TRACE_FUNC("%lu", grp);
 
-	found = rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(read_group);
+	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
 
-	pr_debug("rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq"
-		"_in_delay: g=%u/%u found=%u; Reseting delay chain to zero\n",
-		write_group, read_group, found);
+	scc_mgr_set_dqs_en_delay_all_ranks(grp, 0);
+	scc_mgr_set_dqs_en_phase_all_ranks(grp, 0);
 
-	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS;
-		r += NUM_RANKS_PER_SHADOW_REG) {
-		for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS;
-			i++, p++) {
-			scc_mgr_set_dq_in_delay(write_group, p, 0);
-			scc_mgr_load_dq (p);
-		}
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-	}
+	fail_cnt = 0;
 
-	return found;
-#else
-	return rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(read_group);
-#endif
-}
+	//USER **************************************************************
+	//USER * Step 0 : Determine number of delay taps for each phase tap *
 
-/* per-bit deskew DQ and center */
-static uint32_t rw_mgr_mem_calibrate_vfifo_center (uint32_t rank_bgn,
-	uint32_t write_group, uint32_t read_group, uint32_t test_bgn,
-	uint32_t use_read_test, uint32_t update_fom)
-{
-	uint32_t i, p, d, min_index;
-	/* Store these as signed since there are comparisons with
-	signed numbers */
-	t_btfld bit_chk;
-	t_btfld sticky_bit_chk;
-	int32_t left_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
-	int32_t right_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
-	int32_t final_dq[RW_MGR_MEM_DQ_PER_READ_DQS];
-	int32_t mid;
-	int32_t orig_mid_min, mid_min;
-	int32_t new_dqs, start_dqs, start_dqs_en, shift_dq, final_dqs,
-		final_dqs_en;
-	int32_t dq_margin, dqs_margin;
-	uint32_t stop;
+	dtaps_per_ptap = 0;
+	tmp_delay = 0;
+	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
+		dtaps_per_ptap++;
+		tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
+	}
+	dtaps_per_ptap--;
 
-	ALTERA_ASSERT(read_group < RW_MGR_MEM_IF_READ_DQS_WIDTH);
-	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+	//USER *********************************************************
+	//USER * Step 1 : First push vfifo until we get a failing read *
+	for (v = 0; v < VFIFO_SIZE;) {
+		if (!rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+			fail_cnt++;
 
-	start_dqs = READ_SCC_DQS_IN_DELAY(read_group);
-	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
-		start_dqs_en = READ_SCC_DQS_EN_DELAY(read_group);
+			if (fail_cnt == 2) {
+				break;
+			}
+		}
+		//USER fiddle with FIFO
+		rw_mgr_incr_vfifo(grp, &v);
 	}
 
-	/* per-bit deskew */
-
-	/* set the left and right edge of each bit to an illegal value */
-	/* use (IO_IO_IN_DELAY_MAX + 1) as an illegal value */
-	sticky_bit_chk = 0;
-	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
-		left_edge[i]  = IO_IO_IN_DELAY_MAX + 1;
-		right_edge[i] = IO_IO_IN_DELAY_MAX + 1;
+	if (i >= VFIFO_SIZE) {
+		//USER no failing read found!! Something must have gone wrong
+		return 0;
 	}
 
-	/* Search for the left edge of the window for each bit */
-	for (d = 0; d <= IO_IO_IN_DELAY_MAX; d++) {
-		scc_mgr_apply_group_dq_in_delay (write_group, test_bgn, d);
+	max_working_cnt = 0;
+	min_working_p = 0;
 
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	//USER ********************************************************
+	//USER * step 2: find first working phase, increment in ptaps *
+	found_begin = 0;
+	for (d = 0; d <= dtaps_per_ptap; d++) {
+		scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+		for (i = 0; i < VFIFO_SIZE; i++) {
+			for (p = 0; p <= IO_DQS_EN_PHASE_MAX; p++) {
+				scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
+
+				if (rw_mgr_mem_calibrate_read_test_all_ranks
+				    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+					max_working_cnt = 1;
+					found_begin = 1;
+					break;
+				}
+			}
+
+			if (found_begin) {
+				break;
+			}
+
+			if (p > IO_DQS_EN_PHASE_MAX) {
+				//USER fiddle with FIFO
+				rw_mgr_incr_vfifo(grp, &v);
+			}
+		}
+
+		if (found_begin) {
+			break;
+		}
+	}
+
+	if (i >= VFIFO_SIZE) {
+		//USER cannot find working solution 
+		return 0;
+	}
+
+	min_working_p = p;
+
+	//USER  If d is 0 then the working window covers a phase tap and we can follow the old procedure
+	//USER  otherwise, we've found the beginning, and we need to increment the dtaps until we find the end 
+	if (d == 0) {
+		//USER ********************************************************************
+		//USER * step 3a: if we have room, back off by one and increment in dtaps *
+		min_working_d = 0;
+
+		//USER Special case code for backing up a phase 
+		if (p == 0) {
+			p = IO_DQS_EN_PHASE_MAX;
+			rw_mgr_decr_vfifo(grp, &v);
+		} else {
+			p = p - 1;
+		}
+		scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
+
+		found_begin = 0;
+		for (d = 0; d <= dtaps_per_ptap; d++) {
+			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+			if (rw_mgr_mem_calibrate_read_test_all_ranks
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+				found_begin = 1;
+				min_working_d = d;
+				break;
+			}
+		}
+
+		//USER We have found a working dtap before the ptap found above 
+		if (found_begin == 1) {
+			min_working_p = p;
+			max_working_cnt++;
+		}
+		//USER Restore VFIFO to old state before we decremented it 
+		p = p + 1;
+		if (p > IO_DQS_EN_PHASE_MAX) {
+			p = 0;
+			rw_mgr_incr_vfifo(grp, &v);
+		}
+
+		scc_mgr_set_dqs_en_delay_all_ranks(grp, 0);
+
+		//USER ***********************************************************************************
+		//USER * step 4a: go forward from working phase to non working phase, increment in ptaps *
+		p = p + 1;
+		if (p > IO_DQS_EN_PHASE_MAX) {
+			//USER fiddle with FIFO
+			p = 0;
+			rw_mgr_incr_vfifo(grp, &v);
+		}
+
+		found_end = 0;
+		for (; i < VFIFO_SIZE + 1; i++) {
+			for (; p <= IO_DQS_EN_PHASE_MAX; p++) {
+				scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
+
+				if (!rw_mgr_mem_calibrate_read_test_all_ranks
+				    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+					found_end = 1;
+					break;
+				} else {
+					max_working_cnt++;
+				}
+			}
+
+			if (found_end) {
+				break;
+			}
+
+			if (p > IO_DQS_EN_PHASE_MAX) {
+				//USER fiddle with FIFO
+				rw_mgr_incr_vfifo(grp, &v);
+				p = 0;
+			}
+		}
+
+		if (i >= VFIFO_SIZE + 1) {
+			//USER cannot see edge of failing read 
+			return 0;
+		}
+		//USER *********************************************************
+		//USER * step 5a:  back off one from last, increment in dtaps  *
+		max_working_d = 0;
+
+		//USER Special case code for backing up a phase 
+		if (p == 0) {
+			p = IO_DQS_EN_PHASE_MAX;
+			rw_mgr_decr_vfifo(grp, &v);
+		} else {
+			p = p - 1;
+		}
+
+		max_working_p = p;
+		scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
+
+		for (d = 0; d <= IO_DQS_EN_DELAY_MAX; d++) {
+			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+			if (!rw_mgr_mem_calibrate_read_test_all_ranks
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+				break;
+			}
+		}
+
+		//USER Go back to working dtap 
+		if (d != 0) {
+			max_working_d = d - 1;
+		}
+
+	} else {
+
+		//USER ********************************************************************
+		//USER * step 3-5b:  Find the right edge of the window using delay taps   *             
+
+		max_working_p = min_working_p;
+		min_working_d = d;
+
+		for (; d <= IO_DQS_EN_DELAY_MAX; d++) {
+			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+			if (!rw_mgr_mem_calibrate_read_test_all_ranks
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+				break;
+			}
+		}
+
+		//USER Go back to working dtap 
+		if (d != 0) {
+			max_working_d = d - 1;
+		}
+		//USER Only here to counterbalance a subtract later on which is not needed if this branch
+		//USER of the algorithm is taken 
+		max_working_cnt++;
+	}
+
+	//USER ********************************************
+	//USER * step 6:  Find the centre of the window   *
+
+	//USER If the number of working phases is even we will step back a phase and find the
+	//USER  edge with a larger delay chain tap 
+	if ((max_working_cnt & 1) == 0) {
+		p = min_working_p + (max_working_cnt - 1) / 2;
+
+		//USER Special case code for backing up a phase 
+		if (max_working_p == 0) {
+			max_working_p = IO_DQS_EN_PHASE_MAX;
+			rw_mgr_decr_vfifo(grp, &v);
+		} else {
+			max_working_p = max_working_p - 1;
+		}
+
+		scc_mgr_set_dqs_en_phase_all_ranks(grp, max_working_p);
+
+		//USER Code to determine at which dtap we should start searching again for a failure
+		//USER If we've moved back such that the max and min p are the same, we should start searching
+		//USER from where the window actually exists
+		if (max_working_p == min_working_p) {
+			d = min_working_d;
+		} else {
+			d = max_working_d;
+		}
+
+		for (; d <= IO_DQS_EN_DELAY_MAX; d++) {
+			scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+			if (!rw_mgr_mem_calibrate_read_test_all_ranks
+			    (grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+				break;
+			}
+		}
+
+		//USER Go back to working dtap 
+		if (d != 0) {
+			max_working_d = d - 1;
+		}
+	} else {
+		p = min_working_p + (max_working_cnt) / 2;
+	}
+
+	while (p > IO_DQS_EN_PHASE_MAX) {
+		p -= (IO_DQS_EN_PHASE_MAX + 1);
+	}
+
+	d = (min_working_d + max_working_d) / 2;
+
+	scc_mgr_set_dqs_en_phase_all_ranks(grp, p);
+	scc_mgr_set_dqs_en_delay_all_ranks(grp, d);
+
+	//USER push vfifo until we can successfully calibrate 
+
+	for (i = 0; i < VFIFO_SIZE; i++) {
+		if (rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+			break;
+		}
+		//USER fiddle with FIFO
+		rw_mgr_incr_vfifo(grp, &v);
+	}
+
+	if (i >= VFIFO_SIZE) {
+		return 0;
+	}
+
+	return 1;
+}
+
+#endif
+
+#else
+// Val's original version 
+
+static uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(uint32_t grp)
+{
+	uint32_t i, j, v, d;
+	uint32_t min_working_d, max_working_cnt;
+	uint32_t fail_cnt;
+	t_btfld bit_chk;
+	uint32_t delay_per_ptap_mid;
+
+	TRACE_FUNC("%lu", grp);
+
+	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
+
+	scc_mgr_set_dqs_en_delay_all_ranks(grp, 0);
+	scc_mgr_set_dqs_en_phase_all_ranks(grp, 0);
+
+	fail_cnt = 0;
+
+	//USER first push vfifo until we get a failing read 
+	v = 0;
+	for (i = 0; i < VFIFO_SIZE; i++) {
+		if (!rw_mgr_mem_calibrate_read_test_all_ranks(grp, 1, PASS_ONE_BIT, &bit_chk, 0)) {
+			fail_cnt++;
+
+			if (fail_cnt == 2) {
+				break;
+			}
+		}
+		//USER fiddle with FIFO
+		rw_mgr_incr_vfifo(grp, &v);
+	}
+
+	if (v >= VFIFO_SIZE) {
+		//USER no failing read found!! Something must have gone wrong
+
+		return 0;
+	}
+
+	max_working_cnt = 0;
+	min_working_d = 0;
+
+	for (i = 0; i < VFIFO_SIZE + 1; i++) {
+		for (d = 0; d <= IO_DQS_EN_PHASE_MAX; d++) {
+			scc_mgr_set_dqs_en_phase_all_ranks(grp, d);
+
+			rw_mgr_mem_calibrate_read_test_all_ranks(grp, NUM_READ_PB_TESTS,
+								 PASS_ONE_BIT, &bit_chk, 0);
+			if (bit_chk) {
+				//USER passing read 
+
+				if (max_working_cnt == 0) {
+					min_working_d = d;
+				}
+
+				max_working_cnt++;
+			} else {
+				if (max_working_cnt > 0) {
+					//USER already have one working value 
+					break;
+				}
+			}
+		}
+
+		if (d > IO_DQS_EN_PHASE_MAX) {
+			//USER fiddle with FIFO
+			rw_mgr_incr_vfifo(grp, &v);
+		} else {
+			//USER found working solution! 
+
+			d = min_working_d + (max_working_cnt - 1) / 2;
+
+			while (d > IO_DQS_EN_PHASE_MAX) {
+				d -= (IO_DQS_EN_PHASE_MAX + 1);
+			}
+
+			break;
+		}
+	}
+
+	if (i >= VFIFO_SIZE + 1) {
+		//USER cannot find working solution or cannot see edge of failing read 
+
+		return 0;
+	}
+	//USER in the case the number of working steps is even, use 50ps taps to further center the window 
+
+	if ((max_working_cnt & 1) == 0) {
+		delay_per_ptap_mid = IO_DELAY_PER_OPA_TAP / 2;
+
+		//USER increment in 50ps taps until we reach the required amount 
+
+		for (i = 0, j = 0; i <= IO_DQS_EN_DELAY_MAX && j < delay_per_ptap_mid;
+		     i++, j += IO_DELAY_PER_DQS_EN_DCHAIN_TAP) ;
+
+		scc_mgr_set_dqs_en_delay_all_ranks(grp, i - 1);
+	}
+
+	scc_mgr_set_dqs_en_phase_all_ranks(grp, d);
+
+	//USER push vfifo until we can successfully calibrate 
+
+	for (i = 0; i < VFIFO_SIZE; i++) {
+		if (rw_mgr_mem_calibrate_read_test_all_ranks
+		    (grp, NUM_READ_PB_TESTS, PASS_ONE_BIT, &bit_chk, 0)) {
+			break;
+		}
+		//USER fiddle with FIFO
+		rw_mgr_incr_vfifo(grp, &v);
+	}
+
+	if (i >= VFIFO_SIZE) {
+		return 0;
+	}
+
+	return 1;
+}
+
+#endif
+
+// Try rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase across different dq_in_delay values
+static inline uint32_t rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay(uint32_t
+										      write_group,
+										      uint32_t
+										      read_group,
+										      uint32_t
+										      test_bgn)
+{
+	uint32_t found;
+	uint32_t i;
+	uint32_t p;
+	uint32_t d;
+	uint32_t r;
+
+	const uint32_t delay_step = IO_IO_IN_DELAY_MAX / (RW_MGR_MEM_DQ_PER_READ_DQS - 1);	/* we start at zero, so have one less dq to devide among */
+
+	TRACE_FUNC("(%lu,%lu,%lu)", write_group, read_group, test_bgn);
+
+	// try different dq_in_delays since the dq path is shorter than dqs
+
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+		select_shadow_regs_for_update(r, write_group, 1);
+		for (i = 0, p = test_bgn, d = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS;
+		     i++, p++, d += delay_step) {
+			DPRINT(1,
+			       "rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay: g=%lu/%lu r=%lu, i=%lu p=%lu d=%lu",
+			       write_group, read_group, r, i, p, d);
+			scc_mgr_set_dq_in_delay(write_group, p, d);
+			scc_mgr_load_dq(p);
+		}
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	}
+
+	found = rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase(read_group);
+
+	DPRINT(1,
+	       "rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay: g=%lu/%lu found=%lu; Reseting delay chain to zero",
+	       write_group, read_group, found);
+
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+		select_shadow_regs_for_update(r, write_group, 1);
+		for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
+			scc_mgr_set_dq_in_delay(write_group, p, 0);
+			scc_mgr_load_dq(p);
+		}
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	}
+
+	return found;
+}
+
+//USER per-bit deskew DQ and center 
+
+#if NEWVERSION_RDDESKEW
+
+static uint32_t rw_mgr_mem_calibrate_vfifo_center(uint32_t rank_bgn, uint32_t write_group,
+						  uint32_t read_group, uint32_t test_bgn,
+						  uint32_t use_read_test, uint32_t update_fom)
+{
+	uint32_t i, p, d, min_index;
+	//USER Store these as signed since there are comparisons with signed numbers
+	t_btfld bit_chk;
+	t_btfld sticky_bit_chk;
+	int32_t left_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
+	int32_t right_edge[RW_MGR_MEM_DQ_PER_READ_DQS];
+	int32_t final_dq[RW_MGR_MEM_DQ_PER_READ_DQS];
+	int32_t mid;
+	int32_t orig_mid_min, mid_min;
+	int32_t new_dqs, start_dqs, start_dqs_en, shift_dq, final_dqs, final_dqs_en;
+	int32_t dq_margin, dqs_margin;
+	uint32_t stop;
+
+	TRACE_FUNC("%lu %lu", read_group, test_bgn);
+
+	ALTERA_ASSERT(read_group < RW_MGR_MEM_IF_READ_DQS_WIDTH);
+	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+
+	start_dqs = READ_SCC_DQS_IN_DELAY(read_group);
+	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
+		start_dqs_en = READ_SCC_DQS_EN_DELAY(read_group);
+	}
+
+	select_curr_shadow_reg_using_rank(rank_bgn);
+
+	//USER per-bit deskew 
+
+	//USER set the left and right edge of each bit to an illegal value 
+	//USER use (IO_IO_IN_DELAY_MAX + 1) as an illegal value 
+	sticky_bit_chk = 0;
+	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
+		left_edge[i] = IO_IO_IN_DELAY_MAX + 1;
+		right_edge[i] = IO_IO_IN_DELAY_MAX + 1;
+	}
+
+	//USER Search for the left edge of the window for each bit
+	for (d = 0; d <= IO_IO_IN_DELAY_MAX; d++) {
+		scc_mgr_apply_group_dq_in_delay(write_group, test_bgn, d);
 
-		/* Stop searching when the read test doesn't pass AND when
-		we've seen a passing read on every bit */
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		//USER Stop searching when the read test doesn't pass AND when we've seen a passing read on every bit
 		if (use_read_test) {
-			stop = !rw_mgr_mem_calibrate_read_test (rank_bgn,
-				read_group, NUM_READ_PB_TESTS, PASS_ONE_BIT,
-				&bit_chk, 0, 0);
+			stop =
+			    !rw_mgr_mem_calibrate_read_test(rank_bgn, read_group, NUM_READ_PB_TESTS,
+							    PASS_ONE_BIT, &bit_chk, 0, 0);
 		} else {
-			rw_mgr_mem_calibrate_write_test (rank_bgn, write_group,
-				0, PASS_ONE_BIT, &bit_chk, 0);
-			bit_chk = bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
-				(read_group - (write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP)));
+			rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 0, PASS_ONE_BIT,
+							&bit_chk, 0);
+			bit_chk =
+			    bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
+					(read_group -
+					 (write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+					  RW_MGR_MEM_IF_WRITE_DQS_WIDTH)));
 			stop = (bit_chk == 0);
 		}
 		sticky_bit_chk = sticky_bit_chk | bit_chk;
 		stop = stop && (sticky_bit_chk == param->read_correct_mask);
-		pr_debug("vfifo_center(left): dtap=%u => " BTFLD_FMT " == "
-			BTFLD_FMT " && %u\n", d, sticky_bit_chk,
-			param->read_correct_mask, stop);
+		DPRINT(2, "vfifo_center(left): dtap=%lu => " BTFLD_FMT " == " BTFLD_FMT " && %lu",
+		       d, sticky_bit_chk, param->read_correct_mask, stop);
 
-		if (stop == 1)
+		if (stop == 1) {
 			break;
-		for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
-			if (bit_chk & 1) {
-				/* Remember a passing test as the
-				left_edge */
-				left_edge[i] = d;
-			} else {
-				/* If a left edge has not been seen yet,
-				then a future passing test will mark
-				this edge as the right edge */
-				if (left_edge[i] ==
-					IO_IO_IN_DELAY_MAX + 1) {
-					right_edge[i] = -(d + 1);
+		} else {
+			for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
+				if (bit_chk & 1) {
+					//USER Remember a passing test as the left_edge
+					left_edge[i] = d;
+				} else {
+					//USER If a left edge has not been seen yet, then a future passing test will mark this edge as the right edge 
+					if (left_edge[i] == IO_IO_IN_DELAY_MAX + 1) {
+						right_edge[i] = -(d + 1);
+					}
 				}
+				DPRINT(2,
+				       "vfifo_center[l,d=%lu]: bit_chk_test=%d left_edge[%lu]: %ld right_edge[%lu]: %ld",
+				       d, (int)(bit_chk & 1), i, left_edge[i], i, right_edge[i]);
+				bit_chk = bit_chk >> 1;
 			}
-			pr_debug("vfifo_center[l,d=%u]: "
-				"bit_chk_test=%d left_edge[%u]: "
-				"%d right_edge[%u]: %d\n",
-				d, (int)(bit_chk & 1), i, left_edge[i],
-				i, right_edge[i]);
-			bit_chk = bit_chk >> 1;
 		}
 	}
 
-	/* Reset DQ delay chains to 0 */
-	scc_mgr_apply_group_dq_in_delay (write_group, test_bgn, 0);
+	//USER Reset DQ delay chains to 0 
+	scc_mgr_apply_group_dq_in_delay(write_group, test_bgn, 0);
 	sticky_bit_chk = 0;
 	for (i = RW_MGR_MEM_DQ_PER_READ_DQS - 1;; i--) {
 
-		pr_debug("vfifo_center: left_edge[%u]: %d right_edge[%u]: "
-			"%d\n", i, left_edge[i], i, right_edge[i]);
+		DPRINT(2, "vfifo_center: left_edge[%lu]: %ld right_edge[%lu]: %ld", i, left_edge[i],
+		       i, right_edge[i]);
 
-		/* Check for cases where we haven't found the left edge,
-		which makes our assignment of the the right edge invalid.
-		Reset it to the illegal value. */
-		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1) && (
-			right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
+		//USER Check for cases where we haven't found the left edge, which makes our assignment of the the 
+		//USER right edge invalid.  Reset it to the illegal value. 
+		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1)
+		    && (right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
 			right_edge[i] = IO_IO_IN_DELAY_MAX + 1;
-			pr_debug("vfifo_center: reset right_edge[%u]: %d\n",
-				i, right_edge[i]);
+			DPRINT(2, "vfifo_center: reset right_edge[%lu]: %ld", i, right_edge[i]);
 		}
-
-		/* Reset sticky bit (except for bits where we have seen
-		both the left and right edge) */
+		//USER Reset sticky bit (except for bits where we have seen both the left and right edge) 
 		sticky_bit_chk = sticky_bit_chk << 1;
-		if ((left_edge[i] != IO_IO_IN_DELAY_MAX + 1) &&
-			(right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
+		if ((left_edge[i] != IO_IO_IN_DELAY_MAX + 1)
+		    && (right_edge[i] != IO_IO_IN_DELAY_MAX + 1)) {
 			sticky_bit_chk = sticky_bit_chk | 1;
 		}
 
-		if (i == 0)
+		if (i == 0) {
 			break;
+		}
 	}
 
-	/* Search for the right edge of the window for each bit */
+	//USER Search for the right edge of the window for each bit 
 	for (d = 0; d <= IO_DQS_IN_DELAY_MAX - start_dqs; d++) {
 		scc_mgr_set_dqs_bus_in_delay(read_group, d + start_dqs);
 		if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
@@ -2522,110 +2949,101 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_center (uint32_t rank_bgn,
 			}
 			scc_mgr_set_dqs_en_delay(read_group, delay);
 		}
-		scc_mgr_load_dqs (read_group);
+		scc_mgr_load_dqs(read_group);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-		/* Stop searching when the read test doesn't pass AND when
-		we've seen a passing read on every bit */
+		//USER Stop searching when the read test doesn't pass AND when we've seen a passing read on every bit 
 		if (use_read_test) {
-			stop = !rw_mgr_mem_calibrate_read_test (rank_bgn,
-				read_group, NUM_READ_PB_TESTS, PASS_ONE_BIT,
-				&bit_chk, 0, 0);
+			stop =
+			    !rw_mgr_mem_calibrate_read_test(rank_bgn, read_group, NUM_READ_PB_TESTS,
+							    PASS_ONE_BIT, &bit_chk, 0, 0);
 		} else {
-			rw_mgr_mem_calibrate_write_test (rank_bgn, write_group,
-				0, PASS_ONE_BIT, &bit_chk, 0);
-			bit_chk = bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
-				(read_group - (write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP)));
+			rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 0, PASS_ONE_BIT,
+							&bit_chk, 0);
+			bit_chk =
+			    bit_chk >> (RW_MGR_MEM_DQ_PER_READ_DQS *
+					(read_group -
+					 (write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+					  RW_MGR_MEM_IF_WRITE_DQS_WIDTH)));
 			stop = (bit_chk == 0);
 		}
 		sticky_bit_chk = sticky_bit_chk | bit_chk;
 		stop = stop && (sticky_bit_chk == param->read_correct_mask);
 
-		pr_debug("vfifo_center(right): dtap=%u => " BTFLD_FMT " == "
-			BTFLD_FMT " && %u\n", d, sticky_bit_chk,
-			param->read_correct_mask, stop);
+		DPRINT(2, "vfifo_center(right): dtap=%lu => " BTFLD_FMT " == " BTFLD_FMT " && %lu",
+		       d, sticky_bit_chk, param->read_correct_mask, stop);
 
 		if (stop == 1) {
 			break;
 		} else {
 			for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
 				if (bit_chk & 1) {
-					/* Remember a passing test as
-					the right_edge */
+					//USER Remember a passing test as the right_edge 
 					right_edge[i] = d;
 				} else {
 					if (d != 0) {
-						/* If a right edge has not been
-						seen yet, then a future passing
-						test will mark this edge as the
-						left edge */
-						if (right_edge[i] ==
-						IO_IO_IN_DELAY_MAX + 1) {
+						//USER If a right edge has not been seen yet, then a future passing test will mark this edge as the left edge 
+						if (right_edge[i] == IO_IO_IN_DELAY_MAX + 1) {
 							left_edge[i] = -(d + 1);
 						}
 					} else {
-						/* d = 0 failed, but it passed
-						when testing the left edge,
-						so it must be marginal,
-						set it to -1 */
-						if (right_edge[i] ==
-							IO_IO_IN_DELAY_MAX + 1
-							&& left_edge[i] !=
-							IO_IO_IN_DELAY_MAX
-							+ 1) {
+						//USER d = 0 failed, but it passed when testing the left edge, so it must be marginal, set it to -1
+						if (right_edge[i] == IO_IO_IN_DELAY_MAX + 1
+						    && left_edge[i] != IO_IO_IN_DELAY_MAX + 1) {
 							right_edge[i] = -1;
 						}
-						/* If a right edge has not been
-						seen yet, then a future passing
-						test will mark this edge as the
-						left edge */
-						else if (right_edge[i] ==
-							IO_IO_IN_DELAY_MAX +
-							1) {
+						//USER If a right edge has not been seen yet, then a future passing test will mark this edge as the left edge 
+						else if (right_edge[i] == IO_IO_IN_DELAY_MAX + 1) {
 							left_edge[i] = -(d + 1);
 						}
 
 					}
 				}
 
-				pr_debug("vfifo_center[r,d=%u]: "
-					"bit_chk_test=%d left_edge[%u]: %d "
-					"right_edge[%u]: %d\n",
-					d, (int)(bit_chk & 1), i, left_edge[i],
-					i, right_edge[i]);
+				DPRINT(2,
+				       "vfifo_center[r,d=%lu]: bit_chk_test=%d left_edge[%lu]: %ld right_edge[%lu]: %ld",
+				       d, (int)(bit_chk & 1), i, left_edge[i], i, right_edge[i]);
 				bit_chk = bit_chk >> 1;
 			}
 		}
 	}
 
-	/* Store all observed margins */
+	// Store all observed margins
 
-	/* Check that all bits have a window */
+	//USER Check that all bits have a window
 	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
-		pr_debug("vfifo_center: left_edge[%u]: %d right_edge[%u]:"
-			" %d\n", i, left_edge[i], i, right_edge[i]);
-		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1) || (right_edge[i]
-			== IO_IO_IN_DELAY_MAX + 1)) {
-
-			/* Restore delay chain settings before letting the loop
-			in rw_mgr_mem_calibrate_vfifo to retry different
-			dqs/ck relationships */
+		DPRINT(2, "vfifo_center: left_edge[%lu]: %ld right_edge[%lu]: %ld", i, left_edge[i],
+		       i, right_edge[i]);
+		BFM_GBL_SET(dq_read_left_edge[read_group][i], left_edge[i]);
+		BFM_GBL_SET(dq_read_right_edge[read_group][i], right_edge[i]);
+		if ((left_edge[i] == IO_IO_IN_DELAY_MAX + 1)
+		    || (right_edge[i] == IO_IO_IN_DELAY_MAX + 1)) {
+
+			//USER Restore delay chain settings before letting the loop in 
+			//USER rw_mgr_mem_calibrate_vfifo to retry different dqs/ck relationships
 			scc_mgr_set_dqs_bus_in_delay(read_group, start_dqs);
 			if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
-				scc_mgr_set_dqs_en_delay(read_group,
-					start_dqs_en);
+				scc_mgr_set_dqs_en_delay(read_group, start_dqs_en);
 			}
-			scc_mgr_load_dqs (read_group);
+			scc_mgr_load_dqs(read_group);
 			IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-			pr_debug("vfifo_center: failed to find edge [%u]: "
-				"%d %d\n", i, left_edge[i], right_edge[i]);
+			DPRINT(1, "vfifo_center: failed to find edge [%lu]: %ld %ld", i,
+			       left_edge[i], right_edge[i]);
+			if (use_read_test) {
+				set_failing_group_stage(read_group * RW_MGR_MEM_DQ_PER_READ_DQS + i,
+							CAL_STAGE_VFIFO, CAL_SUBSTAGE_VFIFO_CENTER);
+			} else {
+				set_failing_group_stage(read_group * RW_MGR_MEM_DQ_PER_READ_DQS + i,
+							CAL_STAGE_VFIFO_AFTER_WRITES,
+							CAL_SUBSTAGE_VFIFO_CENTER);
+			}
 			return 0;
 		}
 	}
 
-	/* Find middle of window for each DQ bit */
+	//USER Find middle of window for each DQ bit 
 	mid_min = left_edge[0] - right_edge[0];
 	min_index = 0;
 	for (i = 1; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
@@ -2636,58 +3054,64 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_center (uint32_t rank_bgn,
 		}
 	}
 
-	/*  -mid_min/2 represents the amount that we need to move DQS.
-	If mid_min is odd and positive we'll need to add one to
-	make sure the rounding in further calculations is correct
-	(always bias to the right), so just add 1 for all positive values */
+	//USER  -mid_min/2 represents the amount that we need to move DQS.  If mid_min is odd and positive we'll need to add one to
+	//USER make sure the rounding in further calculations is correct (always bias to the right), so just add 1 for all positive values
 	if (mid_min > 0) {
 		mid_min++;
 	}
 	mid_min = mid_min / 2;
 
-	pr_debug("vfifo_center: mid_min=%d (index=%u)\n", mid_min, min_index);
+	DPRINT(1, "vfifo_center: mid_min=%ld (index=%lu)", mid_min, min_index);
 
-	/* Determine the amount we can change DQS (which is -mid_min) */
+	//USER Determine the amount we can change DQS (which is -mid_min)
 	orig_mid_min = mid_min;
-	new_dqs = start_dqs;
-	mid_min = 0;
+	new_dqs = start_dqs - mid_min;
+	if (new_dqs > IO_DQS_IN_DELAY_MAX) {
+		new_dqs = IO_DQS_IN_DELAY_MAX;
+	} else if (new_dqs < 0) {
+		new_dqs = 0;
+	}
+	mid_min = start_dqs - new_dqs;
+	DPRINT(1, "vfifo_center: new mid_min=%ld new_dqs=%ld", mid_min, new_dqs);
+
+	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
+		if (start_dqs_en - mid_min > IO_DQS_EN_DELAY_MAX) {
+			mid_min += start_dqs_en - mid_min - IO_DQS_EN_DELAY_MAX;
+		} else if (start_dqs_en - mid_min < 0) {
+			mid_min += start_dqs_en - mid_min;
+		}
+	}
+	new_dqs = start_dqs - mid_min;
 
-	pr_debug("vfifo_center: start_dqs=%d start_dqs_en=%d "
-		"new_dqs=%d mid_min=%d\n",
-		start_dqs, IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS ? start_dqs_en : -1,
-		new_dqs, mid_min);
+	DPRINT(1, "vfifo_center: start_dqs=%ld start_dqs_en=%ld new_dqs=%ld mid_min=%ld",
+	       start_dqs, IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS ? start_dqs_en : -1, new_dqs, mid_min);
 
-	/* Initialize data for export structures */
+	//USER Initialize data for export structures 
 	dqs_margin = IO_IO_IN_DELAY_MAX + 1;
-	dq_margin  = IO_IO_IN_DELAY_MAX + 1;
+	dq_margin = IO_IO_IN_DELAY_MAX + 1;
 
-	/* add delay to bring centre of all DQ windows to the same "level" */
+	//USER add delay to bring centre of all DQ windows to the same "level" 
 	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
-		/* Use values before divide by 2 to reduce round off error */
-		shift_dq = (left_edge[i] - right_edge[i] -
-			(left_edge[min_index] - right_edge[min_index]))/2  +
-			(orig_mid_min - mid_min);
-
-		pr_debug("vfifo_center: before: shift_dq[%u]=%d\n", i,
-			shift_dq);
-
-		if (shift_dq + (int32_t)READ_SCC_DQ_IN_DELAY(p) >
-			(int32_t)IO_IO_IN_DELAY_MAX) {
-			shift_dq = (int32_t)IO_IO_IN_DELAY_MAX -
-				READ_SCC_DQ_IN_DELAY(i);
-		} else if (shift_dq + (int32_t)READ_SCC_DQ_IN_DELAY(p) < 0) {
-			shift_dq = -(int32_t)READ_SCC_DQ_IN_DELAY(p);
-		}
-		pr_debug("vfifo_center: after: shift_dq[%u]=%d\n", i,
-			shift_dq);
+		//USER Use values before divide by 2 to reduce round off error 
+		shift_dq =
+		    (left_edge[i] - right_edge[i] -
+		     (left_edge[min_index] - right_edge[min_index])) / 2 + (orig_mid_min - mid_min);
+
+		DPRINT(2, "vfifo_center: before: shift_dq[%lu]=%ld", i, shift_dq);
+
+		if (shift_dq + (int32_t) READ_SCC_DQ_IN_DELAY(p) > (int32_t) IO_IO_IN_DELAY_MAX) {
+			shift_dq = (int32_t) IO_IO_IN_DELAY_MAX - READ_SCC_DQ_IN_DELAY(i);
+		} else if (shift_dq + (int32_t) READ_SCC_DQ_IN_DELAY(p) < 0) {
+			shift_dq = -(int32_t) READ_SCC_DQ_IN_DELAY(p);
+		}
+		DPRINT(2, "vfifo_center: after: shift_dq[%lu]=%ld", i, shift_dq);
 		final_dq[i] = READ_SCC_DQ_IN_DELAY(p) + shift_dq;
 		scc_mgr_set_dq_in_delay(write_group, p, final_dq[i]);
-		scc_mgr_load_dq (p);
+		scc_mgr_load_dq(p);
 
-		pr_debug("vfifo_center: margin[%u]=[%d,%d]\n", i,
-			left_edge[i] - shift_dq + (-mid_min),
-			right_edge[i] + shift_dq - (-mid_min));
-		/* To determine values for export structures */
+		DPRINT(2, "vfifo_center: margin[%lu]=[%ld,%ld]", i,
+		       left_edge[i] - shift_dq + (-mid_min), right_edge[i] + shift_dq - (-mid_min));
+		//USER To determine values for export structures 
 		if (left_edge[i] - shift_dq + (-mid_min) < dq_margin) {
 			dq_margin = left_edge[i] - shift_dq + (-mid_min);
 		}
@@ -2696,352 +3120,593 @@ static uint32_t rw_mgr_mem_calibrate_vfifo_center (uint32_t rank_bgn,
 		}
 	}
 
-#if ENABLE_DQS_IN_CENTERING
 	final_dqs = new_dqs;
 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
 		final_dqs_en = start_dqs_en - mid_min;
 	}
-#else
-	final_dqs = start_dqs;
-	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
-		final_dqs_en = start_dqs_en;
-	}
-#endif
-
-	/* Move DQS-en */
+	//USER Move DQS-en
 	if (IO_SHIFT_DQS_EN_WHEN_SHIFT_DQS) {
 		scc_mgr_set_dqs_en_delay(read_group, final_dqs_en);
-		scc_mgr_load_dqs (read_group);
+		scc_mgr_load_dqs(read_group);
 	}
-
-	/* Move DQS */
+	//USER Move DQS
 	scc_mgr_set_dqs_bus_in_delay(read_group, final_dqs);
-	scc_mgr_load_dqs (read_group);
+	scc_mgr_load_dqs(read_group);
 
 	if (update_fom) {
-		/* Export values */
-		gbl->fom_in += (dq_margin + dqs_margin) / RW_MGR_NUM_DQS_PER_WRITE_GROUP;
+		//USER Export values 
+		gbl->fom_in +=
+		    (dq_margin +
+		     dqs_margin) / (RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+		TCLRPT_SET(debug_summary_report->fom_in,
+			   debug_summary_report->fom_in + (dq_margin +
+							   dqs_margin) /
+			   (RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH));
+		TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][write_group].
+			   fom_in,
+			   debug_cal_report->cal_status_per_group[curr_shadow_reg][write_group].
+			   fom_in + (dq_margin +
+				     dqs_margin) / (RW_MGR_MEM_IF_READ_DQS_WIDTH /
+						    RW_MGR_MEM_IF_WRITE_DQS_WIDTH));
 	}
 
-	pr_debug("vfifo_center: dq_margin=%d dqs_margin=%d\n",
-		dq_margin, dqs_margin);
+	TCLRPT_SET(debug_cal_report->cal_dqs_in_margins[curr_shadow_reg][read_group].dqs_margin,
+		   dqs_margin);
+	TCLRPT_SET(debug_cal_report->cal_dqs_in_margins[curr_shadow_reg][read_group].dq_margin,
+		   dq_margin);
 
-	/* Do not remove this line as it makes sure all of our decisions
-	have been applied */
+	DPRINT(2, "vfifo_center: dq_margin=%ld dqs_margin=%ld", dq_margin, dqs_margin);
+
+	//USER Do not remove this line as it makes sure all of our decisions have been applied
 	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	return (dq_margin >= 0) && (dqs_margin >= 0);
 }
 
-/*
- * calibrate the read valid prediction FIFO.
- *
- *  - read valid prediction will consist of finding a good DQS enable phase,
- * DQS enable delay, DQS input phase, and DQS input delay.
- *  - we also do a per-bit deskew on the DQ lines.
- */
+#else
+
+static uint32_t rw_mgr_mem_calibrate_vfifo_center(uint32_t rank_bgn, uint32_t grp,
+						  uint32_t test_bgn, uint32_t use_read_test)
+{
+	uint32_t i, p, d;
+	uint32_t mid;
+	t_btfld bit_chk;
+	uint32_t max_working_dq[RW_MGR_MEM_DQ_PER_READ_DQS];
+	uint32_t dq_margin, dqs_margin;
+	uint32_t start_dqs;
+
+	TRACE_FUNC("%lu %lu", grp, test_bgn);
+
+	//USER per-bit deskew.
+	//USER start of the per-bit sweep with the minimum working delay setting for
+	//USER all bits.
+
+	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
+		max_working_dq[i] = 0;
+	}
+
+	for (d = 1; d <= IO_IO_IN_DELAY_MAX; d++) {
+		scc_mgr_apply_group_dq_in_delay(write_group, test_bgn, d);
+
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		if (!rw_mgr_mem_calibrate_read_test
+		    (rank_bgn, grp, NUM_READ_PB_TESTS, PASS_ONE_BIT, &bit_chk, 0, 0)) {
+			break;
+		} else {
+			for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
+				if (bit_chk & 1) {
+					max_working_dq[i] = d;
+				}
+				bit_chk = bit_chk >> 1;
+			}
+		}
+	}
+
+	//USER determine minimum working value for DQ 
+
+	dq_margin = IO_IO_IN_DELAY_MAX;
+
+	for (i = 0; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++) {
+		if (max_working_dq[i] < dq_margin) {
+			dq_margin = max_working_dq[i];
+		}
+	}
+
+	//USER add delay to bring all DQ windows to the same "level" 
+
+	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
+		if (max_working_dq[i] > dq_margin) {
+			scc_mgr_set_dq_in_delay(write_group, i, max_working_dq[i] - dq_margin);
+		} else {
+			scc_mgr_set_dq_in_delay(write_group, i, 0);
+		}
+
+		scc_mgr_load_dq(p, p);
+	}
+
+	//USER sweep DQS window, may potentially have more window due to per-bit-deskew that was done
+	//USER in the previous step.
+
+	start_dqs = READ_SCC_DQS_IN_DELAY(grp);
+
+	for (d = start_dqs + 1; d <= IO_DQS_IN_DELAY_MAX; d++) {
+		scc_mgr_set_dqs_bus_in_delay(grp, d);
+		scc_mgr_load_dqs(grp);
+
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		if (!rw_mgr_mem_calibrate_read_test
+		    (rank_bgn, grp, NUM_READ_TESTS, PASS_ALL_BITS, &bit_chk, 0, 0)) {
+			break;
+		}
+	}
+
+	scc_mgr_set_dqs_bus_in_delay(grp, start_dqs);
 
+	//USER margin on the DQS pin 
 
+	dqs_margin = d - start_dqs - 1;
 
-/* VFIFO Calibration -- Full Calibration */
-static uint32_t rw_mgr_mem_calibrate_vfifo (uint32_t read_group, uint32_t test_bgn)
+	//USER find mid point, +1 so that we don't go crazy pushing DQ 
+
+	mid = (dq_margin + dqs_margin + 1) / 2;
+
+	gbl->fom_in += dq_margin + dqs_margin;
+//      TCLRPT_SET(debug_summary_report->fom_in, debug_summary_report->fom_in + (dq_margin + dqs_margin));
+//      TCLRPT_SET(debug_cal_report->cal_status_per_group[grp].fom_in, (dq_margin + dqs_margin));
+
+	//USER center DQS ... if the headroom is setup properly we shouldn't need to 
+
+	if (dqs_margin > mid) {
+		scc_mgr_set_dqs_bus_in_delay(grp, READ_SCC_DQS_IN_DELAY(grp) + dqs_margin - mid);
+
+		if (DDRX) {
+			uint32_t delay = READ_SCC_DQS_EN_DELAY(grp) + dqs_margin - mid;
+
+			if (delay > IO_DQS_EN_DELAY_MAX) {
+				delay = IO_DQS_EN_DELAY_MAX;
+			}
+
+			scc_mgr_set_dqs_en_delay(grp, delay);
+		}
+	}
+
+	scc_mgr_load_dqs(grp);
+
+	//USER center DQ 
+
+	if (dq_margin > mid) {
+		for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_READ_DQS; i++, p++) {
+			scc_mgr_set_dq_in_delay(write_group, i,
+						READ_SCC_DQ_IN_DELAY(i) + dq_margin - mid);
+			scc_mgr_load_dq(p, p);
+		}
+
+		dqs_margin += dq_margin - mid;
+		dq_margin -= dq_margin - mid;
+	}
+
+	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+	return (dq_margin + dqs_margin) > 0;
+}
+
+#endif
+
+//USER calibrate the read valid prediction FIFO.
+//USER 
+//USER  - read valid prediction will consist of finding a good DQS enable phase, DQS enable delay, DQS input phase, and DQS input delay.
+//USER  - we also do a per-bit deskew on the DQ lines.
+
+#if NEWVERSION_GW
+
+//USER VFIFO Calibration -- Full Calibration
+static uint32_t rw_mgr_mem_calibrate_vfifo(uint32_t read_group, uint32_t test_bgn)
 {
-	uint32_t p, d, rank_bgn;
+	uint32_t p, d, rank_bgn, sr;
 	uint32_t dtaps_per_ptap;
 	uint32_t tmp_delay;
 	t_btfld bit_chk;
 	uint32_t grp_calibrated;
 	uint32_t write_group, write_test_bgn;
+	uint32_t failed_substage;
+
+	TRACE_FUNC("%lu %lu", read_group, test_bgn);
 
-	/* update info for sims */
+	//USER update info for sims 
 
 	reg_file_set_stage(CAL_STAGE_VFIFO);
 
-	write_group = read_group;
-	write_test_bgn = test_bgn;
+	if (DDRX) {
+		write_group = read_group;
+		write_test_bgn = test_bgn;
+	} else {
+		write_group =
+		    read_group / (RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+		write_test_bgn = read_group * RW_MGR_MEM_DQ_PER_READ_DQS;
+	}
 
-	/* USER Determine number of delay taps for each phase tap */
+	// USER Determine number of delay taps for each phase tap
 	dtaps_per_ptap = 0;
 	tmp_delay = 0;
-
-	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
-		dtaps_per_ptap++;
-		tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
+	if (!QDRII) {
+		while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
+			dtaps_per_ptap++;
+			tmp_delay += IO_DELAY_PER_DQS_EN_DCHAIN_TAP;
+		}
+		dtaps_per_ptap--;
+		tmp_delay = 0;
 	}
-	dtaps_per_ptap--;
-	tmp_delay = 0;
-
-	/* update info for sims */
+	//USER update info for sims 
 
 	reg_file_set_group(read_group);
 
 	grp_calibrated = 0;
 
 	reg_file_set_sub_stage(CAL_SUBSTAGE_GUARANTEED_READ);
+	failed_substage = CAL_SUBSTAGE_GUARANTEED_READ;
 
 	for (d = 0; d <= dtaps_per_ptap && grp_calibrated == 0; d += 2) {
 
-		/* In RLDRAMX we may be messing the delay of pins in
-		the same write group but outside of the current read
-		group, but that's ok because we haven't calibrated the
-		output side yet. */
-		if (d > 0) {
-			scc_mgr_apply_group_all_out_delay_add_all_ranks
-			(write_group, write_test_bgn, d);
+		if (DDRX || RLDRAMX) {
+			// In RLDRAMX we may be messing the delay of pins in the same write group but outside of
+			// the current read group, but that's ok because we haven't calibrated the output side yet.
+			if (d > 0) {
+				scc_mgr_apply_group_all_out_delay_add_all_ranks(write_group,
+										write_test_bgn, d);
+			}
 		}
 
 		for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX && grp_calibrated == 0; p++) {
-			/* set a particular dqdqs phase */
-			scc_mgr_set_dqdqs_output_phase_all_ranks(
-					read_group, p);
-
-			/* Previous iteration may have failed as a result of
-			ck/dqs or ck/dk violation, in which case the device may
-			require special recovery. */
-			if (d != 0 || p != 0)
-				recover_mem_device_after_ck_dqs_violation();
+			//USER set a particular dqdqs phase 
+			if (DDRX) {
+				scc_mgr_set_dqdqs_output_phase_all_ranks(read_group, p);
+			}
+			//USER Previous iteration may have failed as a result of ck/dqs or ck/dk violation,
+			//USER in which case the device may require special recovery.
+			if (DDRX || RLDRAMX) {
+				if (d != 0 || p != 0) {
+					recover_mem_device_after_ck_dqs_violation();
+				}
+			}
 
-			pr_debug("calibrate_vfifo: g=%u p=%u d=%u\n",
-				read_group, p, d);
+			DPRINT(1, "calibrate_vfifo: g=%lu p=%lu d=%lu", read_group, p, d);
+			BFM_GBL_SET(gwrite_pos[read_group].p, p);
+			BFM_GBL_SET(gwrite_pos[read_group].d, d);
 
-			/* Load up the patterns used by read calibration
-			using current DQDQS phase */
+			//USER Load up the patterns used by read calibration using current DQDQS phase 
 
-			rw_mgr_mem_calibrate_read_load_patterns_all_ranks ();
+			rw_mgr_mem_calibrate_read_load_patterns_all_ranks();
 
-			if (!(gbl->phy_debug_mode_flags &
-				PHY_DEBUG_DISABLE_GUARANTEED_READ)) {
-			if (!rw_mgr_mem_calibrate_read_test_patterns_all_ranks
-				(read_group, 1, &bit_chk)) {
-					pr_debug("Guaranteed read test failed:"
-						" g=%u p=%u d=%u\n",
-						read_group, p, d);
+			if (!(gbl->phy_debug_mode_flags & PHY_DEBUG_DISABLE_GUARANTEED_READ)) {
+				if (!rw_mgr_mem_calibrate_read_test_patterns_all_ranks
+				    (read_group, 1, &bit_chk)) {
+					DPRINT(1, "Guaranteed read test failed: g=%lu p=%lu d=%lu",
+					       read_group, p, d);
 					break;
 				}
 			}
-/* case:56390 */
+// case:56390
 			grp_calibrated = 1;
-
-			if (rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay(write_group, read_group, test_bgn)) {
-				/* USER Read per-bit deskew can be done on a
-				per shadow register basis */
-				for (rank_bgn = 0;
-						rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
-						rank_bgn += NUM_RANKS_PER_SHADOW_REG) {
-					/* Determine if this set of ranks
-					should be skipped entirely */
-					/* If doing read after write
-					calibration, do not update FOM
-					now - do it then */
-					if (!rw_mgr_mem_calibrate_vfifo_center(rank_bgn, write_group, read_group, test_bgn, 1, 0)) {
-						grp_calibrated = 0;
+			if (rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay
+			    (write_group, read_group, test_bgn)) {
+				// USER Read per-bit deskew can be done on a per shadow register basis
+				for (rank_bgn = 0, sr = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
+				     rank_bgn += NUM_RANKS_PER_SHADOW_REG, ++sr) {
+					//USER Determine if this set of ranks should be skipped entirely
+					if (!param->skip_shadow_regs[sr]) {
+
+						//USER Select shadow register set
+						select_shadow_regs_for_update(rank_bgn, read_group,
+									      1);
+
+						// If doing read after write calibration, do not update FOM now - do it then
+						if (!rw_mgr_mem_calibrate_vfifo_center
+						    (rank_bgn, write_group, read_group, test_bgn, 1,
+						     0)) {
+							grp_calibrated = 0;
+							failed_substage = CAL_SUBSTAGE_VFIFO_CENTER;
+						}
 					}
 				}
 			} else {
 				grp_calibrated = 0;
+				failed_substage = CAL_SUBSTAGE_DQS_EN_PHASE;
+			}
+		}
+	}
+
+	if (grp_calibrated == 0) {
+		set_failing_group_stage(write_group, CAL_STAGE_VFIFO, failed_substage);
+
+		return 0;
+	}
+	//USER Reset the delay chains back to zero if they have moved > 1 (check for > 1 because loop will increase d even when pass in first case)
+	if (DDRX || RLDRAMII) {
+		if (d > 2) {
+			scc_mgr_zero_group(write_group, write_test_bgn, 1);
+		}
+	}
+
+	return 1;
+}
+
+#else
+
+//USER VFIFO Calibration -- Full Calibration
+static uint32_t rw_mgr_mem_calibrate_vfifo(uint32_t g, uint32_t test_bgn)
+{
+	uint32_t p, rank_bgn, sr;
+	uint32_t grp_calibrated;
+	uint32_t failed_substage;
+
+	TRACE_FUNC("%lu %lu", g, test_bgn);
+
+	//USER update info for sims 
+
+	reg_file_set_stage(CAL_STAGE_VFIFO);
+
+	reg_file_set_sub_stage(CAL_SUBSTAGE_GUARANTEED_READ);
+
+	failed_substage = CAL_SUBSTAGE_GUARANTEED_READ;
+
+	//USER update info for sims 
+
+	reg_file_set_group(g);
+
+	grp_calibrated = 0;
+
+	for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX && grp_calibrated == 0; p++) {
+		//USER set a particular dqdqs phase 
+		if (DDRX) {
+			scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
+		}
+		//USER Load up the patterns used by read calibration using current DQDQS phase 
+
+		rw_mgr_mem_calibrate_read_load_patterns_all_ranks();
+		if (!(gbl->phy_debug_mode_flags & PHY_DEBUG_DISABLE_GUARANTEED_READ)) {
+			if (!rw_mgr_mem_calibrate_read_test_patterns_all_ranks
+			    (read_group, 1, &bit_chk)) {
+				break;
+			}
+		}
+
+		grp_calibrated = 1;
+		if (rw_mgr_mem_calibrate_vfifo_find_dqs_en_phase_sweep_dq_in_delay(g, g, test_bgn)) {
+			// USER Read per-bit deskew can be done on a per shadow register basis
+			for (rank_bgn = 0, sr = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
+			     rank_bgn += NUM_RANKS_PER_SHADOW_REG, ++sr) {
+
+				//USER Determine if this set of ranks should be skipped entirely
+				if (!param->skip_shadow_regs[sr]) {
+
+					//USER Select shadow register set
+					select_shadow_regs_for_update(rank_bgn, read_group, 1);
+
+					if (!rw_mgr_mem_calibrate_vfifo_center
+					    (rank_bgn, g, test_bgn, 1)) {
+						grp_calibrated = 0;
+						failed_substage = CAL_SUBSTAGE_VFIFO_CENTER;
+					}
+				}
 			}
+		} else {
+			grp_calibrated = 0;
+			failed_substage = CAL_SUBSTAGE_DQS_EN_PHASE;
 		}
 	}
 
-	/* Reset the delay chains back to zero if they have moved > 1
-	(check for > 1 because loop will increase d even when pass in
-	first case) */
-	if (d > 2)
-		scc_mgr_zero_group(write_group, write_test_bgn, 1);
+	if (grp_calibrated == 0) {
+		set_failing_group_stage(g, CAL_STAGE_VFIFO, failed_substage);
+		return 0;
+	}
 
 	return 1;
 }
 
-/* VFIFO Calibration -- Read Deskew Calibration after write deskew */
-static uint32_t rw_mgr_mem_calibrate_vfifo_end (uint32_t read_group, uint32_t test_bgn)
+#endif
+
+//USER VFIFO Calibration -- Read Deskew Calibration after write deskew
+static uint32_t rw_mgr_mem_calibrate_vfifo_end(uint32_t read_group, uint32_t test_bgn)
 {
-	uint32_t rank_bgn;
+	uint32_t rank_bgn, sr;
 	uint32_t grp_calibrated;
 	uint32_t write_group;
 
-	/* update info for sims */
+	TRACE_FUNC("%lu %lu", read_group, test_bgn);
+
+	//USER update info for sims 
 
 	reg_file_set_stage(CAL_STAGE_VFIFO_AFTER_WRITES);
 	reg_file_set_sub_stage(CAL_SUBSTAGE_VFIFO_CENTER);
 
-	write_group = read_group;
+	if (DDRX) {
+		write_group = read_group;
+	} else {
+		write_group =
+		    read_group / (RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+	}
 
-	/* update info for sims */
+	//USER update info for sims 
 	reg_file_set_group(read_group);
 
 	grp_calibrated = 1;
+	// USER Read per-bit deskew can be done on a per shadow register basis
+	for (rank_bgn = 0, sr = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
+	     rank_bgn += NUM_RANKS_PER_SHADOW_REG, ++sr) {
 
-	/* Read per-bit deskew can be done on a per shadow register basis */
-	for (rank_bgn = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
-		rank_bgn += NUM_RANKS_PER_SHADOW_REG) {
+		//USER Determine if this set of ranks should be skipped entirely
+		if (!param->skip_shadow_regs[sr]) {
 
-		/* This is the last calibration round, update FOM here */
-		if (!rw_mgr_mem_calibrate_vfifo_center (rank_bgn,
-			write_group, read_group, test_bgn, 0, 1))
+			//USER Select shadow register set
+			select_shadow_regs_for_update(rank_bgn, read_group, 1);
+
+			// This is the last calibration round, update FOM here
+			if (!rw_mgr_mem_calibrate_vfifo_center
+			    (rank_bgn, write_group, read_group, test_bgn, 0, 1)) {
 				grp_calibrated = 0;
+			}
+		}
 	}
 
-	if (grp_calibrated == 0)
+	if (grp_calibrated == 0) {
+		set_failing_group_stage(write_group, CAL_STAGE_VFIFO_AFTER_WRITES,
+					CAL_SUBSTAGE_VFIFO_CENTER);
 		return 0;
+	}
 
 	return 1;
 }
 
+//USER Calibrate LFIFO to find smallest read latency
 
-/* Calibrate LFIFO to find smallest read latency */
-
-static uint32_t rw_mgr_mem_calibrate_lfifo (void)
+static uint32_t rw_mgr_mem_calibrate_lfifo(void)
 {
 	uint32_t found_one;
 	t_btfld bit_chk;
+	uint32_t g;
+
+	TRACE_FUNC();
+	BFM_STAGE("lfifo");
 
-	/* update info for sims */
+	//USER update info for sims 
 
 	reg_file_set_stage(CAL_STAGE_LFIFO);
 	reg_file_set_sub_stage(CAL_SUBSTAGE_READ_LATENCY);
 
-	/* Load up the patterns used by read calibration for all ranks */
+	//USER Load up the patterns used by read calibration for all ranks
 
-	rw_mgr_mem_calibrate_read_load_patterns_all_ranks ();
+	rw_mgr_mem_calibrate_read_load_patterns_all_ranks();
 
 	found_one = 0;
 
 	do {
 		IOWR_32DIRECT(PHY_MGR_PHY_RLAT, 0, gbl->curr_read_lat);
-		pr_debug("lfifo: read_lat=%u\n", gbl->curr_read_lat);
+		DPRINT(2, "lfifo: read_lat=%lu", gbl->curr_read_lat);
 
-		if (!rw_mgr_mem_calibrate_read_test_all_ranks (0,
-			NUM_READ_TESTS, PASS_ALL_BITS, &bit_chk, 1)) {
+		if (!rw_mgr_mem_calibrate_read_test_all_ranks
+		    (0, NUM_READ_TESTS, PASS_ALL_BITS, &bit_chk, 1)) {
 			break;
 		}
 
 		found_one = 1;
 
-		/* reduce read latency and see if things are working */
-		/* correctly */
+		//USER reduce read latency and see if things are working
+		//USER correctly
 
 		gbl->curr_read_lat--;
 	} while (gbl->curr_read_lat > 0);
 
-	/* reset the fifos to get pointers to known state */
+	//USER reset the fifos to get pointers to known state 
 
 	IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
 
 	if (found_one) {
-		/* add a fudge factor to the read latency that was determined */
+		//USER add a fudge factor to the read latency that was determined 
 		gbl->curr_read_lat += 2;
 		IOWR_32DIRECT(PHY_MGR_PHY_RLAT, 0, gbl->curr_read_lat);
-		pr_debug("lfifo: success: using read_lat=%u\n",
-			gbl->curr_read_lat);
+
+		DPRINT(2, "lfifo: success: using read_lat=%lu", gbl->curr_read_lat);
 
 		return 1;
 	} else {
-		pr_debug("lfifo: failed at initial read_lat=%u\n",
-			gbl->curr_read_lat);
+		set_failing_group_stage(0xff, CAL_STAGE_LFIFO, CAL_SUBSTAGE_READ_LATENCY);
+
+		for (g = 0; g < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; g++) {
+			TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][g].
+				   error_stage, CAL_STAGE_LFIFO);
+			TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][g].
+				   error_sub_stage, CAL_SUBSTAGE_READ_LATENCY);
+		}
+
+		DPRINT(2, "lfifo: failed at initial read_lat=%lu", gbl->curr_read_lat);
 
 		return 0;
 	}
 }
 
-/*
- * issue write test command.
- * two variants are provided. one that just tests a write pattern and
- * another that tests datamask functionality.
- */
+//USER issue write test command.
+//USER two variants are provided. one that just tests a write pattern and another that
+//USER tests datamask functionality.
 
-static void rw_mgr_mem_calibrate_write_test_issue (uint32_t group, uint32_t test_dm)
+static void rw_mgr_mem_calibrate_write_test_issue(uint32_t group, uint32_t test_dm)
 {
 	uint32_t mcc_instruction;
 	uint32_t quick_write_mode = (((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES)
-		&& ENABLE_SUPER_QUICK_CALIBRATION) || BFM_MODE;
+				     && ENABLE_SUPER_QUICK_CALIBRATION) || BFM_MODE;
 	uint32_t rw_wl_nop_cycles;
 
-	/*
-	 * Set counter and jump addresses for the right
-	 * number of NOP cycles.
-	 * The number of supported NOP cycles can range from -1 to infinity
-	 * Three different cases are handled:
-	 *
-	 * 1. For a number of NOP cycles greater than 0, the RW Mgr looping
-	 *    mechanism will be used to insert the right number of NOPs
-	 *
-	 * 2. For a number of NOP cycles equals to 0, the micro-instruction
-	 *    issuing the write command will jump straight to the
-	 *    micro-instruction that turns on DQS (for DDRx), or outputs write
-	 *    data (for RLD), skipping
-	 *    the NOP micro-instruction all together
-	 *
-	 * 3. A number of NOP cycles equal to -1 indicates that DQS must be
-	 *    turned on in the same micro-instruction that issues the write
-	 *    command. Then we need
-	 *    to directly jump to the micro-instruction that sends out the data
-	 *
-	 * NOTE: Implementing this mechanism uses 2 RW Mgr jump-counters
-	 *       (2 and 3). One jump-counter (0) is used to perform multiple
-	 *       write-read operations.
-	 *       one counter left to issue this command in "multiple-group" mode
-	 */
-
-#if MULTIPLE_AFI_WLAT
-	rw_wl_nop_cycles = gbl->rw_wl_nop_cycles_per_group[group];
-#else
+	//USER Set counter and jump addresses for the right
+	//USER number of NOP cycles.
+	//USER The number of supported NOP cycles can range from -1 to infinity
+	//USER Three different cases are handled:
+	//USER
+	//USER 1. For a number of NOP cycles greater than 0, the RW Mgr looping
+	//USER    mechanism will be used to insert the right number of NOPs
+	//USER
+	//USER 2. For a number of NOP cycles equals to 0, the micro-instruction
+	//USER    issuing the write command will jump straight to the micro-instruction
+	//USER    that turns on DQS (for DDRx), or outputs write data (for RLD), skipping
+	//USER    the NOP micro-instruction all together
+	//USER
+	//USER 3. A number of NOP cycles equal to -1 indicates that DQS must be turned
+	//USER    on in the same micro-instruction that issues the write command. Then we need
+	//USER    to directly jump to the micro-instruction that sends out the data
+	//USER
+	//USER NOTE: Implementing this mechanism uses 2 RW Mgr jump-counters (2 and 3). One
+	//USER       jump-counter (0) is used to perform multiple write-read operations.
+	//USER       one counter left to issue this command in "multiple-group" mode.
+
 	rw_wl_nop_cycles = gbl->rw_wl_nop_cycles;
-#endif
 
 	if (rw_wl_nop_cycles == -1) {
-		/* CNTR 2 - We want to execute the special write operation that
-		turns on DQS right away and then skip directly to the
-		instruction that sends out the data. We set the counter to a
-		large number so that the jump is always taken */
+		//USER CNTR 2 - We want to execute the special write operation that
+		//USER turns on DQS right away and then skip directly to the instruction that
+		//USER sends out the data. We set the counter to a large number so that the
+		//USER jump is always taken
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0xFF);
 
-		/* CNTR 3 - Not used */
+		//USER CNTR 3 - Not used
 		if (test_dm) {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_DM_BANK_0_WL_1;
 			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-				__RW_MGR_LFSR_WR_RD_DM_BANK_0_DATA);
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0,
-				__RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP);
+				      __RW_MGR_LFSR_WR_RD_DM_BANK_0_DATA);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP);
 		} else {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_BANK_0_WL_1;
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-				__RW_MGR_LFSR_WR_RD_BANK_0_DATA);
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0,
-				__RW_MGR_LFSR_WR_RD_BANK_0_NOP);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_LFSR_WR_RD_BANK_0_DATA);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_LFSR_WR_RD_BANK_0_NOP);
 		}
+
 	} else if (rw_wl_nop_cycles == 0) {
-		/* CNTR 2 - We want to skip the NOP operation and go straight to
-		the DQS enable instruction. We set the counter to a large number
-		so that the jump is always taken */
+		//USER CNTR 2 - We want to skip the NOP operation and go straight to
+		//USER the DQS enable instruction. We set the counter to a large number so that the
+		//USER jump is always taken
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0xFF);
 
-		/* CNTR 3 - Not used */
+		//USER CNTR 3 - Not used
 		if (test_dm) {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_DM_BANK_0;
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-				__RW_MGR_LFSR_WR_RD_DM_BANK_0_DQS);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_LFSR_WR_RD_DM_BANK_0_DQS);
 		} else {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_BANK_0;
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0,
-				__RW_MGR_LFSR_WR_RD_BANK_0_DQS);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, __RW_MGR_LFSR_WR_RD_BANK_0_DQS);
 		}
+
 	} else {
-		/* CNTR 2 - In this case we want to execute the next instruction
-		and NOT take the jump. So we set the counter to 0. The jump
-		address doesn't count */
+		//USER CNTR 2 - In this case we want to execute the next instruction and NOT
+		//USER take the jump. So we set the counter to 0. The jump address doesn't count
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_2, 0, 0x0);
 		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_2, 0, 0x0);
 
-		/* CNTR 3 - Set the nop counter to the number of cycles we
-		need to loop for, minus 1 */
+		//USER CNTR 3 - Set the nop counter to the number of cycles we need to loop for, minus 1
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_3, 0, rw_wl_nop_cycles - 1);
 		if (test_dm) {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_DM_BANK_0;
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0,
-				__RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_LFSR_WR_RD_DM_BANK_0_NOP);
 		} else {
 			mcc_instruction = __RW_MGR_LFSR_WR_RD_BANK_0;
-			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0,
-				__RW_MGR_LFSR_WR_RD_BANK_0_NOP);
+			IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_3, 0, __RW_MGR_LFSR_WR_RD_BANK_0_NOP);
 		}
 	}
 
@@ -3054,172 +3719,180 @@ static void rw_mgr_mem_calibrate_write_test_issue (uint32_t group, uint32_t test
 	}
 	IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, mcc_instruction);
 
-	/* CNTR 1 - This is used to ensure enough time elapses
-	for read data to come back. */
+	//USER CNTR 1 - This is used to ensure enough time elapses for read data to come back.
 	IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, 0x30);
 
 	if (test_dm) {
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_LFSR_WR_RD_DM_BANK_0_WAIT);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_LFSR_WR_RD_DM_BANK_0_WAIT);
 	} else {
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_LFSR_WR_RD_BANK_0_WAIT);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_LFSR_WR_RD_BANK_0_WAIT);
 	}
 
 	IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, (group << 2), mcc_instruction);
 }
 
-/* Test writes, can check for a single bit pass or multiple bit pass */
+//USER Test writes, can check for a single bit pass or multiple bit pass
 
-static uint32_t rw_mgr_mem_calibrate_write_test (uint32_t rank_bgn,
-		uint32_t write_group, uint32_t use_dm, uint32_t all_correct,
-		t_btfld *bit_chk, uint32_t all_ranks)
+static uint32_t rw_mgr_mem_calibrate_write_test(uint32_t rank_bgn, uint32_t write_group,
+						uint32_t use_dm, uint32_t all_correct,
+						t_btfld * bit_chk, uint32_t all_ranks)
 {
 	uint32_t r;
 	t_btfld correct_mask_vg;
 	t_btfld tmp_bit_chk;
 	uint32_t vg;
-	uint32_t rank_end = all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS :
-		(rank_bgn + NUM_RANKS_PER_SHADOW_REG);
+	uint32_t rank_end =
+	    all_ranks ? RW_MGR_MEM_NUMBER_OF_RANKS : (rank_bgn + NUM_RANKS_PER_SHADOW_REG);
 
 	*bit_chk = param->write_correct_mask;
 	correct_mask_vg = param->write_correct_mask_vg;
 
 	for (r = rank_bgn; r < rank_end; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank 
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_READ_WRITE);
 
 		tmp_bit_chk = 0;
-		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS - 1; ; vg--) {
+		for (vg = RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS - 1;; vg--) {
 
-			/* reset the fifos to get pointers to known state */
+			//USER reset the fifos to get pointers to known state 
 			IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
 
-			tmp_bit_chk = tmp_bit_chk <<
-				(RW_MGR_MEM_DQ_PER_WRITE_DQS /
-				RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS);
-			rw_mgr_mem_calibrate_write_test_issue (write_group *
-				RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS + vg,
-				use_dm);
-
-			tmp_bit_chk = tmp_bit_chk | (correct_mask_vg &
-				~(IORD_32DIRECT(BASE_RW_MGR, 0)));
-			pr_debug("write_test(%u,%u,%u) :[%u,%u] "
-				BTFLD_FMT " & ~%x => " BTFLD_FMT " => "
-				BTFLD_FMT, write_group, use_dm, all_correct,
-				r, vg, correct_mask_vg,
-				IORD_32DIRECT(BASE_RW_MGR, 0), correct_mask_vg
-				& ~IORD_32DIRECT(BASE_RW_MGR, 0),
-				tmp_bit_chk);
-
-			if (vg == 0)
+			tmp_bit_chk =
+			    tmp_bit_chk << (RW_MGR_MEM_DQ_PER_WRITE_DQS /
+					    RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS);
+			rw_mgr_mem_calibrate_write_test_issue(write_group *
+							      RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS
+							      + vg, use_dm);
+
+			tmp_bit_chk =
+			    tmp_bit_chk | (correct_mask_vg & ~(IORD_32DIRECT(BASE_RW_MGR, 0)));
+			DPRINT(2,
+			       "write_test(%lu,%lu,%lu) :[%lu,%lu] " BTFLD_FMT " & ~%x => "
+			       BTFLD_FMT " => " BTFLD_FMT, write_group, use_dm, all_correct, r, vg,
+			       correct_mask_vg, IORD_32DIRECT(BASE_RW_MGR, 0),
+			       correct_mask_vg & ~IORD_32DIRECT(BASE_RW_MGR, 0), tmp_bit_chk);
+
+			if (vg == 0) {
 				break;
+			}
 		}
 		*bit_chk &= tmp_bit_chk;
 	}
 
-	set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
-
-	if (all_correct)
+	if (all_correct) {
+		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
+		DPRINT(2, "write_test(%lu,%lu,ALL) : " BTFLD_FMT " == " BTFLD_FMT " => %lu",
+		       write_group, use_dm, *bit_chk, param->write_correct_mask,
+		       (long unsigned int)(*bit_chk == param->write_correct_mask));
 		return (*bit_chk == param->write_correct_mask);
-	else
+	} else {
+		set_rank_and_odt_mask(0, RW_MGR_ODT_MODE_OFF);
+		DPRINT(2, "write_test(%lu,%lu,ONE) : " BTFLD_FMT " != " BTFLD_FMT " => %lu",
+		       write_group, use_dm, *bit_chk, (long unsigned int)0,
+		       (long unsigned int)(*bit_chk != 0));
 		return (*bit_chk != 0x00);
+	}
 }
 
-static uint32_t rw_mgr_mem_calibrate_write_test_all_ranks
-(uint32_t write_group, uint32_t use_dm, uint32_t all_correct, t_btfld *bit_chk)
+static inline uint32_t rw_mgr_mem_calibrate_write_test_all_ranks(uint32_t write_group,
+								 uint32_t use_dm,
+								 uint32_t all_correct,
+								 t_btfld * bit_chk)
 {
-	return rw_mgr_mem_calibrate_write_test (0, write_group,
-		use_dm, all_correct, bit_chk, 1);
+	return rw_mgr_mem_calibrate_write_test(0, write_group, use_dm, all_correct, bit_chk, 1);
 }
 
-/* level the write operations */
-/* Write Levelling -- Full Calibration */
-static uint32_t rw_mgr_mem_calibrate_wlevel (uint32_t g, uint32_t test_bgn)
+//USER level the write operations
+
+#if NEWVERSION_WL
+
+//USER Write Levelling -- Full Calibration
+static uint32_t rw_mgr_mem_calibrate_wlevel(uint32_t g, uint32_t test_bgn)
 {
-	uint32_t p, d;
+	uint32_t p, d, sr;
+
 	uint32_t num_additional_fr_cycles = 0;
+
 	t_btfld bit_chk;
 	uint32_t work_bgn, work_end, work_mid;
 	uint32_t tmp_delay;
 	uint32_t found_begin;
 	uint32_t dtaps_per_ptap;
 
-	/* update info for sims */
+	TRACE_FUNC("%lu %lu", g, test_bgn);
+	BFM_STAGE("wlevel");
+
+	//USER update info for sims
 
 	reg_file_set_stage(CAL_STAGE_WLEVEL);
 	reg_file_set_sub_stage(CAL_SUBSTAGE_WORKING_DELAY);
 
-	/* maximum phases for the sweep */
+	//USER maximum phases for the sweep 
 
-#if USE_DQS_TRACKING
 	dtaps_per_ptap = IORD_32DIRECT(REG_FILE_DTAPS_PER_PTAP, 0);
-#else
-	dtaps_per_ptap = 0;
-	tmp_delay = 0;
-	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
-		dtaps_per_ptap++;
-		tmp_delay += IO_DELAY_PER_DCHAIN_TAP;
-	}
-	dtaps_per_ptap--;
-#endif
 
-	/* starting phases */
+	//USER starting phases 
 
-	/* update info for sims */
+	//USER update info for sims
 
 	reg_file_set_group(g);
 
-	/* starting and end range where writes work */
+	//USER starting and end range where writes work 
 
-	scc_mgr_spread_out2_delay_all_ranks (g, test_bgn);
+	scc_mgr_spread_out2_delay_all_ranks(g, test_bgn);
 
 	work_bgn = 0;
 	work_end = 0;
 
-	/* step 1: find first working phase, increment in ptaps, and then in
-	dtaps if ptaps doesn't find a working phase */
+	//USER step 1: find first working phase, increment in ptaps, and then in dtaps if ptaps doesn't find a working phase 
 	found_begin = 0;
 	tmp_delay = 0;
-	for (d = 0; d <= dtaps_per_ptap; d++, tmp_delay +=
-		IO_DELAY_PER_DCHAIN_TAP) {
-		scc_mgr_apply_group_all_out_delay_all_ranks (g, test_bgn, d);
+	for (d = 0; d <= dtaps_per_ptap; d++, tmp_delay += IO_DELAY_PER_DCHAIN_TAP) {
+		scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, d);
 
 		work_bgn = tmp_delay;
 
-		for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX +
-			num_additional_fr_cycles*IO_DLL_CHAIN_LENGTH;
-			p++, work_bgn += IO_DELAY_PER_OPA_TAP) {
-			pr_debug("wlevel: begin-1: p=%u d=%u\n", p, d);
+		for (p = 0;
+		     p <= IO_DQDQS_OUT_PHASE_MAX + num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH;
+		     p++, work_bgn += IO_DELAY_PER_OPA_TAP) {
+			DPRINT(2, "wlevel: begin-1: p=%lu d=%lu", p, d);
 			scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
 
-			if (rw_mgr_mem_calibrate_write_test_all_ranks (g, 0,
-				PASS_ONE_BIT, &bit_chk)) {
+			if (rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
 				found_begin = 1;
 				break;
 			}
 		}
 
-		if (found_begin)
+		if (found_begin) {
 			break;
+		}
 	}
 
-	if (p > IO_DQDQS_OUT_PHASE_MAX + num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH)
-		/* fail, cannot find first working phase */
+	if (p > IO_DQDQS_OUT_PHASE_MAX + num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH) {
+		//USER fail, cannot find first working phase 
+
+		set_failing_group_stage(g, CAL_STAGE_WLEVEL, CAL_SUBSTAGE_WORKING_DELAY);
+
 		return 0;
+	}
 
-	pr_debug("wlevel: first valid p=%u d=%u\n", p, d);
+	DPRINT(2, "wlevel: first valid p=%lu d=%lu", p, d);
 
 	reg_file_set_sub_stage(CAL_SUBSTAGE_LAST_WORKING_DELAY);
 
-	/* If d is 0 then the working window covers a phase tap and we can
-	follow the old procedure otherwise, we've found the beginning, and we
-	need to increment the dtaps until we find the end */
+	//USER If d is 0 then the working window covers a phase tap and we can follow the old procedure
+	//USER  otherwise, we've found the beginning, and we need to increment the dtaps until we find the end 
 	if (d == 0) {
+		COV(WLEVEL_PHASE_PTAP_OVERLAP);
 		work_end = work_bgn + IO_DELAY_PER_OPA_TAP;
 
-		/* step 2: if we have room, back off by one and increment
-		in dtaps */
+		//USER step 2: if we have room, back off by one and increment in dtaps 
 
 		if (p > 0) {
 			int found = 0;
@@ -3227,46 +3900,63 @@ static uint32_t rw_mgr_mem_calibrate_wlevel (uint32_t g, uint32_t test_bgn)
 
 			tmp_delay = work_bgn - IO_DELAY_PER_OPA_TAP;
 
-			for (d = 0; d <= IO_IO_OUT1_DELAY_MAX &&
-				tmp_delay < work_bgn; d++,
-				tmp_delay += IO_DELAY_PER_DCHAIN_TAP) {
-				pr_debug("wlevel: begin-2: p=%u d=%u\n",
-					(p - 1), d);
+			for (d = 0; d <= IO_IO_OUT1_DELAY_MAX && tmp_delay < work_bgn;
+			     d++, tmp_delay += IO_DELAY_PER_DCHAIN_TAP) {
+				DPRINT(2, "wlevel: begin-2: p=%lu d=%lu", (p - 1), d);
 				scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, d);
 
-				if (rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
+				if (rw_mgr_mem_calibrate_write_test_all_ranks
+				    (g, 0, PASS_ONE_BIT, &bit_chk)) {
 					found = 1;
 					work_bgn = tmp_delay;
 					break;
 				}
 			}
 
-			scc_mgr_apply_group_all_out_delay_all_ranks (g,
-				test_bgn, 0);
+			{
+				uint32_t d2;
+				uint32_t p2;
+				if (found) {
+					d2 = d;
+					p2 = p - 1;
+				} else {
+					d2 = 0;
+					p2 = p;
+				}
+
+				DPRINT(2, "wlevel: found begin-A: p=%lu d=%lu ps=%lu", p2, d2,
+				       work_bgn);
+
+				BFM_GBL_SET(dqs_wlevel_left_edge[g].p, p2);
+				BFM_GBL_SET(dqs_wlevel_left_edge[g].d, d2);
+				BFM_GBL_SET(dqs_wlevel_left_edge[g].ps, work_bgn);
+			}
+
+			scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, 0);
 		} else {
-			pr_debug("wlevel: found begin-B: p=%u d=%u ps=%u\n",
-				p, d, work_bgn);
+			DPRINT(2, "wlevel: found begin-B: p=%lu d=%lu ps=%lu", p, d, work_bgn);
+
+			BFM_GBL_SET(dqs_wlevel_left_edge[g].p, p);
+			BFM_GBL_SET(dqs_wlevel_left_edge[g].d, d);
+			BFM_GBL_SET(dqs_wlevel_left_edge[g].ps, work_bgn);
 		}
 
-		/* step 3: go forward from working phase to non working phase,
-		increment in ptaps */
+		//USER step 3: go forward from working phase to non working phase, increment in ptaps 
 
-		for (p = p + 1; p <= IO_DQDQS_OUT_PHASE_MAX +
-			num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH; p++,
-			work_end += IO_DELAY_PER_OPA_TAP) {
-			pr_debug("wlevel: end-0: p=%u d=%u\n", p,
-				0);
+		for (p = p + 1;
+		     p <= IO_DQDQS_OUT_PHASE_MAX + num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH;
+		     p++, work_end += IO_DELAY_PER_OPA_TAP) {
+			DPRINT(2, "wlevel: end-0: p=%lu d=%lu", p, (long unsigned int)0);
 			scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
 
-			if (!rw_mgr_mem_calibrate_write_test_all_ranks (g, 0,
-				PASS_ONE_BIT, &bit_chk)) {
+			if (!rw_mgr_mem_calibrate_write_test_all_ranks
+			    (g, 0, PASS_ONE_BIT, &bit_chk)) {
 				break;
 			}
 		}
 
-		/* step 4: back off one from last, increment in dtaps */
-		/* The actual increment is done outside the if/else statement
-		since it is shared with other code */
+		//USER step 4: back off one from last, increment in dtaps 
+		//USER The actual increment is done outside the if/else statement since it is shared with other code
 
 		p = p - 1;
 
@@ -3276,52 +3966,61 @@ static uint32_t rw_mgr_mem_calibrate_wlevel (uint32_t g, uint32_t test_bgn)
 		d = 0;
 
 	} else {
-		/* step 5: Window doesn't cover phase tap, just increment
-		dtaps until failure */
-		/* The actual increment is done outside the if/else statement
-		since it is shared with other code */
+		//USER step 5: Window doesn't cover phase tap, just increment dtaps until failure
+		//USER The actual increment is done outside the if/else statement since it is shared with other code
+		COV(WLEVEL_PHASE_PTAP_NO_OVERLAP);
 		work_end = work_bgn;
-		pr_debug("wlevel: found begin-C: p=%u d=%u ps=%u\n", p,
-			d, work_bgn);
+		DPRINT(2, "wlevel: found begin-C: p=%lu d=%lu ps=%lu", p, d, work_bgn);
+		BFM_GBL_SET(dqs_wlevel_left_edge[g].p, p);
+		BFM_GBL_SET(dqs_wlevel_left_edge[g].d, d);
+		BFM_GBL_SET(dqs_wlevel_left_edge[g].ps, work_bgn);
+
 	}
 
-	/* The actual increment until failure */
-	for (; d <= IO_IO_OUT1_DELAY_MAX; d++, work_end +=
-		IO_DELAY_PER_DCHAIN_TAP) {
-		pr_debug("wlevel: end: p=%u d=%u\n", p, d);
-		scc_mgr_apply_group_all_out_delay_all_ranks (g, test_bgn, d);
+	//USER The actual increment until failure
+	for (; d <= IO_IO_OUT1_DELAY_MAX; d++, work_end += IO_DELAY_PER_DCHAIN_TAP) {
+		DPRINT(2, "wlevel: end: p=%lu d=%lu", p, d);
+		scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, d);
 
-		if (!rw_mgr_mem_calibrate_write_test_all_ranks (g, 0,
-			PASS_ONE_BIT, &bit_chk)) {
+		if (!rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
 			break;
 		}
 	}
-	scc_mgr_zero_group (g, test_bgn, 1);
+	scc_mgr_zero_group(g, test_bgn, 1);
 
 	work_end -= IO_DELAY_PER_DCHAIN_TAP;
 
 	if (work_end >= work_bgn) {
-		/* we have a working range */
+		//USER we have a working range 
 	} else {
-		/* nil range */
+		//USER nil range 
+
+		set_failing_group_stage(g, CAL_STAGE_WLEVEL, CAL_SUBSTAGE_LAST_WORKING_DELAY);
+
 		return 0;
 	}
 
-	pr_debug("wlevel: found end: p=%u d=%u; range: [%u,%u]\n", p,
-		d-1, work_bgn, work_end);
+	DPRINT(2, "wlevel: found end: p=%lu d=%lu; range: [%lu,%lu]", p, d - 1, work_bgn, work_end);
+	BFM_GBL_SET(dqs_wlevel_right_edge[g].p, p);
+	BFM_GBL_SET(dqs_wlevel_right_edge[g].d, d - 1);
+	BFM_GBL_SET(dqs_wlevel_right_edge[g].ps, work_end);
+
+	for (sr = 0; sr < NUM_SHADOW_REGS; sr++) {
+		TCLRPT_SET(debug_cal_report->cal_dqs_out_margins[sr][g].dqdqs_start, work_bgn);
+		TCLRPT_SET(debug_cal_report->cal_dqs_out_margins[sr][g].dqdqs_end, work_end);
+	}
 
-	/* center */
+	//USER center 
 
 	work_mid = (work_bgn + work_end) / 2;
 
-	pr_debug("wlevel: work_mid=%d\n", work_mid);
+	DPRINT(2, "wlevel: work_mid=%ld", work_mid);
 
 	tmp_delay = 0;
 
-	for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX  +
-		num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH &&
-		tmp_delay < work_mid; p++, tmp_delay += IO_DELAY_PER_OPA_TAP)
-		;
+	for (p = 0;
+	     p <= IO_DQDQS_OUT_PHASE_MAX + num_additional_fr_cycles * IO_DLL_CHAIN_LENGTH
+	     && tmp_delay < work_mid; p++, tmp_delay += IO_DELAY_PER_OPA_TAP) ;
 
 	if (tmp_delay > work_mid) {
 		tmp_delay -= IO_DELAY_PER_OPA_TAP;
@@ -3335,38 +4034,169 @@ static uint32_t rw_mgr_mem_calibrate_wlevel (uint32_t g, uint32_t test_bgn)
 
 	scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
 
-	pr_debug("wlevel: p=%u tmp_delay=%u left=%u\n", p, tmp_delay,
-		work_mid - tmp_delay);
+	DPRINT(2, "wlevel: p=%lu tmp_delay=%lu left=%lu", p, tmp_delay, work_mid - tmp_delay);
 
-	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX && tmp_delay < work_mid; d++,
-		tmp_delay += IO_DELAY_PER_DCHAIN_TAP)
-		;
+	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX && tmp_delay < work_mid;
+	     d++, tmp_delay += IO_DELAY_PER_DCHAIN_TAP) ;
 
 	if (tmp_delay > work_mid) {
 		tmp_delay -= IO_DELAY_PER_DCHAIN_TAP;
 		d--;
 	}
 
-	pr_debug("wlevel: p=%u d=%u tmp_delay=%u left=%u\n", p, d,
-		tmp_delay, work_mid - tmp_delay);
+	DPRINT(2, "wlevel: p=%lu d=%lu tmp_delay=%lu left=%lu", p, d, tmp_delay,
+	       work_mid - tmp_delay);
+
+	scc_mgr_apply_group_all_out_delay_add_all_ranks(g, test_bgn, d);
+
+	DPRINT(2, "wlevel: found middle: p=%lu d=%lu", p, d);
+	BFM_GBL_SET(dqs_wlevel_mid[g].p, p);
+	BFM_GBL_SET(dqs_wlevel_mid[g].d, d);
+	BFM_GBL_SET(dqs_wlevel_mid[g].ps, work_mid);
+
+	return 1;
+}
+
+#else
+
+//USER Write Levelling -- Full Calibration
+static uint32_t rw_mgr_mem_calibrate_wlevel(uint32_t g, uint32_t test_bgn)
+{
+	uint32_t p, d;
+	t_btfld bit_chk;
+	uint32_t work_bgn, work_end, work_mid;
+	uint32_t tmp_delay;
+
+	TRACE_FUNC("%lu %lu", g, test_bgn);
+
+	//USER update info for sims
+
+	reg_file_set_stage(CAL_STAGE_WLEVEL);
+	reg_file_set_sub_stage(CAL_SUBSTAGE_WORKING_DELAY);
+
+	//USER maximum phases for the sweep 
+
+	//USER starting phases 
+
+	//USER update info for sims
+
+	reg_file_set_group(g);
+
+	//USER starting and end range where writes work 
+
+	work_bgn = 0;
+	work_end = 0;
+
+	//USER step 1: find first working phase, increment in ptaps 
+
+	for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX; p++, work_bgn += IO_DELAY_PER_OPA_TAP) {
+		scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
+
+		if (rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
+			break;
+		}
+	}
+
+	if (p > IO_DQDQS_OUT_PHASE_MAX) {
+		//USER fail, cannot find first working phase 
+
+		set_failing_group_stage(g, CAL_STAGE_WLEVEL, CAL_SUBSTAGE_WORKING_DELAY);
+
+		return 0;
+	}
+
+	work_end = work_bgn + IO_DELAY_PER_OPA_TAP;
+
+	reg_file_set_sub_stage(CAL_SUBSTAGE_LAST_WORKING_DELAY);
+
+	//USER step 2: if we have room, back off by one and increment in dtaps 
+
+	if (p > 0) {
+		scc_mgr_set_dqdqs_output_phase_all_ranks(g, p - 1);
+
+		tmp_delay = work_bgn - IO_DELAY_PER_OPA_TAP;
+
+		for (d = 0; d <= IO_IO_OUT1_DELAY_MAX && tmp_delay < work_bgn;
+		     d++, tmp_delay += IO_DELAY_PER_DCHAIN_TAP) {
+			scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, d);
+
+			if (rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
+				work_bgn = tmp_delay;
+				break;
+			}
+		}
+
+		scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, 0);
+	}
+	//USER step 3: go forward from working phase to non working phase, increment in ptaps 
+
+	for (p = p + 1; p <= IO_DQDQS_OUT_PHASE_MAX; p++, work_end += IO_DELAY_PER_OPA_TAP) {
+		scc_mgr_set_dqdqs_output_phase_all_ranks(g, p);
+
+		if (!rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
+			break;
+		}
+	}
+
+	//USER step 4: back off one from last, increment in dtaps 
+
+	scc_mgr_set_dqdqs_output_phase_all_ranks(g, p - 1);
+
+	work_end -= IO_DELAY_PER_OPA_TAP;
+
+	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX; d++, work_end += IO_DELAY_PER_DCHAIN_TAP) {
+		scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, d);
+
+		if (!rw_mgr_mem_calibrate_write_test_all_ranks(g, 0, PASS_ONE_BIT, &bit_chk)) {
+			break;
+		}
+	}
+
+	scc_mgr_apply_group_all_out_delay_all_ranks(g, test_bgn, 0);
+
+	if (work_end > work_bgn) {
+		//USER we have a working range 
+	} else {
+		//USER nil range 
+
+		set_failing_group_stage(g, CAL_STAGE_WLEVEL, CAL_SUBSTAGE_LAST_WORKING_DELAY);
+
+		return 0;
+	}
+
+	//USER center 
 
-	scc_mgr_apply_group_all_out_delay_add_all_ranks (g, test_bgn, d);
+	work_mid = (work_bgn + work_end) / 2;
+
+	tmp_delay = 0;
+
+	for (p = 0; p <= IO_DQDQS_OUT_PHASE_MAX && tmp_delay < work_mid;
+	     p++, tmp_delay += IO_DELAY_PER_OPA_TAP) ;
+
+	tmp_delay -= IO_DELAY_PER_OPA_TAP;
+
+	scc_mgr_set_dqdqs_output_phase_all_ranks(g, p - 1);
+
+	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX && tmp_delay < work_mid;
+	     d++, tmp_delay += IO_DELAY_PER_DCHAIN_TAP) ;
 
-	pr_debug("wlevel: found middle: p=%u d=%u\n", p, d);
+	scc_mgr_apply_group_all_out_delay_add_all_ranks(g, test_bgn, d - 1);
 
 	return 1;
 }
 
-/* center all windows. do per-bit-deskew to possibly increase size of
-certain windows */
+#endif
+
+//USER center all windows. do per-bit-deskew to possibly increase size of certain windows
 
-static uint32_t rw_mgr_mem_calibrate_writes_center (uint32_t rank_bgn,
-	uint32_t write_group, uint32_t test_bgn)
+#if NEWVERSION_WRDESKEW
+
+static uint32_t rw_mgr_mem_calibrate_writes_center(uint32_t rank_bgn, uint32_t write_group,
+						   uint32_t test_bgn)
 {
 	uint32_t i, p, min_index;
 	int32_t d;
-	/* Store these as signed since there are comparisons with
-	signed numbers */
+	//USER Store these as signed since there are comparisons with signed numbers
 	t_btfld bit_chk;
 	t_btfld sticky_bit_chk;
 	int32_t left_edge[RW_MGR_MEM_DQ_PER_WRITE_DQS];
@@ -3376,11 +4206,9 @@ static uint32_t rw_mgr_mem_calibrate_writes_center (uint32_t rank_bgn,
 	int32_t new_dqs, start_dqs, shift_dq;
 	int32_t dq_margin, dqs_margin, dm_margin;
 	uint32_t stop;
-	int32_t bgn_curr;
-	int32_t end_curr;
-	int32_t bgn_best;
-	int32_t end_best;
-	int32_t win_best;
+
+	TRACE_FUNC("%lu %lu", write_group, test_bgn);
+	BFM_STAGE("writes_center");
 
 	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
 
@@ -3388,120 +4216,109 @@ static uint32_t rw_mgr_mem_calibrate_writes_center (uint32_t rank_bgn,
 
 	start_dqs = READ_SCC_DQS_IO_OUT1_DELAY();
 
-	/* per-bit deskew */
+	select_curr_shadow_reg_using_rank(rank_bgn);
 
-	/* set the left and right edge of each bit to an illegal value */
-	/* use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value */
+	//USER per-bit deskew 
+
+	//USER set the left and right edge of each bit to an illegal value 
+	//USER use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value
 	sticky_bit_chk = 0;
 	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
-		left_edge[i]  = IO_IO_OUT1_DELAY_MAX + 1;
+		left_edge[i] = IO_IO_OUT1_DELAY_MAX + 1;
 		right_edge[i] = IO_IO_OUT1_DELAY_MAX + 1;
 	}
 
-	/* Search for the left edge of the window for each bit */
+	//USER Search for the left edge of the window for each bit
 	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX; d++) {
-		scc_mgr_apply_group_dq_out1_delay (write_group, test_bgn, d);
+		scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, d);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-		/* Stop searching when the read test doesn't pass AND when
-		we've seen a passing read on every bit */
-		stop = !rw_mgr_mem_calibrate_write_test (rank_bgn, write_group,
-			0, PASS_ONE_BIT, &bit_chk, 0);
+		//USER Stop searching when the read test doesn't pass AND when we've seen a passing read on every bit 
+		stop =
+		    !rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 0, PASS_ONE_BIT,
+						     &bit_chk, 0);
 		sticky_bit_chk = sticky_bit_chk | bit_chk;
 		stop = stop && (sticky_bit_chk == param->write_correct_mask);
-		pr_debug("write_center(left): dtap=%u => " BTFLD_FMT
-			" == " BTFLD_FMT " && %u [bit_chk=" BTFLD_FMT "]\n",
-			d, sticky_bit_chk, param->write_correct_mask,
-			stop, bit_chk);
+		DPRINT(2,
+		       "write_center(left): dtap=%lu => " BTFLD_FMT " == " BTFLD_FMT
+		       " && %lu [bit_chk=" BTFLD_FMT "]", d, sticky_bit_chk,
+		       param->write_correct_mask, stop, bit_chk);
 
 		if (stop == 1) {
 			break;
 		} else {
 			for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
 				if (bit_chk & 1) {
-					/* Remember a passing test as the
-					left_edge */
+					//USER Remember a passing test as the left_edge
 					left_edge[i] = d;
 				} else {
-					/* If a left edge has not been seen yet,
-					then a future passing test will mark
-					this edge as the right edge */
-					if (left_edge[i] ==
-						IO_IO_OUT1_DELAY_MAX + 1) {
+					//USER If a left edge has not been seen yet, then a future passing test will mark this edge as the right edge 
+					if (left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) {
 						right_edge[i] = -(d + 1);
 					}
 				}
-				pr_debug("write_center[l,d=%u): "
-					"bit_chk_test=%d left_edge[%u]: %d "
-					"right_edge[%u]: %d\n",
-					d, (int)(bit_chk & 1), i, left_edge[i],
-					i, right_edge[i]);
+				DPRINT(2,
+				       "write_center[l,d=%lu): bit_chk_test=%d left_edge[%lu]: %ld right_edge[%lu]: %ld",
+				       d, (int)(bit_chk & 1), i, left_edge[i], i, right_edge[i]);
 				bit_chk = bit_chk >> 1;
 			}
 		}
 	}
 
-	/* Reset DQ delay chains to 0 */
-	scc_mgr_apply_group_dq_out1_delay (write_group, test_bgn, 0);
+	//USER Reset DQ delay chains to 0 
+	scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, 0);
 	sticky_bit_chk = 0;
 	for (i = RW_MGR_MEM_DQ_PER_WRITE_DQS - 1;; i--) {
 
-		pr_debug("write_center: left_edge[%u]: %d right_edge[%u]: "
-			"%d\n", i, left_edge[i], i, right_edge[i]);
+		DPRINT(2, "write_center: left_edge[%lu]: %ld right_edge[%lu]: %ld", i, left_edge[i],
+		       i, right_edge[i]);
 
-		/* Check for cases where we haven't found the left edge,
-		which makes our assignment of the the right edge invalid.
-		Reset it to the illegal value. */
-		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) &&
-			(right_edge[i] != IO_IO_OUT1_DELAY_MAX + 1)) {
+		//USER Check for cases where we haven't found the left edge, which makes our assignment of the the 
+		//USER right edge invalid.  Reset it to the illegal value. 
+		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1)
+		    && (right_edge[i] != IO_IO_OUT1_DELAY_MAX + 1)) {
 			right_edge[i] = IO_IO_OUT1_DELAY_MAX + 1;
-			pr_debug("write_center: reset right_edge[%u]: %d\n",
-			i, right_edge[i]);
+			DPRINT(2, "write_center: reset right_edge[%lu]: %ld", i, right_edge[i]);
 		}
-
-		/* Reset sticky bit (except for bits where we have
-		seen the left edge) */
+		//USER Reset sticky bit (except for bits where we have seen the left edge) 
 		sticky_bit_chk = sticky_bit_chk << 1;
-		if ((left_edge[i] != IO_IO_OUT1_DELAY_MAX + 1))
+		if ((left_edge[i] != IO_IO_OUT1_DELAY_MAX + 1)) {
 			sticky_bit_chk = sticky_bit_chk | 1;
+		}
 
-		if (i == 0)
+		if (i == 0) {
 			break;
+		}
 	}
 
-	/* Search for the right edge of the window for each bit */
+	//USER Search for the right edge of the window for each bit 
 	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX - start_dqs; d++) {
-		scc_mgr_apply_group_dqs_io_and_oct_out1 (write_group,
-			d + start_dqs);
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, d + start_dqs);
 
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-
-		/* Stop searching when the read test doesn't pass AND when
-		we've seen a passing read on every bit */
-		stop = !rw_mgr_mem_calibrate_write_test (rank_bgn, write_group,
-			0, PASS_ONE_BIT, &bit_chk, 0);
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
+		//USER Stop searching when the read test doesn't pass AND when we've seen a passing read on every bit 
+		stop =
+		    !rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 0, PASS_ONE_BIT,
+						     &bit_chk, 0);
 		if (stop) {
 			recover_mem_device_after_ck_dqs_violation();
 		}
 		sticky_bit_chk = sticky_bit_chk | bit_chk;
 		stop = stop && (sticky_bit_chk == param->write_correct_mask);
 
-		pr_debug("write_center (right): dtap=%u => " BTFLD_FMT " == "
-			BTFLD_FMT " && %u\n", d, sticky_bit_chk,
-			param->write_correct_mask, stop);
+		DPRINT(2, "write_center (right): dtap=%lu => " BTFLD_FMT " == " BTFLD_FMT " && %lu",
+		       d, sticky_bit_chk, param->write_correct_mask, stop);
 
 		if (stop == 1) {
 			if (d == 0) {
-				for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS;
-					i++) {
-					/* d = 0 failed, but it passed when
-					testing the left edge, so it must be
-					marginal, set it to -1 */
-					if (right_edge[i] ==
-						IO_IO_OUT1_DELAY_MAX + 1 &&
-						left_edge[i] !=
-						IO_IO_OUT1_DELAY_MAX + 1) {
+				for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
+					//USER d = 0 failed, but it passed when testing the left edge, so it must be marginal, set it to -1
+					if (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1
+					    && left_edge[i] != IO_IO_OUT1_DELAY_MAX + 1) {
 						right_edge[i] = -1;
 					}
 				}
@@ -3510,63 +4327,49 @@ static uint32_t rw_mgr_mem_calibrate_writes_center (uint32_t rank_bgn,
 		} else {
 			for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
 				if (bit_chk & 1) {
-					/* Remember a passing test as
-					the right_edge */
+					//USER Remember a passing test as the right_edge 
 					right_edge[i] = d;
 				} else {
 					if (d != 0) {
-						/* If a right edge has not
-						been seen yet, then a future
-						passing test will mark this
-						edge as the left edge */
-						if (right_edge[i] ==
-							IO_IO_OUT1_DELAY_MAX
-							+ 1) {
+						//USER If a right edge has not been seen yet, then a future passing test will mark this edge as the left edge 
+						if (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) {
 							left_edge[i] = -(d + 1);
 						}
 					} else {
-						/* d = 0 failed, but it passed
-						when testing the left edge, so
-						it must be marginal, set it
-						to -1 */
-						if (right_edge[i] ==
-							IO_IO_OUT1_DELAY_MAX +
-							1 && left_edge[i] !=
-							IO_IO_OUT1_DELAY_MAX +
-							1) {
+						//USER d = 0 failed, but it passed when testing the left edge, so it must be marginal, set it to -1
+						if (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1
+						    && left_edge[i] != IO_IO_OUT1_DELAY_MAX + 1) {
 							right_edge[i] = -1;
 						}
-						/* If a right edge has not been
-						seen yet, then a future passing
-						test will mark this edge as the
-						left edge */
-						else if (right_edge[i] ==
-							IO_IO_OUT1_DELAY_MAX +
-						1) {
+						//USER If a right edge has not been seen yet, then a future passing test will mark this edge as the left edge 
+						else if (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) {
 							left_edge[i] = -(d + 1);
 						}
 					}
 				}
-				pr_debug("write_center[r,d=%u): "
-					"bit_chk_test=%d left_edge[%u]: %d "
-					"right_edge[%u]: %d\n",
-					d, (int)(bit_chk & 1), i, left_edge[i],
-					i, right_edge[i]);
+				DPRINT(2,
+				       "write_center[r,d=%lu): bit_chk_test=%d left_edge[%lu]: %ld right_edge[%lu]: %ld",
+				       d, (int)(bit_chk & 1), i, left_edge[i], i, right_edge[i]);
 				bit_chk = bit_chk >> 1;
 			}
 		}
 	}
 
-	/* Check that all bits have a window */
+	//USER Check that all bits have a window
 	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
-		pr_debug("write_center: left_edge[%u]: %d right_edge[%u]: "
-			"%d\n", i, left_edge[i], i, right_edge[i]);
-		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1) ||
-				(right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1))
+		DPRINT(2, "write_center: left_edge[%lu]: %ld right_edge[%lu]: %ld", i, left_edge[i],
+		       i, right_edge[i]);
+		BFM_GBL_SET(dq_write_left_edge[write_group][i], left_edge[i]);
+		BFM_GBL_SET(dq_write_right_edge[write_group][i], right_edge[i]);
+		if ((left_edge[i] == IO_IO_OUT1_DELAY_MAX + 1)
+		    || (right_edge[i] == IO_IO_OUT1_DELAY_MAX + 1)) {
+			set_failing_group_stage(test_bgn + i, CAL_STAGE_WRITES,
+						CAL_SUBSTAGE_WRITES_CENTER);
 			return 0;
+		}
 	}
 
-	/* Find middle of window for each DQ bit */
+	//USER Find middle of window for each DQ bit 
 	mid_min = left_edge[0] - right_edge[0];
 	min_index = 0;
 	for (i = 1; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
@@ -3577,369 +4380,686 @@ static uint32_t rw_mgr_mem_calibrate_writes_center (uint32_t rank_bgn,
 		}
 	}
 
-	/*  -mid_min/2 represents the amount that we need to move DQS.
-	If mid_min is odd and positive we'll need to add one to
-	make sure the rounding in further calculations is correct
-	(always bias to the right), so just add 1 for all positive values */
-	if (mid_min > 0)
-		mid_min++;
-
-	mid_min = mid_min / 2;
+	//USER  -mid_min/2 represents the amount that we need to move DQS.  If mid_min is odd and positive we'll need to add one to
+	//USER make sure the rounding in further calculations is correct (always bias to the right), so just add 1 for all positive values
+	if (mid_min > 0) {
+		mid_min++;
+	}
+	mid_min = mid_min / 2;
+
+	DPRINT(1, "write_center: mid_min=%ld", mid_min);
+
+	//USER Determine the amount we can change DQS (which is -mid_min)
+	orig_mid_min = mid_min;
+	new_dqs = start_dqs;
+	mid_min = 0;
+
+	DPRINT(1, "write_center: start_dqs=%ld new_dqs=%ld mid_min=%ld", start_dqs, new_dqs,
+	       mid_min);
+
+	//USER Initialize data for export structures 
+	dqs_margin = IO_IO_OUT1_DELAY_MAX + 1;
+	dq_margin = IO_IO_OUT1_DELAY_MAX + 1;
+
+	//USER add delay to bring centre of all DQ windows to the same "level" 
+	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
+		//USER Use values before divide by 2 to reduce round off error 
+		shift_dq =
+		    (left_edge[i] - right_edge[i] -
+		     (left_edge[min_index] - right_edge[min_index])) / 2 + (orig_mid_min - mid_min);
+
+		DPRINT(2, "write_center: before: shift_dq[%lu]=%ld", i, shift_dq);
+
+		if (shift_dq + (int32_t) READ_SCC_DQ_OUT1_DELAY(i) > (int32_t) IO_IO_OUT1_DELAY_MAX) {
+			shift_dq = (int32_t) IO_IO_OUT1_DELAY_MAX - READ_SCC_DQ_OUT1_DELAY(i);
+		} else if (shift_dq + (int32_t) READ_SCC_DQ_OUT1_DELAY(i) < 0) {
+			shift_dq = -(int32_t) READ_SCC_DQ_OUT1_DELAY(i);
+		}
+		DPRINT(2, "write_center: after: shift_dq[%lu]=%ld", i, shift_dq);
+		scc_mgr_set_dq_out1_delay(write_group, i, READ_SCC_DQ_OUT1_DELAY(i) + shift_dq);
+		scc_mgr_load_dq(i);
+
+		DPRINT(2, "write_center: margin[%lu]=[%ld,%ld]", i,
+		       left_edge[i] - shift_dq + (-mid_min), right_edge[i] + shift_dq - (-mid_min));
+		//USER To determine values for export structures 
+		if (left_edge[i] - shift_dq + (-mid_min) < dq_margin) {
+			dq_margin = left_edge[i] - shift_dq + (-mid_min);
+		}
+		if (right_edge[i] + shift_dq - (-mid_min) < dqs_margin) {
+			dqs_margin = right_edge[i] + shift_dq - (-mid_min);
+		}
+	}
+
+	//USER Move DQS 
+	if (QDRII) {
+		scc_mgr_set_group_dqs_io_and_oct_out1_gradual(write_group, new_dqs);
+	} else {
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, new_dqs);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	}
+
+	//////////////////////
+	//////////////////////
+	//USER Centre DM 
+	//////////////////////
+	//////////////////////
+
+	BFM_STAGE("dm_center");
+
+	DPRINT(2, "write_center: DM");
+
+	//USER set the left and right edge of each bit to an illegal value 
+	//USER use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value
+	left_edge[0] = IO_IO_OUT1_DELAY_MAX + 1;
+	right_edge[0] = IO_IO_OUT1_DELAY_MAX + 1;
+	int32_t bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
+	int32_t end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+	int32_t bgn_best = IO_IO_OUT1_DELAY_MAX + 1;
+	int32_t end_best = IO_IO_OUT1_DELAY_MAX + 1;
+	int32_t win_best = 0;
+
+	//USER Search for the/part of the window with DM shift
+	for (d = IO_IO_OUT1_DELAY_MAX; d >= 0; d -= DELTA_D) {
+		scc_mgr_apply_group_dm_out1_delay(write_group, d);
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		if (rw_mgr_mem_calibrate_write_test
+		    (rank_bgn, write_group, 1, PASS_ALL_BITS, &bit_chk, 0)) {
+
+			//USE Set current end of the window
+			end_curr = -d;
+			//USER If a starting edge of our window has not been seen this is our current start of the DM window
+			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1) {
+				bgn_curr = -d;
+			}
+			//USER If current window is bigger than best seen. Set best seen to be current window 
+			if ((end_curr - bgn_curr + 1) > win_best) {
+				win_best = end_curr - bgn_curr + 1;
+				bgn_best = bgn_curr;
+				end_best = end_curr;
+			}
+		} else {
+			//USER We just saw a failing test. Reset temp edge
+			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
+			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+		}
+
+	}
+
+	//USER Reset DM delay chains to 0
+	scc_mgr_apply_group_dm_out1_delay(write_group, 0);
+
+	//USER Check to see if the current window nudges up aganist 0 delay. If so we need to continue the search by shifting DQS otherwise DQS search begins as a new search
+	if (end_curr != 0) {
+		bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
+		end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+	}
+	//USER Search for the/part of the window with DQS shifts
+	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX - new_dqs; d += DELTA_D) {
+		// Note: This only shifts DQS, so are we limiting ourselve to
+		// width of DQ unnecessarily
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, d + new_dqs);
+
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		if (rw_mgr_mem_calibrate_write_test
+		    (rank_bgn, write_group, 1, PASS_ALL_BITS, &bit_chk, 0)) {
+
+			//USE Set current end of the window
+			end_curr = d;
+			//USER If a beginning edge of our window has not been seen this is our current begin of the DM window
+			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1) {
+				bgn_curr = d;
+			}
+			//USER If current window is bigger than best seen. Set best seen to be current window
+			if ((end_curr - bgn_curr + 1) > win_best) {
+				win_best = end_curr - bgn_curr + 1;
+				bgn_best = bgn_curr;
+				end_best = end_curr;
+			}
+		} else {
+			//USER We just saw a failing test. Reset temp edge
+			recover_mem_device_after_ck_dqs_violation();
+			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
+			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+
+			//USER Early exit optimization: if ther remaining delay chain space is less than already seen largest window we can exit
+			if ((win_best - 1) > (IO_IO_OUT1_DELAY_MAX - new_dqs - d)) {
+				break;
+			}
+
+		}
+	}
+
+	//USER assign left and right edge for cal and reporting;
+	left_edge[0] = -1 * bgn_best;
+	right_edge[0] = end_best;
+
+	DPRINT(2, "dm_calib: left=%ld right=%ld", left_edge[0], right_edge[0]);
+	BFM_GBL_SET(dm_left_edge[write_group][0], left_edge[0]);
+	BFM_GBL_SET(dm_right_edge[write_group][0], right_edge[0]);
+
+	//USER Move DQS (back to orig)
+	scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, new_dqs);
+
+	//USER Move DM
+
+	//USER Find middle of window for the DM bit
+	mid = (left_edge[0] - right_edge[0]) / 2;
+
+	//USER only move right, since we are not moving DQS/DQ
+	if (mid < 0) {
+		mid = 0;
+	}
+	//dm_marign should fail if we never find a window
+	if (win_best == 0) {
+		dm_margin = -1;
+	} else {
+		dm_margin = left_edge[0] - mid;
+	}
+
+	scc_mgr_apply_group_dm_out1_delay(write_group, mid);
+	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+	DPRINT(2, "dm_calib: left=%ld right=%ld mid=%ld dm_margin=%ld",
+	       left_edge[0], right_edge[0], mid, dm_margin);
+
+	// Store observed DM margins
+	for (i = 0; i < RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP; i++) {
+		TCLRPT_SET(debug_cal_report->cal_dm_margins[curr_shadow_reg][write_group][i].
+			   left_edge, left_edge[i]);
+		TCLRPT_SET(debug_cal_report->cal_dm_margins[curr_shadow_reg][write_group][i].
+			   right_edge, right_edge[i]);
+	}
+
+	//USER Export values 
+	gbl->fom_out += dq_margin + dqs_margin;
+
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_margins[curr_shadow_reg][write_group].dqs_margin,
+		   dqs_margin);
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_margins[curr_shadow_reg][write_group].dq_margin,
+		   dq_margin);
+
+	TCLRPT_SET(debug_cal_report->cal_dqs_out_margins[curr_shadow_reg][write_group].dm_margin,
+		   dm_margin);
+	TCLRPT_SET(debug_summary_report->fom_out,
+		   debug_summary_report->fom_out + (dq_margin + dqs_margin));
+	TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][write_group].fom_out,
+		   (dq_margin + dqs_margin));
+
+	DPRINT(2, "write_center: dq_margin=%ld dqs_margin=%ld dm_margin=%ld", dq_margin, dqs_margin,
+	       dm_margin);
+
+	//USER Do not remove this line as it makes sure all of our decisions have been applied
+	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	return (dq_margin >= 0) && (dqs_margin >= 0) && (dm_margin >= 0);
+}
+
+#else // !NEWVERSION_WRDESKEW
+
+static uint32_t rw_mgr_mem_calibrate_writes_center(uint32_t rank_bgn, uint32_t write_group,
+						   uint32_t test_bgn)
+{
+	uint32_t i, p, d;
+	uint32_t mid;
+	t_btfld bit_chk, sticky_bit_chk;
+	uint32_t max_working_dq[RW_MGR_MEM_DQ_PER_WRITE_DQS];
+	uint32_t max_working_dm[RW_MGR_MEM_DATA_MASK_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH];
+	uint32_t dq_margin, dqs_margin, dm_margin;
+	uint32_t start_dqs;
+	uint32_t stop;
+
+	TRACE_FUNC("%lu %lu", write_group, test_bgn);
+
+	ALTERA_ASSERT(write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH);
+
+	//USER per-bit deskew 
+
+	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
+		max_working_dq[i] = 0;
+	}
+
+	for (d = 1; d <= IO_IO_OUT1_DELAY_MAX; d++) {
+		scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, d);
+
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+		if (!rw_mgr_mem_calibrate_write_test
+		    (rank_bgn, write_group, 0, PASS_ONE_BIT, &bit_chk, 0)) {
+			break;
+		} else {
+			for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
+				if (bit_chk & 1) {
+					max_working_dq[i] = d;
+				}
+				bit_chk = bit_chk >> 1;
+			}
+		}
+	}
+
+	scc_mgr_apply_group_dq_out1_delay(write_group, test_bgn, 0);
 
-	pr_debug("write_center: mid_min=%d\n", mid_min);
+	//USER determine minimum of maximums 
 
-	/* Determine the amount we can change DQS (which is -mid_min) */
-	orig_mid_min = mid_min;
-	new_dqs = start_dqs;
-	mid_min = 0;
+	dq_margin = IO_IO_OUT1_DELAY_MAX;
 
-	pr_debug("write_center: start_dqs=%d new_dqs=%d mid_min=%d\n", start_dqs, new_dqs, mid_min);
+	for (i = 0; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++) {
+		if (max_working_dq[i] < dq_margin) {
+			dq_margin = max_working_dq[i];
+		}
+	}
 
-	/* Initialize data for export structures */
-	dqs_margin = IO_IO_OUT1_DELAY_MAX + 1;
-	dq_margin  = IO_IO_OUT1_DELAY_MAX + 1;
+	//USER add delay to center DQ windows 
 
-	/* add delay to bring centre of all DQ windows to the same "level" */
 	for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
-		/* Use values before divide by 2 to reduce round off error */
-		shift_dq = (left_edge[i] - right_edge[i] -
-			(left_edge[min_index] - right_edge[min_index]))/2  +
-		(orig_mid_min - mid_min);
-
-		pr_debug("write_center: before: shift_dq[%u]=%d\n", i,
-			shift_dq);
-
-		if (shift_dq + (int32_t)READ_SCC_DQ_OUT1_DELAY(i) >
-			(int32_t)IO_IO_OUT1_DELAY_MAX) {
-			shift_dq = (int32_t)IO_IO_OUT1_DELAY_MAX -
-			READ_SCC_DQ_OUT1_DELAY(i);
-		} else if (shift_dq + (int32_t)READ_SCC_DQ_OUT1_DELAY(i) < 0) {
-			shift_dq = -(int32_t)READ_SCC_DQ_OUT1_DELAY(i);
-		}
-		pr_debug("write_center: after: shift_dq[%u]=%d\n",
-			i, shift_dq);
-		scc_mgr_set_dq_out1_delay(write_group, i,
-			READ_SCC_DQ_OUT1_DELAY(i) + shift_dq);
-		scc_mgr_load_dq (i);
-
-		pr_debug("write_center: margin[%u]=[%d,%d]\n", i,
-			left_edge[i] - shift_dq + (-mid_min),
-			right_edge[i] + shift_dq - (-mid_min));
-		/* To determine values for export structures */
-		if (left_edge[i] - shift_dq + (-mid_min) < dq_margin)
-			dq_margin = left_edge[i] - shift_dq + (-mid_min);
-		if (right_edge[i] + shift_dq - (-mid_min) < dqs_margin)
-			dqs_margin = right_edge[i] + shift_dq - (-mid_min);
-	}
+		if (max_working_dq[i] > dq_margin) {
+			scc_mgr_set_dq_out1_delay(write_group, i, max_working_dq[i] - dq_margin);
+		} else {
+			scc_mgr_set_dq_out1_delay(write_group, i, 0);
+		}
 
-	/* Move DQS */
-	scc_mgr_apply_group_dqs_io_and_oct_out1 (write_group, new_dqs);
-	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+		scc_mgr_load_dq(p, i);
+	}
 
-	/* Centre DM */
+	//USER sweep DQS window, may potentially have more window due to per-bit-deskew
 
-	pr_debug("write_center: DM\n");
+	start_dqs = READ_SCC_DQS_IO_OUT1_DELAY();
 
-	/* set the left and right edge of each bit to an illegal value */
-	/* use (IO_IO_OUT1_DELAY_MAX + 1) as an illegal value */
-	left_edge[0]  = IO_IO_OUT1_DELAY_MAX + 1;
-	right_edge[0] = IO_IO_OUT1_DELAY_MAX + 1;
-	bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
-	end_curr = IO_IO_OUT1_DELAY_MAX + 1;
-	bgn_best = IO_IO_OUT1_DELAY_MAX + 1;
-	end_best = IO_IO_OUT1_DELAY_MAX + 1;
-	win_best = 0;
+	for (d = start_dqs + 1; d <= IO_IO_OUT1_DELAY_MAX; d++) {
+		scc_mgr_apply_group_dqs_io_and_oct_out1(write_group, d);
 
-	/* Search for the/part of the window with DM shift */
-	for (d = IO_IO_OUT1_DELAY_MAX; d >= 0; d -= DELTA_D) {
-		scc_mgr_apply_group_dm_out1_delay (write_group, d);
 		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-		if (rw_mgr_mem_calibrate_write_test (rank_bgn, write_group, 1,
-			PASS_ALL_BITS, &bit_chk, 0)) {
-
-			/*USE Set current end of the window */
-			end_curr = -d;
-			/* If a starting edge of our window has not been seen
-			this is our current start of the DM window */
-			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1)
-				bgn_curr = -d;
+		if (QDRII) {
+			rw_mgr_mem_dll_lock_wait();
+		}
 
-			/* If current window is bigger than best seen.
-			Set best seen to be current window */
-			if ((end_curr-bgn_curr+1) > win_best) {
-				win_best = end_curr-bgn_curr+1;
-				bgn_best = bgn_curr;
-				end_best = end_curr;
-			}
-		} else {
-			/* We just saw a failing test. Reset temp edge */
-			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
-			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+		if (!rw_mgr_mem_calibrate_write_test
+		    (rank_bgn, write_group, 0, PASS_ALL_BITS, &bit_chk, 0)) {
+			break;
 		}
 	}
 
-	/* Reset DM delay chains to 0 */
-	scc_mgr_apply_group_dm_out1_delay (write_group, 0);
+	scc_mgr_set_dqs_out1_delay(write_group, start_dqs);
+	scc_mgr_set_oct_out1_delay(write_group, start_dqs);
 
-	/* Check to see if the current window nudges up aganist 0 delay.
-	If so we need to continue the search by shifting DQS otherwise DQS
-	search begins as a new search */
-	if (end_curr != 0) {
-		bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
-		end_curr = IO_IO_OUT1_DELAY_MAX + 1;
-	}
+	dqs_margin = d - start_dqs - 1;
 
-	/* Search for the/part of the window with DQS shifts */
-	for (d = 0; d <= IO_IO_OUT1_DELAY_MAX - new_dqs; d += DELTA_D) {
-		/* Note: This only shifts DQS, so are we limiting ourselve to */
-		/* width of DQ unnecessarily */
-		scc_mgr_apply_group_dqs_io_and_oct_out1 (write_group,
-			d + new_dqs);
+	//USER time to center, +1 so that we don't go crazy centering DQ 
 
-		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	mid = (dq_margin + dqs_margin + 1) / 2;
 
-		if (rw_mgr_mem_calibrate_write_test (rank_bgn, write_group, 1,
-			PASS_ALL_BITS, &bit_chk, 0)) {
+	gbl->fom_out += dq_margin + dqs_margin;
+	TCLRPT_SET(debug_summary_report->fom_out,
+		   debug_summary_report->fom_out + (dq_margin + dqs_margin));
+	TCLRPT_SET(debug_cal_report->cal_status_per_group[curr_shadow_reg][grp].fom_out,
+		   (dq_margin + dqs_margin));
 
-			/*USE Set current end of the window */
-			end_curr = d;
-			/* If a beginning edge of our window has not been seen
-			this is our current begin of the DM window */
-			if (bgn_curr == IO_IO_OUT1_DELAY_MAX + 1)
-				bgn_curr = d;
+	scc_mgr_load_dqs_io();
+	scc_mgr_load_dqs_for_write_group(write_group);
 
-			/* If current window is bigger than best seen. Set best
-			seen to be current window */
-			if ((end_curr-bgn_curr+1) > win_best) {
-				win_best = end_curr-bgn_curr+1;
-				bgn_best = bgn_curr;
-				end_best = end_curr;
-			}
-		} else {
-			/* We just saw a failing test. Reset temp edge */
-			recover_mem_device_after_ck_dqs_violation();
-			bgn_curr = IO_IO_OUT1_DELAY_MAX + 1;
-			end_curr = IO_IO_OUT1_DELAY_MAX + 1;
+	//USER center dq 
 
-			/* Early exit optimization: if ther remaining delay
-			chain space is less than already seen largest window
-			we can exit */
-			if ((win_best - 1) > (IO_IO_OUT1_DELAY_MAX - new_dqs - d))
-				break;
+	if (dq_margin > mid) {
+		for (i = 0, p = test_bgn; i < RW_MGR_MEM_DQ_PER_WRITE_DQS; i++, p++) {
+			scc_mgr_set_dq_out1_delay(write_group, i,
+						  READ_SCC_DQ_OUT1_DELAY(i) + dq_margin - mid);
+			scc_mgr_load_dq(p, i);
 		}
+		dqs_margin += dq_margin - mid;
+		dq_margin -= dq_margin - mid;
 	}
+	//USER do dm centering 
 
-	/* assign left and right edge for cal and reporting; */
-	left_edge[0] = -1*bgn_best;
-	right_edge[0] = end_best;
+	if (!RLDRAMX) {
+		dm_margin = IO_IO_OUT1_DELAY_MAX;
+
+		if (QDRII) {
+			sticky_bit_chk = 0;
+			for (i = 0; i < RW_MGR_MEM_DATA_MASK_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+			     i++) {
+				max_working_dm[i] = 0;
+			}
+		}
 
-	pr_debug("dm_calib: left=%d right=%d\n", left_edge[0], right_edge[0]);
+		for (d = 1; d <= IO_IO_OUT1_DELAY_MAX; d++) {
+			scc_mgr_apply_group_dm_out1_delay(write_group, d);
+			IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-	/* Move DQS (back to orig) */
-	scc_mgr_apply_group_dqs_io_and_oct_out1 (write_group, new_dqs);
+			if (DDRX) {
+				if (rw_mgr_mem_calibrate_write_test
+				    (rank_bgn, write_group, 1, PASS_ALL_BITS, &bit_chk, 0)) {
+					max_working_dm[0] = d;
+				} else {
+					break;
+				}
+			} else {
+				stop =
+				    !rw_mgr_mem_calibrate_write_test(rank_bgn, write_group, 1,
+								     PASS_ALL_BITS, &bit_chk, 0);
+				sticky_bit_chk = sticky_bit_chk | bit_chk;
+				stop = stop && (sticky_bit_chk == param->read_correct_mask);
 
-	/* Move DM */
+				if (stop == 1) {
+					break;
+				} else {
+					for (i = 0;
+					     i <
+					     RW_MGR_MEM_DATA_MASK_WIDTH /
+					     RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
+						if ((bit_chk & param->dm_correct_mask) ==
+						    param->dm_correct_mask) {
+							max_working_dm[i] = d;
+						}
+						bit_chk =
+						    bit_chk >> (RW_MGR_MEM_DATA_WIDTH /
+								RW_MGR_MEM_DATA_MASK_WIDTH);
+					}
+				}
+			}
+		}
 
-	/* Find middle of window for the DM bit */
-	mid = (left_edge[0] - right_edge[0]) / 2;
+		i = 0;
+		for (i = 0; i < RW_MGR_NUM_DM_PER_WRITE_GROUP; i++) {
+			if (max_working_dm[i] > mid) {
+				scc_mgr_set_dm_out1_delay(write_group, i, max_working_dm[i] - mid);
+			} else {
+				scc_mgr_set_dm_out1_delay(write_group, i, 0);
+			}
 
-	/* only move right, since we are not moving DQS/DQ */
-	if (mid < 0)
-		mid = 0;
+			scc_mgr_load_dm(i);
 
-	/*dm_marign should fail if we never find a window */
-	if (win_best == 0) {
-		dm_margin = -1;
+			if (max_working_dm[i] < dm_margin) {
+				dm_margin = max_working_dm[i];
+			}
+		}
 	} else {
-		dm_margin = left_edge[0] - mid;
+		dm_margin = 0;
 	}
 
-	scc_mgr_apply_group_dm_out1_delay(write_group, mid);
 	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 
-	pr_debug("dm_calib: left=%d right=%d mid=%d dm_margin=%d\n",
-		left_edge[0], right_edge[0], mid, dm_margin);
+	return (dq_margin + dqs_margin) > 0;
+}
 
-	/* Export values */
-	gbl->fom_out += dq_margin + dqs_margin;
+#endif
 
-	pr_debug("write_center: dq_margin=%d dqs_margin=%d dm_margin=%d\n",
-		dq_margin, dqs_margin, dm_margin);
+//USER calibrate the write operations
 
-	/* Do not remove this line as it makes sure all of our
-	decisions have been applied */
-	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
-	return (dq_margin >= 0) && (dqs_margin >= 0) && (dm_margin >= 0);
-}
+static uint32_t rw_mgr_mem_calibrate_writes(uint32_t rank_bgn, uint32_t g, uint32_t test_bgn)
+{
+	//USER update info for sims
 
-/* calibrate the write operations */
+	TRACE_FUNC("%lu %lu", g, test_bgn);
 
-static uint32_t rw_mgr_mem_calibrate_writes (uint32_t rank_bgn, uint32_t g,
-	uint32_t test_bgn)
-{
 	reg_file_set_stage(CAL_STAGE_WRITES);
 	reg_file_set_sub_stage(CAL_SUBSTAGE_WRITES_CENTER);
 
+	//USER starting phases 
+
+	//USER update info for sims
+
 	reg_file_set_group(g);
 
-	return rw_mgr_mem_calibrate_writes_center (rank_bgn, g, test_bgn);
+	if (!rw_mgr_mem_calibrate_writes_center(rank_bgn, g, test_bgn)) {
+		set_failing_group_stage(g, CAL_STAGE_WRITES, CAL_SUBSTAGE_WRITES_CENTER);
+		return 0;
+	}
+
+	return 1;
+}
+
+// helpful for creating eye diagrams 
+// TODO: This is for the TCL DBG... but obviously it serves no purpose...
+// Decide what to do with it!
+
+static void rw_mgr_mem_calibrate_eye_diag_aid(void)
+{
+	// no longer exists
+}
+
+// TODO: This needs to be update to properly handle the number of failures
+// Right now it only checks if the write test was successful or not
+static uint32_t rw_mgr_mem_calibrate_full_test(uint32_t min_correct, t_btfld * bit_chk,
+					       uint32_t test_dm)
+{
+	uint32_t g;
+	uint32_t success = 0;
+	uint32_t run_groups = ~param->skip_groups;
+
+	TRACE_FUNC("%lu %lu", min_correct, test_dm);
+
+	for (g = 0; g < RW_MGR_MEM_IF_READ_DQS_WIDTH; g++) {
+		if (run_groups & ((1 << RW_MGR_NUM_DQS_PER_WRITE_GROUP) - 1)) {
+			success =
+			    rw_mgr_mem_calibrate_write_test_all_ranks(g, test_dm, PASS_ALL_BITS,
+								      bit_chk);
+		}
+		run_groups = run_groups >> RW_MGR_NUM_DQS_PER_WRITE_GROUP;
+	}
+
+	return success;
 }
 
-/* precharge all banks and activate row 0 in bank "000..." and bank "111..." */
-static void mem_precharge_and_activate (void)
+//USER precharge all banks and activate row 0 in bank "000..." and bank "111..." 
+static void mem_precharge_and_activate(void)
 {
 	uint32_t r;
 
 	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r++) {
-		/* set rank */
+		if (param->skip_ranks[r]) {
+			//USER request to skip the rank
+
+			continue;
+		}
+		//USER set rank
 		set_rank_and_odt_mask(r, RW_MGR_ODT_MODE_OFF);
 
-		/* precharge all banks ... */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_PRECHARGE_ALL);
+		//USER precharge all banks ... 
+		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_PRECHARGE_ALL);
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_0, 0, 0x0F);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0,
-			__RW_MGR_ACTIVATE_0_AND_1_WAIT1);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_0, 0, __RW_MGR_ACTIVATE_0_AND_1_WAIT1);
 
 		IOWR_32DIRECT(RW_MGR_LOAD_CNTR_1, 0, 0x0F);
-		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0,
-			__RW_MGR_ACTIVATE_0_AND_1_WAIT2);
+		IOWR_32DIRECT(RW_MGR_LOAD_JUMP_ADD_1, 0, __RW_MGR_ACTIVATE_0_AND_1_WAIT2);
 
-		/* activate rows */
-		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0,
-			__RW_MGR_ACTIVATE_0_AND_1);
+		//USER activate rows 
+		IOWR_32DIRECT(RW_MGR_RUN_SINGLE_GROUP, 0, __RW_MGR_ACTIVATE_0_AND_1);
 	}
 }
 
-/* perform all refreshes necessary over all ranks */
+//USER perform all refreshes necessary over all ranks
 
-/* Configure various memory related parameters. */
-static void mem_config (void)
+//USER Configure various memory related parameters.
+
+static void mem_config(void)
 {
 	uint32_t rlat, wlat;
 	uint32_t rw_wl_nop_cycles;
 	uint32_t max_latency;
 
-	/* read in write and read latency */
+	TRACE_FUNC();
 
-	wlat = IORD_32DIRECT (MEM_T_WL_ADD, 0);
-	wlat += IORD_32DIRECT (DATA_MGR_MEM_T_ADD, 0);
-	/* WL for hard phy does not include additive latency */
+	//USER read in write and read latency 
 
-	/*
-	 * YYONG: add addtional write latency to offset the address/command extra clock cycle
-	 * YYONG: We change the AC mux setting causing AC to be delayed by one mem clock cycle
-	 * YYONG: only do this for DDR3
-	 */
-#if DDR3 || DDR2
-	wlat += 1;
-#endif
-	rlat = IORD_32DIRECT (MEM_T_RL_ADD, 0);
+	wlat = IORD_32DIRECT(MEM_T_WL_ADD, 0);
+	wlat += IORD_32DIRECT(DATA_MGR_MEM_T_ADD, 0);	/* WL for hard phy does not include additive latency */
+
+	// YYONG: add addtional write latency to offset the address/command extra clock cycle
+	// YYONG: We change the AC mux setting causing AC to be delayed by one mem clock cycle
+	// YYONG: only do this for DDR3
+	wlat = wlat + 1;
+
+	rlat = IORD_32DIRECT(MEM_T_RL_ADD, 0);
 
 	if (QUARTER_RATE_MODE) {
-		/* In Quarter-Rate the WL-to-nop-cycles works like this */
-		/* 0,1     -> 0 */
-		/* 2,3,4,5 -> 1 */
-		/* 6,7,8,9 -> 2 */
-		/* etc... */
+		//USER In Quarter-Rate the WL-to-nop-cycles works like this
+		//USER 0,1     -> 0
+		//USER 2,3,4,5 -> 1
+		//USER 6,7,8,9 -> 2
+		//USER etc...
 		rw_wl_nop_cycles = (wlat + 6) / 4 - 1;
-	} else if (HALF_RATE_MODE)	{
-		/* In Half-Rate the WL-to-nop-cycles works like this */
-		/* 0,1 -> -1 */
-		/* 2,3 -> 0 */
-		/* 4,5 -> 1 */
-		/* etc... */
-		if (wlat % 2)
+	} else if (HALF_RATE_MODE) {
+		//USER In Half-Rate the WL-to-nop-cycles works like this
+		//USER 0,1 -> -1
+		//USER 2,3 -> 0
+		//USER 4,5 -> 1
+		//USER etc...
+		if (wlat % 2) {
 			rw_wl_nop_cycles = ((wlat - 1) / 2) - 1;
-		else
+		} else {
 			rw_wl_nop_cycles = (wlat / 2) - 1;
+		}
 	} else {
 		rw_wl_nop_cycles = wlat - 2;
-#if LPDDR2
-		rw_wl_nop_cycles = rw_wl_nop_cycles + 1;
-#endif
 	}
-#if MULTIPLE_AFI_WLAT
-	for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
-		gbl->rw_wl_nop_cycles_per_group[i] = rw_wl_nop_cycles;
-	}
-#endif
 	gbl->rw_wl_nop_cycles = rw_wl_nop_cycles;
 
-#if ARRIAV || CYCLONEV
-	/* For AV/CV, lfifo is hardened and always runs at full rate so
-	max latency in AFI clocks, used here, is correspondingly smaller */
+	//USER For AV/CV, lfifo is hardened and always runs at full rate
+	//USER so max latency in AFI clocks, used here, is correspondingly smaller
 	if (QUARTER_RATE_MODE) {
-		max_latency = (1<<MAX_LATENCY_COUNT_WIDTH)/4 - 1;
+		max_latency = (1 << MAX_LATENCY_COUNT_WIDTH) / 4 - 1;
 	} else if (HALF_RATE_MODE) {
-		max_latency = (1<<MAX_LATENCY_COUNT_WIDTH)/2 - 1;
+		max_latency = (1 << MAX_LATENCY_COUNT_WIDTH) / 2 - 1;
 	} else {
-		max_latency = (1<<MAX_LATENCY_COUNT_WIDTH)/1 - 1;
+		max_latency = (1 << MAX_LATENCY_COUNT_WIDTH) / 1 - 1;
 	}
-#else
-	max_latency = (1<<MAX_LATENCY_COUNT_WIDTH) - 1;
-#endif
-	/* configure for a burst length of 8 */
+	//USER configure for a burst length of 8
 
 	if (QUARTER_RATE_MODE) {
-		/* write latency */
+		//USER write latency 
 		wlat = (wlat + 5) / 4 + 1;
 
-		/* set a pretty high read latency initially */
+		//USER set a pretty high read latency initially
 		gbl->curr_read_lat = (rlat + 1) / 4 + 8;
 	} else if (HALF_RATE_MODE) {
-		/* write latency */
+		//USER write latency 
 		wlat = (wlat - 1) / 2 + 1;
 
-		/* set a pretty high read latency initially */
+		//USER set a pretty high read latency initially 
 		gbl->curr_read_lat = (rlat + 1) / 2 + 8;
 	} else {
-		/* write latency */
-		/* Adjust Write Latency for Hard PHY */
+		//USER write latency 
+		// Adjust Write Latency for Hard PHY
 		wlat = wlat + 1;
-#if LPDDR2
-		/* Add another one in hard for LPDDR2 since this value is raw
-		from controller assume tdqss is one */
-		wlat = wlat + 1;
-#endif
 
-		/* set a pretty high read latency initially */
+		//USER set a pretty high read latency initially 
 		gbl->curr_read_lat = rlat + 16;
 	}
 
-	if (gbl->curr_read_lat > max_latency)
+	if (gbl->curr_read_lat > max_latency) {
 		gbl->curr_read_lat = max_latency;
-
+	}
 	IOWR_32DIRECT(PHY_MGR_PHY_RLAT, 0, gbl->curr_read_lat);
 
-	/* advertise write latency */
+	//USER advertise write latency 
 	gbl->curr_write_lat = wlat;
-#if MULTIPLE_AFI_WLAT
-	for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
-		IOWR_32DIRECT(PHY_MGR_AFI_WLAT, i*4, wlat - 2);
-	}
-#else
 	IOWR_32DIRECT(PHY_MGR_AFI_WLAT, 0, wlat - 2);
-#endif
 
-	mem_precharge_and_activate ();
+	//USER initialize bit slips
+
+	mem_precharge_and_activate();
+}
+
+//USER Set VFIFO and LFIFO to instant-on settings in skip calibration mode
+
+static void mem_skip_calibrate(void)
+{
+	uint32_t vfifo_offset;
+	uint32_t i, j, r;
+
+	TRACE_FUNC();
+
+	// Need to update every shadow register set used by the interface       
+	for (r = 0; r < RW_MGR_MEM_NUMBER_OF_RANKS; r += NUM_RANKS_PER_SHADOW_REG) {
+
+		// Strictly speaking this should be called once per group to make
+		// sure each group's delay chains are refreshed from the SCC register file,
+		// but since we're resetting all delay chains anyway, we can save some
+		// runtime by calling select_shadow_regs_for_update just once to switch rank.
+		select_shadow_regs_for_update(r, 0, 1);
+
+		//USER Set output phase alignment settings appropriate for skip calibration
+		for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
+
+			scc_mgr_set_dqs_en_phase(i, 0);
+			// Case:33398
+			//
+			// Write data arrives to the I/O two cycles before write latency is reached (720 deg).
+			//   -> due to bit-slip in a/c bus
+			//   -> to allow board skew where dqs is longer than ck 
+			//      -> how often can this happen!?
+			//      -> can claim back some ptaps for high freq support if we can relax this, but i digress...
+			//
+			// The write_clk leads mem_ck by 90 deg
+			// The minimum ptap of the OPA is 180 deg
+			// Each ptap has (360 / IO_DLL_CHAIN_LENGH) deg of delay
+			// The write_clk is always delayed by 2 ptaps
+			//
+			// Hence, to make DQS aligned to CK, we need to delay DQS by:
+			//    (720 - 90 - 180 - 2 * (360 / IO_DLL_CHAIN_LENGTH))
+			//
+			// Dividing the above by (360 / IO_DLL_CHAIN_LENGTH) gives us the number of ptaps, which simplies to:
+			//
+			//    (1.25 * IO_DLL_CHAIN_LENGTH - 2)
+			scc_mgr_set_dqdqs_output_phase(i, (1.25 * IO_DLL_CHAIN_LENGTH - 2));
+		}
+
+		IOWR_32DIRECT(SCC_MGR_DQS_ENA, 0, 0xff);
+		IOWR_32DIRECT(SCC_MGR_DQS_IO_ENA, 0, 0xff);
+
+		for (i = 0; i < RW_MGR_MEM_IF_WRITE_DQS_WIDTH; i++) {
+			IOWR_32DIRECT(SCC_MGR_GROUP_COUNTER, 0, i);
+			IOWR_32DIRECT(SCC_MGR_DQ_ENA, 0, 0xff);
+			IOWR_32DIRECT(SCC_MGR_DM_ENA, 0, 0xff);
+		}
+
+		IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+	}
+
+	// Compensate for simulation model behaviour 
+	for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
+		scc_mgr_set_dqs_bus_in_delay(i, 10);
+		scc_mgr_load_dqs(i);
+	}
+	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
+
+	//ArriaV has hard FIFOs that can only be initialized by incrementing in sequencer
+	vfifo_offset = CALIB_VFIFO_OFFSET;
+	for (j = 0; j < vfifo_offset; j++) {
+		if (HARD_PHY) {
+			IOWR_32DIRECT(PHY_MGR_CMD_INC_VFIFO_HARD_PHY, 0, 0xff);
+		} else {
+			IOWR_32DIRECT(PHY_MGR_CMD_INC_VFIFO_FR, 0, 0xff);
+		}
+	}
+
+	IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
+
+	// For ACV with hard lfifo, we get the skip-cal setting from generation-time constant
+	gbl->curr_read_lat = CALIB_LFIFO_OFFSET;
+	IOWR_32DIRECT(PHY_MGR_PHY_RLAT, 0, gbl->curr_read_lat);
 }
 
-/* Memory calibration entry point */
+//USER Memory calibration entry point
 
-static uint32_t mem_calibrate (void)
+static uint32_t mem_calibrate(void)
 {
 	uint32_t i;
-	uint32_t rank_bgn;
+	uint32_t rank_bgn, sr;
 	uint32_t write_group, write_test_bgn;
 	uint32_t read_group, read_test_bgn;
 	uint32_t run_groups, current_run;
+	uint32_t failing_groups = 0;
+	uint32_t group_failed = 0;
+	uint32_t sr_failed = 0;
+
+	TRACE_FUNC();
 
-	/* Initialize the data settings */
-	pr_debug("Preparing to init data\n");
-	pr_debug("Init complete\n");
+	// Initialize the data settings
+	DPRINT(1, "Preparing to init data");
+	DPRINT(1, "Init complete");
 
 	gbl->error_substage = CAL_SUBSTAGE_NIL;
 	gbl->error_stage = CAL_STAGE_NIL;
@@ -3947,93 +5067,208 @@ static uint32_t mem_calibrate (void)
 	gbl->fom_in = 0;
 	gbl->fom_out = 0;
 
-	mem_config ();
+	TCLRPT_SET(debug_summary_report->cal_read_latency, 0);
+	TCLRPT_SET(debug_summary_report->cal_write_latency, 0);
+
+	mem_config();
 
 	if (ARRIAV || CYCLONEV) {
+		uint32_t bypass_mode = (HARD_PHY) ? 0x1 : 0x0;
 		for (i = 0; i < RW_MGR_MEM_IF_READ_DQS_WIDTH; i++) {
 			IOWR_32DIRECT(SCC_MGR_GROUP_COUNTER, 0, i);
-			scc_set_bypass_mode(i);
+			scc_set_bypass_mode(i, bypass_mode);
 		}
 	}
 
-	/* Zero all delay chain/phase settings for all
-	groups and all shadow register sets */
-	scc_mgr_zero_all ();
+	if (((DYNAMIC_CALIB_STEPS) & CALIB_SKIP_ALL) == CALIB_SKIP_ALL) {
+		//USER Set VFIFO and LFIFO to instant-on settings in skip calibration mode 
 
-	run_groups = ~0;
+		mem_skip_calibrate();
+	} else {
+		for (i = 0; i < NUM_CALIB_REPEAT; i++) {
 
-	for (write_group = 0, write_test_bgn = 0; write_group
-		< RW_MGR_MEM_IF_WRITE_DQS_WIDTH; write_group++,
-		write_test_bgn += RW_MGR_MEM_DQ_PER_WRITE_DQS) {
+			//USER Zero all delay chain/phase settings for all groups and all shadow register sets
+			scc_mgr_zero_all();
 
-		/* Mark the group as being attempted for calibration */
+			run_groups = ~param->skip_groups;
 
-		current_run = run_groups & ((1 << RW_MGR_NUM_DQS_PER_WRITE_GROUP) - 1);
-		run_groups = run_groups >> RW_MGR_NUM_DQS_PER_WRITE_GROUP;
+			for (write_group = 0, write_test_bgn = 0;
+			     write_group < RW_MGR_MEM_IF_WRITE_DQS_WIDTH;
+			     write_group++, write_test_bgn += RW_MGR_MEM_DQ_PER_WRITE_DQS) {
+				// Initialized the group failure
+				group_failed = 0;
 
-		if (current_run == 0)
-			continue;
+				// Mark the group as being attempted for calibration
 
-		IOWR_32DIRECT(SCC_MGR_GROUP_COUNTER, 0, write_group);
-		scc_mgr_zero_group (write_group, write_test_bgn, 0);
+				BFM_GBL_SET(vfifo_idx, 0);
+				current_run =
+				    run_groups & ((1 << RW_MGR_NUM_DQS_PER_WRITE_GROUP) - 1);
+				run_groups = run_groups >> RW_MGR_NUM_DQS_PER_WRITE_GROUP;
 
-		for (read_group = write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP,
-			read_test_bgn = 0;
-			read_group < (write_group + 1) * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-			read_group++, read_test_bgn += RW_MGR_MEM_DQ_PER_READ_DQS) {
+				if (current_run == 0) {
+					continue;
+				}
 
-			/* Calibrate the VFIFO */
-			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_VFIFO)) {
-				if (!rw_mgr_mem_calibrate_vfifo(read_group, read_test_bgn))
-					return 0;
-			}
-		}
+				IOWR_32DIRECT(SCC_MGR_GROUP_COUNTER, 0, write_group);
+				scc_mgr_zero_group(write_group, write_test_bgn, 0);
+
+				for (read_group =
+				     write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+				     RW_MGR_MEM_IF_WRITE_DQS_WIDTH, read_test_bgn = 0;
+				     read_group <
+				     (write_group +
+				      1) * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+				     RW_MGR_MEM_IF_WRITE_DQS_WIDTH && group_failed == 0;
+				     read_group++, read_test_bgn += RW_MGR_MEM_DQ_PER_READ_DQS) {
+
+					//USER Calibrate the VFIFO 
+					if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_VFIFO)) {
+						if (!rw_mgr_mem_calibrate_vfifo
+						    (read_group, read_test_bgn)) {
+							group_failed = 1;
+
+							if (!
+							    (gbl->
+							     phy_debug_mode_flags &
+							     PHY_DEBUG_SWEEP_ALL_GROUPS)) {
+								return 0;
+							}
+						}
+					}
+				}
 
-		/* level writes (or align DK with CK for RLDRAMX) */
-		if (!(ARRIAV || CYCLONEV)) {
-			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WLEVEL)) {
-				if (!rw_mgr_mem_calibrate_wlevel(write_group, write_test_bgn))
-					return 0;
-			}
-		}
+				//USER level writes (or align DK with CK for RLDRAMX) 
+				if (group_failed == 0) {
+					if ((DDRX || RLDRAMII) && !(ARRIAV || CYCLONEV)) {
+						if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WLEVEL)) {
+							if (!rw_mgr_mem_calibrate_wlevel
+							    (write_group, write_test_bgn)) {
+								group_failed = 1;
+
+								if (!
+								    (gbl->
+								     phy_debug_mode_flags &
+								     PHY_DEBUG_SWEEP_ALL_GROUPS)) {
+									return 0;
+								}
+							}
+						}
+					}
+				}
+				//USER Calibrate the output side 
+				if (group_failed == 0) {
+					for (rank_bgn = 0, sr = 0;
+					     rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
+					     rank_bgn += NUM_RANKS_PER_SHADOW_REG, ++sr) {
+						sr_failed = 0;
+						if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES)) {
+							if ((STATIC_CALIB_STEPS) &
+							    CALIB_SKIP_DELAY_SWEEPS) {
+								//USER not needed in quick mode!
+							} else {
+								//USER Determine if this set of ranks should be skipped entirely
+								if (!param->skip_shadow_regs[sr]) {
+
+									//USER Select shadow register set
+									select_shadow_regs_for_update
+									    (rank_bgn, write_group,
+									     1);
+
+									if (!rw_mgr_mem_calibrate_writes(rank_bgn, write_group, write_test_bgn)) {
+										sr_failed = 1;
+										if (!
+										    (gbl->
+										     phy_debug_mode_flags
+										     &
+										     PHY_DEBUG_SWEEP_ALL_GROUPS))
+										{
+											return 0;
+										}
+									}
+								}
+							}
+						}
+						if (sr_failed == 0) {
+							TCLRPT_SET(debug_cal_report->
+								   cal_status_per_group[sr]
+								   [write_group].error_stage,
+								   CAL_STAGE_NIL);
+						} else {
+							group_failed = 1;
+						}
+					}
+				}
 
-		/* Calibrate the output side */
-		for (rank_bgn = 0; rank_bgn < RW_MGR_MEM_NUMBER_OF_RANKS;
-				rank_bgn += NUM_RANKS_PER_SHADOW_REG) {
-			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES)) {
-				if ((STATIC_CALIB_STEPS) & CALIB_SKIP_DELAY_SWEEPS) {
-					/* not needed in quick mode! */
-				} else {
-					/* Determine if this set of
-					 * ranks should be skipped
-					 * entirely */
-					if (!rw_mgr_mem_calibrate_writes(rank_bgn, write_group, write_test_bgn))
-						return 0;
+				if (group_failed == 0) {
+					for (read_group =
+					     write_group * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+					     RW_MGR_MEM_IF_WRITE_DQS_WIDTH, read_test_bgn = 0;
+					     read_group <
+					     (write_group +
+					      1) * RW_MGR_MEM_IF_READ_DQS_WIDTH /
+					     RW_MGR_MEM_IF_WRITE_DQS_WIDTH && group_failed == 0;
+					     read_group++, read_test_bgn +=
+					     RW_MGR_MEM_DQ_PER_READ_DQS) {
+
+						if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES)) {
+							if (!rw_mgr_mem_calibrate_vfifo_end
+							    (read_group, read_test_bgn)) {
+								group_failed = 1;
+
+								if (!
+								    (gbl->
+								     phy_debug_mode_flags &
+								     PHY_DEBUG_SWEEP_ALL_GROUPS)) {
+									return 0;
+								}
+							}
+						}
+					}
+				}
+
+				if (group_failed == 0) {
+
+#if STATIC_IN_RTL_SIM
+#else
+#endif
 				}
+
+				if (group_failed != 0) {
+					failing_groups++;
+				}
+
 			}
-		}
 
-		for (read_group = write_group * RW_MGR_NUM_DQS_PER_WRITE_GROUP,
-				read_test_bgn = 0;
-				read_group < (write_group + 1) * RW_MGR_NUM_DQS_PER_WRITE_GROUP;
-				read_group++, read_test_bgn += RW_MGR_MEM_DQ_PER_READ_DQS) {
-			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_WRITES)) {
-				if (!rw_mgr_mem_calibrate_vfifo_end(read_group, read_test_bgn))
-					return 0;
+			// USER If there are any failing groups then report the failure
+			if (failing_groups != 0) {
+				return 0;
+			}
+			//USER Calibrate the LFIFO 
+			if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_LFIFO)) {
+				//USER If we're skipping groups as part of debug, don't calibrate LFIFO
+				if (param->skip_groups == 0) {
+					if (!rw_mgr_mem_calibrate_lfifo()) {
+						return 0;
+					}
+				}
 			}
 		}
 	}
 
-	/* Calibrate the LFIFO */
-	if (!((STATIC_CALIB_STEPS) & CALIB_SKIP_LFIFO)) {
-		/* If we're skipping groups as part of debug,
-		don't calibrate LFIFO */
-		if (!rw_mgr_mem_calibrate_lfifo ())
-			return 0;
+	TCLRPT_SET(debug_summary_report->cal_write_latency, IORD_32DIRECT(MEM_T_WL_ADD, 0));
+	if (QUARTER_RATE == 1) {
+		// The read latency is in terms of AFI cycles so we multiply by 4 in quarter
+		// rate to get the memory cycles.
+		TCLRPT_SET(debug_summary_report->cal_read_latency, gbl->curr_read_lat * 4);
+	} else if (HALF_RATE == 1) {
+		// The read latency is in terms of AFI cycles so we multiply by 2 in half
+		// rate to get the memory cycles.
+		TCLRPT_SET(debug_summary_report->cal_read_latency, gbl->curr_read_lat * 2);
+	} else {
+		TCLRPT_SET(debug_summary_report->cal_read_latency, gbl->curr_read_lat);
 	}
 
-	/* Do not remove this line as it makes sure all of our decisions
-	have been applied */
+	//USER Do not remove this line as it makes sure all of our decisions have been applied
 	IOWR_32DIRECT(SCC_MGR_UPD, 0, 0);
 	return 1;
 }
@@ -4043,32 +5278,59 @@ static uint32_t run_mem_calibrate(void)
 	uint32_t pass;
 	uint32_t debug_info;
 
-	/* Initialize the debug status to show that calibration has started. */
-	/* This should occur before anything else */
-	/* Reset pass/fail status shown on afi_cal_success/fail */
+	// Initialize the debug status to show that calibration has started.
+	// This should occur before anything else
+	// Reset pass/fail status shown on afi_cal_success/fail
 	IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_RESET);
 
+	TRACE_FUNC();
+
+	BFM_STAGE("calibrate");
+	//stop tracking manger
+	uint32_t ctrlcfg = IORD_32DIRECT(CTRL_CONFIG_REG, 0);
+
+	IOWR_32DIRECT(CTRL_CONFIG_REG, 0, ctrlcfg & 0xFFBFFFFF);
+
 	initialize();
-	rw_mgr_mem_initialize ();
-	pass = mem_calibrate ();
-	mem_precharge_and_activate ();
+
+	rw_mgr_mem_initialize();
+
+	pass = mem_calibrate();
+
+	mem_precharge_and_activate();
+
+	//pe_checkout_pattern();
 
 	IOWR_32DIRECT(PHY_MGR_CMD_FIFO_RESET, 0, 0);
 
-	/* Handoff */
+	if (pass) {
+		TCLRPT_SET(debug_summary_report->error_stage, CAL_STAGE_NIL);
+
+		BFM_STAGE("handoff");
+
+#ifdef TEST_SIZE
+		if (!check_test_mem(0)) {
+			gbl->error_stage = 0x92;
+			gbl->error_group = 0x92;
+		}
+#endif
+	}
+
+	//USER Handoff 
 
-	/* Don't return control of the PHY back to AFI when in debug mode */
+	//USER Don't return control of the PHY back to AFI when in debug mode
 	if ((gbl->phy_debug_mode_flags & PHY_DEBUG_IN_DEBUG_MODE) == 0) {
-		rw_mgr_mem_handoff ();
+		rw_mgr_mem_handoff();
 
-		/* In Hard PHY this is a 2-bit control: */
-		/* 0: AFI Mux Select */
-		/* 1: DDIO Mux Select */
+		// In Hard PHY this is a 2-bit control:
+		// 0: AFI Mux Select
+		// 1: DDIO Mux Select
 		IOWR_32DIRECT(PHY_MGR_MUX_SEL, 0, 0x2);
 	}
+	IOWR_32DIRECT(CTRL_CONFIG_REG, 0, ctrlcfg);
 
 	if (pass) {
-		pr_debug("CALIBRATION PASSED\n");
+		IPRINT("CALIBRATION PASSED");
 
 		gbl->fom_in /= 2;
 		gbl->fom_out /= 2;
@@ -4081,7 +5343,7 @@ static uint32_t run_mem_calibrate(void)
 			gbl->fom_out = 0xff;
 		}
 
-		/* Update the FOM in the register file */
+		// Update the FOM in the register file
 		debug_info = gbl->fom_in;
 		debug_info |= gbl->fom_out << 8;
 		IOWR_32DIRECT(REG_FILE_FOM, 0, debug_info);
@@ -4090,32 +5352,40 @@ static uint32_t run_mem_calibrate(void)
 		IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_SUCCESS);
 
 	} else {
-		pr_debug("CALIBRATION FAILED\n");
+
+		IPRINT("CALIBRATION FAILED");
 
 		debug_info = gbl->error_stage;
 		debug_info |= gbl->error_substage << 8;
 		debug_info |= gbl->error_group << 16;
 
-
 		IOWR_32DIRECT(REG_FILE_FAILING_STAGE, 0, debug_info);
 		IOWR_32DIRECT(PHY_MGR_CAL_DEBUG_INFO, 0, debug_info);
 		IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_FAIL);
 
-		/* Update the failing group/stage in the register file */
+		// Update the failing group/stage in the register file
 		debug_info = gbl->error_stage;
 		debug_info |= gbl->error_substage << 8;
 		debug_info |= gbl->error_group << 16;
 		IOWR_32DIRECT(REG_FILE_FAILING_STAGE, 0, debug_info);
+
 	}
 
-	/* Set the debug status to show that calibration has ended. */
-	/* This should occur after everything else */
+	// Mark the reports as being ready to read
+	TCLRPT_SET(debug_summary_report->report_flags, debug_summary_report->report_flags |=
+		   DEBUG_REPORT_STATUS_REPORT_READY);
+	TCLRPT_SET(debug_cal_report->report_flags, debug_cal_report->report_flags |=
+		   DEBUG_REPORT_STATUS_REPORT_READY);
+	TCLRPT_SET(debug_margin_report->report_flags, debug_margin_report->report_flags |=
+		   DEBUG_REPORT_STATUS_REPORT_READY);
+
+	// Set the debug status to show that calibration has ended.
+	// This should occur after everything else
 	return pass;
 
 }
 
-static void hc_initialize_rom_data(const uint32_t *inst_rom_init, uint32_t inst_rom_init_size,
-		const uint32_t *ac_rom_init, uint32_t ac_rom_init_size)
+static void hc_initialize_rom_data(void)
 {
 	uint32_t i;
 
@@ -4132,7 +5402,7 @@ static void hc_initialize_rom_data(const uint32_t *inst_rom_init, uint32_t inst_
 
 static void initialize_reg_file(void)
 {
-	/* Initialize the register file with the correct data */
+	// Initialize the register file with the correct data
 	IOWR_32DIRECT(REG_FILE_SIGNATURE, 0, REG_FILE_INIT_SEQ_SIGNATURE);
 	IOWR_32DIRECT(REG_FILE_DEBUG_DATA_ADDR, 0, 0);
 	IOWR_32DIRECT(REG_FILE_CUR_STAGE, 0, 0);
@@ -4144,59 +5414,45 @@ static void initialize_reg_file(void)
 
 static void initialize_hps_phy(void)
 {
-	/* These may need to be included also: */
-	/* wrap_back_en (false) */
-	/* atpg_en (false) */
-	/* pipelineglobalenable (true) */
+	// These may need to be included also:
+	// wrap_back_en (false)
+	// atpg_en (false)
+	// pipelineglobalenable (true) 
 
 	uint32_t reg;
-	/* Tracking also gets configured here because it's in the
-	same register */
+	// Tracking also gets configured here because it's in the same register
 	uint32_t trk_sample_count = 7500;
-	uint32_t trk_long_idle_sample_count = (10 << 16) | 100;
-	/* Format is number of outer loops in the 16 MSB, sample
-	count in 16 LSB. */
+	uint32_t trk_long_idle_sample_count = (10 << 16) | 100;	// Format is number of outer loops in the 16 MSB, sample count in 16 LSB.
 
 	reg = 0;
-#if DDR3 || DDR2
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_ACDELAYEN_SET(2);
-#else
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_ACDELAYEN_SET(1);
-#endif
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQDELAYEN_SET(1);
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQSDELAYEN_SET(1);
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_DQSLOGICDELAYEN_SET(1);
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_RESETDELAYEN_SET(0);
-#if LPDDR2
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_LPDDRDIS_SET(0);
-#else
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_LPDDRDIS_SET(1);
-#endif
-	/* Fix for long latency VFIFO */
-	/* This field selects the intrinsic latency to RDATA_EN/FULL path.
-	00-bypass, 01- add 5 cycles, 10- add 10 cycles, 11- add 15 cycles. */
+	// Fix for long latency VFIFO
+	// This field selects the intrinsic latency to RDATA_EN/FULL path. 00-bypass, 01- add 5 cycles, 10- add 10 cycles, 11- add 15 cycles.
 	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_ADDLATSEL_SET(0);
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_SET(
-		trk_sample_count);
+	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_SET(trk_sample_count);
 	IOWR_32DIRECT(BASE_MMR, SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_OFFSET, reg);
 
 	reg = 0;
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_SAMPLECOUNT_31_20_SET(
-		trk_sample_count >>
-		SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_WIDTH);
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_SET(
-		trk_long_idle_sample_count);
+	reg |=
+	    SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_SAMPLECOUNT_31_20_SET(trk_sample_count >>
+								SDR_CTRLGRP_PHYCTRL_PHYCTRL_0_SAMPLECOUNT_19_0_WIDTH);
+	reg |=
+	    SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_SET(trk_long_idle_sample_count);
 	IOWR_32DIRECT(BASE_MMR, SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_OFFSET, reg);
 
 	reg = 0;
-	reg |= SDR_CTRLGRP_PHYCTRL_PHYCTRL_2_LONGIDLESAMPLECOUNT_31_20_SET(
-		trk_long_idle_sample_count >>
-		SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_WIDTH);
+	reg |=
+	    SDR_CTRLGRP_PHYCTRL_PHYCTRL_2_LONGIDLESAMPLECOUNT_31_20_SET(trk_long_idle_sample_count
+									>>
+									SDR_CTRLGRP_PHYCTRL_PHYCTRL_1_LONGIDLESAMPLECOUNT_19_0_WIDTH);
 	IOWR_32DIRECT(BASE_MMR, SDR_CTRLGRP_PHYCTRL_PHYCTRL_2_OFFSET, reg);
 }
 
-#if USE_DQS_TRACKING
-
 static void initialize_tracking(void)
 {
 	uint32_t concatenated_longidle = 0x0;
@@ -4206,8 +5462,7 @@ static void initialize_tracking(void)
 	uint32_t dtaps_per_ptap;
 	uint32_t tmp_delay;
 
-	/* compute usable version of value in case we skip full
-	computation later */
+	// compute usable version of value in case we skip full computation later
 	dtaps_per_ptap = 0;
 	tmp_delay = 0;
 	while (tmp_delay < IO_DELAY_PER_OPA_TAP) {
@@ -4216,25 +5471,18 @@ static void initialize_tracking(void)
 	}
 	dtaps_per_ptap--;
 
-	concatenated_longidle = concatenated_longidle ^ 10;
-		/*longidle outer loop */
+	concatenated_longidle = concatenated_longidle ^ 10;	//longidle outer loop
 	concatenated_longidle = concatenated_longidle << 16;
-	concatenated_longidle = concatenated_longidle ^ 100;
-		/*longidle sample count */
+	concatenated_longidle = concatenated_longidle ^ 100;	//longidle sample count
 
-	concatenated_delays = concatenated_delays ^ 243;
-		/* trfc, worst case of 933Mhz 4Gb */
+	concatenated_delays = concatenated_delays ^ 243;	// trfc, worst case of 933Mhz 4Gb
 	concatenated_delays = concatenated_delays << 8;
-	concatenated_delays = concatenated_delays ^ 14;
-		/* trcd, worst case */
+	concatenated_delays = concatenated_delays ^ 14;	// trcd, worst case
 	concatenated_delays = concatenated_delays << 8;
-	concatenated_delays = concatenated_delays ^ 10;
-		/* vfifo wait */
+	concatenated_delays = concatenated_delays ^ 10;	// vfifo wait
 	concatenated_delays = concatenated_delays << 8;
-	concatenated_delays = concatenated_delays ^ 4;
-		/* mux delay */
+	concatenated_delays = concatenated_delays ^ 4;	// mux delay
 
-#if DDR3 || LPDDR2
 	concatenated_rw_addr = concatenated_rw_addr ^ __RW_MGR_IDLE;
 	concatenated_rw_addr = concatenated_rw_addr << 8;
 	concatenated_rw_addr = concatenated_rw_addr ^ __RW_MGR_ACTIVATE_1;
@@ -4242,31 +5490,39 @@ static void initialize_tracking(void)
 	concatenated_rw_addr = concatenated_rw_addr ^ __RW_MGR_SGLE_READ;
 	concatenated_rw_addr = concatenated_rw_addr << 8;
 	concatenated_rw_addr = concatenated_rw_addr ^ __RW_MGR_PRECHARGE_ALL;
-#endif
 
-#if DDR3 || LPDDR2
 	concatenated_refresh = concatenated_refresh ^ __RW_MGR_REFRESH_ALL;
-#else
-	concatenated_refresh = concatenated_refresh ^ 0;
-#endif
 	concatenated_refresh = concatenated_refresh << 24;
-	concatenated_refresh = concatenated_refresh ^ 1000; /* trefi */
+	concatenated_refresh = concatenated_refresh ^ 1000;	// trefi
 
-	/* Initialize the register file with the correct data */
+	// Initialize the register file with the correct data
 	IOWR_32DIRECT(REG_FILE_DTAPS_PER_PTAP, 0, dtaps_per_ptap);
 	IOWR_32DIRECT(REG_FILE_TRK_SAMPLE_COUNT, 0, 7500);
 	IOWR_32DIRECT(REG_FILE_TRK_LONGIDLE, 0, concatenated_longidle);
 	IOWR_32DIRECT(REG_FILE_DELAYS, 0, concatenated_delays);
 	IOWR_32DIRECT(REG_FILE_TRK_RW_MGR_ADDR, 0, concatenated_rw_addr);
-	IOWR_32DIRECT(REG_FILE_TRK_READ_DQS_WIDTH, 0,
-		RW_MGR_MEM_IF_READ_DQS_WIDTH);
+	IOWR_32DIRECT(REG_FILE_TRK_READ_DQS_WIDTH, 0, RW_MGR_MEM_IF_READ_DQS_WIDTH);
 	IOWR_32DIRECT(REG_FILE_TRK_RFSH, 0, concatenated_refresh);
 }
 
-#endif	/* USE_DQS_TRACKING */
+static void user_init_cal_req(void)
+{
+	uint32_t scc_afi_reg;
+
+	scc_afi_reg = IORD_32DIRECT(SCC_MGR_AFI_CAL_INIT, 0);
+
+	if (scc_afi_reg == 1) {	// 1 is initialization request
+		initialize();
+		rw_mgr_mem_initialize();
+		rw_mgr_mem_handoff();
+		IOWR_32DIRECT(PHY_MGR_MUX_SEL, 0, 0);
+		IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_SUCCESS);
+	} else if (scc_afi_reg == 2) {
+		run_mem_calibrate();
+	}
+}
 
-static int socfpga_sdram_calibration(const uint32_t *inst_rom_init, uint32_t inst_rom_init_size,
-		const uint32_t *ac_rom_init, uint32_t ac_rom_init_size)
+static int socfpga_mem_calibration(void)
 {
 	param_t my_param;
 	gbl_t my_gbl;
@@ -4276,93 +5532,102 @@ static int socfpga_sdram_calibration(const uint32_t *inst_rom_init, uint32_t ins
 	param = &my_param;
 	gbl = &my_gbl;
 
-	/* Initialize the debug mode flags */
+	// Initialize the debug mode flags
 	gbl->phy_debug_mode_flags = 0;
-	/* Set the calibration enabled by default */
+	// Set the calibration enabled by default
 	gbl->phy_debug_mode_flags |= PHY_DEBUG_ENABLE_CAL_RPT;
+	// Only enable margining by default if requested
+	// Only sweep all groups (regardless of fail state) by default if requested 
+	//Set enabled read test by default
 
-	/* Initialize the register file */
+	// Initialize the register file
 	initialize_reg_file();
 
-	/* Initialize any PHY CSR */
+	// Initialize any PHY CSR
 	initialize_hps_phy();
 
 	scc_mgr_initialize();
 
-#if USE_DQS_TRACKING
 	initialize_tracking();
-#endif
 
-	/* Enable all ranks, groups */
-	for (i = 0; i < RW_MGR_MEM_NUMBER_OF_RANKS; i++)
-		param->skip_ranks[i] = 0;
+	// Initialize the TCL report. This must occur before any printf
+	// but after the debug mode flags and register file
 
-	for (i = 0; i < NUM_SHADOW_REGS; ++i)
+	// USER Enable all ranks, groups
+	for (i = 0; i < RW_MGR_MEM_NUMBER_OF_RANKS; i++) {
+		param->skip_ranks[i] = 0;
+	}
+	for (i = 0; i < NUM_SHADOW_REGS; ++i) {
 		param->skip_shadow_regs[i] = 0;
-
+	}
 	param->skip_groups = 0;
 
-	pr_debug("Preparing to start memory calibration\n");
-
-	pr_debug("%s%s %s ranks=%u cs/dimm=%u dq/dqs=%u,%u vg/dqs=%u,%u "
-		"dqs=%u,%u dq=%u dm=%u "
-		"ptap_delay=%u dtap_delay=%u dtap_dqsen_delay=%u, dll=%u\n",
-		RDIMM ? "r" : (LRDIMM ? "l" : ""),
-		DDR2 ? "DDR2" : (DDR3 ? "DDR3" : (QDRII ? "QDRII" : (RLDRAMII ?
-		"RLDRAMII" : (RLDRAM3 ? "RLDRAM3" : "??PROTO??")))),
-		FULL_RATE ? "FR" : (HALF_RATE ? "HR" : (QUARTER_RATE ?
-		"QR" : "??RATE??")),
-		RW_MGR_MEM_NUMBER_OF_RANKS,
-		RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM,
-		RW_MGR_MEM_DQ_PER_READ_DQS,
-		RW_MGR_MEM_DQ_PER_WRITE_DQS,
-		RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS,
-		RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS,
-		RW_MGR_MEM_IF_READ_DQS_WIDTH,
-		RW_MGR_MEM_IF_WRITE_DQS_WIDTH,
-		RW_MGR_MEM_DATA_WIDTH,
-		RW_MGR_MEM_DATA_MASK_WIDTH,
-		IO_DELAY_PER_OPA_TAP,
-		IO_DELAY_PER_DCHAIN_TAP,
-		IO_DELAY_PER_DQS_EN_DCHAIN_TAP,
-		IO_DLL_CHAIN_LENGTH);
-	pr_debug("max values: en_p=%u dqdqs_p=%u en_d=%u dqs_in_d=%u "
-		"io_in_d=%u io_out1_d=%u io_out2_d=%u"
-		"dqs_in_reserve=%u dqs_out_reserve=%u\n",
-		IO_DQS_EN_PHASE_MAX,
-		IO_DQDQS_OUT_PHASE_MAX,
-		IO_DQS_EN_DELAY_MAX,
-		IO_DQS_IN_DELAY_MAX,
-		IO_IO_IN_DELAY_MAX,
-		IO_IO_OUT1_DELAY_MAX,
-		IO_IO_OUT2_DELAY_MAX,
-		IO_DQS_IN_RESERVE,
-		IO_DQS_OUT_RESERVE);
-
-	hc_initialize_rom_data(inst_rom_init, inst_rom_init_size,
-			ac_rom_init, ac_rom_init_size);
-
-	/* update info for sims */
+	IPRINT("Preparing to start memory calibration");
+
+	TRACE_FUNC();
+	DPRINT(1,
+	       "%s%s %s ranks=%lu cs/dimm=%lu dq/dqs=%lu,%lu vg/dqs=%lu,%lu dqs=%lu,%lu dq=%lu dm=%lu "
+	       "ptap_delay=%lu dtap_delay=%lu dtap_dqsen_delay=%lu, dll=%lu",
+	       RDIMM ? "r" : (LRDIMM ? "l" : ""),
+	       DDR2 ? "DDR2" : (DDR3 ? "DDR3"
+				: (QDRII ? "QDRII"
+				   : (RLDRAMII ? "RLDRAMII"
+				      : (RLDRAM3 ? "RLDRAM3" : "??PROTO??")))),
+	       FULL_RATE ? "FR" : (HALF_RATE ? "HR" : (QUARTER_RATE ? "QR" : "??RATE??")),
+	       (long unsigned int)RW_MGR_MEM_NUMBER_OF_RANKS,
+	       (long unsigned int)RW_MGR_MEM_NUMBER_OF_CS_PER_DIMM,
+	       (long unsigned int)RW_MGR_MEM_DQ_PER_READ_DQS,
+	       (long unsigned int)RW_MGR_MEM_DQ_PER_WRITE_DQS,
+	       (long unsigned int)RW_MGR_MEM_VIRTUAL_GROUPS_PER_READ_DQS,
+	       (long unsigned int)RW_MGR_MEM_VIRTUAL_GROUPS_PER_WRITE_DQS,
+	       (long unsigned int)RW_MGR_MEM_IF_READ_DQS_WIDTH,
+	       (long unsigned int)RW_MGR_MEM_IF_WRITE_DQS_WIDTH,
+	       (long unsigned int)RW_MGR_MEM_DATA_WIDTH,
+	       (long unsigned int)RW_MGR_MEM_DATA_MASK_WIDTH,
+	       (long unsigned int)IO_DELAY_PER_OPA_TAP, (long unsigned int)IO_DELAY_PER_DCHAIN_TAP,
+	       (long unsigned int)IO_DELAY_PER_DQS_EN_DCHAIN_TAP,
+	       (long unsigned int)IO_DLL_CHAIN_LENGTH);
+	DPRINT(1,
+	       "max values: en_p=%lu dqdqs_p=%lu en_d=%lu dqs_in_d=%lu io_in_d=%lu io_out1_d=%lu io_out2_d=%lu"
+	       "dqs_in_reserve=%lu dqs_out_reserve=%lu", (long unsigned int)IO_DQS_EN_PHASE_MAX,
+	       (long unsigned int)IO_DQDQS_OUT_PHASE_MAX, (long unsigned int)IO_DQS_EN_DELAY_MAX,
+	       (long unsigned int)IO_DQS_IN_DELAY_MAX, (long unsigned int)IO_IO_IN_DELAY_MAX,
+	       (long unsigned int)IO_IO_OUT1_DELAY_MAX, (long unsigned int)IO_IO_OUT2_DELAY_MAX,
+	       (long unsigned int)IO_DQS_IN_RESERVE, (long unsigned int)IO_DQS_OUT_RESERVE);
+
+	hc_initialize_rom_data();
+
+	//USER update info for sims
 	reg_file_set_stage(CAL_STAGE_NIL);
 	reg_file_set_group(0);
 
-	/* Load global needed for those actions that require */
-	/* some dynamic calibration support */
+	// Load global needed for those actions that require
+	// some dynamic calibration support
 	dyn_calib_steps = STATIC_CALIB_STEPS;
 
-	/* Load global to allow dynamic selection of delay loop settings */
-	/* based on calibration mode */
-	if (!(dyn_calib_steps & CALIB_SKIP_DELAY_LOOPS)) {
+	// Load global to allow dynamic selection of delay loop settings
+	// based on calibration mode
+	if (!((DYNAMIC_CALIB_STEPS) & CALIB_SKIP_DELAY_LOOPS)) {
 		skip_delay_mask = 0xff;
 	} else {
 		skip_delay_mask = 0x0;
 	}
 
-	pass = run_mem_calibrate ();
+#ifdef TEST_SIZE
+	if (!check_test_mem(1)) {
+		IOWR_32DIRECT(PHY_MGR_CAL_DEBUG_INFO, 0, 0x9090);
+		IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_FAIL);
+	}
+	write_test_mem();
+	if (!check_test_mem(0)) {
+		IOWR_32DIRECT(PHY_MGR_CAL_DEBUG_INFO, 0, 0x9191);
+		IOWR_32DIRECT(PHY_MGR_CAL_STATUS, 0, PHY_MGR_CAL_FAIL);
+	}
+#endif
+
+	pass = run_mem_calibrate();
 
-	pr_debug("Calibration complete\n");
-	/* Send the end of transmission character */
-	pr_debug("%c\n", 0x4);
+	// EMPTY
 
-	return pass == 0 ? -EINVAL : 0;
+	return pass;
 }
diff --git a/arch/arm/mach-socfpga/include/mach/sequencer.h b/arch/arm/mach-socfpga/include/mach/sequencer.h
index 7c9157a25779..8676b4efdd6b 100644
--- a/arch/arm/mach-socfpga/include/mach/sequencer.h
+++ b/arch/arm/mach-socfpga/include/mach/sequencer.h
@@ -2,70 +2,38 @@
 #define _SEQUENCER_H_
 
 /*
- * Copyright Altera Corporation (C) 2012-2014. All rights reserved
- *
- * SPDX-License-Identifier:  BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *  * Redistributions of source code must retain the above copyright
- *  notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *  notice, this list of conditions and the following disclaimer in the
- *  documentation and/or other materials provided with the distribution.
- *  * Neither the name of Altera Corporation nor the
- *  names of its contributors may be used to endorse or promote products
- *  derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define MRS_MIRROR_PING_PONG_ATSO 0
-#define DYNAMIC_CALIBRATION_MODE 0
-#define STATIC_QUICK_CALIBRATION 0
-#define DISABLE_GUARANTEED_READ 0
-#define STATIC_SKIP_CALIBRATION 0
-
-#if ENABLE_ASSERT
-#define ERR_IE_TEXT "Internal Error: Sub-system: %s, File: %s, Line: %d\n%s%s"
-
-#define ALTERA_INTERNAL_ERROR(string) \
-	{err_report_internal_error(string, "SEQ", __FILE__, __LINE__); \
-	exit(-1); }
-
-#define ALTERA_ASSERT(condition) \
-	if (!(condition)) {\
-		ALTERA_INTERNAL_ERROR(#condition); }
-#define ALTERA_INFO_ASSERT(condition, text) \
-	if (!(condition)) {\
-		ALTERA_INTERNAL_ERROR(text); }
-
-#else
+* Copyright Altera Corporation (C) 2012-2014. All rights reserved
+*
+* SPDX-License-Identifier:  BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above copyright
+*  notice, this list of conditions and the following disclaimer in the
+*  documentation and/or other materials provided with the distribution.
+*  * Neither the name of Altera Corporation nor the
+*  names of its contributors may be used to endorse or promote products
+*  derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
 
 #define ALTERA_ASSERT(condition)
-#define ALTERA_INFO_ASSERT(condition, text)
-
-#endif
+#define ALTERA_INFO_ASSERT(condition,text)
 
-
-#if RLDRAMII
-#define RW_MGR_NUM_DM_PER_WRITE_GROUP (1)
-#define RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP (1)
-#else
-#define RW_MGR_NUM_DM_PER_WRITE_GROUP (RW_MGR_MEM_DATA_MASK_WIDTH \
-	/ RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
-#define RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP (RW_MGR_TRUE_MEM_DATA_MASK_WIDTH \
-	/ RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
-#endif
+#define RW_MGR_NUM_DM_PER_WRITE_GROUP (RW_MGR_MEM_DATA_MASK_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
+#define RW_MGR_NUM_TRUE_DM_PER_WRITE_GROUP (RW_MGR_TRUE_MEM_DATA_MASK_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
 
 #define RW_MGR_NUM_DQS_PER_WRITE_GROUP (RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
 #define NUM_RANKS_PER_SHADOW_REG (RW_MGR_MEM_NUMBER_OF_RANKS / NUM_SHADOW_REGS)
@@ -75,11 +43,9 @@
 
 #define RW_MGR_DI_BASE (BASE_RW_MGR + 0x0020)
 
-#if DDR3
 #define DDR3_MR1_ODT_MASK  0xFFFFFD99
 #define DDR3_MR2_ODT_MASK  0xFFFFF9FF
 #define DDR3_AC_MIRR_MASK  0x020A8
-#endif /* DDR3 */
 
 #define RW_MGR_LOAD_CNTR_0 BASE_RW_MGR + 0x0800
 #define RW_MGR_LOAD_CNTR_1 BASE_RW_MGR + 0x0804
@@ -142,10 +108,7 @@
 #define CAL_SUBSTAGE_REFRESH		1
 
 #define MAX_RANKS			(RW_MGR_MEM_NUMBER_OF_RANKS)
-#define MAX_DQS				(RW_MGR_MEM_IF_WRITE_DQS_WIDTH > \
-					RW_MGR_MEM_IF_READ_DQS_WIDTH ? \
-					RW_MGR_MEM_IF_WRITE_DQS_WIDTH : \
-					RW_MGR_MEM_IF_READ_DQS_WIDTH)
+#define MAX_DQS				(RW_MGR_MEM_IF_WRITE_DQS_WIDTH > RW_MGR_MEM_IF_READ_DQS_WIDTH ? RW_MGR_MEM_IF_WRITE_DQS_WIDTH : RW_MGR_MEM_IF_READ_DQS_WIDTH)
 #define MAX_DQ				(RW_MGR_MEM_DATA_WIDTH)
 #define MAX_DM				(RW_MGR_MEM_DATA_MASK_WIDTH)
 
@@ -158,19 +121,22 @@
  * - The remaining words are part of the transfer.
  */
 
-#define BASE_PTR_MGR 			SEQUENCER_PTR_MGR_INST_BASE
-#define BASE_PHY_MGR 			SDR_PHYGRP_PHYMGRGRP_ADDRESS
-#define BASE_RW_MGR 			SDR_PHYGRP_RWMGRGRP_ADDRESS
-#define BASE_DATA_MGR 			SDR_PHYGRP_DATAMGRGRP_ADDRESS
-#define BASE_SCC_MGR			SDR_PHYGRP_SCCGRP_ADDRESS
-#define BASE_REG_FILE			SDR_PHYGRP_REGFILEGRP_ADDRESS
-#define BASE_TIMER			SEQUENCER_TIMER_INST_BASE
-#define BASE_MMR                        SDR_CTRLGRP_ADDRESS
-#define BASE_TRK_MGR			(0x000D0000)
+/* Define the base address of each manager. */
+
+/* MarkW: how should these base addresses be done for A-V? */
+#define BASE_PTR_MGR 					SEQUENCER_PTR_MGR_INST_BASE
+#define BASE_PHY_MGR 					(0x00088000)
+#define BASE_RW_MGR 					(0x00090000)
+#define BASE_DATA_MGR 					(0x00098000)
+#define BASE_SCC_MGR					SEQUENCER_SCC_MGR_INST_BASE
+#define BASE_REG_FILE					SEQUENCER_REG_FILE_INST_BASE
+#define BASE_TIMER					SEQUENCER_TIMER_INST_BASE
+#define BASE_MMR                                        (0x000C0000)
+#define BASE_TRK_MGR					(0x000D0000)
 
 /* Register file addresses. */
-#define REG_FILE_SIGNATURE		(BASE_REG_FILE + 0x0000)
-#define REG_FILE_DEBUG_DATA_ADDR	(BASE_REG_FILE + 0x0004)
+#define REG_FILE_SIGNATURE				(BASE_REG_FILE + 0x0000)
+#define REG_FILE_DEBUG_DATA_ADDR		(BASE_REG_FILE + 0x0004)
 #define REG_FILE_CUR_STAGE              (BASE_REG_FILE + 0x0008)
 #define REG_FILE_FOM                    (BASE_REG_FILE + 0x000C)
 #define REG_FILE_FAILING_STAGE          (BASE_REG_FILE + 0x0010)
@@ -184,37 +150,28 @@
 #define REG_FILE_TRK_RW_MGR_ADDR        (BASE_REG_FILE + 0x002C)
 #define REG_FILE_TRK_READ_DQS_WIDTH     (BASE_REG_FILE + 0x0030)
 #define REG_FILE_TRK_RFSH               (BASE_REG_FILE + 0x0034)
+#define CTRL_CONFIG_REG			(BASE_MMR      + 0x0000)
 
 /* PHY manager configuration registers. */
 
-#define PHY_MGR_PHY_RLAT			(BASE_PHY_MGR + 0x40 + 0x00)
-#define PHY_MGR_RESET_MEM_STBL			(BASE_PHY_MGR + 0x40 + 0x04)
-#define PHY_MGR_MUX_SEL				(BASE_PHY_MGR + 0x40 + 0x08)
-#define PHY_MGR_CAL_STATUS			(BASE_PHY_MGR + 0x40 + 0x0c)
-#define PHY_MGR_CAL_DEBUG_INFO			(BASE_PHY_MGR + 0x40 + 0x10)
-#define PHY_MGR_VFIFO_RD_EN_OVRD		(BASE_PHY_MGR + 0x40 + 0x14)
-#if CALIBRATE_BIT_SLIPS
-#define PHY_MGR_FR_SHIFT			(BASE_PHY_MGR + 0x40 + 0x20)
-#if MULTIPLE_AFI_WLAT
-#define PHY_MGR_AFI_WLAT			(BASE_PHY_MGR + 0x40 + 0x20 + 4 * \
-						RW_MGR_MEM_IF_WRITE_DQS_WIDTH)
-#else
-#define PHY_MGR_AFI_WLAT			(BASE_PHY_MGR + 0x40 + 0x18)
-#endif
-#else
-#define PHY_MGR_AFI_WLAT			(BASE_PHY_MGR + 0x40 + 0x18)
-#endif
-#define PHY_MGR_AFI_RLAT			(BASE_PHY_MGR + 0x40 + 0x1c)
+#define PHY_MGR_PHY_RLAT				(BASE_PHY_MGR + 0x4000)
+#define PHY_MGR_RESET_MEM_STBL			(BASE_PHY_MGR + 0x4004)
+#define PHY_MGR_MUX_SEL					(BASE_PHY_MGR + 0x4008)
+#define PHY_MGR_CAL_STATUS				(BASE_PHY_MGR + 0x400c)
+#define PHY_MGR_CAL_DEBUG_INFO			(BASE_PHY_MGR + 0x4010)
+#define PHY_MGR_VFIFO_RD_EN_OVRD		(BASE_PHY_MGR + 0x4014)
+#define PHY_MGR_AFI_WLAT				(BASE_PHY_MGR + 0x4018)
+#define PHY_MGR_AFI_RLAT				(BASE_PHY_MGR + 0x401c)
 
-#define PHY_MGR_CAL_RESET			(0)
+#define PHY_MGR_CAL_RESET				(0)
 #define PHY_MGR_CAL_SUCCESS			(1)
-#define PHY_MGR_CAL_FAIL			(2)
+#define PHY_MGR_CAL_FAIL				(2)
 
 /* PHY manager command addresses. */
 
 #define PHY_MGR_CMD_INC_VFIFO_FR		(BASE_PHY_MGR + 0x0000)
 #define PHY_MGR_CMD_INC_VFIFO_HR		(BASE_PHY_MGR + 0x0004)
-#define PHY_MGR_CMD_INC_VFIFO_HARD_PHY		(BASE_PHY_MGR + 0x0004)
+#define PHY_MGR_CMD_INC_VFIFO_HARD_PHY	(BASE_PHY_MGR + 0x0004)
 #define PHY_MGR_CMD_FIFO_RESET			(BASE_PHY_MGR + 0x0008)
 #define PHY_MGR_CMD_INC_VFIFO_FR_HR		(BASE_PHY_MGR + 0x000C)
 #define PHY_MGR_CMD_INC_VFIFO_QR		(BASE_PHY_MGR + 0x0010)
@@ -227,25 +184,25 @@
 #define PHY_MGR_CALIB_SKIP_STEPS		(BASE_PHY_MGR + 0x000c)
 #define PHY_MGR_CALIB_VFIFO_OFFSET		(BASE_PHY_MGR + 0x0010)
 #define PHY_MGR_CALIB_LFIFO_OFFSET		(BASE_PHY_MGR + 0x0014)
-#define PHY_MGR_RDIMM				(BASE_PHY_MGR + 0x0018)
-#define PHY_MGR_MEM_T_WL			(BASE_PHY_MGR + 0x001c)
-#define PHY_MGR_MEM_T_RL			(BASE_PHY_MGR + 0x0020)
+#define PHY_MGR_RDIMM					(BASE_PHY_MGR + 0x0018)
+#define PHY_MGR_MEM_T_WL				(BASE_PHY_MGR + 0x001c)
+#define PHY_MGR_MEM_T_RL				(BASE_PHY_MGR + 0x0020)
 
 /* Data Manager */
-#define DATA_MGR_DRAM_CFG			(BASE_DATA_MGR + 0x0000)
-#define DATA_MGR_MEM_T_WL			(BASE_DATA_MGR + 0x0004)
-#define DATA_MGR_MEM_T_ADD			(BASE_DATA_MGR + 0x0008)
-#define DATA_MGR_MEM_T_RL			(BASE_DATA_MGR + 0x000C)
-#define DATA_MGR_MEM_T_RFC			(BASE_DATA_MGR + 0x0010)
-#define DATA_MGR_MEM_T_REFI			(BASE_DATA_MGR + 0x0014)
-#define DATA_MGR_MEM_T_WR			(BASE_DATA_MGR + 0x0018)
-#define DATA_MGR_MEM_T_MRD			(BASE_DATA_MGR + 0x001C)
-#define DATA_MGR_COL_WIDTH			(BASE_DATA_MGR + 0x0020)
-#define DATA_MGR_ROW_WIDTH			(BASE_DATA_MGR + 0x0024)
-#define DATA_MGR_BANK_WIDTH			(BASE_DATA_MGR + 0x0028)
-#define DATA_MGR_CS_WIDTH			(BASE_DATA_MGR + 0x002C)
-#define DATA_MGR_ITF_WIDTH			(BASE_DATA_MGR + 0x0030)
-#define DATA_MGR_DVC_WIDTH			(BASE_DATA_MGR + 0x0034)
+#define DATA_MGR_DRAM_CFG				(BASE_DATA_MGR + 0x0000)
+#define DATA_MGR_MEM_T_WL				(BASE_DATA_MGR + 0x0004)
+#define DATA_MGR_MEM_T_ADD				(BASE_DATA_MGR + 0x0008)
+#define DATA_MGR_MEM_T_RL				(BASE_DATA_MGR + 0x000C)
+#define DATA_MGR_MEM_T_RFC				(BASE_DATA_MGR + 0x0010)
+#define DATA_MGR_MEM_T_REFI				(BASE_DATA_MGR + 0x0014)
+#define DATA_MGR_MEM_T_WR				(BASE_DATA_MGR + 0x0018)
+#define DATA_MGR_MEM_T_MRD				(BASE_DATA_MGR + 0x001C)
+#define DATA_MGR_COL_WIDTH				(BASE_DATA_MGR + 0x0020)
+#define DATA_MGR_ROW_WIDTH				(BASE_DATA_MGR + 0x0024)
+#define DATA_MGR_BANK_WIDTH				(BASE_DATA_MGR + 0x0028)
+#define DATA_MGR_CS_WIDTH				(BASE_DATA_MGR + 0x002C)
+#define DATA_MGR_ITF_WIDTH				(BASE_DATA_MGR + 0x0030)
+#define DATA_MGR_DVC_WIDTH				(BASE_DATA_MGR + 0x0034)
 
 #define MEM_T_WL_ADD DATA_MGR_MEM_T_WL
 #define MEM_T_RL_ADD DATA_MGR_MEM_T_RL
@@ -253,114 +210,75 @@
 #define CALIB_SKIP_DELAY_LOOPS			(1 << 0)
 #define CALIB_SKIP_ALL_BITS_CHK			(1 << 1)
 #define CALIB_SKIP_DELAY_SWEEPS			(1 << 2)
-#define CALIB_SKIP_VFIFO			(1 << 3)
-#define CALIB_SKIP_LFIFO			(1 << 4)
-#define CALIB_SKIP_WLEVEL			(1 << 5)
-#define CALIB_SKIP_WRITES			(1 << 6)
+#define CALIB_SKIP_VFIFO				(1 << 3)
+#define CALIB_SKIP_LFIFO				(1 << 4)
+#define CALIB_SKIP_WLEVEL				(1 << 5)
+#define CALIB_SKIP_WRITES				(1 << 6)
 #define CALIB_SKIP_FULL_TEST			(1 << 7)
-#define CALIB_SKIP_ALL				(CALIB_SKIP_VFIFO | \
-				CALIB_SKIP_LFIFO | CALIB_SKIP_WLEVEL | \
-				CALIB_SKIP_WRITES | CALIB_SKIP_FULL_TEST)
+#define CALIB_SKIP_ALL					(CALIB_SKIP_VFIFO | CALIB_SKIP_LFIFO | CALIB_SKIP_WLEVEL | CALIB_SKIP_WRITES | CALIB_SKIP_FULL_TEST)
 #define CALIB_IN_RTL_SIM				(1 << 8)
 
 /* Scan chain manager command addresses */
 
-#define WRITE_SCC_DQS_IN_DELAY(group, delay)	\
-	IOWR_32DIRECT(SCC_MGR_DQS_IN_DELAY, (group) << 2, delay)
-#define WRITE_SCC_DQS_EN_DELAY(group, delay)	\
-	IOWR_32DIRECT(SCC_MGR_DQS_EN_DELAY, (group) << 2, (delay) \
-	+ IO_DQS_EN_DELAY_OFFSET)
-#define WRITE_SCC_DQS_EN_PHASE(group, phase)	\
-	IOWR_32DIRECT(SCC_MGR_DQS_EN_PHASE, (group) << 2, phase)
-#define WRITE_SCC_DQDQS_OUT_PHASE(group, phase)	\
-	IOWR_32DIRECT(SCC_MGR_DQDQS_OUT_PHASE, (group) << 2, phase)
-#define WRITE_SCC_OCT_OUT1_DELAY(group, delay)	\
-	IOWR_32DIRECT(SCC_MGR_OCT_OUT1_DELAY, (group) << 2, delay)
+#define WRITE_SCC_DQS_IN_DELAY(group, delay)	IOWR_32DIRECT(SCC_MGR_DQS_IN_DELAY, (group) << 2, delay)
+#define WRITE_SCC_DQS_EN_DELAY(group, delay)	IOWR_32DIRECT(SCC_MGR_DQS_EN_DELAY, (group) << 2, (delay) + IO_DQS_EN_DELAY_OFFSET)
+#define WRITE_SCC_DQS_EN_PHASE(group, phase)	IOWR_32DIRECT(SCC_MGR_DQS_EN_PHASE, (group) << 2, phase)
+#define WRITE_SCC_DQDQS_OUT_PHASE(group, phase)	IOWR_32DIRECT(SCC_MGR_DQDQS_OUT_PHASE, (group) << 2, phase)
+#define WRITE_SCC_OCT_OUT1_DELAY(group, delay)	IOWR_32DIRECT(SCC_MGR_OCT_OUT1_DELAY, (group) << 2, delay)
 #define WRITE_SCC_OCT_OUT2_DELAY(group, delay)
 #define WRITE_SCC_DQS_BYPASS(group, bypass)
 
-#define WRITE_SCC_DQ_OUT1_DELAY(pin, delay)		\
-	IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (pin) << 2, delay)
+#define WRITE_SCC_DQ_OUT1_DELAY(pin, delay)		IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (pin) << 2, delay)
 
 #define WRITE_SCC_DQ_OUT2_DELAY(pin, delay)
 
-#define WRITE_SCC_DQ_IN_DELAY(pin, delay)		\
-	IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, (pin) << 2, delay)
+#define WRITE_SCC_DQ_IN_DELAY(pin, delay)		IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, (pin) << 2, delay)
 
 #define WRITE_SCC_DQ_BYPASS(pin, bypass)
 
 #define WRITE_SCC_RFIFO_MODE(pin, mode)
 
-#define WRITE_SCC_HHP_EXTRAS(value) 	    \
-	IOWR_32DIRECT(SCC_MGR_HHP_GLOBALS, SCC_MGR_HHP_EXTRAS_OFFSET, value)
-#define WRITE_SCC_HHP_DQSE_MAP(value) 	    \
-	IOWR_32DIRECT(SCC_MGR_HHP_GLOBALS, SCC_MGR_HHP_DQSE_MAP_OFFSET, value)
+#define WRITE_SCC_HHP_EXTRAS(value) 	    IOWR_32DIRECT(SCC_MGR_HHP_GLOBALS, SCC_MGR_HHP_EXTRAS_OFFSET, value)
+#define WRITE_SCC_HHP_DQSE_MAP(value) 	    IOWR_32DIRECT(SCC_MGR_HHP_GLOBALS, SCC_MGR_HHP_DQSE_MAP_OFFSET, value)
 
-#define WRITE_SCC_DQS_IO_OUT1_DELAY(delay)	\
-	IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2, delay)
+#define WRITE_SCC_DQS_IO_OUT1_DELAY(delay)	IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2, delay)
 
 #define WRITE_SCC_DQS_IO_OUT2_DELAY(delay)
 
-#define WRITE_SCC_DQS_IO_IN_DELAY(delay)	\
-	IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2, delay)
+#define WRITE_SCC_DQS_IO_IN_DELAY(delay)	IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2, delay)
 
-#define WRITE_SCC_DM_IO_OUT1_DELAY(pin, delay)	\
-	IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2, delay)
+#define WRITE_SCC_DM_IO_OUT1_DELAY(pin, delay)	IOWR_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2, delay)
 
 #define WRITE_SCC_DM_IO_OUT2_DELAY(pin, delay)
 
-#define WRITE_SCC_DM_IO_IN_DELAY(pin, delay)	\
-	IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2, delay)
+#define WRITE_SCC_DM_IO_IN_DELAY(pin, delay)	IOWR_32DIRECT(SCC_MGR_IO_IN_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2, delay)
 
 #define WRITE_SCC_DM_BYPASS(pin, bypass)
 
-#define READ_SCC_DQS_IN_DELAY(group)	\
-	IORD_32DIRECT(SCC_MGR_DQS_IN_DELAY, (group) << 2)
-#define READ_SCC_DQS_EN_DELAY(group)	\
-	(IORD_32DIRECT(SCC_MGR_DQS_EN_DELAY, (group) << 2) \
-	- IO_DQS_EN_DELAY_OFFSET)
-#define READ_SCC_DQS_EN_PHASE(group)	\
-	IORD_32DIRECT(SCC_MGR_DQS_EN_PHASE, (group) << 2)
-#define READ_SCC_DQDQS_OUT_PHASE(group)	\
-	IORD_32DIRECT(SCC_MGR_DQDQS_OUT_PHASE, (group) << 2)
-#define READ_SCC_OCT_OUT1_DELAY(group)	\
-	IORD_32DIRECT(SCC_MGR_OCT_OUT1_DELAY, \
-	(group * RW_MGR_MEM_IF_READ_DQS_WIDTH / \
-	RW_MGR_MEM_IF_WRITE_DQS_WIDTH) << 2)
+#define READ_SCC_DQS_IN_DELAY(group)	IORD_32DIRECT(SCC_MGR_DQS_IN_DELAY, (group) << 2)
+#define READ_SCC_DQS_EN_DELAY(group)	(IORD_32DIRECT(SCC_MGR_DQS_EN_DELAY, (group) << 2) - IO_DQS_EN_DELAY_OFFSET)
+#define READ_SCC_DQS_EN_PHASE(group)	IORD_32DIRECT(SCC_MGR_DQS_EN_PHASE, (group) << 2)
+#define READ_SCC_DQDQS_OUT_PHASE(group)	IORD_32DIRECT(SCC_MGR_DQDQS_OUT_PHASE, (group) << 2)
+#define READ_SCC_OCT_OUT1_DELAY(group)	IORD_32DIRECT(SCC_MGR_OCT_OUT1_DELAY, (group * RW_MGR_MEM_IF_READ_DQS_WIDTH / RW_MGR_MEM_IF_WRITE_DQS_WIDTH) << 2)
 #define READ_SCC_OCT_OUT2_DELAY(group)	0
 #define READ_SCC_DQS_BYPASS(group) 		0
 #define READ_SCC_DQS_BYPASS(group) 		0
 
-#define READ_SCC_DQ_OUT1_DELAY(pin)		\
-	IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (pin) << 2)
+#define READ_SCC_DQ_OUT1_DELAY(pin)		IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (pin) << 2)
 #define READ_SCC_DQ_OUT2_DELAY(pin)		0
-#define READ_SCC_DQ_IN_DELAY(pin)		\
-	IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, (pin) << 2)
+#define READ_SCC_DQ_IN_DELAY(pin)		IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, (pin) << 2)
 #define READ_SCC_DQ_BYPASS(pin) 	    0
 #define READ_SCC_RFIFO_MODE(pin) 	    0
 
-#define READ_SCC_DQS_IO_OUT1_DELAY()	\
-	IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2)
+#define READ_SCC_DQS_IO_OUT1_DELAY()	IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2)
 #define READ_SCC_DQS_IO_OUT2_DELAY()	0
-#define READ_SCC_DQS_IO_IN_DELAY()	\
-	IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2)
+#define READ_SCC_DQS_IO_IN_DELAY()	IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS) << 2)
 
-#define READ_SCC_DM_IO_OUT1_DELAY(pin)	\
-	IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2)
+#define READ_SCC_DM_IO_OUT1_DELAY(pin)	IORD_32DIRECT(SCC_MGR_IO_OUT1_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2)
 #define READ_SCC_DM_IO_OUT2_DELAY(pin)	0
-#define READ_SCC_DM_IO_IN_DELAY(pin)	\
-	IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, \
-	(RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2)
+#define READ_SCC_DM_IO_IN_DELAY(pin)	IORD_32DIRECT(SCC_MGR_IO_IN_DELAY, (RW_MGR_MEM_DQ_PER_WRITE_DQS + 1 + pin) << 2)
 #define READ_SCC_DM_BYPASS(pin) 	    0
 
-
 #define SCC_MGR_GROUP_COUNTER			(BASE_SCC_MGR + 0x0000)
 #define SCC_MGR_DQS_IN_DELAY			(BASE_SCC_MGR + 0x0100)
 #define SCC_MGR_DQS_EN_PHASE			(BASE_SCC_MGR + 0x0200)
@@ -368,28 +286,27 @@
 #define SCC_MGR_DQDQS_OUT_PHASE			(BASE_SCC_MGR + 0x0400)
 #define SCC_MGR_OCT_OUT1_DELAY			(BASE_SCC_MGR + 0x0500)
 #define SCC_MGR_IO_OUT1_DELAY			(BASE_SCC_MGR + 0x0700)
-#define SCC_MGR_IO_IN_DELAY			(BASE_SCC_MGR + 0x0900)
-
+#define SCC_MGR_IO_IN_DELAY				(BASE_SCC_MGR + 0x0900)
 
 /* HHP-HPS-specific versions of some commands */
 #define SCC_MGR_DQS_EN_DELAY_GATE		(BASE_SCC_MGR + 0x0600)
 #define SCC_MGR_IO_OE_DELAY			(BASE_SCC_MGR + 0x0800)
-#define SCC_MGR_HHP_GLOBALS			(BASE_SCC_MGR + 0x0A00)
-#define SCC_MGR_HHP_RFILE			(BASE_SCC_MGR + 0x0B00)
+#define SCC_MGR_HHP_GLOBALS				(BASE_SCC_MGR + 0x0A00)
+#define SCC_MGR_HHP_RFILE				(BASE_SCC_MGR + 0x0B00)
 
 /* HHP-HPS-specific values */
 #define SCC_MGR_HHP_EXTRAS_OFFSET			0
 #define SCC_MGR_HHP_DQSE_MAP_OFFSET			1
 
-#define SCC_MGR_DQS_ENA				(BASE_SCC_MGR + 0x0E00)
-#define SCC_MGR_DQS_IO_ENA			(BASE_SCC_MGR + 0x0E04)
-#define SCC_MGR_DQ_ENA				(BASE_SCC_MGR + 0x0E08)
-#define SCC_MGR_DM_ENA				(BASE_SCC_MGR + 0x0E0C)
-#define SCC_MGR_UPD				(BASE_SCC_MGR + 0x0E20)
-#define SCC_MGR_ACTIVE_RANK			(BASE_SCC_MGR + 0x0E40)
+#define SCC_MGR_DQS_ENA					(BASE_SCC_MGR + 0x0E00)
+#define SCC_MGR_DQS_IO_ENA				(BASE_SCC_MGR + 0x0E04)
+#define SCC_MGR_DQ_ENA					(BASE_SCC_MGR + 0x0E08)
+#define SCC_MGR_DM_ENA					(BASE_SCC_MGR + 0x0E0C)
+#define SCC_MGR_UPD						(BASE_SCC_MGR + 0x0E20)
+#define SCC_MGR_ACTIVE_RANK				(BASE_SCC_MGR + 0x0E40)
 #define SCC_MGR_AFI_CAL_INIT			(BASE_SCC_MGR + 0x0D00)
 
-/* PHY Debug mode flag constants */
+// PHY Debug mode flag constants
 #define PHY_DEBUG_IN_DEBUG_MODE 0x00000001
 #define PHY_DEBUG_ENABLE_CAL_RPT 0x00000002
 #define PHY_DEBUG_ENABLE_MARGIN_RPT 0x00000004
@@ -397,46 +314,44 @@
 #define PHY_DEBUG_DISABLE_GUARANTEED_READ 0x00000010
 #define PHY_DEBUG_ENABLE_NON_DESTRUCTIVE_CALIBRATION 0x00000020
 
-/* Init and Reset delay constants - Only use if defined by sequencer_defines.h,
- * otherwise, revert to defaults
- * Default for Tinit = (0+1) * ((202+1) * (2 * 131 + 1) + 1) = 53532 = 200.75us @ 266MHz
- */
+// Init and Reset delay constants - Only use if defined by sequencer_defines.h,
+// otherwise, revert to defaults
+// Default for Tinit = (0+1) * ((202+1) * (2 * 131 + 1) + 1) = 53532 = 200.75us @ 266MHz
 #ifdef TINIT_CNTR0_VAL
-   #define SEQ_TINIT_CNTR0_VAL TINIT_CNTR0_VAL
+#define SEQ_TINIT_CNTR0_VAL TINIT_CNTR0_VAL
 #else
-   #define SEQ_TINIT_CNTR0_VAL 0
+#define SEQ_TINIT_CNTR0_VAL 0
 #endif
 
 #ifdef TINIT_CNTR1_VAL
-   #define SEQ_TINIT_CNTR1_VAL TINIT_CNTR1_VAL
+#define SEQ_TINIT_CNTR1_VAL TINIT_CNTR1_VAL
 #else
-   #define SEQ_TINIT_CNTR1_VAL 202
+#define SEQ_TINIT_CNTR1_VAL 202
 #endif
 
 #ifdef TINIT_CNTR2_VAL
-   #define SEQ_TINIT_CNTR2_VAL TINIT_CNTR2_VAL
+#define SEQ_TINIT_CNTR2_VAL TINIT_CNTR2_VAL
 #else
-   #define SEQ_TINIT_CNTR2_VAL 131
+#define SEQ_TINIT_CNTR2_VAL 131
 #endif
 
-
-/* Default for Treset = (2+1) * ((252+1) * (2 * 131 + 1) + 1) = 133563 = 500.86us @ 266MHz */
+// Default for Treset = (2+1) * ((252+1) * (2 * 131 + 1) + 1) = 133563 = 500.86us @ 266MHz
 #ifdef TRESET_CNTR0_VAL
-   #define SEQ_TRESET_CNTR0_VAL TRESET_CNTR0_VAL
+#define SEQ_TRESET_CNTR0_VAL TRESET_CNTR0_VAL
 #else
-   #define SEQ_TRESET_CNTR0_VAL 2
+#define SEQ_TRESET_CNTR0_VAL 2
 #endif
 
 #ifdef TRESET_CNTR1_VAL
-   #define SEQ_TRESET_CNTR1_VAL TRESET_CNTR1_VAL
+#define SEQ_TRESET_CNTR1_VAL TRESET_CNTR1_VAL
 #else
-   #define SEQ_TRESET_CNTR1_VAL 252
+#define SEQ_TRESET_CNTR1_VAL 252
 #endif
 
 #ifdef TRESET_CNTR2_VAL
-   #define SEQ_TRESET_CNTR2_VAL TRESET_CNTR2_VAL
+#define SEQ_TRESET_CNTR2_VAL TRESET_CNTR2_VAL
 #else
-   #define SEQ_TRESET_CNTR2_VAL 131
+#define SEQ_TRESET_CNTR2_VAL 131
 #endif
 
 /* Bitfield type changes depending on protocol */
@@ -445,18 +360,32 @@ typedef uint32_t t_btfld;
 #define RW_MGR_INST_ROM_WRITE BASE_RW_MGR + 0x1800
 #define RW_MGR_AC_ROM_WRITE BASE_RW_MGR + 0x1C00
 
+static const uint32_t inst_rom_init_size;
+static const uint32_t inst_rom_init[];
+static const uint32_t ac_rom_init_size;
+static const uint32_t ac_rom_init[];
+
 /* parameter variable holder */
 
 typedef struct param_type {
+	t_btfld dm_correct_mask;
 	t_btfld read_correct_mask;
 	t_btfld read_correct_mask_vg;
 	t_btfld write_correct_mask;
 	t_btfld write_correct_mask_vg;
+
+	/* set a particular entry to 1 if we need to skip a particular rank */
+
 	uint32_t skip_ranks[MAX_RANKS];
+
+	/* set a particular entry to 1 if we need to skip a particular group */
+
 	uint32_t skip_groups;
+
+	/* set a particular entry to 1 if the shadow register (which represents a set of ranks) needs to be skipped */
+
 	uint32_t skip_shadow_regs[NUM_SHADOW_REGS];
 
-	/* set a particular entry to 1 if we need to skip a particular group */
 } param_t;
 
 /* global variable holder */
@@ -484,11 +413,41 @@ typedef struct gbl_type {
 	uint32_t fom_in;
 	uint32_t fom_out;
 
-	/*USER Number of RW Mgr NOP cycles between
-	write command and write data */
-#if MULTIPLE_AFI_WLAT
-	uint32_t rw_wl_nop_cycles_per_group[RW_MGR_MEM_IF_WRITE_DQS_WIDTH];
-#endif
+	//USER Number of RW Mgr NOP cycles between write command and write data
 	uint32_t rw_wl_nop_cycles;
 } gbl_t;
+
+// External global variables
+static gbl_t *gbl;
+static param_t *param;
+
+// External functions
+static uint32_t rw_mgr_mem_calibrate_full_test(uint32_t min_correct, t_btfld * bit_chk,
+					       uint32_t test_dm);
+static uint32_t run_mem_calibrate(void);
+static void rw_mgr_mem_calibrate_eye_diag_aid(void);
+static void rw_mgr_load_mrs_calib(void);
+static void rw_mgr_load_mrs_exec(void);
+static void rw_mgr_mem_initialize(void);
+static void rw_mgr_mem_dll_lock_wait(void);
+static inline void scc_mgr_set_dq_in_delay(uint32_t write_group, uint32_t dq_in_group,
+					   uint32_t delay);
+static inline void scc_mgr_set_dq_out1_delay(uint32_t write_group, uint32_t dq_in_group,
+					     uint32_t delay);
+static inline void scc_mgr_set_dq_out2_delay(uint32_t write_group, uint32_t dq_in_group,
+					     uint32_t delay);
+static inline void scc_mgr_load_dq(uint32_t dq_in_group);
+static inline void scc_mgr_set_dqs_bus_in_delay(uint32_t read_group, uint32_t delay);
+static inline void scc_mgr_load_dqs(uint32_t dqs);
+static void scc_mgr_set_group_dqs_io_and_oct_out1_gradual(uint32_t write_group, uint32_t delay);
+static void scc_mgr_set_group_dqs_io_and_oct_out2_gradual(uint32_t write_group, uint32_t delay);
+static void scc_mgr_set_dqs_en_delay_all_ranks(uint32_t read_group, uint32_t delay);
+static void scc_mgr_set_dqs_en_phase_all_ranks(uint32_t read_group, uint32_t phase);
+static void scc_mgr_set_dqdqs_output_phase_all_ranks(uint32_t write_group, uint32_t phase);
+static inline void scc_mgr_set_dm_out1_delay(uint32_t write_group, uint32_t dm, uint32_t delay);
+static inline void scc_mgr_set_dm_out2_delay(uint32_t write_group, uint32_t dm, uint32_t delay);
+static inline void scc_mgr_load_dm(uint32_t dm);
+static void rw_mgr_incr_vfifo_auto(uint32_t grp);
+static void rw_mgr_decr_vfifo_auto(uint32_t grp);
+static int sdram_calibration(void);
 #endif
diff --git a/arch/arm/mach-socfpga/include/mach/sequencer_defines.h b/arch/arm/mach-socfpga/include/mach/sequencer_defines.h
new file mode 100644
index 000000000000..50598441069d
--- /dev/null
+++ b/arch/arm/mach-socfpga/include/mach/sequencer_defines.h
@@ -0,0 +1,6 @@
+#define TINIT_CNTR1_VAL 32
+#define TINIT_CNTR2_VAL 32
+#define TINIT_CNTR0_VAL 99
+#define TRESET_CNTR1_VAL 99
+#define TRESET_CNTR2_VAL 10
+#define TRESET_CNTR0_VAL 99
diff --git a/arch/arm/mach-socfpga/include/mach/system.h b/arch/arm/mach-socfpga/include/mach/system.h
new file mode 100755
index 000000000000..89527b2c2b40
--- /dev/null
+++ b/arch/arm/mach-socfpga/include/mach/system.h
@@ -0,0 +1,37 @@
+/*
+* Copyright Altera Corporation (C) 2012-2014. All rights reserved
+*
+* SPDX-License-Identifier:  BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above copyright
+*  notice, this list of conditions and the following disclaimer in the
+*  documentation and/or other materials provided with the distribution.
+*  * Neither the name of Altera Corporation nor the
+*  names of its contributors may be used to endorse or promote products
+*  derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define SEQUENCER_DATA_MGR_INST_BASE 0x60000
+#define SEQUENCER_PHY_MGR_INST_BASE 0x48000
+#define SEQUENCER_PTR_MGR_INST_BASE 0x40000
+#define SEQUENCER_RAM_BASE 0x20000
+#define SEQUENCER_ROM_BASE 0x10000
+#define SEQUENCER_RW_MGR_INST_BASE 0x50000
+#define SEQUENCER_SCC_MGR_INST_BASE 0x58000
+#define SEQUENCER_REG_FILE_INST_BASE 0x70000
+#define SEQUENCER_TIMER_INST_BASE 0x78000
diff --git a/arch/arm/mach-socfpga/include/mach/tclrpt.h b/arch/arm/mach-socfpga/include/mach/tclrpt.h
new file mode 100755
index 000000000000..4345b23ba65c
--- /dev/null
+++ b/arch/arm/mach-socfpga/include/mach/tclrpt.h
@@ -0,0 +1,38 @@
+#ifndef TCLRPT_H_
+#define TCLRPT_H_
+/*
+* Copyright Altera Corporation (C) 2012-2014. All rights reserved
+*
+* SPDX-License-Identifier:  BSD-3-Clause
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above copyright
+*  notice, this list of conditions and the following disclaimer in the
+*  documentation and/or other materials provided with the distribution.
+*  * Neither the name of Altera Corporation nor the
+*  names of its contributors may be used to endorse or promote products
+*  derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL ALTERA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "sequencer.h"
+
+#define TCLRPT_SET(item, value)
+
+// None of the rest of the file should be referenced if ENABLE_TCL_DEBUG is not
+// set (although it's not a problem if it is, but this helps catch errors)
+
+#endif
-- 
2.1.4




More information about the barebox mailing list