atj213x: various fixes in sample code snippets

Change-Id: I78781e1a56cb6705d011ee2296f1789b497a566a
diff --git a/utils/atj2137/adfuload/test_binary/backlight_c/crt0.S b/utils/atj2137/adfuload/test_binary/backlight_c/crt0.S
index e03b8b5..485a4fc 100644
--- a/utils/atj2137/adfuload/test_binary/backlight_c/crt0.S
+++ b/utils/atj2137/adfuload/test_binary/backlight_c/crt0.S
@@ -26,9 +26,9 @@
 reloc_loop:
     lw t2, 0(v0)               # src
     addiu v0, 4                # inc src addr
-    sw t2, 0(t0)               # dst
-    bne t0, t1, reloc_loop
     addiu t0, 4                # inc dst addr
+    bne t0, t1, reloc_loop
+    sw t2, -4(t0)              # dst
 
 entry_point_jump:
     la t0, entry_point
@@ -49,17 +49,18 @@
 cache_init_loop:
     cache   8, 0(t0)           # index store icache tag
     cache   9, 0(t0)           # index store dcache tag
-    bne     t0, t1, cache_init_loop
     addiu   t0, t0, 0x10
+    bne     t0, t1, cache_init_loop
+    nop
 
     # clear bss
     la t0, bssbegin
     la t1, bssend
 
 clear_bss_loop:
-    sw zero, 0(t0)
-    bne t0, t1, clear_bss_loop
     addiu t0, 4
+    bne t0, t1, clear_bss_loop
+    sw zero, -4(t0)
 
     # setup stack
     la sp, stackend
@@ -67,9 +68,9 @@
     li t1, 0xdeadbeef
 
 stack_munge_loop:
-    sw t1, 0(t0)
-    bne t0, sp, stack_munge_loop
     addiu t0, 4
+    bne t0, sp, stack_munge_loop
+    sw t1, -4(t0)
 
     # jump to C code
     j main
diff --git a/utils/atj2137/adfuload/test_binary/timer_irq/crt0.S b/utils/atj2137/adfuload/test_binary/timer_irq/crt0.S
index eb78ba4..29fef64 100644
--- a/utils/atj2137/adfuload/test_binary/timer_irq/crt0.S
+++ b/utils/atj2137/adfuload/test_binary/timer_irq/crt0.S
@@ -29,9 +29,9 @@
 reloc_loop:
     lw t2, 0(v0)               # src
     addiu v0, 4                # inc src addr
-    sw t2, 0(t0)               # dst
-    bne t0, t1, reloc_loop
     addiu t0, 4                # inc dst addr
+    bne t0, t1, reloc_loop
+    sw t2, -4(t0)              # dst
 
 cache_init:
     # setup caches
@@ -47,8 +47,9 @@
 cache_init_loop:
     cache   8, 0(t0)           # index store icache tag
     cache   9, 0(t0)           # index store dcache tag
-    bne     t0, t1, cache_init_loop
     addiu   t0, t0, 0x10
+    bne     t0, t1, cache_init_loop
+    nop
 
 intc_setup:
     li      t0, 0xb0020000     # INTC base
@@ -73,9 +74,9 @@
     la t1, bssend
 
 clear_bss_loop:
-    sw zero, 0(t0)
-    bne t0, t1, clear_bss_loop
     addiu t0, 4
+    bne t0, t1, clear_bss_loop
+    sw zero, -4(t0)
 
     # setup stack
     la k0, irqstackend
@@ -84,9 +85,9 @@
     li t1, 0xdeadbeef
 
 stack_munge_loop:
-    sw t1, 0(t0)
-    bne t0, sp, stack_munge_loop
     addiu t0, 4
+    bne t0, sp, stack_munge_loop
+    sw t1, -4(t0)
 
     # jump to C code with enabled interrupts
     la t0, main
diff --git a/utils/atj2137/adfuload/test_binary/timer_irq/irq_handler.S b/utils/atj2137/adfuload/test_binary/timer_irq/irq_handler.S
index 1d58c60..a9ae340 100644
--- a/utils/atj2137/adfuload/test_binary/timer_irq/irq_handler.S
+++ b/utils/atj2137/adfuload/test_binary/timer_irq/irq_handler.S
@@ -83,7 +83,7 @@
     lw fp, 68(sp)
     lw ra, 72(sp)
 
-    addiu sp, sp, 88
+    addiu sp, sp, 84
     move sp, k1
     eret
 
diff --git a/utils/atj2137/adfuload/test_binary/timer_no_irq/crt0.S b/utils/atj2137/adfuload/test_binary/timer_no_irq/crt0.S
index e03b8b5..485a4fc 100644
--- a/utils/atj2137/adfuload/test_binary/timer_no_irq/crt0.S
+++ b/utils/atj2137/adfuload/test_binary/timer_no_irq/crt0.S
@@ -26,9 +26,9 @@
 reloc_loop:
     lw t2, 0(v0)               # src
     addiu v0, 4                # inc src addr
-    sw t2, 0(t0)               # dst
-    bne t0, t1, reloc_loop
     addiu t0, 4                # inc dst addr
+    bne t0, t1, reloc_loop
+    sw t2, -4(t0)              # dst
 
 entry_point_jump:
     la t0, entry_point
@@ -49,17 +49,18 @@
 cache_init_loop:
     cache   8, 0(t0)           # index store icache tag
     cache   9, 0(t0)           # index store dcache tag
-    bne     t0, t1, cache_init_loop
     addiu   t0, t0, 0x10
+    bne     t0, t1, cache_init_loop
+    nop
 
     # clear bss
     la t0, bssbegin
     la t1, bssend
 
 clear_bss_loop:
-    sw zero, 0(t0)
-    bne t0, t1, clear_bss_loop
     addiu t0, 4
+    bne t0, t1, clear_bss_loop
+    sw zero, -4(t0)
 
     # setup stack
     la sp, stackend
@@ -67,9 +68,9 @@
     li t1, 0xdeadbeef
 
 stack_munge_loop:
-    sw t1, 0(t0)
-    bne t0, sp, stack_munge_loop
     addiu t0, 4
+    bne t0, sp, stack_munge_loop
+    sw t1, -4(t0)
 
     # jump to C code
     j main