SPC Codec: Run SPC emulation on COP and audio sample processing on CPU on dual-core PortalPlayer targets.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@15673 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/codecs.c b/apps/codecs.c
index f2539dc..9969b6f 100644
--- a/apps/codecs.c
+++ b/apps/codecs.c
@@ -162,7 +162,19 @@
     invalidate_icache,
 #endif
 
-    NULL, /* struct sp_data *dsp */
+    NULL, /* struct dsp_config *dsp */
+
+#if NUM_CORES > 1
+    create_thread,
+    thread_thaw,
+    thread_wait,
+    semaphore_init,
+    semaphore_wait,
+    semaphore_release,
+    event_init,
+    event_wait,
+    event_set_state,
+#endif
 };
 
 void codec_get_full_path(char *path, const char *codec_root_fn)
diff --git a/apps/codecs.h b/apps/codecs.h
index d2ba00c..29ed2d3 100644
--- a/apps/codecs.h
+++ b/apps/codecs.h
@@ -80,7 +80,7 @@
 #define CODEC_ENC_MAGIC 0x52454E43 /* RENC */
 
 /* increase this every time the api struct changes */
-#define CODEC_API_VERSION 20
+#define CODEC_API_VERSION 21
 
 /* update this to latest version if a change to the api struct breaks
    backwards compatibility (and please take the opportunity to sort in any
@@ -236,6 +236,23 @@
 #endif
 
     struct dsp_config *dsp;
+
+#if NUM_CORES > 1
+    struct thread_entry *
+        (*create_thread)(void (*function)(void), void* stack,
+                         int stack_size, unsigned flags, const char *name
+                         IF_PRIO(, int priority)
+                         IF_COP(, unsigned int core));
+
+    void (*thread_thaw)(struct thread_entry *thread);
+    void (*thread_wait)(struct thread_entry *thread);
+    void (*semaphore_init)(struct semaphore *s, int max, int start);
+    void (*semaphore_wait)(struct semaphore *s);
+    void (*semaphore_release)(struct semaphore *s);
+    void (*event_init)(struct event *e, unsigned int flags);
+    void (*event_wait)(struct event *e, unsigned int for_state);
+    void (*event_set_state)(struct event *e, unsigned int state);
+#endif /* NUM_CORES */
 };
 
 /* codec header */
diff --git a/apps/codecs/spc.c b/apps/codecs/spc.c
index f2890cd..ae30263 100644
--- a/apps/codecs/spc.c
+++ b/apps/codecs/spc.c
@@ -185,12 +185,253 @@
 }
 
 /**************** Codec ****************/
-
-static int32_t samples[WAV_CHUNK_SIZE*2] IBSS_ATTR;
-
-static struct Spc_Emu spc_emu IDATA_ATTR;
-
 enum {SAMPLE_RATE = 32000};
+static struct Spc_Emu spc_emu IDATA_ATTR CACHEALIGN_ATTR;
+
+#if SPC_DUAL_CORE
+/** Implementations for pipelined dual-core operation **/
+static int spc_emu_thread_stack[DEFAULT_STACK_SIZE/sizeof(int)]
+    CACHEALIGN_ATTR;
+
+static const unsigned char * const spc_emu_thread_name = "spc emu";
+static struct thread_entry *emu_thread_p;
+
+enum
+{
+    SPC_EMU_AUDIO = 0,
+    SPC_EMU_LOAD,
+    SPC_EMU_QUIT,
+};
+
+struct spc_load
+{
+    uint8_t *buf;
+    size_t size;
+};
+
+/* sample queue */
+#define WAV_NUM_CHUNKS 2
+#define WAV_CHUNK_MASK (WAV_NUM_CHUNKS-1)
+struct sample_queue_chunk
+{
+    long id;
+    union
+    {
+        intptr_t data;
+        int32_t audio[WAV_CHUNK_SIZE*2];
+    };
+};
+
+static struct
+{
+    int head, tail;
+    struct semaphore emu_sem_head;
+    struct semaphore emu_sem_tail;
+    struct event emu_evt_reply;
+    intptr_t retval;
+    struct sample_queue_chunk wav_chunk[WAV_NUM_CHUNKS];
+} sample_queue NOCACHEBSS_ATTR;
+
+static inline void samples_release_wrbuf(void)
+{
+    sample_queue.tail++;
+    ci->semaphore_release(&sample_queue.emu_sem_head);
+}
+
+static inline struct sample_queue_chunk * samples_get_wrbuf(void)
+{
+    ci->semaphore_wait(&sample_queue.emu_sem_tail);
+    return &sample_queue.wav_chunk[sample_queue.tail & WAV_CHUNK_MASK];
+}
+
+static inline void samples_release_rdbuf(void)
+{
+    if (sample_queue.head != sample_queue.tail) {
+        sample_queue.head++;
+    }
+
+    ci->semaphore_release(&sample_queue.emu_sem_tail);
+}
+
+static inline int32_t * samples_get_rdbuf(void)
+{
+    ci->semaphore_wait(&sample_queue.emu_sem_head);
+
+    if (ci->stop_codec || ci->new_track)
+    {
+        /* Told to stop. Buffer must be released. */
+        samples_release_rdbuf();
+        return NULL;
+    }
+
+    return sample_queue.wav_chunk[sample_queue.head & WAV_CHUNK_MASK].audio;
+}
+
+static intptr_t emu_thread_send_msg(long id, intptr_t data)
+{
+    struct sample_queue_chunk *chunk;
+    /* Grab an audio output buffer */
+    ci->semaphore_wait(&sample_queue.emu_sem_head);
+    chunk = &sample_queue.wav_chunk[sample_queue.head & WAV_CHUNK_MASK];
+    /* Place a message in it instead of audio */
+    chunk->id = id;
+    chunk->data = data;
+    /* Release it to the emu thread */
+    samples_release_rdbuf();
+    /* Wait for a response */
+    ci->event_wait(&sample_queue.emu_evt_reply, STATE_SIGNALED);
+    return sample_queue.retval;    
+}
+
+/* thread function */
+static bool emu_thread_process_msg(struct sample_queue_chunk *chunk)
+{
+    long id = chunk->id;
+    bool ret = id != SPC_EMU_QUIT;
+
+    chunk->id = SPC_EMU_AUDIO; /* Reset chunk type to audio */
+    sample_queue.retval = 0;
+
+    if (id == SPC_EMU_LOAD)
+    {
+        struct spc_load *ld = (struct spc_load *)chunk->data;
+        invalidate_icache();
+        SPC_Init(&spc_emu);
+        sample_queue.retval = SPC_load_spc(&spc_emu, ld->buf, ld->size);
+    }
+
+    /* Empty the audio queue */
+    /* This is a dirty hack a timeout based wait would make unnescessary but
+       still safe because the other thread is known to be waiting for a reply
+       and is not using the objects. */
+    ci->semaphore_init(&sample_queue.emu_sem_tail, 2, 2);
+    ci->semaphore_init(&sample_queue.emu_sem_head, 2, 0);
+    sample_queue.head = sample_queue.tail = 0;
+    ci->event_set_state(&sample_queue.emu_evt_reply, STATE_SIGNALED);
+
+    return ret;
+}
+
+static void spc_emu_thread(void)
+{
+    CPU_Init(&spc_emu);
+
+    while (1) {
+        /* get a buffer for output */
+        struct sample_queue_chunk *chunk = samples_get_wrbuf();
+
+        if (chunk->id != SPC_EMU_AUDIO) {
+            /* This chunk doesn't contain audio but a command */
+            if (!emu_thread_process_msg(chunk))
+                break;
+            /* Have to re-get this pointer to keep semaphore counts correct */
+            continue;
+        }
+
+        ENTER_TIMER(render);
+        /* fill samples buffer */
+        if ( SPC_play(&spc_emu, WAV_CHUNK_SIZE*2, chunk->audio) )
+            assert( false );
+        EXIT_TIMER(render);
+
+        /* done so release it to output */
+        samples_release_wrbuf();
+        ci->yield();
+    }
+}
+
+static bool spc_emu_start(void)
+{
+    emu_thread_p = ci->create_thread(spc_emu_thread, spc_emu_thread_stack,
+                           sizeof(spc_emu_thread_stack), CREATE_THREAD_FROZEN,
+                           spc_emu_thread_name IF_PRIO(, PRIORITY_PLAYBACK), COP);
+
+    if (emu_thread_p == NULL)
+        return false;
+
+    /* Initialize audio queue as full to prevent emu thread from trying to run the
+       emulator before loading something */
+    ci->event_init(&sample_queue.emu_evt_reply,
+                   EVENT_AUTOMATIC | STATE_NONSIGNALED);
+    ci->semaphore_init(&sample_queue.emu_sem_tail, 2, 0);
+    ci->semaphore_init(&sample_queue.emu_sem_head, 2, 2);
+    sample_queue.head = 0;
+    sample_queue.tail = 2;
+
+    /* Start it running */
+    ci->thread_thaw(emu_thread_p);
+    return true;
+}
+
+/* load a new program on the emu thread */
+static inline int load_spc_buffer(uint8_t *buf, size_t size)
+{
+    struct spc_load ld = { buf, size };
+    flush_icache();
+    return emu_thread_send_msg(SPC_EMU_LOAD, (intptr_t)&ld);
+}
+
+static inline void spc_emu_quit(void)
+{
+    emu_thread_send_msg(SPC_EMU_QUIT, 0);
+    /* Wait for emu thread to be killed */
+    ci->thread_wait(emu_thread_p);
+}
+
+static inline bool spc_play_get_samples(int32_t **samples)
+{
+    /* obtain filled samples buffer */
+    *samples = samples_get_rdbuf();
+    return *samples != NULL;
+}
+
+static inline void spc_play_send_samples(int32_t *samples)
+{
+    ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE);
+    /* done with chunk so release it to emu thread */
+    samples_release_rdbuf();
+}
+
+#else /* !SPC_DUAL_CORE */
+/** Implementations for single-core operation **/
+int32_t wav_chunk[WAV_CHUNK_SIZE*2] IBSS_ATTR;
+
+/* load a new program into emu */
+static inline int load_spc_buffer(uint8_t *buf, size_t size)
+{
+    SPC_Init(&spc_emu);
+    return SPC_load_spc(&spc_emu, buf, size);
+}
+
+static inline bool spc_emu_start(void)
+{
+#ifdef CPU_COLDFIRE
+    /* signed integer mode with saturation */
+    coldfire_set_macsr(EMAC_SATURATE);
+#endif
+    CPU_Init(&spc_emu);
+    return true;
+}
+
+static inline void spc_play_send_samples(int32_t *samples)
+{
+    ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE);
+}
+
+#define spc_emu_quit()
+#define samples_release_rdbuf()
+
+static inline bool spc_play_get_samples(int32_t **samples)
+{
+    ENTER_TIMER(render);
+    /* fill samples buffer */
+    if ( SPC_play(&spc_emu,WAV_CHUNK_SIZE*2,wav_chunk) )
+        assert( false );
+    EXIT_TIMER(render);
+    *samples = wav_chunk;
+    return true;
+}
+#endif /* SPC_DUAL_CORE */
 
 /* The main decoder loop */
 static int play_track( void )
@@ -206,7 +447,7 @@
         fadedec=0x7fffffffl/(fadeendsample-fadestartsample)+1;
         
     ENTER_TIMER(total);
-    
+
     while ( 1 )
     {
         ci->yield();
@@ -224,14 +465,12 @@
             }
             ci->seek_complete();
         }
-        
-        ENTER_TIMER(render);
-        /* fill samples buffer */
-        if ( SPC_play(&spc_emu,WAV_CHUNK_SIZE*2,samples) )
-            assert( false );
-        EXIT_TIMER(render);
-        
-        sampleswritten+=WAV_CHUNK_SIZE;
+
+        int32_t *samples;
+        if (!spc_play_get_samples(&samples))
+            break;
+
+        sampleswritten += WAV_CHUNK_SIZE;
 
         /* is track timed? */
         if (ci->global_settings->repeat_mode!=REPEAT_ONE && ci->id3->length) {
@@ -241,11 +480,11 @@
             /* fade? */
             if (curtime>ID666.length)
             {
-#ifdef CPU_COLDFIRE
+            #ifdef CPU_COLDFIRE
                 /* Have to switch modes to do this */
                 long macsr = coldfire_get_macsr();
                 coldfire_set_macsr(EMAC_SATURATE | EMAC_FRACTIONAL | EMAC_ROUND);
-#endif
+            #endif
                 int i;
                 for (i=0;i<WAV_CHUNK_SIZE;i++) {
                     if (lasttimesample+i>fadestartsample) {
@@ -256,42 +495,43 @@
                         fadevol-=fadedec;
                     }
                 }
-#ifdef CPU_COLDFIRE
+            #ifdef CPU_COLDFIRE
                coldfire_set_macsr(macsr);
-#endif
+            #endif
             }
             /* end? */
             if (lasttimesample>=fadeendsample)
+            {
+                samples_release_rdbuf();
                 break;
+            }
         }
 
-        ci->pcmbuf_insert(samples, samples+WAV_CHUNK_SIZE, WAV_CHUNK_SIZE);
+        spc_play_send_samples(samples);
 
         if (ci->global_settings->repeat_mode!=REPEAT_ONE)
-        ci->set_elapsed(sampleswritten*1000LL/SAMPLE_RATE);
+            ci->set_elapsed(sampleswritten*1000LL/SAMPLE_RATE);
         else
             ci->set_elapsed(0);
     }
     
     EXIT_TIMER(total);
-    
     return 0;
 }
 
 /* this is the codec entry point */
 enum codec_status codec_main(void)
 {
-#ifdef CPU_COLDFIRE
-    /* signed integer mode with saturation */
-    coldfire_set_macsr(EMAC_SATURATE);
-#endif
-    CPU_Init(&spc_emu);
+    enum codec_status stat = CODEC_ERROR;
+
+    if (!spc_emu_start())
+        goto codec_quit;
 
     do
     {
         DEBUGF("SPC: next_track\n");
         if (codec_init()) {
-            return CODEC_ERROR;
+            goto codec_quit;
         }
         DEBUGF("SPC: after init\n");
 
@@ -301,7 +541,7 @@
 
         /* wait for track info to load */
         while (!*ci->taginfo_ready && !ci->stop_codec)
-            ci->sleep(1);
+            ci->yield();
 
         codec_set_replaygain(ci->id3);
 
@@ -313,20 +553,19 @@
         size_t buffersize;
         uint8_t* buffer = ci->request_buffer(&buffersize, ci->filesize);
         if (!buffer) {
-            return CODEC_ERROR;
+            goto codec_quit;
         }
 
         DEBUGF("SPC: read size = 0x%lx\n",(unsigned long)buffersize);
         do
         {
-            SPC_Init(&spc_emu);
-            if (SPC_load_spc(&spc_emu,buffer,buffersize)) {
+            if (load_spc_buffer(buffer, buffersize)) {
                 DEBUGF("SPC load failure\n");
-                return CODEC_ERROR;
+                goto codec_quit;
             }
 
             LoadID666(buffer+0x2e);
-            
+
             if (ci->global_settings->repeat_mode!=REPEAT_ONE && ID666.length==0) {
                 ID666.length=3*60*1000; /* 3 minutes */
                 ID666.fade=5*1000; /* 5 seconds */
@@ -340,12 +579,16 @@
 
             reset_profile_timers();
         }
-        
         while ( play_track() );
 
         print_timers(ci->id3->path);
     }
     while ( ci->request_next_track() );
+
+    stat = CODEC_OK;
+
+codec_quit:
+    spc_emu_quit();
     
-    return CODEC_OK;
+    return stat;
 }
diff --git a/apps/codecs/spc/spc_codec.h b/apps/codecs/spc/spc_codec.h
index f2677df..c785acc 100644
--- a/apps/codecs/spc/spc_codec.h
+++ b/apps/codecs/spc/spc_codec.h
@@ -32,38 +32,51 @@
 
 /** Basic configuration options **/
 
+#define SPC_DUAL_CORE 1
+
+#if !defined(SPC_DUAL_CORE) || NUM_CORES == 1
+#undef  SPC_DUAL_CORE
+#define SPC_DUAL_CORE 0
+#endif
+
 /* TGB is the only target fast enough for gaussian and realtime BRR decode */
 /* echo is almost fast enough but not quite */
-#ifndef TOSHIBA_GIGABEAT_F
-    /* Cache BRR waves */
-    #define SPC_BRRCACHE 1
-
-    /* Disable gaussian interpolation */
-    #define SPC_NOINTERP 1
-
-#ifndef CPU_COLDFIRE
-    /* Disable echo processing */
-    #define SPC_NOECHO 1
-#else
-    /* Enable echo processing */
-    #define SPC_NOECHO 0
-#endif
-#else
+#if defined(TOSHIBA_GIGABEAT_F) || defined(SIMULATOR)
     /* Don't cache BRR waves */
     #define SPC_BRRCACHE 0 
     
     /* Allow gaussian interpolation */
     #define SPC_NOINTERP 0
-    
+
     /* Allow echo processing */
     #define SPC_NOECHO 0
-#endif
+#elif defined(CPU_COLDFIRE)
+    /* Cache BRR waves */
+    #define SPC_BRRCACHE 1 
+    
+    /* Disable gaussian interpolation */
+    #define SPC_NOINTERP 1
 
-/* Samples per channel per iteration */
-#ifdef CPU_COLDFIRE
-#define WAV_CHUNK_SIZE 1024
+    /* Allow echo processing */
+    #define SPC_NOECHO 0
+#elif defined (CPU_PP) && SPC_DUAL_CORE
+    /* Cache BRR waves */
+    #define SPC_BRRCACHE 1 
+    
+    /* Disable gaussian interpolation */
+    #define SPC_NOINTERP 1
+
+    /* Allow echo processing */
+    #define SPC_NOECHO 0
 #else
-#define WAV_CHUNK_SIZE 2048
+    /* Cache BRR waves */
+    #define SPC_BRRCACHE 1 
+    
+    /* Disable gaussian interpolation */
+    #define SPC_NOINTERP 1
+
+    /* Disable echo processing */
+    #define SPC_NOECHO 1
 #endif
 
 #ifdef CPU_ARM
@@ -72,6 +85,26 @@
 
     #undef  IDATA_ATTR
     #define IDATA_ATTR
+
+    #undef  ICONST_ATTR
+    #define ICONST_ATTR
+
+    #undef  IBSS_ATTR
+    #define IBSS_ATTR
+
+#if SPC_DUAL_CORE
+    #undef NOCACHEBSS_ATTR
+    #define NOCACHEBSS_ATTR __attribute__ ((section(".ibss")))
+    #undef NOCACHEDATA_ATTR
+    #define NOCACHEDATA_ATTR __attribute__((section(".idata")))
+#endif
+#endif
+
+/* Samples per channel per iteration */
+#if defined(CPU_PP) && NUM_CORES == 1
+#define WAV_CHUNK_SIZE 2048
+#else
+#define WAV_CHUNK_SIZE 1024
 #endif
 
 /**************** Little-endian handling ****************/
@@ -231,16 +264,26 @@
 
 enum { FIR_BUF_HALF = 8 };
 
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE)
 /* global because of the large aligment requirement for hardware masking -
  * L-R interleaved 16-bit samples for easy loading and mac.w use.
  */
 enum
 {
-    FIR_BUF_SIZE = FIR_BUF_HALF * sizeof ( int32_t ),
-    FIR_BUF_MASK = ~FIR_BUF_SIZE
+    FIR_BUF_CNT   = FIR_BUF_HALF,
+    FIR_BUF_SIZE  = FIR_BUF_CNT * sizeof ( int32_t ),
+    FIR_BUF_ALIGN = FIR_BUF_SIZE * 2,
+    FIR_BUF_MASK  = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) - 1))
 };
-#endif /* CPU_COLDFIRE */
+#elif defined (CPU_ARM)
+enum
+{
+    FIR_BUF_CNT   = FIR_BUF_HALF * 2 * 2,
+    FIR_BUF_SIZE  = FIR_BUF_CNT * sizeof ( int32_t ),
+    FIR_BUF_ALIGN = FIR_BUF_SIZE,
+    FIR_BUF_MASK  = ~((FIR_BUF_ALIGN / 2) | (sizeof ( int32_t ) * 2 - 1))
+};
+#endif /* CPU_* */
 
 struct Spc_Dsp
 {
@@ -257,14 +300,19 @@
     int noise_count;
     uint16_t noise; /* also read as int16_t */
     
-#ifdef CPU_COLDFIRE
+#if defined(CPU_COLDFIRE)
     /* circularly hardware masked address */
     int32_t *fir_ptr;
     /* wrapped address just behind current position -
        allows mac.w to increment and mask fir_ptr */
     int32_t *last_fir_ptr;
     /* copy of echo FIR constants as int16_t for use with mac.w */
-    int16_t fir_coeff[VOICE_COUNT];
+    int16_t fir_coeff [VOICE_COUNT];
+#elif defined (CPU_ARM)
+   /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
+    int32_t *fir_ptr;
+    /* copy of echo FIR constants as int32_t, for faster access */
+    int32_t fir_coeff [VOICE_COUNT]; 
 #else
     /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
     int fir_pos; /* (0 to 7) */
diff --git a/apps/codecs/spc/spc_dsp.c b/apps/codecs/spc/spc_dsp.c
index 8881788..19986fd 100644
--- a/apps/codecs/spc/spc_dsp.c
+++ b/apps/codecs/spc/spc_dsp.c
@@ -25,14 +25,13 @@
 #include "spc_codec.h"
 #include "spc_profiler.h"
 
-#ifdef CPU_COLDFIRE
-static int32_t fir_buf[FIR_BUF_HALF]
-    __attribute__ ((aligned (FIR_BUF_SIZE*2))) IBSS_ATTR;
+#if defined(CPU_COLDFIRE) || defined (CPU_ARM)
+int32_t fir_buf[FIR_BUF_CNT]
+    __attribute__ ((aligned (FIR_BUF_ALIGN*1))) IBSS_ATTR;
 #endif
-
 #if SPC_BRRCACHE
 /* a little extra for samples that go past end */
-int16_t BRRcache [0x20000 + 32];
+int16_t BRRcache [BRR_CACHE_SIZE];
 #endif
 
 void DSP_write( struct Spc_Dsp* this, int i, int data )
@@ -58,11 +57,12 @@
 
 /* if ( n < -32768 ) out = -32768; */
 /* if ( n >  32767 ) out =  32767; */
-#define CLAMP16( n, out )\
-{\
-    if ( (int16_t) n != n )\
-        out = 0x7FFF ^ (n >> 31);\
-}
+#define CLAMP16( n ) \
+({                              \
+    if ( (int16_t) n != n )     \
+        n = 0x7FFF ^ (n >> 31); \
+    n;                          \
+})
 
 #if SPC_BRRCACHE
 static void decode_brr( struct Spc_Dsp* this, unsigned start_addr,
@@ -181,7 +181,7 @@
                     smp2 = smp1;
                 }
                 
-                CLAMP16( delta, delta );
+                delta = CLAMP16( delta );
                 smp1 = (int16_t) (delta * 2); /* sign-extend */
             }
             while ( (offset += 4) != 0 );
@@ -359,7 +359,7 @@
         #define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)
         #define IF_RBE(...)
     #endif /* ROCKBOX_BIG_ENDIAN */
-    
+   
 #if !SPC_NOINTERP
     int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
         this->r.g.noise_enables;
@@ -431,7 +431,7 @@
             
             /* Envelope */
             {
-                int const env_range = 0x800;
+                int const ENV_RANGE = 0x800;
                 int env_mode = voice->env_mode;
                 int adsr0 = raw_voice->adsr [0];
                 int env_timer;
@@ -482,14 +482,14 @@
                             
                             int envx = voice->envx;
                             
-                            int const step = env_range / 64;
+                            int const step = ENV_RANGE / 64;
                             envx += step;
                             if ( t == 15 )
-                                envx += env_range / 2 - step;
+                                envx += ENV_RANGE / 2 - step;
                             
-                            if ( envx >= env_range )
+                            if ( envx >= ENV_RANGE )
                             {
-                                envx = env_range - 1;
+                                envx = ENV_RANGE - 1;
                                 voice->env_mode = state_decay;
                             }
                             voice->envx = envx;
@@ -516,7 +516,7 @@
                             int mode = t >> 5;
                             if ( mode <= 5 ) /* decay */
                             {
-                                int step = env_range / 64;
+                                int step = ENV_RANGE / 64;
                                 if ( mode == 5 ) /* exponential */
                                 {
                                     envx--; /* envx *= 255 / 256 */
@@ -531,14 +531,14 @@
                             }
                             else /* attack */
                             {
-                                int const step = env_range / 64;
+                                int const step = ENV_RANGE / 64;
                                 envx += step;
                                 if ( mode == 7 &&
-                                     envx >= env_range * 3 / 4 + step )
-                                    envx += env_range / 256 - step;
+                                     envx >= ENV_RANGE * 3 / 4 + step )
+                                    envx += ENV_RANGE / 256 - step;
                                 
-                                if ( envx >= env_range )
-                                    envx = env_range - 1;
+                                if ( envx >= ENV_RANGE )
+                                    envx = ENV_RANGE - 1;
                             }
                             voice->envx = envx;
                             /* TODO: should this be 8? */
@@ -550,7 +550,7 @@
                 else /* state_release */
                 {
                     int envx = voice->envx;
-                    if ( (envx -= env_range / 256) > 0 )
+                    if ( (envx -= ENV_RANGE / 256) > 0 )
                     {
                         voice->envx = envx;
                         raw_voice->envx = envx >> 8;
@@ -683,7 +683,7 @@
                         smp2 = smp1;
                     }
                     
-                    CLAMP16( delta, delta );
+                    delta = CLAMP16( delta );
                     smp1 = (int16_t) (delta * 2); /* sign-extend */
                 }
                 while ( (offset += 4) != 0 );
@@ -778,7 +778,7 @@
                     output = (output + rev [1] * interp [2]) >> 12;
                     output = (int16_t) (output * 2);
                     output += ((rev [0] * interp [3]) >> 12) * 2;
-                    CLAMP16( output, output );
+                    output = CLAMP16( output );
                 }
                 output = (output * voice->envx) >> 11 & ~1;
                 
@@ -788,7 +788,7 @@
                 prev_outx = output;
                 raw_voice->outx = (int8_t) (output >> 8);
             }
-        #else
+        #else /* SPCNOINTERP */
         /* two-point linear interpolation */
         #ifdef CPU_COLDFIRE
             int amp_0 = (int16_t)this->noise;
@@ -822,7 +822,7 @@
                 /* output = y0 + (result >> 12)          */
                 "asr.l      %[sh], %[y1]              \r\n"
                 "add.l      %[y0], %[y1]              \r\n"
-                : [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
+                : [f]"+d"(f), [y0]"=&a"(y0), [y1]"=&d"(amp_0)
                 : [s]"a"(voice->samples), [sh]"d"(12)
                     );
             }
@@ -861,17 +861,49 @@
             "movclr.l %%acc1, %[amp_1] \r\n"
             : [amp_0]"=r"(amp_0), [amp_1]"=r"(amp_1)
             );
-        #else
+        #elif defined (CPU_ARM)
+            int amp_0, amp_1;
+            
+            if ( (this->r.g.noise_enables & vbit) != 0 ) {
+                amp_0 = *(int16_t *)&this->noise;
+            } else {
+                uint32_t f = voice->position;
+                amp_0 = (uint32_t)voice->samples;
 
-            /* Try this one out on ARM and see - similar to above but the asm
-               on coldfire removes a redundant register load worth 1 or 2%;
-               switching to loading two samples at once may help too. That's
-               done above and while 6 to 7% faster on cf over two 16 bit loads
-               it makes it endian dependant.
-               
-               measured small improvement (~1.5%) - hcs
-            */
+                asm volatile(
+                "mov    %[y1], %[f], lsr #12        \r\n"
+                "eor    %[f], %[f], %[y1], lsl #12  \r\n" 
+                "add    %[y1], %[y0], %[y1], lsl #1 \r\n"
+                "ldrsh  %[y0], [%[y1], #2]          \r\n"
+                "ldrsh  %[y1], [%[y1], #4]          \r\n"
+                "sub    %[y1], %[y1], %[y0]         \r\n"
+                "mul    %[f], %[y1], %[f]           \r\n"
+                "add    %[y0], %[y0], %[f], asr #12 \r\n"
+                : [f]"+r"(f), [y0]"+r"(amp_0), [y1]"=&r"(amp_1)
+                );
+            }
 
+            voice->position += rate;
+
+            asm volatile(
+            "mul    %[amp_1], %[amp_0], %[envx] \r\n"
+            "mov    %[amp_0], %[amp_1], asr #11 \r\n"
+            "mov    %[amp_1], %[amp_0], asr #8  \r\n"
+            : [amp_0]"+r"(amp_0), [amp_1]"=&r"(amp_1)
+            : [envx]"r"(voice->envx)
+            );
+
+            prev_outx = amp_0;
+            raw_voice->outx = (int8_t)amp_1;
+
+            asm volatile(
+            "mul    %[amp_1], %[amp_0], %[vol_1] \r\n"
+            "mul    %[amp_0], %[vol_0], %[amp_0] \r\n"
+            : [amp_0]"+r"(amp_0), [amp_1]"+r"(amp_1)
+            : [vol_0]"r"((int)voice->volume[0]),
+              [vol_1]"r"((int)voice->volume[1])
+            );
+        #else /* Unoptimized CPU */
             int output;
             
             if ( (this->r.g.noise_enables & vbit) == 0 )
@@ -884,19 +916,7 @@
             }
 
             voice->position += rate;
-            
-            /* old version */
-#if 0
-            int fraction = voice->position & 0xFFF;
-            short const* const pos = voice->samples + (voice->position >> 12);
-            voice->position += rate;
-            int output =
-                (pos [2] * fraction + pos [1] * (0x1000 - fraction)) >> 12;
-            /* no interpolation (hardly faster, and crappy sounding) */
-            /*int output = pos [0];*/
-            if ( this->r.g.noise_enables & vbit )
-                output = *(int16_t*) &this->noise;
-#endif
+
             output = (output * voice->envx) >> 11;
 
             /* duplicated here to give compiler more to run in parallel */
@@ -905,8 +925,8 @@
 
             prev_outx = output;
             raw_voice->outx = (int8_t) (output >> 8);
-        #endif /* CPU_COLDFIRE */
-        #endif
+        #endif /* CPU_* */
+        #endif /* SPCNOINTERP */
         
         #if SPC_BRRCACHE
             if ( voice->position >= voice->wave_end )
@@ -1033,7 +1053,7 @@
             "or.l       %[sh], %[e0]                \r\n"
             /* save final feedback into echo buffer    */
             "move.l     %[e0], (%[echo_ptr])        \r\n"
-            : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1)
+            : [e0]"+d"(echo_0), [e1]"+d"(echo_1)
             : [out_0]"r"(out_0), [out_1]"r"(out_1),
               [ef]"r"((int)this->r.g.echo_feedback),
               [echo_ptr]"a"((int32_t *)echo_ptr),
@@ -1056,7 +1076,88 @@
         out_buf [             0] = out_0;
         out_buf [WAV_CHUNK_SIZE] = out_1;
         out_buf ++;
-    #else /* !CPU_COLDFIRE */
+    #elif defined (CPU_ARM)
+        /* Read feedback from echo buffer */
+        int echo_pos = this->echo_pos;
+        uint8_t* const echo_ptr = RAM +
+                ((this->r.g.echo_page * 0x100 + echo_pos) & 0xFFFF);
+        echo_pos += 4;
+        if ( echo_pos >= (this->r.g.echo_delay & 15) * 0x800 )
+            echo_pos = 0;
+        this->echo_pos = echo_pos;
+
+        int fb_0 = GET_LE16SA( echo_ptr     );
+        int fb_1 = GET_LE16SA( echo_ptr + 2 );
+
+        /* Keep last 8 samples */
+        int32_t *fir_ptr = this->fir_ptr;
+
+        /* Apply FIR */
+        asm volatile (
+        "str    %[fb_0], [%[fir_p]], #4  \r\n"
+        "str    %[fb_1], [%[fir_p]], #4  \r\n"
+        /* duplicate at +8 eliminates wrap checking below */
+        "str    %[fb_0], [%[fir_p], #56] \r\n"
+        "str    %[fb_1], [%[fir_p], #60] \r\n"
+        : [fir_p]"+r"(fir_ptr)
+        : [fb_0]"r"(fb_0), [fb_1]"r"(fb_1)
+        );
+
+        this->fir_ptr = (int32_t *)((intptr_t)fir_ptr & FIR_BUF_MASK);
+        int32_t *fir_coeff = this->fir_coeff;
+
+        asm volatile (
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r4-r5 }     \r\n"
+        "mul    %[fb_0],     r0, %[fb_0] \r\n"
+        "mul    %[fb_1],     r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        "ldmia  %[fir_c]!, { r0-r1 }     \r\n"
+        "ldmia  %[fir_p]!, { r2-r5 }     \r\n"
+        "mla    %[fb_0], r2, r0, %[fb_0] \r\n"
+        "mla    %[fb_1], r3, r0, %[fb_1] \r\n"
+        "mla    %[fb_0], r4, r1, %[fb_0] \r\n"
+        "mla    %[fb_1], r5, r1, %[fb_1] \r\n"
+        : [fb_0]"+r"(fb_0), [fb_1]"+r"(fb_1),
+          [fir_p]"+r"(fir_ptr), [fir_c]"+r"(fir_coeff)
+        :
+        : "r0", "r1", "r2", "r3", "r4", "r5"
+        );
+
+        /* Generate output */
+        int amp_0 = (chans_0 * global_vol_0 + fb_0 * this->r.g.echo_volume_0)
+                    >> global_muting;
+        int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
+                    >> global_muting;
+
+        out_buf [             0] = amp_0;
+        out_buf [WAV_CHUNK_SIZE] = amp_1;
+        out_buf ++;
+
+        if ( !(this->r.g.flags & 0x20) )
+        {
+            /* Feedback into echo buffer */
+            int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
+            int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
+            e0 = CLAMP16( e0 );
+            SET_LE16A( echo_ptr    , e0 );
+            e1 = CLAMP16( e1 );
+            SET_LE16A( echo_ptr + 2, e1 );
+        }
+    #else /* Unoptimized CPU */
         /* Read feedback from echo buffer */
         int echo_pos = this->echo_pos;
         uint8_t* const echo_ptr = RAM +
@@ -1102,25 +1203,25 @@
         out_buf [WAV_CHUNK_SIZE] = amp_1;
         out_buf ++;
         
-        /* Feedback into echo buffer */
-        int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
-        int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
         if ( !(this->r.g.flags & 0x20) )
         {
-            CLAMP16( e0, e0 );
+            /* Feedback into echo buffer */
+            int e0 = (echo_0 >> 7) + ((fb_0 * this->r.g.echo_feedback) >> 14);
+            int e1 = (echo_1 >> 7) + ((fb_1 * this->r.g.echo_feedback) >> 14);
+            e0 = CLAMP16( e0 );
             SET_LE16A( echo_ptr    , e0 );
-            CLAMP16( e1, e1 );
+            e1 = CLAMP16( e1 );
             SET_LE16A( echo_ptr + 2, e1 );
         }
-    #endif /* CPU_COLDFIRE */
-    #else
+    #endif /* CPU_* */
+    #else /* SPCNOECHO == 1*/
         /* Generate output  */
         int amp_0 = (chans_0 * global_vol_0) >> global_muting;
         int amp_1 = (chans_1 * global_vol_1) >> global_muting;
         out_buf [             0] = amp_0;
         out_buf [WAV_CHUNK_SIZE] = amp_1;
         out_buf ++;
-    #endif
+    #endif /* SPCNOECHO */
     }
     while ( --count );
 #if 0
@@ -1155,10 +1256,13 @@
             this->wave_entry [i].start_addr = -1;
     #endif
 
-#ifdef CPU_COLDFIRE
-    this->fir_ptr      = fir_buf;
+#if defined(CPU_COLDFIRE)
+    this->fir_ptr = fir_buf;
     this->last_fir_ptr = &fir_buf [7];
     ci->memset( fir_buf, 0, sizeof fir_buf );
+#elif defined (CPU_ARM)
+    this->fir_ptr = fir_buf;
+    ci->memset( fir_buf, 0, sizeof fir_buf );
 #else
     this->fir_pos = 0;
     ci->memset( this->fir_buf, 0, sizeof this->fir_buf );