iPhone audio short<->float

Hello,

I’ve just been trying to speed up my short-to-float (and back again) conversions for iPhone audio. Thought it might be useful here. It involves some ARM assembler (which I no expert in but have spent the last week or so looking at). Anyway I think I have around a 3x speed up doing the int/float conversion semivectorized (the actual conversion has to be scalar but you can load a block of 8 samples into registers in one instruction and do the scaling up/down to ±32767 in one instruction, the float<->short conversion needs to be done with 8 separate single instructions). This code does the operation on two blocks of 8 at time loading 16 samples into two 8-bank registers. Then does the conversion (using two multiplies and 16 int/float conversions). There may be a quicker way round the SInt32<->SInt16 packing/unpacking which I’m doing here (though manually unrolling the loop was definitely quicker than a while loop). Any ARM experts?..

BTW you need to make sure Compile for Thumb is OFF for this code to work without changes (and I have only tested it with optimisations -Os [fastest/smallest] and without other optimisations e.g., -ftree-vectorize and -ffast-math)

// short to float…

static inline void audioShortToFloat(const short *src, float* dst, unsigned int length)
{
 static const float scale = 1.f / 32767.f;
 int temp[16] __attribute__ ((aligned));
 int *tempa = temp;
 int *tempb = tempa+8;

 unsigned int numVectors = length >> 4U;
 unsigned int numScalars = length & 15U;

 asm volatile ( 
 	"flds s0, [%0]				\n\t" 
 	: 
 	: "r" (&scale) 
 	: "s0"
 	);

 while(numVectors--)
 {
  temp[ 0] = (int)src[ 0];
  temp[ 1] = (int)src[ 1];
  temp[ 2] = (int)src[ 2];
  temp[ 3] = (int)src[ 3];
  temp[ 4] = (int)src[ 4];
  temp[ 5] = (int)src[ 5];
  temp[ 6] = (int)src[ 6];
  temp[ 7] = (int)src[ 7];
  temp[ 8] = (int)src[ 8];
  temp[ 9] = (int)src[ 9];
  temp[10] = (int)src[10];
  temp[11] = (int)src[11];
  temp[12] = (int)src[12];
  temp[13] = (int)src[13];
  temp[14] = (int)src[14];
  temp[15] = (int)src[15];
  src += 16;

  asm volatile ( 
  	"fldmias  %1, {s8-s15}		\n\t" 
  	"fldmias  %2, {s16-s23}		\n\t" 
  	"fsitos s8, s8				\n\t" 
  	"fsitos s9, s9				\n\t" 
  	"fsitos s10, s10			\n\t" 
  	"fsitos s11, s11			\n\t" 
  	"fsitos s12, s12			\n\t" 
  	"fsitos s13, s13			\n\t" 
  	"fsitos s14, s14			\n\t" 
  	"fsitos s15, s15			\n\t" 
  	"fsitos s16, s16			\n\t" 
  	"fsitos s17, s17			\n\t" 
  	"fsitos s18, s18			\n\t" 
  	"fsitos s19, s19			\n\t" 
  	"fsitos s20, s20			\n\t" 
  	"fsitos s21, s21			\n\t" 
  	"fsitos s22, s22			\n\t" 
  	"fsitos s23, s23			\n\t" 
  	"fmuls s8, s8, s0			\n\t" 
  	"fmuls s16, s16, s0			\n\t" 
  	"fstmias  %0!, {s8-s15}		\n\t" 
  	"fstmias  %0!, {s16-s23}	\n\t" 
  	: 
  	: "r" (dst), "r" (tempa), "r" (tempb) 
  	: "r0", "s0", "cc", "memory", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23" 
  	);
  	
// just in case the buffersize isn't a multiple of 16 (unlikely?)
 while(numScalars--)
 {
  *dst++ = (float)(*src++) * scale;
 }
}

static inline void audioShortToFloatChannels(AudioBufferList* src, float* dst[], unsigned int length, unsigned int numChannels)
{
// vec length to 8
 asm volatile (
 	"fmrx    r0, fpscr                         \n\t" 
 	"bic     r0, r0, #0x00370000               \n\t" 
 	"orr     r0, r0, #0x00070000 				\n\t" 
 	"fmxr    fpscr, r0                         \n\t" 
 	: 
 	: 
 	: "r0" 
 	);

 for (UInt32 channel = 0; channel < numChannels; channel++)
 {
  AudioSampleType *audioUnitBuffer = (AudioSampleType*)src->mBuffers[0].mData;
  audioShortToFloat(audioUnitBuffer, dst[channel], length);
 }

// reset vec length
 asm volatile (
 	"fmrx    r0, fpscr            \n\t" 
 	"bic     r0, r0, #0x00370000  \n\t" 
 	"fmxr    fpscr, r0            \n\t" 
 	: 
 	: 
 	: "r0" 
 	);
}

float to short…

[code]static inline void audioFloatToShort(const float src, short dst, unsigned int length)
{
static const float scale = 32767.f;
int temp[16] attribute ((aligned));
int *tempa = temp;
int *tempb = temp+8;

unsigned int numVectors = length >> 4U;
unsigned int numScalars = length & 15U;

asm volatile (
“flds s0, [%0] \n\t”
:
: “r” (&scale)
: “s0”
);

while(numVectors–)
{
asm volatile (
“fldmias %2!, {s8-s15} \n\t”
“fldmias %2!, {s16-s23} \n\t”
“fmuls s8, s8, s0 \n\t”
“fmuls s16, s16, s0 \n\t”
“ftosis s8, s8 \n\t”
“ftosis s9, s9 \n\t”
“ftosis s10, s10 \n\t”
“ftosis s11, s11 \n\t”
“ftosis s12, s12 \n\t”
“ftosis s13, s13 \n\t”
“ftosis s14, s14 \n\t”
“ftosis s15, s15 \n\t”
“ftosis s16, s16 \n\t”
“ftosis s17, s17 \n\t”
“ftosis s18, s18 \n\t”
“ftosis s19, s19 \n\t”
“ftosis s20, s20 \n\t”
“ftosis s21, s21 \n\t”
“ftosis s22, s22 \n\t”
“ftosis s23, s23 \n\t”
“fstmias %0, {s8-s15} \n\t”
“fstmias %1, {s16-s23} \n\t”
:
: “r” (tempa), “r” (tempb), “r” (src)
: “r0”, “s0”, “cc”, “memory”, “s8”, “s9”, “s10”, “s11”, “s12”, “s13”, “s14”, “s15”, “s16”, “s17”, “s18”, “s19”, “s20”, “s21”, “s22”, “s23”
);

dst[ 0] = (short)temp[ 0];
dst[ 1] = (short)temp[ 1];
dst[ 2] = (short)temp[ 2];
dst[ 3] = (short)temp[ 3];
dst[ 4] = (short)temp[ 4];
dst[ 5] = (short)temp[ 5];
dst[ 6] = (short)temp[ 6];
dst[ 7] = (short)temp[ 7];
dst[ 8] = (short)temp[ 8];
dst[ 9] = (short)temp[ 9];
dst[10] = (short)temp[10];
dst[11] = (short)temp[11];
dst[12] = (short)temp[12];
dst[13] = (short)temp[13];
dst[14] = (short)temp[14];
dst[15] = (short)temp[15];
dst += 16;
}

// just in case the buffersize isn’t a multiple of 16 (unlikely?)
while(numScalars–)
{
*dst++ = (short)(*src++ * scale);
}
}

static inline void audioFloatToShortChannels(const float src[], AudioBufferList dst, unsigned int length, unsigned int numChannels)
{
// vec length to 8
asm volatile (
“fmrx r0, fpscr \n\t”
“bic r0, r0, #0x00370000 \n\t”
“orr r0, r0, #0x00070000 \n\t”
“fmxr fpscr, r0 \n\t”
:
:
: “r0”
);

for (UInt32 channel = 0; channel < numChannels; channel++)
{
AudioSampleType audioUnitBuffer = (AudioSampleType)dst->mBuffers[channel].mData;
audioFloatToShort(src[channel], audioUnitBuffer, length);
}

// reset vec length
asm volatile (
“fmrx r0, fpscr \n\t”
“bic r0, r0, #0x00370000 \n\t”
“fmxr fpscr, r0 \n\t”
:
:
: “r0”
);

}
[/code]

Nice work!

Was this a bottleneck for you? In my iPhone app the conversion didn’t seem to be a big deal in terms of CPU…

Actually, are you on the 3GS? I think its ARM7 (?) chip is better in any case. I’m on a 2nd Gen iPod Touch (which is the same CPU as the 3G iPhone I think). Perhaps the optimiser works better for the 3GS in addition to it being generally a little faster.

In any case I’ve had no luck getting profiles back from Shark remotely from the iPod. It samples, analyses, then Shark crashes as it’s retrieving the data.

This optimisation was based on the assumption that it could be done faster in assembly and since the conversion is going on all the time it might help (rather than anything sensible like profiling!). I’ve meaured the speed of the throughput though compared to standard one-by-one casts in C (which is where the 3x figure came from).

You’re on 10.6 though (I’m still on 10.5). I’d interested if it’s just me that can’t get Shark reports back from the device on 10.5!

doh! just realised my gaff with the subject ^^^

I’ve got a 3GS, which is indeed very quick compared to my old 1st-gen iPod. But it still needs all the optimisation it can get, so this certainly isn’t a bad idea!

Just got round to finding someone’s 3GS to test this on. The vfp assembler stuff is a lot SLOWER on that (something to do with different pipelining and only being supported for backwards compatability). Neon (ARM SIMD) assembly or intrinsics are the way to go I think.

There’s lot to be gained by just using the “optimized armv6 armv7” architecture option in Xcode.

I’m looking at using vfp for armv6 and neon for armv7…