Hello,
I’ve just been trying to speed up my short-to-float (and back again) conversions for iPhone audio. Thought it might be useful here. It involves some ARM assembler (which I no expert in but have spent the last week or so looking at). Anyway I think I have around a 3x speed up doing the int/float conversion semivectorized (the actual conversion has to be scalar but you can load a block of 8 samples into registers in one instruction and do the scaling up/down to ±32767 in one instruction, the float<->short conversion needs to be done with 8 separate single instructions). This code does the operation on two blocks of 8 at time loading 16 samples into two 8-bank registers. Then does the conversion (using two multiplies and 16 int/float conversions). There may be a quicker way round the SInt32<->SInt16 packing/unpacking which I’m doing here (though manually unrolling the loop was definitely quicker than a while loop). Any ARM experts?..
BTW you need to make sure Compile for Thumb is OFF for this code to work without changes (and I have only tested it with optimisations -Os [fastest/smallest] and without other optimisations e.g., -ftree-vectorize and -ffast-math)
// short to float…
static inline void audioShortToFloat(const short *src, float* dst, unsigned int length)
{
static const float scale = 1.f / 32767.f;
int temp[16] __attribute__ ((aligned));
int *tempa = temp;
int *tempb = tempa+8;
unsigned int numVectors = length >> 4U;
unsigned int numScalars = length & 15U;
asm volatile (
"flds s0, [%0] \n\t"
:
: "r" (&scale)
: "s0"
);
while(numVectors--)
{
temp[ 0] = (int)src[ 0];
temp[ 1] = (int)src[ 1];
temp[ 2] = (int)src[ 2];
temp[ 3] = (int)src[ 3];
temp[ 4] = (int)src[ 4];
temp[ 5] = (int)src[ 5];
temp[ 6] = (int)src[ 6];
temp[ 7] = (int)src[ 7];
temp[ 8] = (int)src[ 8];
temp[ 9] = (int)src[ 9];
temp[10] = (int)src[10];
temp[11] = (int)src[11];
temp[12] = (int)src[12];
temp[13] = (int)src[13];
temp[14] = (int)src[14];
temp[15] = (int)src[15];
src += 16;
asm volatile (
"fldmias %1, {s8-s15} \n\t"
"fldmias %2, {s16-s23} \n\t"
"fsitos s8, s8 \n\t"
"fsitos s9, s9 \n\t"
"fsitos s10, s10 \n\t"
"fsitos s11, s11 \n\t"
"fsitos s12, s12 \n\t"
"fsitos s13, s13 \n\t"
"fsitos s14, s14 \n\t"
"fsitos s15, s15 \n\t"
"fsitos s16, s16 \n\t"
"fsitos s17, s17 \n\t"
"fsitos s18, s18 \n\t"
"fsitos s19, s19 \n\t"
"fsitos s20, s20 \n\t"
"fsitos s21, s21 \n\t"
"fsitos s22, s22 \n\t"
"fsitos s23, s23 \n\t"
"fmuls s8, s8, s0 \n\t"
"fmuls s16, s16, s0 \n\t"
"fstmias %0!, {s8-s15} \n\t"
"fstmias %0!, {s16-s23} \n\t"
:
: "r" (dst), "r" (tempa), "r" (tempb)
: "r0", "s0", "cc", "memory", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23"
);
// just in case the buffersize isn't a multiple of 16 (unlikely?)
while(numScalars--)
{
*dst++ = (float)(*src++) * scale;
}
}
static inline void audioShortToFloatChannels(AudioBufferList* src, float* dst[], unsigned int length, unsigned int numChannels)
{
// vec length to 8
asm volatile (
"fmrx r0, fpscr \n\t"
"bic r0, r0, #0x00370000 \n\t"
"orr r0, r0, #0x00070000 \n\t"
"fmxr fpscr, r0 \n\t"
:
:
: "r0"
);
for (UInt32 channel = 0; channel < numChannels; channel++)
{
AudioSampleType *audioUnitBuffer = (AudioSampleType*)src->mBuffers[0].mData;
audioShortToFloat(audioUnitBuffer, dst[channel], length);
}
// reset vec length
asm volatile (
"fmrx r0, fpscr \n\t"
"bic r0, r0, #0x00370000 \n\t"
"fmxr fpscr, r0 \n\t"
:
:
: "r0"
);
}
float to short…
[code]static inline void audioFloatToShort(const float src, short dst, unsigned int length)
{
static const float scale = 32767.f;
int temp[16] attribute ((aligned));
int *tempa = temp;
int *tempb = temp+8;
unsigned int numVectors = length >> 4U;
unsigned int numScalars = length & 15U;
asm volatile (
“flds s0, [%0] \n\t”
:
: “r” (&scale)
: “s0”
);
while(numVectors–)
{
asm volatile (
“fldmias %2!, {s8-s15} \n\t”
“fldmias %2!, {s16-s23} \n\t”
“fmuls s8, s8, s0 \n\t”
“fmuls s16, s16, s0 \n\t”
“ftosis s8, s8 \n\t”
“ftosis s9, s9 \n\t”
“ftosis s10, s10 \n\t”
“ftosis s11, s11 \n\t”
“ftosis s12, s12 \n\t”
“ftosis s13, s13 \n\t”
“ftosis s14, s14 \n\t”
“ftosis s15, s15 \n\t”
“ftosis s16, s16 \n\t”
“ftosis s17, s17 \n\t”
“ftosis s18, s18 \n\t”
“ftosis s19, s19 \n\t”
“ftosis s20, s20 \n\t”
“ftosis s21, s21 \n\t”
“ftosis s22, s22 \n\t”
“ftosis s23, s23 \n\t”
“fstmias %0, {s8-s15} \n\t”
“fstmias %1, {s16-s23} \n\t”
:
: “r” (tempa), “r” (tempb), “r” (src)
: “r0”, “s0”, “cc”, “memory”, “s8”, “s9”, “s10”, “s11”, “s12”, “s13”, “s14”, “s15”, “s16”, “s17”, “s18”, “s19”, “s20”, “s21”, “s22”, “s23”
);
dst[ 0] = (short)temp[ 0];
dst[ 1] = (short)temp[ 1];
dst[ 2] = (short)temp[ 2];
dst[ 3] = (short)temp[ 3];
dst[ 4] = (short)temp[ 4];
dst[ 5] = (short)temp[ 5];
dst[ 6] = (short)temp[ 6];
dst[ 7] = (short)temp[ 7];
dst[ 8] = (short)temp[ 8];
dst[ 9] = (short)temp[ 9];
dst[10] = (short)temp[10];
dst[11] = (short)temp[11];
dst[12] = (short)temp[12];
dst[13] = (short)temp[13];
dst[14] = (short)temp[14];
dst[15] = (short)temp[15];
dst += 16;
}
// just in case the buffersize isn’t a multiple of 16 (unlikely?)
while(numScalars–)
{
*dst++ = (short)(*src++ * scale);
}
}
static inline void audioFloatToShortChannels(const float src[], AudioBufferList dst, unsigned int length, unsigned int numChannels)
{
// vec length to 8
asm volatile (
“fmrx r0, fpscr \n\t”
“bic r0, r0, #0x00370000 \n\t”
“orr r0, r0, #0x00070000 \n\t”
“fmxr fpscr, r0 \n\t”
:
:
: “r0”
);
for (UInt32 channel = 0; channel < numChannels; channel++)
{
AudioSampleType audioUnitBuffer = (AudioSampleType)dst->mBuffers[channel].mData;
audioFloatToShort(src[channel], audioUnitBuffer, length);
}
// reset vec length
asm volatile (
“fmrx r0, fpscr \n\t”
“bic r0, r0, #0x00370000 \n\t”
“fmxr fpscr, r0 \n\t”
:
:
: “r0”
);
}
[/code]