Thought this could be cool for 64-bit goers. Also, I’ve no idea about this vDSP business since I’m a Windows type, so I’ll leave that up to whomever.
(…and excuse the unDRYness - I’m having difficulty visualising the benefits of doing so, other than avoiding re-creating the macros.)
Header:
#ifndef DOUBLE_VECTOR_OPERATIONS_H
#define DOUBLE_VECTOR_OPERATIONS_H
#include "JuceHeader.h"
/**
A collection of simple vector operations on arrays of doubles, accelerated with
SIMD instructions where possible.
*/
class JUCE_API DoubleVectorOperations
{
public:
//==============================================================================
/** Clears a vector of doubles. */
static void JUCE_CALLTYPE clear (double* dest, int numValues) noexcept;
/** Copies a repeated value into a vector of doubles. */
static void JUCE_CALLTYPE fill (double* dest, double valueToFill, int numValues) noexcept;
/** Copies a vector of doubles. */
static void JUCE_CALLTYPE copy (double* dest, const double* src, int numValues) noexcept;
/** Copies a vector of doubles, multiplying each value by a given multiplier */
static void JUCE_CALLTYPE copyWithMultiply (double* dest, const double* src, double multiplier, int numValues) noexcept;
/** Adds the source values to the destination values. */
static void JUCE_CALLTYPE add (double* dest, const double* src, int numValues) noexcept;
/** Adds a fixed value to the destination values. */
static void JUCE_CALLTYPE add (double* dest, double amount, int numValues) noexcept;
/** Multiplies each source value by the given multiplier, then adds it to the destination value. */
static void JUCE_CALLTYPE addWithMultiply (double* dest, const double* src, double multiplier, int numValues) noexcept;
/** Multiplies the destination values by the source values. */
static void JUCE_CALLTYPE multiply (double* dest, const double* src, int numValues) noexcept;
/** Multiplies each of the destination values by a fixed multiplier. */
static void JUCE_CALLTYPE multiply (double* dest, double multiplier, int numValues) noexcept;
/** Converts a stream of integers to doubles, multiplying each one by the given multiplier. */
static void JUCE_CALLTYPE convertFixedToFloat (double* dest, const int* src, double multiplier, int numValues) noexcept;
/** Finds the miniumum and maximum values in the given array. */
static void JUCE_CALLTYPE findMinAndMax (const double* src, int numValues, double& minResult, double& maxResult) noexcept;
/** Finds the miniumum value in the given array. */
static double JUCE_CALLTYPE findMinimum (const double* src, int numValues) noexcept;
/** Finds the maximum value in the given array. */
static double JUCE_CALLTYPE findMaximum (const double* src, int numValues) noexcept;
};
#endif //JUCE_DOUBLEVECTOROPERATIONS_H_INCLUDED
CPP:
#include "DoubleVectorOperations.h"
/** This here just in case such is already defined.
Such is undef'ed once again at the bottom of the file...
*/
#undef JUCE_MINIMUMMAXIMUM_SSE_LOOP
#undef JUCE_BEGIN_SSE_OP
#undef JUCE_FINISH_SSE_OP
#undef JUCE_SSE_LOOP
#undef JUCE_INCREMENT_SRC_DEST
#undef JUCE_INCREMENT_DEST
#undef JUCE_LOAD_NONE
#undef JUCE_LOAD_DEST
#undef JUCE_LOAD_SRC
#undef JUCE_LOAD_SRC_DEST
#undef JUCE_PERFORM_SSE_OP_DEST
#undef JUCE_PERFORM_SSE_OP_SRC_DEST
#if JUCE_USE_SSE_INTRINSICS
namespace DoubleVectorHelpers
{
static bool sse2Present = false;
static bool isSSE2Available() noexcept
{
if (sse2Present)
return true;
sse2Present = juce::SystemStats::hasSSE2();
return sse2Present;
}
inline static bool isAligned (const void* p) noexcept
{
return (((juce::pointer_sized_int) p) & 15) == 0;
}
inline static void mmEmpty() noexcept
{
#if ! JUCE_64BIT
_mm_empty();
#endif
}
static inline double findMinimumOrMaximum (const double* src, int num, const bool isMinimum) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
const int numLongOps = num / 4;
if (numLongOps > 1 && DoubleVectorHelpers::isSSE2Available())
{
__m128d val;
#define JUCE_MINIMUMMAXIMUM_SSE_LOOP(loadOp, minMaxOp) \
val = loadOp (src); \
src += 4; \
for (int i = 1; i < numLongOps; ++i) \
{ \
const __m128d s = loadOp (src); \
val = minMaxOp (val, s); \
src += 4; \
}
if (isMinimum)
{
if (DoubleVectorHelpers::isAligned (src)) { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_load_pd, _mm_min_pd) }
else { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_loadu_pd, _mm_min_pd) }
}
else
{
if (DoubleVectorHelpers::isAligned (src)) { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_load_pd, _mm_max_pd) }
else { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_loadu_pd,_mm_max_pd) }
}
double localVal;
{
double vals[4];
_mm_storeu_pd (vals, val);
DoubleVectorHelpers::mmEmpty();
localVal = isMinimum ? juce::jmin (vals[0], vals[1], vals[2], vals[3])
: juce::jmax (vals[0], vals[1], vals[2], vals[3]);
}
num &= 3;
for (int i = 0; i < num; ++i)
localVal = isMinimum ? juce::jmin (localVal, src[i])
: juce::jmax (localVal, src[i]);
return localVal;
}
#endif
return isMinimum ? juce::findMinimum (src, num)
: juce::findMaximum (src, num);
}
}
#define JUCE_BEGIN_SSE_OP \
if (DoubleVectorHelpers::isSSE2Available()) \
{ \
const int numLongOps = num / 4;
#define JUCE_FINISH_SSE_OP(normalOp) \
DoubleVectorHelpers::mmEmpty(); \
num &= 3; \
if (num == 0) return; \
} \
for (int i = 0; i < num; ++i) normalOp;
#define JUCE_SSE_LOOP(sseOp, srcLoad, dstLoad, dstStore, locals, increment) \
for (int i = 0; i < numLongOps; ++i) \
{ \
locals (srcLoad, dstLoad); \
dstStore (dest, sseOp); \
increment; \
}
#define JUCE_INCREMENT_SRC_DEST dest += 4; src += 4;
#define JUCE_INCREMENT_DEST dest += 4;
#define JUCE_LOAD_NONE(srcLoad, dstLoad)
#define JUCE_LOAD_DEST(srcLoad, dstLoad) const __m128d d = dstLoad (dest);
#define JUCE_LOAD_SRC(srcLoad, dstLoad) const __m128d s = srcLoad (src);
#define JUCE_LOAD_SRC_DEST(srcLoad, dstLoad) const __m128d d = dstLoad (dest); const __m128d s = srcLoad (src);
#define JUCE_PERFORM_SSE_OP_DEST(normalOp, sseOp, locals) \
JUCE_BEGIN_SSE_OP \
if (DoubleVectorHelpers::isAligned (dest)) JUCE_SSE_LOOP (sseOp, dummy, _mm_load_pd, _mm_store_pd, locals, JUCE_INCREMENT_DEST) \
else JUCE_SSE_LOOP (sseOp, dummy, _mm_loadu_pd, _mm_storeu_pd, locals, JUCE_INCREMENT_DEST) \
JUCE_FINISH_SSE_OP (normalOp)
#define JUCE_PERFORM_SSE_OP_SRC_DEST(normalOp, sseOp, locals, increment) \
JUCE_BEGIN_SSE_OP \
if (DoubleVectorHelpers::isAligned (dest)) \
{ \
if (DoubleVectorHelpers::isAligned (src)) JUCE_SSE_LOOP (sseOp, _mm_load_pd, _mm_load_pd, _mm_store_pd, locals, increment) \
else JUCE_SSE_LOOP (sseOp, _mm_loadu_pd, _mm_load_pd, _mm_store_pd, locals, increment) \
}\
else \
{ \
if (DoubleVectorHelpers::isAligned (src)) JUCE_SSE_LOOP (sseOp, _mm_load_pd, _mm_loadu_pd, _mm_storeu_pd, locals, increment) \
else JUCE_SSE_LOOP (sseOp, _mm_loadu_pd, _mm_loadu_pd, _mm_storeu_pd, locals, increment) \
} \
JUCE_FINISH_SSE_OP (normalOp)
#else
#define JUCE_PERFORM_SSE_OP_DEST(normalOp, unused1, unused2) for (int i = 0; i < num; ++i) normalOp;
#define JUCE_PERFORM_SSE_OP_SRC_DEST(normalOp, sseOp, locals, increment) for (int i = 0; i < num; ++i) normalOp;
#endif
void JUCE_CALLTYPE DoubleVectorOperations::clear (double* dest, const int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vclr (dest, 1, num);
#else
juce::zeromem (dest, num * sizeof (double));
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::fill (double* dest, const double valueToFill, int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vfill (&valueToFill, dest, 1, num);
#else
#if JUCE_USE_SSE_INTRINSICS
const __m128d val = _mm_load1_pd (&valueToFill);
#endif
JUCE_PERFORM_SSE_OP_DEST (dest[i] = valueToFill, val, JUCE_LOAD_NONE)
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::copy (double* dest, const double* src, const int num) noexcept
{
std::memcpy (dest, src, num * sizeof (double));
}
void JUCE_CALLTYPE DoubleVectorOperations::copyWithMultiply (double* dest, const double* src, double multiplier, int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vsmul (src, 1, &multiplier, dest, 1, num);
#else
#if JUCE_USE_SSE_INTRINSICS
const __m128d mult = _mm_load1_pd (&multiplier);
#endif
JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] = src[i] * multiplier,
_mm_mul_pd (mult, s),
JUCE_LOAD_SRC, JUCE_INCREMENT_SRC_DEST)
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::add (double* dest, const double* src, int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vadd (src, 1, dest, 1, dest, 1, num);
#else
JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] += src[i],
_mm_add_pd (d, s),
JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::add (double* dest, double amount, int num) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
const __m128d amountToAdd = _mm_load1_pd (&amount);
#endif
JUCE_PERFORM_SSE_OP_DEST (dest[i] += amount,
_mm_add_pd (d, amountToAdd),
JUCE_LOAD_DEST)
}
void JUCE_CALLTYPE DoubleVectorOperations::addWithMultiply (double* dest, const double* src, double multiplier, int num) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
const __m128d mult = _mm_load1_pd (&multiplier);
#endif
JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] += src[i] * multiplier,
_mm_add_pd (d, _mm_mul_pd (mult, s)),
JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
}
void JUCE_CALLTYPE DoubleVectorOperations::multiply (double* dest, const double* src, int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vmul (src, 1, dest, 1, dest, 1, num);
#else
JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] *= src[i],
_mm_mul_pd (d, s),
JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::multiply (double* dest, double multiplier, int num) noexcept
{
#if JUCE_USE_VDSP_FRAMEWORK
vDSP_vsmul (dest, 1, &multiplier, dest, 1, num);
#else
#if JUCE_USE_SSE_INTRINSICS
const __m128d mult = _mm_load1_pd (&multiplier);
#endif
JUCE_PERFORM_SSE_OP_DEST (dest[i] *= multiplier,
_mm_mul_pd (d, mult),
JUCE_LOAD_DEST)
#endif
}
void JUCE_CALLTYPE DoubleVectorOperations::convertFixedToFloat (double* dest, const int* src, const double multiplier, int num) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
const __m128d mult = _mm_load1_pd (&multiplier);
#endif
JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] = src[i] * multiplier,
_mm_mul_pd (mult, _mm_cvtepi32_pd (_mm_loadu_si128 ((const __m128i*) src))),
JUCE_LOAD_NONE, JUCE_INCREMENT_SRC_DEST)
}
void JUCE_CALLTYPE DoubleVectorOperations::findMinAndMax (const double* src, int num, double& minResult, double& maxResult) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
const int numLongOps = num / 4;
if (numLongOps > 1 && DoubleVectorHelpers::isSSE2Available())
{
__m128d mn, mx;
#define JUCE_MINMAX_SSE_LOOP(loadOp) \
mn = loadOp (src); \
mx = mn; \
src += 4; \
for (int i = 1; i < numLongOps; ++i) \
{ \
const __m128d s = loadOp (src); \
mn = _mm_min_pd (mn, s); \
mx = _mm_max_pd (mx, s); \
src += 4; \
}
if (DoubleVectorHelpers::isAligned (src)) { JUCE_MINMAX_SSE_LOOP (_mm_load_pd) }
else { JUCE_MINMAX_SSE_LOOP (_mm_loadu_pd) }
double localMin, localMax;
{
double mns[4], mxs[4];
_mm_storeu_pd (mns, mn);
_mm_storeu_pd (mxs, mx);
DoubleVectorHelpers::mmEmpty();
localMin = juce::jmin (mns[0], mns[1], mns[2], mns[3]);
localMax = juce::jmax (mxs[0], mxs[1], mxs[2], mxs[3]);
}
num &= 3;
for (int i = 0; i < num; ++i)
{
const double s = src[i];
localMin = juce::jmin (localMin, s);
localMax = juce::jmax (localMax, s);
}
minResult = localMin;
maxResult = localMax;
return;
}
#endif
juce::findMinAndMax (src, num, minResult, maxResult);
}
double JUCE_CALLTYPE DoubleVectorOperations::findMinimum (const double* src, const int num) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
return DoubleVectorHelpers::findMinimumOrMaximum (src, num, true);
#else
return juce::findMinimum (src, num);
#endif
}
double JUCE_CALLTYPE DoubleVectorOperations::findMaximum (const double* src, const int num) noexcept
{
#if JUCE_USE_SSE_INTRINSICS
return DoubleVectorHelpers::findMinimumOrMaximum (src, num, false);
#else
return juce::findMaximum (src, num);
#endif
}
#undef JUCE_MINIMUMMAXIMUM_SSE_LOOP
#undef JUCE_BEGIN_SSE_OP
#undef JUCE_FINISH_SSE_OP
#undef JUCE_SSE_LOOP
#undef JUCE_INCREMENT_SRC_DEST
#undef JUCE_INCREMENT_DEST
#undef JUCE_LOAD_NONE
#undef JUCE_LOAD_DEST
#undef JUCE_LOAD_SRC
#undef JUCE_LOAD_SRC_DEST
#undef JUCE_PERFORM_SSE_OP_DEST
#undef JUCE_PERFORM_SSE_OP_SRC_DEST