DoubleVectorOperations


#1

Thought this could be cool for 64-bit goers. Also, I’ve no idea about this vDSP business since I’m a Windows type, so I’ll leave that up to whomever.

(…and excuse the unDRYness - I’m having difficulty visualising the benefits of doing so, other than avoiding re-creating the macros.)

Header:

#ifndef DOUBLE_VECTOR_OPERATIONS_H
#define DOUBLE_VECTOR_OPERATIONS_H

#include "JuceHeader.h"

/**
    A collection of simple vector operations on arrays of doubles, accelerated with
    SIMD instructions where possible.
*/
class JUCE_API DoubleVectorOperations
{
public:
    //==============================================================================
    /** Clears a vector of doubles. */
    static void JUCE_CALLTYPE clear (double* dest, int numValues) noexcept;

    /** Copies a repeated value into a vector of doubles. */
    static void JUCE_CALLTYPE fill (double* dest, double valueToFill, int numValues) noexcept;

    /** Copies a vector of doubles. */
    static void JUCE_CALLTYPE copy (double* dest, const double* src, int numValues) noexcept;

    /** Copies a vector of doubles, multiplying each value by a given multiplier */
    static void JUCE_CALLTYPE copyWithMultiply (double* dest, const double* src, double multiplier, int numValues) noexcept;

    /** Adds the source values to the destination values. */
    static void JUCE_CALLTYPE add (double* dest, const double* src, int numValues) noexcept;

    /** Adds a fixed value to the destination values. */
    static void JUCE_CALLTYPE add (double* dest, double amount, int numValues) noexcept;

    /** Multiplies each source value by the given multiplier, then adds it to the destination value. */
    static void JUCE_CALLTYPE addWithMultiply (double* dest, const double* src, double multiplier, int numValues) noexcept;

    /** Multiplies the destination values by the source values. */
    static void JUCE_CALLTYPE multiply (double* dest, const double* src, int numValues) noexcept;

    /** Multiplies each of the destination values by a fixed multiplier. */
    static void JUCE_CALLTYPE multiply (double* dest, double multiplier, int numValues) noexcept;

    /** Converts a stream of integers to doubles, multiplying each one by the given multiplier. */
    static void JUCE_CALLTYPE convertFixedToFloat (double* dest, const int* src, double multiplier, int numValues) noexcept;

    /** Finds the miniumum and maximum values in the given array. */
    static void JUCE_CALLTYPE findMinAndMax (const double* src, int numValues, double& minResult, double& maxResult) noexcept;

    /** Finds the miniumum value in the given array. */
    static double JUCE_CALLTYPE findMinimum (const double* src, int numValues) noexcept;

    /** Finds the maximum value in the given array. */
    static double JUCE_CALLTYPE findMaximum (const double* src, int numValues) noexcept;
};

#endif //JUCE_DOUBLEVECTOROPERATIONS_H_INCLUDED

CPP:

#include "DoubleVectorOperations.h"

/** This here just in case such is already defined.
    Such is undef'ed once again at the bottom of the file...
*/
#undef JUCE_MINIMUMMAXIMUM_SSE_LOOP
#undef JUCE_BEGIN_SSE_OP
#undef JUCE_FINISH_SSE_OP
#undef JUCE_SSE_LOOP
#undef JUCE_INCREMENT_SRC_DEST
#undef JUCE_INCREMENT_DEST
#undef JUCE_LOAD_NONE
#undef JUCE_LOAD_DEST
#undef JUCE_LOAD_SRC
#undef JUCE_LOAD_SRC_DEST
#undef JUCE_PERFORM_SSE_OP_DEST
#undef JUCE_PERFORM_SSE_OP_SRC_DEST

#if JUCE_USE_SSE_INTRINSICS
namespace DoubleVectorHelpers
{
    static bool sse2Present = false;

    static bool isSSE2Available() noexcept
    {
        if (sse2Present)
            return true;

        sse2Present = juce::SystemStats::hasSSE2();
        return sse2Present;
    }

    inline static bool isAligned (const void* p) noexcept
    {
        return (((juce::pointer_sized_int) p) & 15) == 0;
    }

    inline static void mmEmpty() noexcept
    {
        #if ! JUCE_64BIT
            _mm_empty();
        #endif
    }

    static inline double findMinimumOrMaximum (const double* src, int num, const bool isMinimum) noexcept
    {
        #if JUCE_USE_SSE_INTRINSICS
            const int numLongOps = num / 4;

            if (numLongOps > 1 && DoubleVectorHelpers::isSSE2Available())
            {
                __m128d val;

                #define JUCE_MINIMUMMAXIMUM_SSE_LOOP(loadOp, minMaxOp) \
                    val = loadOp (src); \
                    src += 4; \
                    for (int i = 1; i < numLongOps; ++i) \
                    { \
                        const __m128d s = loadOp (src); \
                        val = minMaxOp (val, s); \
                        src += 4; \
                    }

                if (isMinimum)
                {
                    if (DoubleVectorHelpers::isAligned (src))   { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_load_pd,  _mm_min_pd) }
                    else                                        { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_loadu_pd, _mm_min_pd) }
                }
                else
                {
                    if (DoubleVectorHelpers::isAligned (src))   { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_load_pd, _mm_max_pd) }
                    else                                        { JUCE_MINIMUMMAXIMUM_SSE_LOOP (_mm_loadu_pd,_mm_max_pd) }
                }

                double localVal;

                {
                    double vals[4];
                    _mm_storeu_pd (vals, val);
                    DoubleVectorHelpers::mmEmpty();

                    localVal = isMinimum ? juce::jmin (vals[0], vals[1], vals[2], vals[3])
                                         : juce::jmax (vals[0], vals[1], vals[2], vals[3]);
                }

                num &= 3;

                for (int i = 0; i < num; ++i)
                    localVal = isMinimum ? juce::jmin (localVal, src[i])
                                         : juce::jmax (localVal, src[i]);

                return localVal;
            }
        #endif

        return isMinimum ? juce::findMinimum (src, num)
                         : juce::findMaximum (src, num);
    }
}

#define JUCE_BEGIN_SSE_OP \
    if (DoubleVectorHelpers::isSSE2Available()) \
    { \
        const int numLongOps = num / 4;

#define JUCE_FINISH_SSE_OP(normalOp) \
        DoubleVectorHelpers::mmEmpty(); \
        num &= 3; \
        if (num == 0) return; \
    } \
    for (int i = 0; i < num; ++i) normalOp;

#define JUCE_SSE_LOOP(sseOp, srcLoad, dstLoad, dstStore, locals, increment) \
    for (int i = 0; i < numLongOps; ++i) \
    { \
        locals (srcLoad, dstLoad); \
        dstStore (dest, sseOp); \
        increment; \
    }

#define JUCE_INCREMENT_SRC_DEST    dest += 4; src += 4;
#define JUCE_INCREMENT_DEST        dest += 4;

#define JUCE_LOAD_NONE(srcLoad, dstLoad)
#define JUCE_LOAD_DEST(srcLoad, dstLoad)     const __m128d d = dstLoad (dest);
#define JUCE_LOAD_SRC(srcLoad, dstLoad)      const __m128d s = srcLoad (src);
#define JUCE_LOAD_SRC_DEST(srcLoad, dstLoad) const __m128d d = dstLoad (dest); const __m128d s = srcLoad (src);

#define JUCE_PERFORM_SSE_OP_DEST(normalOp, sseOp, locals) \
    JUCE_BEGIN_SSE_OP \
    if (DoubleVectorHelpers::isAligned (dest))  JUCE_SSE_LOOP (sseOp, dummy, _mm_load_pd,  _mm_store_pd,  locals, JUCE_INCREMENT_DEST) \
    else                                        JUCE_SSE_LOOP (sseOp, dummy, _mm_loadu_pd, _mm_storeu_pd, locals, JUCE_INCREMENT_DEST) \
    JUCE_FINISH_SSE_OP (normalOp)

#define JUCE_PERFORM_SSE_OP_SRC_DEST(normalOp, sseOp, locals, increment) \
    JUCE_BEGIN_SSE_OP \
    if (DoubleVectorHelpers::isAligned (dest)) \
    { \
        if (DoubleVectorHelpers::isAligned (src))   JUCE_SSE_LOOP (sseOp, _mm_load_pd,  _mm_load_pd, _mm_store_pd, locals, increment) \
        else                                        JUCE_SSE_LOOP (sseOp, _mm_loadu_pd, _mm_load_pd, _mm_store_pd, locals, increment) \
    }\
    else \
    { \
        if (DoubleVectorHelpers::isAligned (src))   JUCE_SSE_LOOP (sseOp, _mm_load_pd,  _mm_loadu_pd, _mm_storeu_pd, locals, increment) \
        else                                        JUCE_SSE_LOOP (sseOp, _mm_loadu_pd, _mm_loadu_pd, _mm_storeu_pd, locals, increment) \
    } \
    JUCE_FINISH_SSE_OP (normalOp)


#else
    #define JUCE_PERFORM_SSE_OP_DEST(normalOp, unused1, unused2)              for (int i = 0; i < num; ++i) normalOp;
    #define JUCE_PERFORM_SSE_OP_SRC_DEST(normalOp, sseOp, locals, increment)  for (int i = 0; i < num; ++i) normalOp;
#endif

void JUCE_CALLTYPE DoubleVectorOperations::clear (double* dest, const int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vclr (dest, 1, num);
    #else
        juce::zeromem (dest, num * sizeof (double));
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::fill (double* dest, const double valueToFill, int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vfill (&valueToFill, dest, 1, num);
    #else
        #if JUCE_USE_SSE_INTRINSICS
            const __m128d val = _mm_load1_pd (&valueToFill);
        #endif

        JUCE_PERFORM_SSE_OP_DEST (dest[i] = valueToFill, val, JUCE_LOAD_NONE)
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::copy (double* dest, const double* src, const int num) noexcept
{
    std::memcpy (dest, src, num * sizeof (double));
}

void JUCE_CALLTYPE DoubleVectorOperations::copyWithMultiply (double* dest, const double* src, double multiplier, int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vsmul (src, 1, &multiplier, dest, 1, num);
    #else
        #if JUCE_USE_SSE_INTRINSICS
            const __m128d mult = _mm_load1_pd (&multiplier);
        #endif

        JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] = src[i] * multiplier,
                                      _mm_mul_pd (mult, s),
                                      JUCE_LOAD_SRC, JUCE_INCREMENT_SRC_DEST)
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::add (double* dest, const double* src, int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vadd (src, 1, dest, 1, dest, 1, num);
    #else
        JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] += src[i],
                                      _mm_add_pd (d, s),
                                      JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::add (double* dest, double amount, int num) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        const __m128d amountToAdd = _mm_load1_pd (&amount);
    #endif

    JUCE_PERFORM_SSE_OP_DEST (dest[i] += amount,
                              _mm_add_pd (d, amountToAdd),
                              JUCE_LOAD_DEST)
}

void JUCE_CALLTYPE DoubleVectorOperations::addWithMultiply (double* dest, const double* src, double multiplier, int num) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        const __m128d mult = _mm_load1_pd (&multiplier);
    #endif

    JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] += src[i] * multiplier,
                                  _mm_add_pd (d, _mm_mul_pd (mult, s)),
                                  JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
}

void JUCE_CALLTYPE DoubleVectorOperations::multiply (double* dest, const double* src, int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vmul (src, 1, dest, 1, dest, 1, num);
    #else
        JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] *= src[i],
                                      _mm_mul_pd (d, s),
                                      JUCE_LOAD_SRC_DEST, JUCE_INCREMENT_SRC_DEST)
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::multiply (double* dest, double multiplier, int num) noexcept
{
    #if JUCE_USE_VDSP_FRAMEWORK
        vDSP_vsmul (dest, 1, &multiplier, dest, 1, num);
    #else
        #if JUCE_USE_SSE_INTRINSICS
            const __m128d mult = _mm_load1_pd (&multiplier);
        #endif

        JUCE_PERFORM_SSE_OP_DEST (dest[i] *= multiplier,
                                  _mm_mul_pd (d, mult),
                                  JUCE_LOAD_DEST)
    #endif
}

void JUCE_CALLTYPE DoubleVectorOperations::convertFixedToFloat (double* dest, const int* src, const double multiplier, int num) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        const __m128d mult = _mm_load1_pd (&multiplier);
    #endif

    JUCE_PERFORM_SSE_OP_SRC_DEST (dest[i] = src[i] * multiplier,
                                  _mm_mul_pd (mult, _mm_cvtepi32_pd (_mm_loadu_si128 ((const __m128i*) src))),
                                  JUCE_LOAD_NONE, JUCE_INCREMENT_SRC_DEST)
}

void JUCE_CALLTYPE DoubleVectorOperations::findMinAndMax (const double* src, int num, double& minResult, double& maxResult) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        const int numLongOps = num / 4;

        if (numLongOps > 1 && DoubleVectorHelpers::isSSE2Available())
        {
            __m128d mn, mx;

            #define JUCE_MINMAX_SSE_LOOP(loadOp) \
                mn = loadOp (src); \
                mx = mn; \
                src += 4; \
                for (int i = 1; i < numLongOps; ++i) \
                { \
                    const __m128d s = loadOp (src); \
                    mn = _mm_min_pd (mn, s); \
                    mx = _mm_max_pd (mx, s); \
                    src += 4; \
                }

            if (DoubleVectorHelpers::isAligned (src))   { JUCE_MINMAX_SSE_LOOP (_mm_load_pd) }
            else                                        { JUCE_MINMAX_SSE_LOOP (_mm_loadu_pd) }

            double localMin, localMax;

            {
                double mns[4], mxs[4];
                _mm_storeu_pd (mns, mn);
                _mm_storeu_pd (mxs, mx);
                DoubleVectorHelpers::mmEmpty();

                localMin = juce::jmin (mns[0], mns[1], mns[2], mns[3]);
                localMax = juce::jmax (mxs[0], mxs[1], mxs[2], mxs[3]);
            }

            num &= 3;

            for (int i = 0; i < num; ++i)
            {
                const double s = src[i];
                localMin = juce::jmin (localMin, s);
                localMax = juce::jmax (localMax, s);
            }

            minResult = localMin;
            maxResult = localMax;
            return;
        }
    #endif

    juce::findMinAndMax (src, num, minResult, maxResult);
}

double JUCE_CALLTYPE DoubleVectorOperations::findMinimum (const double* src, const int num) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        return DoubleVectorHelpers::findMinimumOrMaximum (src, num, true);
    #else
        return juce::findMinimum (src, num);
    #endif
}

double JUCE_CALLTYPE DoubleVectorOperations::findMaximum (const double* src, const int num) noexcept
{
    #if JUCE_USE_SSE_INTRINSICS
        return DoubleVectorHelpers::findMinimumOrMaximum (src, num, false);
    #else
        return juce::findMaximum (src, num);
    #endif
}

#undef JUCE_MINIMUMMAXIMUM_SSE_LOOP
#undef JUCE_BEGIN_SSE_OP
#undef JUCE_FINISH_SSE_OP
#undef JUCE_SSE_LOOP
#undef JUCE_INCREMENT_SRC_DEST
#undef JUCE_INCREMENT_DEST
#undef JUCE_LOAD_NONE
#undef JUCE_LOAD_DEST
#undef JUCE_LOAD_SRC
#undef JUCE_LOAD_SRC_DEST
#undef JUCE_PERFORM_SSE_OP_DEST
#undef JUCE_PERFORM_SSE_OP_SRC_DEST

#2

This has been on my to-do-list because at some point I’ll need to give tracktion a fully 64-bit audio pipeline (sigh).

In fact my plan was not to have a “DoubleVectorOperations” class, but to add methods to FloatVectorOperations so that there are float and double versions of all its operations. Doing it that way means that when you have templated classes that could work with either float or double types, they can just call the methods in FloatVectorOperations and the correct one will be selected, rather than needing to mess around working out which class should be called.


#3

Ah, that makes sense and would definitely simplify… should be an easy shift over of what I wrote above (minus corrections as per your needs).


#4

Might I suggest adding a unit test for FloatVectorOperations (and the “double” based version as well, when you get around to it)?


#5

Yep.


#6

Merci