How to organize SSE code better + Loop Unrolling?

Basic SSE is very simple and easy to add the rest. :slight_smile: All thanks to the guy who started this. I tried to contact him but so far no response, as I want to credit him for starting this up…

JUCE_ALIGN(16) class sse4
{
public:
	__m128 v;
	//
	forcedinline sse4(float x) : v(_mm_set1_ps(x)) { };
	forcedinline sse4(float *px) : v(_mm_load_ps(px)) { };
	forcedinline sse4(__m128 v) : v(v) { };
	forcedinline void write(float* target) { _mm_store_ps(target, v); };
	forcedinline void set(sse4 value) { v = value.v; };
	forcedinline void operator = (sse4& _v2) { v = _v2.v; };
	forcedinline void operator = (float* _v2) { v = _mm_load_ps(_v2); }
};

forcedinline sse4 operator + (const sse4 &l, const sse4 &r) { return sse4(_mm_add_ps(l.v, r.v)); }
forcedinline sse4 operator - (const sse4 &l, const sse4 &r) { return sse4(_mm_sub_ps(l.v, r.v)); }
forcedinline sse4 operator * (const sse4 &l, const sse4 &r) { return sse4(_mm_mul_ps(l.v, r.v)); }
forcedinline sse4 operator / (const sse4 &l, const sse4 &r) { return sse4(_mm_div_ps(l.v, r.v)); }
forcedinline sse4 operator + (const sse4 &l, const float &r) { return sse4(_mm_add_ps(l.v, _mm_set1_ps(r))); }
forcedinline sse4 operator - (const sse4 &l, const float &r) { return sse4(_mm_sub_ps(l.v, _mm_set1_ps(r))); }
forcedinline sse4 operator * (const sse4 &l, const float &r) { return sse4(_mm_mul_ps(l.v, _mm_set1_ps(r))); }
forcedinline sse4 operator / (const sse4 &l, const float &r) { return sse4(_mm_div_ps(l.v, _mm_set1_ps(r))); }