input should be 16 bits aligned so it will work correctly.
drive is from 0 to 4, or even higher.
So, it is like this:
float input[4];
float drive;
// (((sqrt(input) - input) * drive) + input) * (1 - ((drive / 4) * 0.5)) __m128 dinput = _mm_load_ps(input); __m128 mask = _mm_add_ps(_mm_and_ps(_mm_set1_ps(-2.0f), _mm_cmpnge_ps(dinput, _mm_setzero_ps())), _mm_set1_ps(1.0f)); __m128 ddrive = _mm_set1_ps(drive); __m128 sqrtInput = _mm_mul_ps(_mm_sqrt_ps(_mm_mul_ps(dinput, mask)), mask); __m128 result1 = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(sqrtInput, dinput), ddrive), dinput); _mm_store_ps(input, _mm_mul_ps(result1, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_mm_div_ps(ddrive, _mm_set1_ps(4.0f)), _mm_set1_ps(0.5f)))));