State of the Art Denormal Prevention

Does anybody has reliable information how de-normals and these flags affecting AMD processors? I couldn't find any information...

Here is some testcode, if somebody has an AMD processor, would be great to check this out

 


#ifndef MAINCOMPONENT_H_INCLUDED
#define MAINCOMPONENT_H_INCLUDED
#include "../JuceLibraryCode/JuceHeader.h"
#include "xmmintrin.h"

class ScopedNoDenormals
{
public:
    ScopedNoDenormals()
    {
        oldMXCSR = _mm_getcsr(); 
        int newMXCSR = oldMXCSR | 0x8040;
        _mm_setcsr( newMXCSR); 
    };
    
    ~ScopedNoDenormals()
    {
        _mm_setcsr( oldMXCSR );
    };
    
    int oldMXCSR;
    
};

class MainContentComponent   : public Component, public Thread, public Timer
{
public:
   //==============================================================================
   
    
    
    MainContentComponent()
    : Thread("DenormalTest")
    {
        improvement=0.f;
        setSize (600, 400);
    
        startThread();
        startTimer(1000);
    
    }
    ~MainContentComponent()
    {
    }
    void timerCallback() override
    {
        repaint();
    }
    void paint (Graphics& g)
    {
        g.fillAll (Colour (0xff001F36));
        g.setFont (Font (16.0f));
        g.setColour (Colours::white);
        g.drawText ("ScopedNoDenormals is x"+String(improvement,5)+" faster" , getLocalBounds(), Justification::centred, true);
    }
    void resized()
    {
        // This is called when the MainContentComponent is resized.
        // If you add any child components, this is where you should
        // update their positions.
    }
    double calc()
    {
        double denormal=std::numeric_limits<double>::min()*0.5;
        
        double half=0.5;
    
        for (int i=0; i<10000000;i++)
        {
            
#if JUCE_32BIT && JUCE_WINDOWS
            __asm
            {
                movsd xmm0,denormal ;
                movsd xmm1,half ;
                mulsd xmm0,xmm1 ;
                mulsd xmm0,xmm1 ;
                mulsd xmm0,xmm1 ;
                mulsd xmm0,xmm1 ;

            };
#else
        
            // We use intrinsics, because normal arithmetic code would be optimized
            __m128d r1;
            __m128d r2;
        
            r1 =_mm_load_sd (&denormal);
            r2 =_mm_load_sd (&half);
            r1 =_mm_mul_sd(r1,r2);
            r1 =_mm_mul_sd(r1,r2);
            r1 =_mm_mul_sd(r1,r2);
            r1 =_mm_mul_sd(r1,r2);
    
#endif
        };
        return 0;
    }
    void run()
    {
        while (!threadShouldExit())
        {
            int64 before=Time::getHighResolutionTicks();
            calc();
            int64 usedTimeDenormal = Time::getHighResolutionTicks() - before;
            before=Time::getHighResolutionTicks();
            {
                ScopedNoDenormals n;
                calc();
            }
            
            int64 usedTimeNoDenormal = Time::getHighResolutionTicks() - before;
            improvement=(double)usedTimeDenormal/(double)usedTimeNoDenormal;
        
        };
    }


    double improvement;
private:
    //==============================================================================
    JUCE_DECLARE_NON_COPYABLE_WITH_LEAK_DETECTOR (MainContentComponent)
};

#endif  // MAINCOMPONENT_H_INCLUDED

3 Likes