I’ve been in the computer game industry for years, on many many platforms, and I’ve never seen a difference like this before! It’s quite astonishing how slow it is in debug.
I’m going to try the LLVM (Clang -cl) option and see if it makes a difference.
Here’s some disassembly of just one line of code with the in-lined debug mode
Unfortunately I can’t use break points with optimisation on, in debug for some reason, so I can’t show the fast version…
This line
pos = ((pos + buff_wrap) & mask) + (pos & (~mask));
Is this code…
00007FFDA8D56E4A mov rax,qword ptr [this]
00007FFDA8D56E52 movups xmm0,xmmword ptr [rax+90h]
00007FFDA8D56E59 movdqa xmmword ptr [rsp+0C20h],xmm0
00007FFDA8D56E62 mov rax,qword ptr [rsp+300h]
00007FFDA8D56E6A mov qword ptr [rsp+20h],rax
00007FFDA8D56E6F movaps xmm0,xmmword ptr [rsp+0C20h]
00007FFDA8D56E77 movaps xmmword ptr [rsp+0C40h],xmm0
00007FFDA8D56E7F movaps xmm0,xmmword ptr [pos]
00007FFDA8D56E87 movaps xmmword ptr [rsp+0C30h],xmm0
00007FFDA8D56E8F movaps xmm0,xmmword ptr [rsp+0C30h]
00007FFDA8D56E97 addpd xmm0,xmmword ptr [rsp+0C40h]
00007FFDA8D56EA0 movaps xmmword ptr [rsp+0C50h],xmm0
00007FFDA8D56EA8 movaps xmm0,xmmword ptr [rsp+0C50h]
00007FFDA8D56EB0 movaps xmmword ptr [rsp+0C60h],xmm0
00007FFDA8D56EB8 movaps xmm0,xmmword ptr [rsp+0C60h]
00007FFDA8D56EC0 movaps xmmword ptr [rsp+0C70h],xmm0
00007FFDA8D56EC8 movaps xmm0,xmmword ptr [rsp+0C70h]
00007FFDA8D56ED0 movaps xmmword ptr [rsp+1C00h],xmm0
00007FFDA8D56ED8 lea rax,[rsp+1C00h]
00007FFDA8D56EE0 mov qword ptr [rsp+20h],rax
00007FFDA8D56EE5 lea rax,[rsp+0C90h]
00007FFDA8D56EED lea rcx,[mask]
00007FFDA8D56EF5 mov rdi,rax
00007FFDA8D56EF8 mov rsi,rcx
00007FFDA8D56EFB mov ecx,10h
00007FFDA8D56F00 rep movs byte ptr [rdi],byte ptr [rsi]
00007FFDA8D56F02 lea rax,[rsp+0CA0h]
00007FFDA8D56F0A lea rcx,[rsp+0C90h]
00007FFDA8D56F12 mov rdi,rax
00007FFDA8D56F15 mov rsi,rcx
00007FFDA8D56F18 mov ecx,10h
00007FFDA8D56F1D rep movs byte ptr [rdi],byte ptr [rsi]
00007FFDA8D56F1F mov rax,qword ptr [rsp+308h]
00007FFDA8D56F27 mov qword ptr [rsp+28h],rax
00007FFDA8D56F2C movdqa xmm0,xmmword ptr [rsp+0CA0h]
00007FFDA8D56F35 movdqa xmmword ptr [rsp+0CB0h],xmm0
00007FFDA8D56F3E movdqa xmm0,xmmword ptr [rsp+0CB0h]
00007FFDA8D56F47 movdqa xmmword ptr [rsp+1C10h],xmm0
00007FFDA8D56F50 movaps xmm0,xmmword ptr [rsp+1C10h]
00007FFDA8D56F58 movaps xmmword ptr [rsp+0CD0h],xmm0
00007FFDA8D56F60 movaps xmm0,xmmword ptr [rsp+0CD0h]
00007FFDA8D56F68 movaps xmmword ptr [rsp+0CF0h],xmm0
00007FFDA8D56F70 mov rax,qword ptr [rsp+20h]
00007FFDA8D56F75 movups xmm0,xmmword ptr [rax]
00007FFDA8D56F78 movups xmmword ptr [rsp+0CE0h],xmm0
00007FFDA8D56F80 movaps xmm0,xmmword ptr [rsp+0CE0h]
00007FFDA8D56F88 andps xmm0,xmmword ptr [rsp+0CF0h]
00007FFDA8D56F90 movaps xmmword ptr [rsp+0D00h],xmm0
00007FFDA8D56F98 movaps xmm0,xmmword ptr [rsp+0D00h]
00007FFDA8D56FA0 movaps xmmword ptr [rsp+0D10h],xmm0
00007FFDA8D56FA8 movaps xmm0,xmmword ptr [rsp+0D10h]
00007FFDA8D56FB0 movaps xmmword ptr [rsp+0D20h],xmm0
00007FFDA8D56FB8 movaps xmm0,xmmword ptr [rsp+0D20h]
00007FFDA8D56FC0 movaps xmmword ptr [rsp+1C20h],xmm0
00007FFDA8D56FC8 lea rax,[rsp+1C20h]
00007FFDA8D56FD0 mov qword ptr [rsp+28h],rax
00007FFDA8D56FD5 movdqa xmm0,xmmword ptr [mask]
00007FFDA8D56FDE movdqa xmmword ptr [rsp+0D60h],xmm0
00007FFDA8D56FE7 movdqa xmm0,xmmword ptr [juce::dsp::SIMDNativeOps<unsigned __int64>::kAllBitsSet (07FFDA9B91C90h)]
00007FFDA8D56FEF movdqa xmmword ptr [rsp+0D40h],xmm0
00007FFDA8D56FF8 movdqa xmm0,xmmword ptr [rsp+0D40h]
00007FFDA8D57001 movdqa xmmword ptr [rsp+0D50h],xmm0
00007FFDA8D5700A movdqa xmm0,xmmword ptr [rsp+0D50h]
00007FFDA8D57013 movdqa xmmword ptr [rsp+0D70h],xmm0
00007FFDA8D5701C movdqa xmm0,xmmword ptr [rsp+0D60h]
00007FFDA8D57025 pandn xmm0,xmmword ptr [rsp+0D70h]
00007FFDA8D5702E movdqa xmmword ptr [rsp+0D80h],xmm0
00007FFDA8D57037 movdqa xmm0,xmmword ptr [rsp+0D80h]
00007FFDA8D57040 movdqa xmmword ptr [rsp+0D90h],xmm0
00007FFDA8D57049 movdqa xmm0,xmmword ptr [rsp+0D90h]
00007FFDA8D57052 movdqa xmmword ptr [rsp+0DA0h],xmm0
00007FFDA8D5705B movdqa xmm0,xmmword ptr [rsp+0DA0h]
00007FFDA8D57064 movdqa xmmword ptr [rsp+1C30h],xmm0
00007FFDA8D5706D lea rax,[rsp+1C30h]
00007FFDA8D57075 mov qword ptr [rsp+0F0h],rax
00007FFDA8D5707D mov rax,qword ptr [rsp+0F0h]
00007FFDA8D57085 movups xmm0,xmmword ptr [rax]
00007FFDA8D57088 movdqa xmmword ptr [rsp+0DC0h],xmm0
00007FFDA8D57091 movdqa xmm0,xmmword ptr [rsp+0DC0h]
00007FFDA8D5709A movdqa xmmword ptr [rsp+0DD0h],xmm0
00007FFDA8D570A3 movdqa xmm0,xmmword ptr [rsp+0DD0h]
00007FFDA8D570AC movdqa xmmword ptr [rsp+1E40h],xmm0
00007FFDA8D570B5 movaps xmm0,xmmword ptr [rsp+1E40h]
00007FFDA8D570BD movaps xmmword ptr [rsp+0DF0h],xmm0
00007FFDA8D570C5 movaps xmm0,xmmword ptr [rsp+0DF0h]
00007FFDA8D570CD movaps xmmword ptr [rsp+0E10h],xmm0
00007FFDA8D570D5 movaps xmm0,xmmword ptr [pos]
00007FFDA8D570DD movaps xmmword ptr [rsp+0E00h],xmm0
00007FFDA8D570E5 movaps xmm0,xmmword ptr [rsp+0E00h]
00007FFDA8D570ED andps xmm0,xmmword ptr [rsp+0E10h]
00007FFDA8D570F5 movaps xmmword ptr [rsp+0E20h],xmm0
00007FFDA8D570FD movaps xmm0,xmmword ptr [rsp+0E20h]
00007FFDA8D57105 movaps xmmword ptr [rsp+0E30h],xmm0
00007FFDA8D5710D movaps xmm0,xmmword ptr [rsp+0E30h]
00007FFDA8D57115 movaps xmmword ptr [rsp+0E40h],xmm0
00007FFDA8D5711D movaps xmm0,xmmword ptr [rsp+0E40h]
00007FFDA8D57125 movaps xmmword ptr [rsp+1CC0h],xmm0
00007FFDA8D5712D lea rax,[rsp+1CC0h]
00007FFDA8D57135 mov qword ptr [rsp+0F8h],rax
00007FFDA8D5713D lea rax,[rsp+0E50h]
00007FFDA8D57145 mov rdi,rax
00007FFDA8D57148 mov rsi,qword ptr [rsp+0F8h]
00007FFDA8D57150 mov ecx,10h
00007FFDA8D57155 rep movs byte ptr [rdi],byte ptr [rsi]
00007FFDA8D57157 lea rax,[rsp+0E60h]
00007FFDA8D5715F lea rcx,[rsp+0E50h]
00007FFDA8D57167 mov rdi,rax
00007FFDA8D5716A mov rsi,rcx
00007FFDA8D5716D mov ecx,10h
00007FFDA8D57172 rep movs byte ptr [rdi],byte ptr [rsi]
00007FFDA8D57174 movaps xmm0,xmmword ptr [rsp+0E60h]
00007FFDA8D5717C movaps xmmword ptr [rsp+0E80h],xmm0
00007FFDA8D57184 mov rax,qword ptr [rsp+28h]
00007FFDA8D57189 movups xmm0,xmmword ptr [rax]
00007FFDA8D5718C movups xmmword ptr [rsp+0E70h],xmm0
00007FFDA8D57194 movaps xmm0,xmmword ptr [rsp+0E70h]
00007FFDA8D5719C addpd xmm0,xmmword ptr [rsp+0E80h]
00007FFDA8D571A5 movaps xmmword ptr [rsp+0E90h],xmm0
00007FFDA8D571AD movaps xmm0,xmmword ptr [rsp+0E90h]
00007FFDA8D571B5 movaps xmmword ptr [rsp+0EA0h],xmm0
00007FFDA8D571BD movaps xmm0,xmmword ptr [rsp+0EA0h]
00007FFDA8D571C5 movaps xmmword ptr [rsp+0EB0h],xmm0
00007FFDA8D571CD movaps xmm0,xmmword ptr [rsp+0EB0h]
00007FFDA8D571D5 movaps xmmword ptr [rsp+1E30h],xmm0
00007FFDA8D571DD lea rax,[rsp+1E30h]
00007FFDA8D571E5 mov qword ptr [rsp+100h],rax
00007FFDA8D571ED mov rax,qword ptr [rsp+100h]
00007FFDA8D571F5 movups xmm0,xmmword ptr [rax]
00007FFDA8D571F8 movdqa xmmword ptr [pos],xmm0
[edit] I just used optimisation O1 and I could stop and look at the code, although it’s probably an unfair comparison because it needs to take into account the surrounding code.
Anyway, I could work with that for now, thanks for the suggestion, @Nitsuj70