• Performance of denormal numbers

    From Bonita Montero@21:1/5 to All on Sat Sep 17 16:35:18 2022
    XPost: comp.lang.c++

    I wanted to check if denormal numbers have slower performance on
    modern CPUs. Intel introduced the DAZ / FTZ Bits with SSE1 because
    denormals were even handled in microcode:

    #include <iostream>
    #include <bit>
    #include <cstdint>
    #include <chrono>
    #include <utility>
    #include <atomic>

    using namespace std;
    using namespace chrono;

    uint64_t denScale( uint64_t rounds, bool den );

    int main()
    {
    auto bench = []( bool den ) -> double
    {
    constexpr uint64_t ROUNDS = 25'000'000;
    auto start = high_resolution_clock::now();
    int64_t nScale = denScale( ROUNDS, den );
    return (double)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / nScale;
    };
    double
    tDen = bench( true ),
    tNorm = bench( false ),
    rel = tDen / tNorm - 1;
    cout << tDen << endl;
    cout << tNorm << endl;
    cout << trunc( 100 * 10 * rel + 0.5 ) / 10 << "%" << endl;
    }

    MASM code:

    PUBLIC ?denScale@@YA_K_K_N@Z

    CONST SEGMENT
    DEN DQ 00008000000000000h
    ONE DQ 03FF0000000000000h
    P5 DQ 03fe0000000000000h
    CONST ENDS

    _TEXT SEGMENT
    ?denScale@@YA_K_K_N@Z PROC
    xor rax, rax
    test rcx, rcx
    jz byeBye
    mov r8, ONE
    mov r9, DEN
    test dl, dl
    cmovnz r8, r9
    movq xmm1, P5
    mov rax, rcx
    loopThis:
    movq xmm0, r8
    REPT 52
    mulsd xmm0, xmm1
    ENDM
    sub rcx, 1
    jae loopThis
    mov rdx, 52
    mul rdx
    byeBye:
    ret
    ?denScale@@YA_K_K_N@Z ENDP
    _TEXT ENDS
    END

    For my PC normal numbers have a 25% higher throughput.
    Feel free to post your results also.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)