• Problem with knucleotide benchmark

    From Branmimir Maksimovic@21:1/5 to All on Fri May 7 00:38:44 2021
    I have written this some 15 years ago, that is
    pretty dirty.
    Benchmark is described at https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/knucleotide.html#knucleotide
    Program is fasm
    compile like and run like:
    ~/shootout/knucleotide >>> fasm knucleotidef.asm [1]
    flat assembler version 1.73.27 (16384 kilobytes memory)
    4 passes, 8612 bytes.
    ~/shootout/knucleotide >>> gcc knucleotidef.o -o knucleotidef -no-pie -m32 ~/shootout/knucleotide >>> time ./knucleotidef 32 < input.txt
    A 30.295
    T 30.151
    C 19.800
    G 19.754

    AA 9.177
    TA 9.132
    AT 9.131
    TT 9.091
    CA 6.002
    AC 6.001
    AG 5.987
    GA 5.984
    CT 5.971
    TC 5.971
    GT 5.957
    TG 5.956
    CC 3.917
    GC 3.911
    CG 3.909
    GG 3.902

    446535 GGTA
    47336 GGTATT
    ./knucleotidef 32 < input.txt 28.72s user 0.22s system 331% cpu 8.730 total

    Problem is that I made it for 4 threads, and I cannot for love of God
    figure out how to increase that. Posted on several forums and no
    response, seems that code is very complicated as I wrote it that way
    long time ago. Now I would that rewrite much simpler without that
    macros, but what to do now ;)

    Here it is:

    struc vector d,s
    .data dd d
    .size dd s
    .elements dd 0

    macro ccall proc,[arg] ; call CDECL procedure
    local size
    size = 0
    pushd arg
    size = size+4
    call proc
    add esp,size

    macro sys_exit rc
    mov eax,1 ; exit
    mov ebx,rc
    int 0x80

    macro sys_read fd, buf, size
    mov eax, 3 ; sys_read
    mov ebx, fd
    mov ecx, buf
    mov edx, size
    int 0x80
    macro sys_write fd, buf, size
    mov eax, 4 ; sys_write
    mov ebx, fd
    mov ecx, buf
    mov edx, size
    int 0x80

    CLONE_VM equ 0x00000100
    CLONE_FS equ 0x00000200
    CLONE_FILES equ 0x00000400
    CLONE_SIGHAND equ 0x00000800
    CLONE_THREAD equ 0x00010000

    macro sys_clone stack
    mov eax,120 ; sys_clone
    mov ebx,CLONE_VM or CLONE_FS or CLONE_FILES \
    mov ecx,stack ; choose stack
    xor edx,edx ; no struct
    int 0x80

    __WNOTHREAD equ 0x20000000
    __WALL equ 0x40000000
    __WCLONE equ 0x80000000

    macro sys_wait pid
    mov ebx,pid
    mov eax,114 ; sys_wait4
    xor ecx,ecx
    xor esi,esi
    mov edx,__WALL;
    int 0x80

    macro read fd, buf,size
    local l1,l2,l3
    mov edi,buf
    mov ebx,size
    xor ecx,ecx
    mov eax, dword [fptr]
    and eax,eax
    jnz l2
    push ebx ecx edx edi
    strncpy fileresbuf,filebuf,fsize,0
    sys_read fd,filebuf,fsize
    pop edi edx ecx ebx
    and eax,eax
    jz l3
    lea eax, [eax+filebuf]
    mov dword [fend], eax
    mov dword [fptr], filebuf
    mov eax, dword [fend]
    sub eax, dword [fptr]
    jz l1
    cmp eax,ebx
    cmovg eax,ebx
    sub ebx, eax
    add ecx,eax
    push ecx
    strncpy edi,dword [fptr], eax, 0
    pop ecx
    and ebx,ebx
    mov dword [fptr],esi
    jnz l1
    mov eax,ecx

    macro back size
    sub dword[fptr],size

    macro getLine fd, buf, size, hint
    local l1,l2,l3
    mov ecx, size
    mov ebx,hint
    cmp ecx,ebx
    cmovl ebx,ecx
    xor edx,edx
    mov edi, buf
    cmp ecx,0
    jle l2
    push ebx ecx edx edi
    read fd,dword[esp],ebx
    pop edi edx ecx ebx
    add edx,eax
    test eax,eax
    jz l2;
    sub ecx,eax
    push ecx eax
    mov ecx,eax
    strnchr edi,0xa,ecx
    pop eax ecx
    cmp byte [edi-1], 0xa
    jne l1
    dec edi
    mov byte [edi],0
    sub edi,buf
    mov eax,edx
    dec eax
    sub eax,edi
    jle l3
    back eax

    macro strnchr s,c,count
    mov edi,s
    mov eax,c
    mov ecx,count
    repne scasb

    macro dwordnset s, c, count
    mov edi,s
    mov eax,c
    mov ecx,count
    rep stosd

    macro strnset s,c, size
    mov edi,s
    mov eax,c
    mov ecx,size
    rep stosb

    macro dwordncmp s1, s2, size, dir
    if ~ dir
    end if
    mov esi,s2
    mov edi,s1
    mov ecx,size
    repe cmpsd
    if dir
    end if

    macro strncmp s1, s2, size, dir
    if ~ dir
    end if
    mov esi,s2
    mov edi,s1
    mov ecx,size
    repe cmpsb
    if dir
    end if

    macro dwordncpy s1,s2, size, dir
    if ~ dir
    end if
    mov esi,s2
    mov edi,s1
    mov ecx, size
    rep movsd
    if dir
    end if

    macro strncpy s1,s2, size, dir
    if ~ dir
    end if
    mov esi,s2
    mov edi,s1
    mov ecx, size
    rep movsb
    if dir
    end if

    macro to_num src
    mov al,src

    macro to_char src
    mov al,src

    macro pack_str dst,src,size,f
    if ~f
    local l1
    mov esi,src
    mov edi,dst
    mov ecx,size
    mov edx,0
    mov ebx,xtbl
    to_num byte [esi]
    mov byte [edi], al
    inc edi
    inc esi
    inc edx
    dec ecx
    jnz l1
    strncpy dst,src,size,0
    end if

    macro really_pack_str dst,src,size,f
    local l1,l2,e1
    mov esi,src
    mov edi,dst
    mov ecx,size
    mov edx,1
    mov ebx,4
    mov byte [edi],0
    push ebx
    mov ebx,xtbl
    to_num byte [esi]
    pop ebx
    shl byte [edi],2
    or byte [edi], al
    inc esi
    dec ecx
    jz e1
    dec ebx
    jnz l2
    inc edi
    inc edx ; count
    jmp l1

    macro unpack_str dst,src,size
    local l1
    mov esi,src
    mov edi,dst
    mov ecx,size
    mov ebx,xtbl
    to_char byte [esi]
    mov byte [edi], al
    inc edi
    inc esi
    dec ecx
    jnz l1

    macro initvector data,oldsize,size,block
    local e1,e2
    mov eax, size
    imul eax, block
    push eax
    ccall realloc,dword[data],eax
    pop ebx
    and eax,eax
    jz e1
    mov dword[data],eax
    mov dword[oldsize],ebx
    jmp e2
    ccall perror, err1
    sys_exit -1

    macro freevector data,size
    local l1
    mov ecx,size
    mov ebx,data
    push ebx ecx
    ccall free,dword[ebx]
    pop ecx ebx
    mov dword[ebx],0
    add ebx,4
    dec ecx
    jnz l1

    macro hash str,size
    local l1,s1,s2,e1,e2
    mov ecx,size
    mov ebx,str
    mov edi,4
    mov esi,16
    xor eax,eax

    cmp ecx,4
    jle s1
    pxor xmm2,xmm2
    pcmpeqb xmm2,xmm1
    pmovmskb edx,xmm2
    xor dx,0xffff
    je l1
    cmp ecx,16
    jg s2
    sub ecx,4
    add ebx,ecx
    shl ecx,1
    sub ecx,64
    neg ecx
    movd xmm2,ecx
    psllq xmm1,xmm2
    psrlq xmm1,xmm2
    movd eax,xmm1
    mov ecx,4
    pxor xmm1,xmm1
    jmp l1
    movd edx,xmm1
    psrldq xmm1,4
    movd eax,xmm1
    cmp ecx,20
    jge s1
    sub ecx,4
    add ebx,ecx
    sub ecx,12
    mov edi,4
    sub edi,ecx
    mov esi,edi
    shl ecx,1
    mov edi,ecx
    sub ecx,32
    neg ecx
    shl edx,cl
    shld eax,edx,8
    sub edi,8
    neg edi
    mov ecx,edi
    shr eax,cl
    mov ecx,4
    mov edi,4
    jmp s1
    shl eax,2
    movzx edx,byte[ebx]
    or eax,edx
    dec ecx
    jle e1
    inc ebx
    dec esi
    jnz l1
    pslldq xmm1,4
    movdqa xmm2,xmm1
    movd xmm1,eax
    por xmm1,xmm2
    xor eax,eax
    mov esi,16
    dec edi
    jz e2
    jmp l1
    pslldq xmm1,4
    movdqa xmm2,xmm1
    movd xmm1,eax
    por xmm1,xmm2
    dec edi

    macro hashfind data,elements,block,srchstr,srchlen
    mov eax,srchstr
    movd xmm0,eax
    hash srchstr,srchlen
    mov ebx,data
    strfind elements,block

    macro strfind elements,block
    local l1,l2,l3,l4,l5,s1,s2,s3,e1
    movdqa xmm2,xmm1
    mov edx,4
    sub edx,edi
    xor eax,eax
    movd ecx,xmm2
    shld eax,ecx,16
    and ecx,0xffff
    add eax,ecx
    psrldq xmm2,4
    dec edx
    jnz s2
    and eax,0xffff
    xor esi,esi
    shl eax,2
    movd xmm2,eax
    movd xmm3,ebx
    cmp dword[ebx+eax],0
    jne l3
    ; allocate
    mov ebx,1
    xor eax,eax
    lock cmpxchg dword[sema],ebx ; test and set
    and eax,eax
    jnz s1
    add esi,28
    movd eax,xmm2
    movd ebx,xmm3
    ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
    lock and dword[sema],0 ; reset
    mov esi,eax
    and esi,esi
    jz e2
    movd eax,xmm2
    movd ebx,xmm3
    cmp dword[ebx+eax],0
    mov dword[ebx+eax],esi
    jne l2
    mov esi, dword[ebx+eax]
    mov dword[esi],0
    mov ebx,dword[ebx+eax]
    add ebx,4
    mov eax,dword[ebx-4]
    imul eax,24
    mov dword[ebx+eax],0
    movd [ebx+eax+4],xmm0
    movdqu [ebx+eax+8],xmm1
    inc dword[elements]
    inc dword[ebx-4]
    jmp e1
    mov ebx,dword[ebx+eax]
    add ebx,4
    xor eax,eax

    mov esi,dword[ebx-4]
    imul esi,24
    cmp eax,esi
    jge l1 ; we need to reallocate
    movdqu xmm4,[ebx+eax+8]
    pcmpeqb xmm4,xmm1
    pmovmskb esi,xmm4
    xor si,0xffff
    jz e1

    add eax,24
    jmp l4
    lea eax,[ebx+eax]

    macro find data,elements,block,srchstr
    ; binary search and insert

    local l1,l2,e1,e2,e3
    pushd data
    mov ecx,srchstr
    mov ebx,dword[esp]
    add ebx,4
    mov eax,dword[ebx-4]
    lea edx,[ebx+eax*block]

    if 0
    ccall printf,fmt1,dword[ebx-4]
    end if

    and eax,eax
    jz e1
    cmp edx,ebx
    jle e1
    shr eax,1
    mov esi,dword[ecx]
    cmp dword[ebx+eax*block],esi
    jle l1
    and eax,eax
    jnz l2
    inc eax
    lea ebx, [ebx+eax*block]
    jmp l1
    mov eax,dword[esp]
    add eax,4
    lea edx,[eax-4]
    mov edx,dword[edx]
    lea eax, [eax+edx*block]
    mov edx, eax
    add edx,block
    dec edx
    sub eax,ebx
    jl e2
    push ecx
    lea ecx, [edx-block]

    if 0
    ccall printf,fmt5,eax,dword[ecx]
    end if

    strncpy edx,ecx,eax,1
    pop ecx

    if 0
    ccall printf,fmt5,eax,ecx
    end if

    mov esi,dword[ecx]
    mov dword [ebx],esi ; count
    mov esi,dword[ecx+4]
    mov dword [ebx+4],esi ;ptrstring
    mov eax,dword[esp]
    inc dword[eax]
    inc dword[elements]
    xor eax,eax
    jmp e3
    ccall printf,fmt,err2
    sys_exit -1
    pop ecx
    lea eax,[ebx+eax*block]

    macro print_strs ptr,size
    local l1,e1
    mov esi,size
    mov ebx,ptr
    mov ecx,dword[ebx]
    cmp ecx,0
    jz e1
    mov edx,dword[sdta]
    inc edx
    sub edx,esi
    push edx
    fild dword[esp]
    mov dword[esp],100
    fild dword[esp]
    add ebx,4
    add esp,4
    push ecx ebx esi
    unpack_str lngbuf,dword[ebx+4],size
    pop esi
    mov byte[lngbuf+esi],0
    pop ebx
    push ebx
    fild dword[ebx]
    fmul st0,st1
    fdiv st0,st2
    sub esp,8
    fstp qword[esp]
    ccall printf,fmt, lngbuf
    add esp,8
    pop ebx ecx
    add ebx,8
    dec ecx
    jnz l1

    macro merge data,data1,size,srchlen
    local l1,l2,l3,t1,e1
    mov ecx,size/4
    cmp ecx,0
    jz e1
    mov ebx,data1
    cmp dword[ebx],0
    jz l3
    push ebx ecx
    mov ebx,dword[ebx]
    mov ecx,dword[ebx]

    if 0
    ccall printf,fmt6,ecx,dword[ebx+4]
    end if

    add ebx,4
    add dword[sum],ecx
    inc dword[cnt]
    push ebx ecx
    pxor xmm1,xmm1
    hashfind data,hashtable.elements,8,dword[ebx+4],srchlen
    dec dword[hashtable.elements]

    pop ecx ebx
    mov esi, dword[ebx]
    add dword[eax],esi
    add ebx,24
    dec ecx
    jnz l2
    pop ecx ebx
    add ebx,4
    dec ecx
    jnz l1

    macro frequencies size
    local l1,l2,l3,e1
    mov edx,size
    mov ebx,dword[dta]
    mov ecx,dword[sdta]
    inc ecx
    sub ecx,edx ; loop count
    pushd dword[hashtable.data]
    pushd dword[hashtable.elements]
    push ebx ecx edx
    strncpy tstck-5*4,esp,5*4,0
    strncpy tstck1-5*4,esp,5*4,0
    strncpy tstck2-5*4,esp,5*4,0
    strncpy tstck3-5*4,esp,5*4,0

    sub dword[tstck1-4*4],1
    add dword[tstck1-3*4],1
    add dword[tstck1-1*4],0x40000
    sub dword[tstck2-4*4],2
    add dword[tstck2-3*4],2
    add dword[tstck2-1*4],0x80000
    sub dword[tstck3-4*4],3
    add dword[tstck3-3*4],3
    add dword[tstck3-1*4],0xc0000

    sys_clone tstck ;1
    and eax,eax
    jz l1
    push eax
    sys_clone tstck1;2
    and eax,eax
    jz l1
    push eax
    sys_clone tstck2;3
    and eax,eax
    jz l1
    push eax
    sys_clone tstck3;4
    and eax,eax
    jz l1
    sys_wait eax ;1
    pop eax
    sys_wait eax ;2
    pop eax
    sys_wait eax ;3
    pop eax
    sys_wait eax ;4
    pop edx ecx ebx esi esi
    jmp e1
    mov ebp,esp
    sub esp,5*4

    if 0
    ccall printf,fmt1,ecx
    ccall printf,fmt1,edx
    ccall printf,fmt2,ebx
    end if

    pop edx ecx ebx
    push ebp
    mov ebp,esp
    pxor xmm1,xmm1
    ; push ebx ecx edx
    movd xmm5,edx
    movd xmm6,ecx
    movd xmm7,ebx
    hashfind dword [ebp+8], ebp+4,8,ebx,edx
    ; pop edx ecx ebx
    movd edx,xmm5
    movd ecx,xmm6
    movd ebx,xmm7
    inc dword[eax]
    add ebx,4 ; interleave
    sub ecx,4
    jg l3
    mov esp,ebp
    pop ebp
    pop esi
    lock add dword[hashtable.elements],esi
    pop esi
    sys_exit 0


    macro frequencies1
    local l1,l2,l3,e1
    mov ecx,dword[hashtable.elements]
    cmp ecx,0
    jz e1
    mov ebx,dword[hashtable.data]
    cmp dword[ebx],0
    jz l3
    push ebx ecx
    mov ebx,dword[ebx]
    mov ecx,dword[ebx]

    if 0
    ccall printf,fmt6,ecx,dword[ebx+4]
    end if

    add ebx,4
    sub dword[esp],ecx ; was bug, ecx wasnt counted
    ; when raw had more than 1
    push ebx ecx
    find dword[sortedtable.data],sortedtable.elements,8,ebx
    pop ecx ebx
    add ebx,24
    dec ecx
    jnz l2
    pop ecx ebx
    and ecx,ecx ; no decrement here
    jz e1
    add ebx,4
    jmp l1

    macro mwrite_frequencies ; len
    push ebp
    mov ebp,esp
    calc_frequencies dword[ebp+8]
    mov esi,dword[hashtable.data]
    add esi,0x40000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
    mov esi,dword[hashtable.data]
    add esi,0x80000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
    mov esi,dword[hashtable.data]
    add esi,0xc0000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
    ; ccall printf,fmt1,dword[cnt]
    ; ccall printf,fmt1,dword[sum]
    ; ccall printf, fmt1, dword [hashtable.elements]
    initvector sortedtable.data,sortedtable.size,dword[hashtable.elements],24
    mov ebx,dword[hashtable.elements]
    imul ebx,6
    dwordnset dword[sortedtable.data],0,ebx
    sort_frequencies dword[ebp+8]
    print_strs dword[sortedtable.data],dword[ebp+8]
    ; ccall printf,fmt1,dword[sortedtable.elements]
    mov dword[sortedtable.elements],0
    mov dword [hashtable.elements],0
    freevector dword[hashtable.data],SIZE
    ccall printf,nl
    mov esp,ebp
    pop ebp

    macro mwrite_count ; len,str
    push ebp
    mov ebp,esp
    mov dword[sum],0
    mov dword[cnt],0
    calc_frequencies dword[ebp+12]
    ; ccall printf, fmt1, dword [hashtable.elements]
    mov esi,dword[hashtable.data]
    add esi,0x40000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
    mov esi,dword[hashtable.data]
    add esi,0x80000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
    mov esi,dword[hashtable.data]
    add esi,0xc0000
    merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
    ; ccall printf,fmt1,dword[cnt]
    ; ccall printf,fmt1,dword[sum]
    pack_str lngbuf,dword[ebp+8],dword[ebp+12],0
    pxor xmm1,xmm1
    hashfind dword[hashtable.data],hashtable.elements,8,lngbuf,edx
    push eax
    unpack_str dword[ebp+8],dword[eax+4],dword[ebp+12]
    pop eax
    ccall printf, fmt5, dword [eax],dword[ebp+8]
    ; ccall printf, fmt1, dword [hashtable.elements]
    mov dword [hashtable.elements],0
    freevector dword[hashtable.data],SIZE
    mov esp,ebp
    pop ebp

    macro calc_frequencies size
    mov edx,size
    call cfrequencies

    macro sort_frequencies size
    call sfrequencies

    macro write_frequencies len
    pushd len
    call swrite_fruequencies
    add esp,4

    macro write_count len,str
    pushd len str
    call swrite_count
    add esp,8

    STDIN equ 0
    STDOUT equ 1
    STDERR equ 2
    fsize equ 4096
    SIZE equ 0x40000 ; play with size and mask in strfind
    format ELF

    section '.text' executable

    public main
    extrn printf
    extrn perror
    extrn realloc
    extrn free

    frequencies edx
    strnset xtbl,'A',0
    mov ebx ,xtbl
    mov byte[ebx+'a'],0
    mov byte[ebx+'A'],0
    mov byte[ebx+'c'],1
    mov byte[ebx+'C'],1
    mov byte[ebx+'g'],2
    mov byte[ebx+'G'],2
    mov byte[ebx+'t'],3
    mov byte[ebx+'T'],3

    mov byte[ebx+0],'A'
    mov byte[ebx+1],'C'
    mov byte[ebx+2],'G'
    mov byte[ebx+3],'T'

    getLine STDIN, buf, 256, 61
    movzx eax, byte [buf]
    and eax,eax
    jz e1
    strncmp buf,three,6,0
    and ecx,ecx
    jnz l2
    getLine STDIN, buf, 256, 61
    movzx eax, byte [buf]
    and eax,eax
    jz e1
    cmp eax,'>'
    je e1
    mov eax,edi
    push eax
    add eax, dword [sdta]
    ccall realloc,dword[dta], eax
    and eax,eax
    jz e2
    mov dword[dta],eax
    pop eax
    mov ebx, dword [sdta]
    add ebx, dword [dta]
    push eax
    pack_str ebx,buf,eax,0
    pop eax
    add dword[sdta],edx
    jmp l1
    initvector hashtable.data,hashtable.size,SIZE,4
    dwordnset dword[hashtable.data],0,SIZE

    ; sys_write STDOUT, dword [dta], dword[sdta]

    ; ccall printf,fmt1,dword[sdta]
    cmp dword[sdta],0
    je e3

    write_frequencies 1
    write_frequencies 2

    write_count 3,lngstr4
    write_count 4,lngstr3
    write_count 6,lngstr2
    write_count 12,lngstr1
    ; write_count 17,lngstr
    write_count 18,lngstr
    ; write_count 19,lngstr
    ; write_count 64,lngstr
    xor eax,eax
    ccall perror, err1
    sys_exit -1

    section '.data' writeable

    align 4
    fmt db "%s %.3f",0xa,0
    fmt1 db "%u",0xa,0
    fmt2 db "%p",0xa,0
    fmt3 db "%c",0xa,0
    fmt4 db "%s %u",0xa,0
    fmt5 db "%u",9,"%s",0xa,0
    fmt6 db "%u %u",0xa,0
    fmt7 db "%x",0xa,0
    err1 db "realloc failed",0
    err2 db "index error",0
    lngstr db "gGtattTtaatttatagt",0
    lngstr1 db "ggtATTttaatt",0
    lngstr2 db "gGtAtt",0
    lngstr3 db "gGta",0
    lngstr4 db "GgT",0
    three db ">THREE"
    nl db 0xa,0

    align 4
    fptr dd 0
    fend dd 0
    dta dd 0
    sdta dd 0
    hashtable vector 0,0
    sortedtable vector 0,0
    align 4
    sema dd 0

    section '.bss' writeable

    align 4
    buf rb 256
    align 4
    fileresbuf rb fsize
    filebuf rb fsize
    align 4
    xtbl rb 256
    align 4
    lngbuf rb 128
    align 4
    bstck rb 4095
    tstck rb 1
    align 4
    bstck1 rb 4095
    tstck1 rb 1
    align 4
    bstck2 rb 4095
    tstck2 rb 1
    align 4
    bstck3 rb 4095
    tstck3 rb 1
    sum rd 1
    cnt rd 1

    current job title: senior software engineer
    skills: x86 aasembler,c++,c,rust,go,nim,haskell...

    press any key to continue or any other to quit...

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)