• Re: On the ugly Heapsort implementation which becomes beautiful in

    From Kerr-Mudd, John@21:1/5 to siarczek83@gmail.com on Thu Aug 25 19:32:34 2022
    XPost: comp.lang.forth

    On Thu, 25 Aug 2022 10:17:26 -0700 (PDT)
    MichaƂ Kasprzak <siarczek83@gmail.com> wrote:

    Look at the xor r10d, r10d instruction in the heapsort.asm file.
    This known technique of limiting code length by working on the lower 32 bits of 64-bit registers, which works great for rax-rdx registers, unfortunately does not work for extended registers r8-r15. Even the 32-bit xor r10d, r10d requires a prefix byte
    and the entire instruction is also 3 bytes long.
    It's a pity, but we can swap the roles of the r10 and rax registers in the sorting loop part of the code to be able to write xor eax, eax (which is 2 bytes long) instead of xor r10d, r10d and get 1 byte shorter code :)
    And that's exactly what I did in the patched version of heapsort.asm below. Now the revised assembled code is 181 bytes long. It is really short! When you honestly compare this length with the numbers you saw earlier in this thread remember that this code has a sift code inlined in two places for quicker operation.

    Who next of you will shorten this code more or find some improvement to this code? :)


    You might try an asm group
    I'll try a xpost to clax



    ---------- start of file: heapsort.asm, cut below this line ----------
    BITS 64
    ; rdi = array's address
    ; rsi = array's number of elements = n
    heapsort: mov rax, rsi
    shr rax, 1
    ; nothing to sort if there is 0 or 1 element in the array
    jz the_end
    ; now we are sure that always n>=2
    ; r10 = index of the last parent node
    ; r10 is also the counter of the heap building loop
    lea r10, [rax-1]
    ; rbx = index of the left child of the last parent node
    ; rbx is also index of the left child of r10 node in the loop
    lea rbx, [rax+rax-1]
    ; heap building loop
    ; sift the counter node's value stored in r9 to the proper position
    ; r9 = array[r10]
    build_loop: mov r9, qword [rdi+8*r10]
    ; r11 is index of target place for sifted values
    ; we start sifting from the counter node's index
    mov r11, r10
    ; rax is the index of a current child of r11 node
    ; we start r11 with the index of the left child
    mov rax, rbx
    jmp sift1_while
    ; we will find the child with bigger value and store its index in rax
    ; we will load this bigger value to rdx
    ; rcx holds the index of the right child
    sift1: lea rcx, [rax+1]
    mov rdx, qword [rdi+8*rax]
    ; note that the right child may not exist
    ; then we consider the left child to be bigger
    cmp rcx, rsi
    jnb sift1_check
    ; we load the right child node's value to r8
    mov r8, qword [rdi+8*rcx]
    ; if the right child node's value is bigger than the left child's
    cmp r8, rdx
    jle sift1_check
    ; then we will proceed the right child node
    mov rdx, r8
    mov rax, rcx
    ; compare the values of the current child node with the sifting node sift1_check: cmp r9, rdx
    ; if the sifting node's value is bigger then we end sifting
    jnl build_next
    ; we store the child's value into the parent node's place
    mov qword [rdi+8*r11], rdx
    ; we make the child's node the new parent's node
    mov r11, rax
    lea rax, [rax+rax+1]
    ; do sift if there is still at least one child
    sift1_while: cmp rax, rsi
    jb sift1
    ; we store sifted value into the target place and repeat the loop
    build_next: mov qword [rdi+8*r11], r9
    sub rbx, 2
    sub r10, 1
    jnc build_loop
    ; the sorting loop starts with n-1
    ; r10 will hold the counter of the sorting loop
    lea r10, [rsi-1]
    ; the sorting loop
    ; rdx = array[0]
    ; r9 = array[counter]
    sort_loop: mov rdx, qword [rdi]
    mov r9, qword [rdi+8*r10]
    ; we will sift the node rax=0 which value is in r9
    xor eax, eax
    ; we store array[counter]=array[0]
    mov qword [rdi+8*r10], rdx
    ; now we will store in rdx the left child node's index
    mov edx, 1
    jmp sift2_while
    ; sift2 works like sift1 with changed registers
    ; we will find the child with bigger value and store its index in rdx
    ; we will load this bigger value to rsi
    ; rcx holds the index of the right child
    sift2: lea rcx, [rdx+1]
    mov rsi, qword [rdi+8*rdx]
    ; note that the right child may not exist
    ; then we consider the left child to be bigger
    cmp rcx, r10
    jnb sift2_check
    ; we load the right child node's value to r8
    mov r8, qword [rdi+8*rcx]
    ; if the right child node's value is bigger than the left child's
    cmp r8, rsi
    jle sift2_check
    ; then we will proceed the right child node
    mov rsi, r8
    mov rdx, rcx
    ; compare the values of the current child node with the sifting node sift2_check: cmp r9, rsi
    ; if the sifting node's value is bigger then we end sifting
    jnl sort_next
    ; we store the child's value into the parent node's place
    mov qword [rdi+8*rax], rsi
    ; we make the child's node the new parent's node
    mov rax, rdx
    lea rdx, [rdx+rdx+1]
    ; do sift if there is still at least one child
    sift2_while: cmp rdx, r10
    jb sift2
    ; we store sifted value into the target place
    sort_next: mov qword [rdi+8*rax], r9
    ; next step of the sorting loop
    sub r10, 1
    ; the sorting loop ends with 1
    jnz sort_loop
    the_end:
    ---------- end of file: heapsort.asm, cut above this line ----------


    --
    Bah, and indeed Humbug.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)