diff --git a/src/runtime/runtime.asm b/src/runtime/runtime.asm index c050e9b..77261c6 100644 --- a/src/runtime/runtime.asm +++ b/src/runtime/runtime.asm @@ -3,18 +3,13 @@ extern main section .text _start: - ; The args already match our array structure, so we pass the result directly mov rdi, rsp - - call main ; main returns int in rax - - ; Exit with main's return value - mov rdi, rax ; exit code - mov rax, 60 ; syscall: exit + call main + mov rdi, rax + mov rax, 60 syscall global nub_strcmp - nub_strcmp: xor rdx, rdx .loop: @@ -32,4 +27,74 @@ nub_strcmp: .equal: mov rax, 1 ret - \ No newline at end of file + +; TODO: This is ai-generated. Should be re-implemented in the future +global nub_memset +nub_memset: + ; Save original destination for return value + mov rax, rdi + + ; Handle zero length + test rdx, rdx + jz .done + + ; For small sizes, use simple byte-by-byte loop + cmp rdx, 16 + jb .byte_loop + + ; Prepare value for bulk setting + ; Replicate the byte across all 8 bytes of rsi + and rsi, 0xFF ; Ensure only low byte is used + mov rcx, rsi ; rcx = byte value + shl rsi, 8 + or rsi, rcx ; rsi = byte | (byte << 8) + mov rcx, rsi + shl rsi, 16 + or rsi, rcx ; rsi = 4 copies of byte + mov rcx, rsi + shl rsi, 32 + or rsi, rcx ; rsi = 8 copies of byte + + ; Align to 8-byte boundary if needed + mov rcx, rdi + and rcx, 7 ; rcx = bytes until 8-byte aligned + jz .aligned + + ; Fill bytes until aligned + neg rcx + add rcx, 8 ; rcx = bytes to fill for alignment + cmp rcx, rdx + jbe .align_loop + mov rcx, rdx ; Don't go past end +.align_loop: + mov [rdi], sil + inc rdi + dec rdx + dec rcx + jnz .align_loop + +.aligned: + ; Fill 8 bytes at a time + mov rcx, rdx + shr rcx, 3 ; rcx = number of 8-byte chunks + jz .remainder +.quad_loop: + mov [rdi], rsi + add rdi, 8 + dec rcx + jnz .quad_loop + + ; Handle remainder bytes + and rdx, 7 ; rdx = remaining bytes +.remainder: + test rdx, rdx + jz .done + +.byte_loop: + mov [rdi], sil + inc rdi + dec rdx + jnz .byte_loop + +.done: + ret \ No newline at end of file