-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path24_UTF8_String_Indirect_Access.asm
More file actions
92 lines (78 loc) · 3.38 KB
/
24_UTF8_String_Indirect_Access.asm
File metadata and controls
92 lines (78 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
; ==============================================================
; Core features:
; 1. Handles 1-4 byte UTF-8 characters (including emojis).
; 2. Detects malformed UTF-8 (optional error handling).
; ==============================================================
section .text
global _start
_start:
; --- Simulate input (addresses pointing to UTF-8 strings) ---
mov esi, inbox_addresses ; List of string addresses
mov edi, utf8_data ; Floor: UTF-8 strings (null-terminated)
.process_address:
lodsd ; eax = address
add eax, edi ; Convert to memory address
mov ebx, eax ; ebx = current char pointer
.next_char:
movzx ecx, byte [ebx] ; Load next byte
test cl, cl ; Check for null terminator
jz .process_address ; End of string? Move to next address.
; --- UTF-8 Decoding Logic ---
; Determine byte length of the character (1-4 bytes)
cmp cl, 0x80 ; ASCII (1-byte)?
jb .output_char ; If yes, output directly.
; Multi-byte UTF-8 handling
mov edx, ecx ; Save first byte
inc ebx ; Move to next byte
mov ch, [ebx] ; Read second byte
test dl, 0xE0 ; 2-byte sequence (110xxxxx)?
jz .invalid_utf8 ; Handle error (optional)
cmp dl, 0xE0 ; 3-byte sequence (1110xxxx)?
jb .output_char_2byte
cmp dl, 0xF0 ; 4-byte sequence (11110xxx)?
jb .output_char_3byte
; 4-byte sequence (11110xxx 10xxxxxx ...)
mov cl, [ebx+1] ; Read third byte
mov dh, [ebx+2] ; Read fourth byte
add ebx, 3 ; Advance pointer by 3
jmp .output_char_utf8
.output_char_2byte:
; 2-byte sequence (110xxxxx 10xxxxxx)
shl edx, 8
mov dl, ch ; Combine bytes
inc ebx ; Advance pointer
jmp .output_char_utf8
.output_char_3byte:
; 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
mov cl, [ebx+1] ; Read third byte
shl edx, 16
mov dx, cx ; Combine bytes
add ebx, 2 ; Advance pointer
jmp .output_char_utf8
.output_char:
; Single-byte ASCII (0xxxxxxx)
mov [output], cl ; OUTBOX
inc ebx ; BUMPUP 24
jmp .next_char
.output_char_utf8:
; --- Placeholder: Process full UTF-8 codepoint (EDX) ---
; (In real code, convert to UTF-32 or process further)
mov [output], edx
jmp .next_char
.invalid_utf8:
; Optional: Handle malformed UTF-8 (e.g., skip or error)
inc ebx ; Skip corrupt byte
jmp .next_char
section .data
; --- Example Data ---
inbox_addresses dd 0, 6 ; Addresses of strings in utf8_data
utf8_data db 'A', 0 ; "A" (ASCII)
db 0xC3, 0xA9, 0 ; "é" (2-byte)
db 0xE2, 0x82, 0xAC, 0 ; "€" (3-byte)
db 0xF0, 0x9F, 0x98, 0x80, 0 ; "😀" (4-byte)
output dd 0
section .text
; --- Exit (Linux syscall) ---
mov eax, 60 ; sys_exit
xor edi, edi ; status 0
syscall