Page 1 of 1

Parsing HTML in x64 assembly - Part IV

#1 Martyn.Rae  Icon User is offline

  • The programming dinosaur
  • member icon

Reputation: 545
  • View blog
  • Posts: 1,420
  • Joined: 22-August 09

Posted 09 July 2017 - 09:21 PM

Where are we so far?

So far, the previous three tutorials in this series have concerned themselves with producing a tree structure of nodes representing the HTML we have just parsed. In itself, not particularly useful as we have not had the ability to exploit the fruits of our labour! So where do we want to go with this? I don't know about you the reader, but I hate writing code just for the sake of writing code.

Parsing the tag attributes

As is stands, the tag attributes are simply just parsed as a string. Let's parse the asttributes so we end up with a table consisting of 'key=value' pairs. To do this, we need to modify the node structure like so:-

node                                struct
    next_node                       dq        ?
    previous_node                   dq        ?
    first_child                     dq        ?
    last_child                      dq        ?
    html_tag                        dq        ?
    html_attribute_first            dq        ?
    html_attribute_last             dq        ?
    html_text_string                dq        ?
node                                ends



We also need to create an attribute structure to describe how we are going to hold the key and value items like so:-

attribute                           struct
    next_attribute                  dq        ?
    previous_attribute              dq        ?
    attribute_key                   dq        ?
    attribute_value                 dq        ?
attribute                           ends



The modified Code

The modified code below works but only if the HTML being parsed is valid. I use the W3C.org web site found here

                                    title     Parse_Html
                                    option    casemap:none

attribute                           struct
    next_attribute                  dq        ?
    previous_attribute              dq        ?
    attribute_key                   dq        ?
    attribute_value                 dq        ?
attribute                           ends

node                                struct
    next_node                       dq        ?
    previous_node                   dq        ?
    first_child                     dq        ?
    last_child                      dq        ?
    html_tag                        dq        ?
    html_attribute_first            dq        ?
    html_attribute_last             dq        ?
    html_text_string                dq        ?
node                                ends

                                    .data
html_tags                           dq        1,  000000000000000FFH, 00000000000000000H, "a",        0          ; 0
                                    dq        1,  000000000000000FFH, 00000000000000000H, "b",        0          ; 1
                                    dq        1,  000000000000000FFH, 00000000000000000H, "i",        0          ; 2
                                    dq        1,  000000000000000FFH, 00000000000000000H, "p",        0          ; 3
                                    dq        1,  000000000000000FFH, 00000000000000000H, "q",        0          ; 4
                                    dq        1,  000000000000000FFH, 00000000000000000H, "s",        0          ; 5
                                    dq        1,  000000000000000FFH, 00000000000000000H, "u",        0          ; 6
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rb",       0          ; 7
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dd",       0          ; 8
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ld",       0          ; 9
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "td",       0          ; 10
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "me",       0          ; 11
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "1h",       0          ; 12
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "2h",       0          ; 13
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "3h",       0          ; 14
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "4h",       0          ; 15
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "5h",       0          ; 16
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "6h",       0          ; 17
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rh",       0          ; 18
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "il",       0          ; 19
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lo",       0          ; 20
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "br",       0          ; 21
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "pr",       0          ; 22
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "tr",       0          ; 23
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dt",       0          ; 24
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ht",       0          ; 25
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rt",       0          ; 26
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lu",       0          ; 27
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "idb",      0          ; 28
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "odb",      0          ; 29
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "loc",      0          ; 30
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "led",      0          ; 31
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "nfd",      0          ; 32
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "vid",      0          ; 33
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gmi",      0          ; 34
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "sni",      0          ; 35
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "dbk",      0          ; 36
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pam",      0          ; 37
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "van",      0          ; 38
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "erp",      0          ; 39
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "ctr",      0          ; 40
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "bus",      0          ; 41
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pus",      0          ; 42
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rav",      0          ; 43
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rbw",      0          ; 44
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gvs",      0          ; 45
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "rbba",     0          ; 46
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "aera",     0          ; 47
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "esab",     0          ; 48
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ydob",     0          ; 49
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "etic",     0          ; 50
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "edoc",     0          ; 51
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atad",     0          ; 52
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "mrof",     0          ; 53
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "daeh",     0          ; 54
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "lmth",     0          ; 55
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "knil",     0          ; 56
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "niam",     0          ; 57
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "kram",     0          ; 58
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "unem",     0          ; 59
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atem",     0          ; 60
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ybur",     0          ; 61
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "pmas",     0          ; 62
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "naps",     0          ; 63
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "emit",     0          ; 64
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "htap",     0          ; 65
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "edisa",    0          ; 66
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oidua",    0          ; 67
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "debme",    0          ; 68
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tupni",    0          ; 69
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "lebal",    0          ; 70
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "retem",    0          ; 71
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "marap",    0          ; 72
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "llams",    0          ; 73
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elyts",    0          ; 74
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elbat",    0          ; 75
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "ydobt",    0          ; 76
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tooft",    0          ; 77
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "daeht",    0          ; 78
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "eltit",    0          ; 79
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "kcart",    0          ; 80
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oediv",    0          ; 81
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retnec",   0          ; 82
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "nottub",   0          ; 83
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "savnac",   0          ; 84
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "golaid",   0          ; 85
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "erugif",   0          ; 86
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retoof",   0          ; 87
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "redaeh",   0          ; 88
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "puorgh",   0          ; 89
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "emarfi",   0          ; 90
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "negyek",   0          ; 91
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "dnegel",   0          ; 92
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tcejbo",   0          ; 93
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "noitpo",   0          ; 94
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tuptuo",   0          ; 95
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tpircs",   0          ; 96
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tceles",   0          ; 97
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "ecruos",   0          ; 98
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "gnorts",   0          ; 99
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sserdda",  0          ; 100
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "elcitra",  0          ; 101
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitpac",  0          ; 102
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sliated",  0          ; 103
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitces",  0          ; 104
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "yrammus",  0          ; 105
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "epytcod!", 0          ; 106
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgloc", 0          ; 107
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tsilatad", 0          ; 108
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tesdleif", 0          ; 109
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "metiunem", 0          ; 110
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tpircson", 0          ; 111
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgtpo", 0          ; 112
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "ssergorp", 0          ; 113
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "etalpmet", 0          ; 114
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "aeratxet", 0          ; 115
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "ouqkcolb", "et"       ; 116
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "itpacgif", "no"       ; 117
                                                                                                                 
start_comment                       dd        '--!<'
end_comment                         dd        '>--'
end_script_tag                      dq        'tpircs/<'
end_style_tag                       dq        '>elyts/<'

                                    .code

;
;   r15 points to the source
;   r14 points to the free workspace
;

Parse_Html                          proc      first_node: qword, last_node: qword
                                    mov       first_node, rcx
                                    mov       last_node, rdx
create_node:                        mov       rdi, r14
                                    add       r14, sizeof node
hunt_for_tag:                       mov       eax, dword ptr [rsi]
                                    cmp       al, 21
                                    je        end_of_parse
                                    cmp       al, '<'
                                    je        found_start_tag
                                    add       rsi, 1
                                    jmp       hunt_for_tag
found_start_tag:                    cmp       eax, start_comment
                                    jne       not_start_comment
                                    
                                    ;
                                    ; Process the comment
                                    ;

hunt_for_end_comment:               add       rsi, 1
                                    mov       eax, dword ptr [rsi]
                                    and       eax, 0FFFFFFH
                                    cmp       eax, end_comment
                                    jne       hunt_for_end_comment
                                    add       rsi, 3
                                    jmp       hunt_for_tag
not_start_comment:                  cmp       ah, '/'
                                    je        start_end_tag
                                    xor       r10, r10
process_tag:                        mov       r13, r14
                                    add       rsi, 1
                                    mov       al, byte ptr [rsi]
copy_html_tag:                      add       rsi, 1
                                    add       r14, 1
                                    mov       byte ptr [r14], al
                                    inc       byte ptr [r13]
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '0'
                                    jb        tag_no_number
                                    cmp       al, '9'
                                    jbe       copy_html_tag
tag_no_number:                      or        al, 020H
                                    cmp       al, 'a'
                                    jb        end_start_html_tag
                                    cmp       al, 'z'
                                    ja        end_start_html_tag
                                    jmp       copy_html_tag
end_start_html_tag:                 add       r14, 1
                                    ;
                                    ;   Find the tag in the table
                                    ;
                                    lea       r12, html_tags
                                    xor       rdx, rdx
                                    xor       rcx, rcx
                                    mov       bl, byte ptr [r13]
find_html_tag_length_loop:          mov       rdx, qword ptr [r12]
                                    or        rdx, rdx
                                    je        tag_not_found
                                    cmp       bl, dl
                                    je        identify_html_tag
                                    lea       r12, 40[r12]
                                    add       rcx, 1
                                    jmp       find_html_tag_length_loop
identify_html_tag:                  mov       r8, qword ptr 1[r13]
                                    mov       r9, qword ptr 9[r13]
                                    and       r8, qword ptr 8[r12]
                                    and       r9, qword ptr 16[r12]
                                    cmp       r8, qword ptr 24[r12]
                                    jne       next_tag_in_table
                                    cmp       r9, qword ptr 32[r12]
                                    jne       next_tag_in_table
                                    mov       node.html_tag[rdi], rcx
                                    jmp       tag_found
next_tag_in_table:                  lea       r12, 40[r12]
                                    add       rcx, 1
                                    mov       rdx, qword ptr [r12]
                                    or        rdx, rdx
                                    je        tag_not_found
                                    jmp       identify_html_tag

                                    ;
                                    ; Find the start of the attributes or end of tag
                                    ;

tag_found:                          xor       r11, r11
                                    mov       al, byte ptr [rsi]
                                    cmp       al, ' '
                                    je        find_non_space
                                    cmp       al, 009H
                                    je        find_non_space
                                    cmp       al, 00DH
                                    je        find_non_space
                                    cmp       al, 00AH
                                    jne       found_non_space
find_non_space:                     add       rsi, 1
                                    jmp       tag_found
found_non_space:                    cmp       al, '>'
                                    je        found_end_attributes
                                    cmp       al, '/'
                                    je        nearly_found_end_attributes

                                    ;
                                    ; We have found the start of the attributes
                                    ;

                                    mov       r13, r14
find_end_key:                       add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '='
                                    je        found_end_key
                                    cmp       al, ' '
                                    je        found_end_key
                                    cmp       al, 009H
                                    je        found_end_key
                                    cmp       al, 00DH
                                    je        found_end_key
                                    cmp       al, 00AH
                                    je        found_end_key
                                    cmp       al, '>'
                                    je        found_end_key
                                    cmp       al, '/'
                                    je        found_end_key
                                    mov       byte ptr [r14], al
                                    inc       byte ptr [r13]
                                    add       rsi, 1
                                    jmp       find_end_key
found_end_key:                      add       r14, 1
                                    mov       r11, r14
                                    add       r14, sizeof attribute
                                    cmp       al, '='
                                    jne       end_of_attributes
                                    add       rsi, 1
                                    mov       r9, r14
find_end_of_attribute_value:        add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '"'
                                    je        skip_to_end_double_quotes
                                    cmp       al, "'"
                                    je        skip_to_end_single_quotes
                                    cmp       al, '>'
                                    je        found_end_attributes
                                    cmp       al, '/'
                                    je        nearly_found_end_attributes
                                    jmp       find_end_of_attribute_value
skip_to_end_double_quotes:          mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '"'
                                    je        end_of_quoted_attribute
                                    jmp       skip_to_end_double_quotes
skip_to_end_single_quotes:          mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, "'"
                                    je        find_end_of_attribute_value
                                    jmp       skip_to_end_single_quotes
end_of_quoted_attribute:            mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 2
                                    mov       al, byte ptr [rsi]
end_of_attributes:                  cmp       al, '>'
                                    je        found_end_attributes
                                    cmp       al, '/'
                                    je        nearly_found_end_attributes
                                    lea       rcx, node.html_attribute_first[rdi]
                                    lea       rdx, node.html_attribute_last[rdi]
                                    mov       rax, qword ptr [rcx]
                                    or        rax, rax
                                    jne       not_attribute_first
                                    mov       qword ptr [rcx], r11
                                    jmp       chain_to_end_attribute
not_attribute_first:                mov       rax, qword ptr [rdx]
                                    mov       attribute.next_attribute[rax], r11
chain_to_end_attribute:             mov       rax, qword ptr [rdx]
                                    mov       attribute.previous_attribute[r11], rax
                                    mov       qword ptr [rdx], r11
                                    lea       rax,  1[r13]
                                    mov       attribute.attribute_key[r11], rax
                                    or        r9, r9
                                    je        tag_found
                                    lea       rax, 1[r9]
                                    mov       attribute.attribute_value[r11], rax
                                    xor       r9, r9
                                    jmp       tag_found
nearly_found_end_attributes:        add       rsi, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '>'
                                    jne       nearly_found_end_attributes
                                    jmp       process_next_tag
found_end_attributes:               lea       rcx, node.html_attribute_first[rdi]
                                    lea       rdx, node.html_attribute_last[rdi]
                                    mov       rax, qword ptr [rcx]
                                    or        rax, rax
                                    jne       not_first_attribute
                                    mov       qword ptr [rcx], r11
                                    jmp       chain_to_attribute_end
not_first_attribute:                mov       rax, qword ptr [rdx]
                                    mov       attribute.next_attribute[rax], r11
chain_to_attribute_end:             or        r11, r11
                                    je        process_next_tag
                                    mov       rax, qword ptr [rdx]
                                    mov       attribute.previous_attribute[r11], rax
                                    mov       qword ptr [rdx], r11
                                    lea       rax,  1[r13]
                                    mov       attribute.attribute_key[r11], rax
                                    or        r9, r9
                                    je        process_next_tag
                                    lea       rax, 1[r9]
                                    mov       attribute.attribute_value[r11], rax
                                    xor       r11, r11
                                    xor       r9, r9
process_next_tag:                   xor       al, al
                                    mov       byte ptr [r14], al 
                                    add       r14, 1                                   
                                    add       rsi, 1
                                    or        r10, r10
                                    je        next_sub_section
                                    mov       rax, node.html_tag[rdi]
                                    ret

                                    ;
                                    ; Need to know if this is a self terminating tag
                                    ;

next_sub_section:                   mov       rbx, node.html_tag[rdi]

                                    ;
                                    ; Special case for <script>
                                    ;

                                    cmp       rbx, 96
                                    je        process_script

                                    ;
                                    ; Special case for <style>
                                    ;

                                    cmp       rbx, 74
                                    je        process_style

                                    cmp       rbx, 7
                                    je        self_terminating
                                    cmp       rbx, 18
                                    je        self_terminating
                                    cmp       rbx, 30
                                    je        self_terminating
                                    cmp       rbx, 34
                                    je        self_terminating
                                    cmp       rbx, 44
                                    je        self_terminating
                                    cmp       rbx, 47
                                    je        self_terminating
                                    cmp       rbx, 48
                                    je        self_terminating
                                    cmp       rbx, 56
                                    je        self_terminating
                                    cmp       rbx, 60
                                    je        self_terminating
                                    cmp       rbx, 68
                                    je        self_terminating
                                    cmp       rbx, 69
                                    je        self_terminating
                                    cmp       rbx, 72
                                    je        self_terminating
                                    cmp       rbx, 80
                                    je        self_terminating
                                    cmp       rbx, 91
                                    je        self_terminating
                                    cmp       rbx, 98
                                    je        self_terminating
                                    cmp       rbx, 106
                                    je        self_terminating

                                    ;
                                    ; Skip spaces etc
                                    ;

find_next_non_space:                mov       al, byte ptr [rsi]
                                    cmp       al, ' '
                                    je        skip_space
                                    cmp       al, 009H
                                    je        skip_space
                                    cmp       al, 00DH
                                    je        skip_space
                                    cmp       al, 00AH
                                    jne       check_for_text
skip_space:                         add       rsi, 1
                                    jmp       find_next_non_space
check_for_text:                     cmp       al, '<'
                                    je        recursive_call

                                    ;
                                    ; Non space characters after the start tag need to be remembered
                                    ;

                                    mov       node.html_text_string[rdi], r14
copy_text_string:                   mov       byte ptr [r14], al
                                    add       r14, 1
                                    add       rsi, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '<'
                                    jne       copy_text_string
                                    add       r14, 1

                                    ;
                                    ; Apply recursive call to Parse_Html
                                    ;

recursive_call:                     add       r14, 1
                                    push      rdi
                                    lea       rdx, node.last_child[rdi]
                                    lea       rcx, node.first_child[rdi]
                                    sub       rsp, 16
                                    call      Parse_Html
                                    add       rsp, 16
                                    pop       rdi
                                    cmp       node.html_tag[rdi], rax
                                    je        self_terminating
                                    int       3

                                    ;
                                    ; Process self terminating tags
                                    ;

self_terminating:                   mov       rcx, first_node
                                    mov       rdx, last_node
                                    mov       rax, qword ptr [rcx]
                                    or        rax, rax
                                    jne       not_first_node
                                    mov       qword ptr [rcx], rdi
                                    jmp       chain_to_end
not_first_node:                     mov       rax, qword ptr [rdx]
                                    mov       node.next_node[rax], rdi
chain_to_end:                       mov       rax, qword ptr [rdx]
                                    mov       node.previous_node[rdi], rax
                                    mov       qword ptr [rdx], rdi
                                    jmp       create_node                           
tag_not_found:                      int       3
start_end_tag:                      add       rsi, 1
                                    xor       r10, r10
                                    inc       r10
                                    jmp       process_tag

                                    ;
                                    ; Search for </script> end tag
                                    ;

process_script:                     mov       node.html_text_string[rdi], r14
process_script_loop:                mov       rax, qword ptr [rsi]
                                    mov       rbx, end_script_tag
                                    cmp       rax, rbx
                                    je        recursive_call
                                    mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    jmp       process_script_loop

                                    ;
                                    ; Search for </style> end tag
                                    ;

process_style:                      mov       node.html_text_string[rdi], r14
process_style_loop:                 mov       rax, qword ptr [rsi]
                                    mov       rbx, end_style_tag
                                    cmp       rax, rbx
                                    je        recursive_call
                                    mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    jmp       process_style_loop
end_of_parse:                       ret

Parse_Html                          endp
                                    end




Questions and Answers

Any questions you have pertaining to this code, please don't hesitate to post them here and I will get back to you as quickly as I can.

This post has been edited by Martyn.Rae: 11 July 2017 - 09:37 AM


Is This A Good Question/Topic? 0
  • +

Page 1 of 1