vis
a vi-like editor based on Plan 9's structural regular expressions
git clone https://9o.is/git/vis.git
html.lua
(6842B)
1 -- Copyright 2006-2025 Mitchell. See LICENSE.
2 -- HTML LPeg lexer.
3
4 local lexer = lexer
5 local word_match = lexer.word_match
6 local P, S = lpeg.P, lpeg.S
7
8 local lex = lexer.new(..., {no_user_word_lists = true})
9
10 -- Comments.
11 lex:add_rule('comment', lex:tag(lexer.COMMENT, lexer.range('<!--', '-->')))
12
13 -- Doctype.
14 lex:add_rule('doctype',
15 lex:tag(lexer.TAG .. '.doctype', lexer.range('<!' * word_match('doctype', true), '>')))
16
17 -- Tags.
18 local paired_tag = lex:tag(lexer.TAG, lex:word_match(lexer.TAG, true))
19 local single_tag = lex:tag(lexer.TAG .. '.single', lex:word_match(lexer.TAG .. '.single', true))
20 local known_tag = paired_tag + single_tag
21 local unknown_tag = lex:tag(lexer.TAG .. '.unknown', (lexer.alnum + '-')^1)
22 local tag = lex:tag(lexer.TAG .. '.chars', '<' * P('/')^-1) * (known_tag + unknown_tag) * -P(':')
23 lex:add_rule('tag', tag)
24
25 -- Closing tags.
26 local tag_close = lex:tag(lexer.TAG .. '.chars', P('/')^-1 * '>')
27 lex:add_rule('tag_close', tag_close)
28
29 -- Equals.
30 -- TODO: performance is terrible on large files.
31 local in_tag = P(function(input, index)
32 local before = input:sub(1, index - 1)
33 local s, e = before:find('<[^>]-$'), before:find('>[^<]-$')
34 if s and e then return s > e end
35 if s then return true end
36 return input:find('^[^<]->', index) ~= nil
37 end)
38
39 local equals = lex:tag(lexer.OPERATOR, '=') -- * in_tag
40 -- lex:add_rule('equals', equals)
41
42 -- Attributes.
43 local known_attribute = lex:tag(lexer.ATTRIBUTE, lex:word_match(lexer.ATTRIBUTE, true) +
44 ((P('data-') + 'aria-') * (lexer.alnum + '-')^1))
45 local unknown_attribute = lex:tag(lexer.ATTRIBUTE .. '.unknown', (lexer.alnum + '-')^1)
46 local ws = lex:get_rule('whitespace')
47 local attribute_eq = (known_attribute + unknown_attribute) * ws^-1 * equals
48 lex:add_rule('attribute', attribute_eq)
49
50 -- Strings.
51 local string = lex:tag(lexer.STRING, lexer.after_set('=', lexer.range("'") + lexer.range('"')))
52 lex:add_rule('string', string)
53
54 -- Numbers.
55 local number = lex:tag(lexer.NUMBER, lexer.dec_num * P('%')^-1)
56 lex:add_rule('number', lexer.after_set('=', number)) -- *in_tag)
57
58 -- Entities.
59 lex:add_rule('entity', lex:tag(lexer.CONSTANT_BUILTIN .. '.entity',
60 '&' * (lexer.any - lexer.space - ';')^1 * ';'))
61
62 -- Fold points.
63 local function disambiguate_lt(text, pos, line, s)
64 if line:find('/>', s) then
65 return 0
66 elseif line:find('^</', s) then
67 return -1
68 else
69 return 1
70 end
71 end
72 lex:add_fold_point(lexer.TAG .. '.chars', '<', disambiguate_lt)
73 lex:add_fold_point(lexer.COMMENT, '<!--', '-->')
74
75 -- Tags that start embedded languages.
76 -- Export these patterns for proxy lexers (e.g. ASP) that need them.
77 lex.embed_start_tag = tag * (ws * attribute_eq * ws^-1 * string)^0 * ws^-1 * tag_close
78 lex.embed_end_tag = tag * tag_close
79
80 -- Embedded CSS (<style type="text/css"> ... </style>).
81 local css = lexer.load('css')
82 local style_tag = word_match('style', true)
83 local css_start_rule = #('<' * style_tag * ('>' + P(function(input, index)
84 if input:find('^%s+type%s*=%s*(["\'])text/css%1', index) then return true end
85 end))) * lex.embed_start_tag
86 local css_end_rule = #('</' * style_tag * '>') * lex.embed_end_tag
87 lex:embed(css, css_start_rule, css_end_rule)
88 -- Embedded CSS in style="" attribute.
89 local style = lexer.load('css', 'css.style')
90 css_start_rule = #(P('style') * lexer.space^0 * '=') * attribute_eq * ws^-1 *
91 lex:tag(lexer.STRING, '"')
92 css_end_rule = lex:tag(lexer.STRING, '"')
93 lex:embed(style, css_start_rule, css_end_rule) -- only double-quotes for now
94
95 -- Embedded JavaScript (<script type="text/javascript"> ... </script>).
96 local js = lexer.load('javascript')
97 local script_tag = word_match('script', true)
98 local js_start_rule = #('<' * script_tag * ('>' + P(function(input, index)
99 if input:find('^%s+type%s*=%s*(["\'])text/javascript%1', index) then return true end
100 end))) * lex.embed_start_tag
101 local js_end_rule = #('</' * script_tag * '>') * lex.embed_end_tag
102 lex:embed(js, js_start_rule, js_end_rule)
103
104 -- Embedded CoffeeScript (<script type="text/coffeescript"> ... </script>).
105 local cs = lexer.load('coffeescript')
106 script_tag = word_match('script', true)
107 local cs_start_rule = #('<' * script_tag * P(function(input, index)
108 if input:find('^[^>]+type%s*=%s*(["\'])text/coffeescript%1', index) then return true end
109 end)) * lex.embed_start_tag
110 local cs_end_rule = #('</' * script_tag * '>') * lex.embed_end_tag
111 lex:embed(cs, cs_start_rule, cs_end_rule)
112
113 -- Word lists.
114 lex:set_word_list(lexer.TAG .. '.single', {
115 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta',
116 'param', 'source', 'track', 'wbr'
117 })
118
119 lex:set_word_list(lexer.TAG, {
120 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdi', 'bdo', 'blockquote', 'body',
121 'button', 'canvas', 'caption', 'cite', 'code', 'colgroup', 'content', 'data', 'datalist', 'dd',
122 'decorator', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'element', 'em', 'fieldset',
123 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header',
124 'html', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'mark', 'menu',
125 'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p',
126 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select', 'shadow',
127 'small', 'spacer', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td',
128 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'u', 'ul', 'var', 'video'
129 })
130
131 lex:set_word_list(lexer.ATTRIBUTE, {
132 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'async', 'autocomplete',
133 'autofocus', 'autoplay', 'bgcolor', 'border', 'buffered', 'challenge', 'charset', 'checked',
134 'cite', 'class', 'code', 'codebase', 'color', 'cols', 'colspan', 'content', 'contenteditable',
135 'contextmenu', 'controls', 'coords', 'data', 'data-', 'datetime', 'default', 'defer', 'dir',
136 'dirname', 'disabled', 'download', 'draggable', 'dropzone', 'enctype', 'for', 'form', 'headers',
137 'height', 'hidden', 'high', 'href', 'hreflang', 'http-equiv', 'icon', 'id', 'ismap', 'itemprop',
138 'keytype', 'kind', 'label', 'lang', 'language', 'list', 'loop', 'low', 'manifest', 'max',
139 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'novalidate', 'open', 'optimum',
140 'pattern', 'ping', 'placeholder', 'poster', 'preload', 'pubdate', 'radiogroup', 'readonly', 'rel',
141 'required', 'reversed', 'role', 'rows', 'rowspan', 'sandbox', 'scope', 'scoped', 'seamless',
142 'selected', 'shape', 'size', 'sizes', 'span', 'spellcheck', 'src', 'srcdoc', 'srclang', 'start',
143 'step', 'style', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'value', 'width',
144 'wrap'
145 })
146
147 lexer.property['scintillua.comment'] = '<!--|-->'
148 lexer.property['scintillua.angle.braces'] = '1'
149 lexer.property['scintillua.word.chars'] =
150 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'
151
152 return lex