vis

a vi-like editor based on Plan 9's structural regular expressions

git clone https://9o.is/git/vis.git

html.lua

(6842B)


      1 -- Copyright 2006-2025 Mitchell. See LICENSE.
      2 -- HTML LPeg lexer.
      3 
      4 local lexer = lexer
      5 local word_match = lexer.word_match
      6 local P, S = lpeg.P, lpeg.S
      7 
      8 local lex = lexer.new(..., {no_user_word_lists = true})
      9 
     10 -- Comments.
     11 lex:add_rule('comment', lex:tag(lexer.COMMENT, lexer.range('<!--', '-->')))
     12 
     13 -- Doctype.
     14 lex:add_rule('doctype',
     15 	lex:tag(lexer.TAG .. '.doctype', lexer.range('<!' * word_match('doctype', true), '>')))
     16 
     17 -- Tags.
     18 local paired_tag = lex:tag(lexer.TAG, lex:word_match(lexer.TAG, true))
     19 local single_tag = lex:tag(lexer.TAG .. '.single', lex:word_match(lexer.TAG .. '.single', true))
     20 local known_tag = paired_tag + single_tag
     21 local unknown_tag = lex:tag(lexer.TAG .. '.unknown', (lexer.alnum + '-')^1)
     22 local tag = lex:tag(lexer.TAG .. '.chars', '<' * P('/')^-1) * (known_tag + unknown_tag) * -P(':')
     23 lex:add_rule('tag', tag)
     24 
     25 -- Closing tags.
     26 local tag_close = lex:tag(lexer.TAG .. '.chars', P('/')^-1 * '>')
     27 lex:add_rule('tag_close', tag_close)
     28 
     29 -- Equals.
     30 -- TODO: performance is terrible on large files.
     31 local in_tag = P(function(input, index)
     32 	local before = input:sub(1, index - 1)
     33 	local s, e = before:find('<[^>]-$'), before:find('>[^<]-$')
     34 	if s and e then return s > e end
     35 	if s then return true end
     36 	return input:find('^[^<]->', index) ~= nil
     37 end)
     38 
     39 local equals = lex:tag(lexer.OPERATOR, '=') -- * in_tag
     40 -- lex:add_rule('equals', equals)
     41 
     42 -- Attributes.
     43 local known_attribute = lex:tag(lexer.ATTRIBUTE, lex:word_match(lexer.ATTRIBUTE, true) +
     44 	((P('data-') + 'aria-') * (lexer.alnum + '-')^1))
     45 local unknown_attribute = lex:tag(lexer.ATTRIBUTE .. '.unknown', (lexer.alnum + '-')^1)
     46 local ws = lex:get_rule('whitespace')
     47 local attribute_eq = (known_attribute + unknown_attribute) * ws^-1 * equals
     48 lex:add_rule('attribute', attribute_eq)
     49 
     50 -- Strings.
     51 local string = lex:tag(lexer.STRING, lexer.after_set('=', lexer.range("'") + lexer.range('"')))
     52 lex:add_rule('string', string)
     53 
     54 -- Numbers.
     55 local number = lex:tag(lexer.NUMBER, lexer.dec_num * P('%')^-1)
     56 lex:add_rule('number', lexer.after_set('=', number)) -- *in_tag)
     57 
     58 -- Entities.
     59 lex:add_rule('entity', lex:tag(lexer.CONSTANT_BUILTIN .. '.entity',
     60 	'&' * (lexer.any - lexer.space - ';')^1 * ';'))
     61 
     62 -- Fold points.
     63 local function disambiguate_lt(text, pos, line, s)
     64 	if line:find('/>', s) then
     65 		return 0
     66 	elseif line:find('^</', s) then
     67 		return -1
     68 	else
     69 		return 1
     70 	end
     71 end
     72 lex:add_fold_point(lexer.TAG .. '.chars', '<', disambiguate_lt)
     73 lex:add_fold_point(lexer.COMMENT, '<!--', '-->')
     74 
     75 -- Tags that start embedded languages.
     76 -- Export these patterns for proxy lexers (e.g. ASP) that need them.
     77 lex.embed_start_tag = tag * (ws * attribute_eq * ws^-1 * string)^0 * ws^-1 * tag_close
     78 lex.embed_end_tag = tag * tag_close
     79 
     80 -- Embedded CSS (<style type="text/css"> ... </style>).
     81 local css = lexer.load('css')
     82 local style_tag = word_match('style', true)
     83 local css_start_rule = #('<' * style_tag * ('>' + P(function(input, index)
     84 	if input:find('^%s+type%s*=%s*(["\'])text/css%1', index) then return true end
     85 end))) * lex.embed_start_tag
     86 local css_end_rule = #('</' * style_tag * '>') * lex.embed_end_tag
     87 lex:embed(css, css_start_rule, css_end_rule)
     88 -- Embedded CSS in style="" attribute.
     89 local style = lexer.load('css', 'css.style')
     90 css_start_rule = #(P('style') * lexer.space^0 * '=') * attribute_eq * ws^-1 *
     91 	lex:tag(lexer.STRING, '"')
     92 css_end_rule = lex:tag(lexer.STRING, '"')
     93 lex:embed(style, css_start_rule, css_end_rule) -- only double-quotes for now
     94 
     95 -- Embedded JavaScript (<script type="text/javascript"> ... </script>).
     96 local js = lexer.load('javascript')
     97 local script_tag = word_match('script', true)
     98 local js_start_rule = #('<' * script_tag * ('>' + P(function(input, index)
     99 	if input:find('^%s+type%s*=%s*(["\'])text/javascript%1', index) then return true end
    100 end))) * lex.embed_start_tag
    101 local js_end_rule = #('</' * script_tag * '>') * lex.embed_end_tag
    102 lex:embed(js, js_start_rule, js_end_rule)
    103 
    104 -- Embedded CoffeeScript (<script type="text/coffeescript"> ... </script>).
    105 local cs = lexer.load('coffeescript')
    106 script_tag = word_match('script', true)
    107 local cs_start_rule = #('<' * script_tag * P(function(input, index)
    108 	if input:find('^[^>]+type%s*=%s*(["\'])text/coffeescript%1', index) then return true end
    109 end)) * lex.embed_start_tag
    110 local cs_end_rule = #('</' * script_tag * '>') * lex.embed_end_tag
    111 lex:embed(cs, cs_start_rule, cs_end_rule)
    112 
    113 -- Word lists.
    114 lex:set_word_list(lexer.TAG .. '.single', {
    115 	'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta',
    116 	'param', 'source', 'track', 'wbr'
    117 })
    118 
    119 lex:set_word_list(lexer.TAG, {
    120 	'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdi', 'bdo', 'blockquote', 'body',
    121 	'button', 'canvas', 'caption', 'cite', 'code', 'colgroup', 'content', 'data', 'datalist', 'dd',
    122 	'decorator', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'element', 'em', 'fieldset',
    123 	'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header',
    124 	'html', 'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'mark', 'menu',
    125 	'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p',
    126 	'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select', 'shadow',
    127 	'small', 'spacer', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td',
    128 	'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'u', 'ul', 'var', 'video'
    129 })
    130 
    131 lex:set_word_list(lexer.ATTRIBUTE, {
    132 	'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'async', 'autocomplete',
    133 	'autofocus', 'autoplay', 'bgcolor', 'border', 'buffered', 'challenge', 'charset', 'checked',
    134 	'cite', 'class', 'code', 'codebase', 'color', 'cols', 'colspan', 'content', 'contenteditable',
    135 	'contextmenu', 'controls', 'coords', 'data', 'data-', 'datetime', 'default', 'defer', 'dir',
    136 	'dirname', 'disabled', 'download', 'draggable', 'dropzone', 'enctype', 'for', 'form', 'headers',
    137 	'height', 'hidden', 'high', 'href', 'hreflang', 'http-equiv', 'icon', 'id', 'ismap', 'itemprop',
    138 	'keytype', 'kind', 'label', 'lang', 'language', 'list', 'loop', 'low', 'manifest', 'max',
    139 	'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'novalidate', 'open', 'optimum',
    140 	'pattern', 'ping', 'placeholder', 'poster', 'preload', 'pubdate', 'radiogroup', 'readonly', 'rel',
    141 	'required', 'reversed', 'role', 'rows', 'rowspan', 'sandbox', 'scope', 'scoped', 'seamless',
    142 	'selected', 'shape', 'size', 'sizes', 'span', 'spellcheck', 'src', 'srcdoc', 'srclang', 'start',
    143 	'step', 'style', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'value', 'width',
    144 	'wrap'
    145 })
    146 
    147 lexer.property['scintillua.comment'] = '<!--|-->'
    148 lexer.property['scintillua.angle.braces'] = '1'
    149 lexer.property['scintillua.word.chars'] =
    150 	'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'
    151 
    152 return lex