vis

a vi-like editor based on Plan 9's structural regular expressions

git clone https://9o.is/git/vis.git

awk.lua

(8685B)


      1 -- Copyright 2006-2025 Mitchell. See LICENSE.
      2 -- AWK LPeg lexer.
      3 -- Modified by Wolfgang Seeberg 2012, 2013.
      4 
      5 local lexer = lexer
      6 local P, S = lpeg.P, lpeg.S
      7 
      8 local lex = lexer.new(...)
      9 
     10 local LEFTBRACKET = '['
     11 local RIGHTBRACKET = ']'
     12 local SLASH = '/'
     13 local BACKSLASH = '\\'
     14 local CARET = '^'
     15 local CR = '\r'
     16 local LF = '\n'
     17 local CRLF = CR .. LF
     18 local DQUOTE = '"'
     19 local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
     20 local COMPANION = {['('] = '[', ['['] = '('}
     21 local CC = {
     22 	alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, print = 1, punct = 1,
     23 	space = 1, upper = 1, xdigit = 1
     24 }
     25 local LastRegexEnd = 0
     26 local BackslashAtCommentEnd = 0
     27 local KW_BEFORE_RX = {
     28 	case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, ['return'] = 1
     29 }
     30 
     31 local function findKeyword(input, e)
     32 	local i = e
     33 	while i > 0 and input:find("^[%l]", i) do i = i - 1 end
     34 	local w = input:sub(i + 1, e)
     35 	if i == 0 then
     36 		return KW_BEFORE_RX[w] == 1
     37 	elseif input:find("^[%u%d_]", i) then
     38 		return false
     39 	else
     40 		return KW_BEFORE_RX[w] == 1
     41 	end
     42 end
     43 
     44 local function isRegex(input, i)
     45 	while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
     46 	if i < 1 then return true end
     47 	if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
     48 		return true
     49 	elseif input:sub(i, i) == SLASH then
     50 		return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
     51 	elseif input:find('^[]%w)."]', i) then
     52 		return false
     53 	elseif input:sub(i, i) == LF then
     54 		if i == 1 then return true end
     55 		i = i - 1
     56 		if input:sub(i, i) == CR then
     57 			if i == 1 then return true end
     58 			i = i - 1
     59 		end
     60 	elseif input:sub(i, i) == CR then
     61 		if i == 1 then return true end
     62 		i = i - 1
     63 	else
     64 		return false
     65 	end
     66 	if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
     67 		return isRegex(input, i - 1)
     68 	else
     69 		return true
     70 	end
     71 end
     72 
     73 local function eatCharacterClass(input, s, e)
     74 	local i = s
     75 	while i <= e do
     76 		if input:find('^[\r\n]', i) then
     77 			return false
     78 		elseif input:sub(i, i + 1) == ':]' then
     79 			local str = input:sub(s, i - 1)
     80 			return CC[str] == 1 and i + 1
     81 		end
     82 		i = i + 1
     83 	end
     84 	return false
     85 end
     86 
     87 local function eatBrackets(input, i, e)
     88 	if input:sub(i, i) == CARET then i = i + 1 end
     89 	if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
     90 	while i <= e do
     91 		if input:find('^[\r\n]', i) then
     92 			return false
     93 		elseif input:sub(i, i) == RIGHTBRACKET then
     94 			return i
     95 		elseif input:sub(i, i + 1) == '[:' then
     96 			i = eatCharacterClass(input, i + 2, e)
     97 			if not i then return false end
     98 		elseif input:sub(i, i) == BACKSLASH then
     99 			i = i + 1
    100 			if input:sub(i, i + 1) == CRLF then i = i + 1 end
    101 		end
    102 		i = i + 1
    103 	end
    104 	return false
    105 end
    106 
    107 local function eatRegex(input, i)
    108 	local e = #input
    109 	while i <= e do
    110 		if input:find('^[\r\n]', i) then
    111 			return false
    112 		elseif input:sub(i, i) == SLASH then
    113 			LastRegexEnd = i
    114 			return i
    115 		elseif input:sub(i, i) == LEFTBRACKET then
    116 			i = eatBrackets(input, i + 1, e)
    117 			if not i then return false end
    118 		elseif input:sub(i, i) == BACKSLASH then
    119 			i = i + 1
    120 			if input:sub(i, i + 1) == CRLF then i = i + 1 end
    121 		end
    122 		i = i + 1
    123 	end
    124 	return false
    125 end
    126 
    127 local ScanRegexResult
    128 local function scanGawkRegex(input, index)
    129 	if isRegex(input, index - 2) then
    130 		local i = eatRegex(input, index)
    131 		if not i then
    132 			ScanRegexResult = false
    133 			return false
    134 		end
    135 		local rx = input:sub(index - 1, i)
    136 		for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
    137 			-- /\S/ is special, but /\\S/ is not.
    138 			if #bs % 2 == 1 then return i + 1 end
    139 		end
    140 		ScanRegexResult = i + 1
    141 	else
    142 		ScanRegexResult = false
    143 	end
    144 	return false
    145 end
    146 -- Is only called immediately after scanGawkRegex().
    147 local function scanRegex() return ScanRegexResult end
    148 
    149 local function scanString(input, index)
    150 	local i = index
    151 	local e = #input
    152 	while i <= e do
    153 		if input:find('^[\r\n]', i) then
    154 			return false
    155 		elseif input:sub(i, i) == DQUOTE then
    156 			return i + 1
    157 		elseif input:sub(i, i) == BACKSLASH then
    158 			i = i + 1
    159 			-- lexer.range() doesn't handle CRLF.
    160 			if input:sub(i, i + 1) == CRLF then i = i + 1 end
    161 		end
    162 		i = i + 1
    163 	end
    164 	return false
    165 end
    166 
    167 -- purpose: prevent isRegex() from entering a comment line that ends with a backslash.
    168 local function scanComment(input, index)
    169 	local _, i = input:find('[^\r\n]*', index)
    170 	if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
    171 	return i + 1
    172 end
    173 
    174 local function scanFieldDelimiters(input, index)
    175 	local i = index
    176 	local e = #input
    177 	local left = input:sub(i - 1, i - 1)
    178 	local count = 1
    179 	local right = DELIMITER_MATCHES[left]
    180 	local left2 = COMPANION[left]
    181 	local count2 = 0
    182 	local right2 = DELIMITER_MATCHES[left2]
    183 	while i <= e do
    184 		if input:find('^[#\r\n]', i) then
    185 			return false
    186 		elseif input:sub(i, i) == right then
    187 			count = count - 1
    188 			if count == 0 then return count2 == 0 and i + 1 end
    189 		elseif input:sub(i, i) == left then
    190 			count = count + 1
    191 		elseif input:sub(i, i) == right2 then
    192 			count2 = count2 - 1
    193 			if count2 < 0 then return false end
    194 		elseif input:sub(i, i) == left2 then
    195 			count2 = count2 + 1
    196 		elseif input:sub(i, i) == DQUOTE then
    197 			i = scanString(input, i + 1)
    198 			if not i then return false end
    199 			i = i - 1
    200 		elseif input:sub(i, i) == SLASH then
    201 			if isRegex(input, i - 1) then
    202 				i = eatRegex(input, i + 1)
    203 				if not i then return false end
    204 			end
    205 		elseif input:sub(i, i) == BACKSLASH then
    206 			if input:sub(i + 1, i + 2) == CRLF then
    207 				i = i + 2
    208 			elseif input:find('^[\r\n]', i + 1) then
    209 				i = i + 1
    210 			end
    211 		end
    212 		i = i + 1
    213 	end
    214 	return false
    215 end
    216 
    217 -- Comments.
    218 lex:add_rule('comment', lex:tag(lexer.COMMENT, '#' * P(scanComment)))
    219 
    220 -- Strings.
    221 lex:add_rule('string', lex:tag(lexer.STRING, DQUOTE * P(scanString)))
    222 
    223 -- No leading sign because it might be binary.
    224 local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) + ('.' * lexer.digit^1)) *
    225 	(S('eE') * S('+-')^-1 * lexer.digit^1)^-1
    226 
    227 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
    228 lex:add_rule('field', lex:tag(lexer.VARIABLE .. '.field', '$' * S('$+-')^0 *
    229 	(float + lexer.word^0 * '(' * P(scanFieldDelimiters) + lexer.word^1 *
    230 		('[' * P(scanFieldDelimiters))^-1 + '"' * P(scanString) + '/' * P(eatRegex) * '/')))
    231 
    232 -- Regular expressions.
    233 -- Slash delimited regular expressions are preceded by most operators or the keywords 'print'
    234 -- and 'case', possibly on a preceding line. They can contain unescaped slashes and brackets
    235 -- in brackets. Some escape sequences like '\S', '\s' have special meanings with Gawk. Tokens
    236 -- that contain them are displayed differently.
    237 lex:add_rule('gawkRegex', lex:tag(lexer.REGEX .. '.gawk', SLASH * P(scanGawkRegex)))
    238 lex:add_rule('regex', lex:tag(lexer.REGEX, SLASH * P(scanRegex)))
    239 
    240 -- Operators.
    241 lex:add_rule('gawkOperator', lex:tag(lexer.OPERATOR .. '.gawk', P("|&") + "@" + "**=" + "**"))
    242 lex:add_rule('operator', lex:tag(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
    243 
    244 -- Numbers.
    245 lex:add_rule('gawkNumber', lex:tag(lexer.NUMBER .. '.gawk', lexer.hex_num + lexer.oct_num))
    246 lex:add_rule('number', lex:tag(lexer.NUMBER, float))
    247 
    248 -- Keywords.
    249 lex:add_rule('keyword', lex:tag(lexer.KEYWORD, lex:word_match(lexer.KEYWORD)))
    250 
    251 lex:add_rule('builtInVariable',
    252 	lex:tag(lexer.VARIABLE_BUILTIN, lex:word_match(lexer.VARIABLE_BUILTIN)))
    253 
    254 lex:add_rule('gawkBuiltInVariable', lex:tag(lexer.VARIABLE_BUILTIN .. '.gawk',
    255 	lex:word_match(lexer.VARIABLE_BUILTIN .. '.gawk')))
    256 
    257 -- Functions.
    258 local builtin_func = lex:tag(lexer.FUNCTION_BUILTIN, lex:word_match(lexer.FUNCTION_BUILTIN))
    259 local func = lex:tag(lexer.FUNCTION, lexer.word)
    260 lex:add_rule('function', (builtin_func + func) * #P('('))
    261 
    262 -- Identifiers.
    263 lex:add_rule('identifier', lex:tag(lexer.IDENTIFIER, lexer.word))
    264 
    265 -- Fold points.
    266 lex:add_fold_point(lexer.OPERATOR, '{', '}')
    267 
    268 -- Word lists.
    269 lex:set_word_list(lexer.KEYWORD, {
    270 	'BEGIN', 'END', 'break', 'continue', 'do', 'else', 'for', 'if', 'in', 'while', --
    271 	'delete', -- array
    272 	'print', 'printf', 'getline', 'close', 'fflush', 'system', -- I/O
    273 	'function', 'return', -- functions
    274 	'next', 'nextfile', 'exit' -- program execution
    275 })
    276 
    277 lex:set_word_list(lexer.FUNCTION_BUILTIN, {
    278 	'gsub', 'index', 'length', 'match', 'split', 'sprintf', 'sub', 'substr', 'tolower', 'toupper', -- string
    279 	'mktime', 'strftime', 'systime', -- time
    280 	'atan2', 'cos', 'exp', 'int', 'log', 'rand', 'sin', 'sqrt', 'srand' -- arithmetic
    281 })
    282 
    283 lex:set_word_list(lexer.VARIABLE_BUILTIN, {
    284 	'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR', 'OFMT', 'OFS', 'ORS',
    285 	'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
    286 })
    287 
    288 lex:set_word_list(lexer.VARIABLE_BUILTIN .. '.gawk', {
    289 	'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE', 'LINT', 'PREC',
    290 	'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
    291 })
    292 
    293 lexer.property['scintillua.comment'] = '#'
    294 
    295 return lex