vis
a vi-like editor based on Plan 9's structural regular expressions
git clone https://9o.is/git/vis.git
awk.lua
(8685B)
1 -- Copyright 2006-2025 Mitchell. See LICENSE.
2 -- AWK LPeg lexer.
3 -- Modified by Wolfgang Seeberg 2012, 2013.
4
5 local lexer = lexer
6 local P, S = lpeg.P, lpeg.S
7
8 local lex = lexer.new(...)
9
10 local LEFTBRACKET = '['
11 local RIGHTBRACKET = ']'
12 local SLASH = '/'
13 local BACKSLASH = '\\'
14 local CARET = '^'
15 local CR = '\r'
16 local LF = '\n'
17 local CRLF = CR .. LF
18 local DQUOTE = '"'
19 local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
20 local COMPANION = {['('] = '[', ['['] = '('}
21 local CC = {
22 alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, print = 1, punct = 1,
23 space = 1, upper = 1, xdigit = 1
24 }
25 local LastRegexEnd = 0
26 local BackslashAtCommentEnd = 0
27 local KW_BEFORE_RX = {
28 case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, ['return'] = 1
29 }
30
31 local function findKeyword(input, e)
32 local i = e
33 while i > 0 and input:find("^[%l]", i) do i = i - 1 end
34 local w = input:sub(i + 1, e)
35 if i == 0 then
36 return KW_BEFORE_RX[w] == 1
37 elseif input:find("^[%u%d_]", i) then
38 return false
39 else
40 return KW_BEFORE_RX[w] == 1
41 end
42 end
43
44 local function isRegex(input, i)
45 while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
46 if i < 1 then return true end
47 if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
48 return true
49 elseif input:sub(i, i) == SLASH then
50 return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
51 elseif input:find('^[]%w)."]', i) then
52 return false
53 elseif input:sub(i, i) == LF then
54 if i == 1 then return true end
55 i = i - 1
56 if input:sub(i, i) == CR then
57 if i == 1 then return true end
58 i = i - 1
59 end
60 elseif input:sub(i, i) == CR then
61 if i == 1 then return true end
62 i = i - 1
63 else
64 return false
65 end
66 if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
67 return isRegex(input, i - 1)
68 else
69 return true
70 end
71 end
72
73 local function eatCharacterClass(input, s, e)
74 local i = s
75 while i <= e do
76 if input:find('^[\r\n]', i) then
77 return false
78 elseif input:sub(i, i + 1) == ':]' then
79 local str = input:sub(s, i - 1)
80 return CC[str] == 1 and i + 1
81 end
82 i = i + 1
83 end
84 return false
85 end
86
87 local function eatBrackets(input, i, e)
88 if input:sub(i, i) == CARET then i = i + 1 end
89 if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
90 while i <= e do
91 if input:find('^[\r\n]', i) then
92 return false
93 elseif input:sub(i, i) == RIGHTBRACKET then
94 return i
95 elseif input:sub(i, i + 1) == '[:' then
96 i = eatCharacterClass(input, i + 2, e)
97 if not i then return false end
98 elseif input:sub(i, i) == BACKSLASH then
99 i = i + 1
100 if input:sub(i, i + 1) == CRLF then i = i + 1 end
101 end
102 i = i + 1
103 end
104 return false
105 end
106
107 local function eatRegex(input, i)
108 local e = #input
109 while i <= e do
110 if input:find('^[\r\n]', i) then
111 return false
112 elseif input:sub(i, i) == SLASH then
113 LastRegexEnd = i
114 return i
115 elseif input:sub(i, i) == LEFTBRACKET then
116 i = eatBrackets(input, i + 1, e)
117 if not i then return false end
118 elseif input:sub(i, i) == BACKSLASH then
119 i = i + 1
120 if input:sub(i, i + 1) == CRLF then i = i + 1 end
121 end
122 i = i + 1
123 end
124 return false
125 end
126
127 local ScanRegexResult
128 local function scanGawkRegex(input, index)
129 if isRegex(input, index - 2) then
130 local i = eatRegex(input, index)
131 if not i then
132 ScanRegexResult = false
133 return false
134 end
135 local rx = input:sub(index - 1, i)
136 for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
137 -- /\S/ is special, but /\\S/ is not.
138 if #bs % 2 == 1 then return i + 1 end
139 end
140 ScanRegexResult = i + 1
141 else
142 ScanRegexResult = false
143 end
144 return false
145 end
146 -- Is only called immediately after scanGawkRegex().
147 local function scanRegex() return ScanRegexResult end
148
149 local function scanString(input, index)
150 local i = index
151 local e = #input
152 while i <= e do
153 if input:find('^[\r\n]', i) then
154 return false
155 elseif input:sub(i, i) == DQUOTE then
156 return i + 1
157 elseif input:sub(i, i) == BACKSLASH then
158 i = i + 1
159 -- lexer.range() doesn't handle CRLF.
160 if input:sub(i, i + 1) == CRLF then i = i + 1 end
161 end
162 i = i + 1
163 end
164 return false
165 end
166
167 -- purpose: prevent isRegex() from entering a comment line that ends with a backslash.
168 local function scanComment(input, index)
169 local _, i = input:find('[^\r\n]*', index)
170 if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
171 return i + 1
172 end
173
174 local function scanFieldDelimiters(input, index)
175 local i = index
176 local e = #input
177 local left = input:sub(i - 1, i - 1)
178 local count = 1
179 local right = DELIMITER_MATCHES[left]
180 local left2 = COMPANION[left]
181 local count2 = 0
182 local right2 = DELIMITER_MATCHES[left2]
183 while i <= e do
184 if input:find('^[#\r\n]', i) then
185 return false
186 elseif input:sub(i, i) == right then
187 count = count - 1
188 if count == 0 then return count2 == 0 and i + 1 end
189 elseif input:sub(i, i) == left then
190 count = count + 1
191 elseif input:sub(i, i) == right2 then
192 count2 = count2 - 1
193 if count2 < 0 then return false end
194 elseif input:sub(i, i) == left2 then
195 count2 = count2 + 1
196 elseif input:sub(i, i) == DQUOTE then
197 i = scanString(input, i + 1)
198 if not i then return false end
199 i = i - 1
200 elseif input:sub(i, i) == SLASH then
201 if isRegex(input, i - 1) then
202 i = eatRegex(input, i + 1)
203 if not i then return false end
204 end
205 elseif input:sub(i, i) == BACKSLASH then
206 if input:sub(i + 1, i + 2) == CRLF then
207 i = i + 2
208 elseif input:find('^[\r\n]', i + 1) then
209 i = i + 1
210 end
211 end
212 i = i + 1
213 end
214 return false
215 end
216
217 -- Comments.
218 lex:add_rule('comment', lex:tag(lexer.COMMENT, '#' * P(scanComment)))
219
220 -- Strings.
221 lex:add_rule('string', lex:tag(lexer.STRING, DQUOTE * P(scanString)))
222
223 -- No leading sign because it might be binary.
224 local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) + ('.' * lexer.digit^1)) *
225 (S('eE') * S('+-')^-1 * lexer.digit^1)^-1
226
227 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
228 lex:add_rule('field', lex:tag(lexer.VARIABLE .. '.field', '$' * S('$+-')^0 *
229 (float + lexer.word^0 * '(' * P(scanFieldDelimiters) + lexer.word^1 *
230 ('[' * P(scanFieldDelimiters))^-1 + '"' * P(scanString) + '/' * P(eatRegex) * '/')))
231
232 -- Regular expressions.
233 -- Slash delimited regular expressions are preceded by most operators or the keywords 'print'
234 -- and 'case', possibly on a preceding line. They can contain unescaped slashes and brackets
235 -- in brackets. Some escape sequences like '\S', '\s' have special meanings with Gawk. Tokens
236 -- that contain them are displayed differently.
237 lex:add_rule('gawkRegex', lex:tag(lexer.REGEX .. '.gawk', SLASH * P(scanGawkRegex)))
238 lex:add_rule('regex', lex:tag(lexer.REGEX, SLASH * P(scanRegex)))
239
240 -- Operators.
241 lex:add_rule('gawkOperator', lex:tag(lexer.OPERATOR .. '.gawk', P("|&") + "@" + "**=" + "**"))
242 lex:add_rule('operator', lex:tag(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
243
244 -- Numbers.
245 lex:add_rule('gawkNumber', lex:tag(lexer.NUMBER .. '.gawk', lexer.hex_num + lexer.oct_num))
246 lex:add_rule('number', lex:tag(lexer.NUMBER, float))
247
248 -- Keywords.
249 lex:add_rule('keyword', lex:tag(lexer.KEYWORD, lex:word_match(lexer.KEYWORD)))
250
251 lex:add_rule('builtInVariable',
252 lex:tag(lexer.VARIABLE_BUILTIN, lex:word_match(lexer.VARIABLE_BUILTIN)))
253
254 lex:add_rule('gawkBuiltInVariable', lex:tag(lexer.VARIABLE_BUILTIN .. '.gawk',
255 lex:word_match(lexer.VARIABLE_BUILTIN .. '.gawk')))
256
257 -- Functions.
258 local builtin_func = lex:tag(lexer.FUNCTION_BUILTIN, lex:word_match(lexer.FUNCTION_BUILTIN))
259 local func = lex:tag(lexer.FUNCTION, lexer.word)
260 lex:add_rule('function', (builtin_func + func) * #P('('))
261
262 -- Identifiers.
263 lex:add_rule('identifier', lex:tag(lexer.IDENTIFIER, lexer.word))
264
265 -- Fold points.
266 lex:add_fold_point(lexer.OPERATOR, '{', '}')
267
268 -- Word lists.
269 lex:set_word_list(lexer.KEYWORD, {
270 'BEGIN', 'END', 'break', 'continue', 'do', 'else', 'for', 'if', 'in', 'while', --
271 'delete', -- array
272 'print', 'printf', 'getline', 'close', 'fflush', 'system', -- I/O
273 'function', 'return', -- functions
274 'next', 'nextfile', 'exit' -- program execution
275 })
276
277 lex:set_word_list(lexer.FUNCTION_BUILTIN, {
278 'gsub', 'index', 'length', 'match', 'split', 'sprintf', 'sub', 'substr', 'tolower', 'toupper', -- string
279 'mktime', 'strftime', 'systime', -- time
280 'atan2', 'cos', 'exp', 'int', 'log', 'rand', 'sin', 'sqrt', 'srand' -- arithmetic
281 })
282
283 lex:set_word_list(lexer.VARIABLE_BUILTIN, {
284 'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR', 'OFMT', 'OFS', 'ORS',
285 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
286 })
287
288 lex:set_word_list(lexer.VARIABLE_BUILTIN .. '.gawk', {
289 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE', 'LINT', 'PREC',
290 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
291 })
292
293 lexer.property['scintillua.comment'] = '#'
294
295 return lex