blob: 4801a6c1d47bd9e0a8ada16089221c8237b777d5 [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""A lexical analyzer class for simple shell-like syntaxes."""
2
3# Module and documentation by Eric S. Raymond, 21 Dec 1998
4# Input stacking and error message cleanup added by ESR, March 2000
5# push_source() and pop_source() made explicit by ESR, January 2001.
6# Posix compliance, split(), string arguments, and
7# iterator interface by Gustavo Niemeyer, April 2003.
8# changes to tokenize more like Posix shells by Vinay Sajip, July 2016.
9
10import os
11import re
12import sys
13from collections import deque
14
15from io import StringIO
16
17__all__ = ["shlex", "split", "quote", "join"]
18
19class shlex:
20 "A lexical analyzer class for simple shell-like syntaxes."
21 def __init__(self, instream=None, infile=None, posix=False,
22 punctuation_chars=False):
23 if isinstance(instream, str):
24 instream = StringIO(instream)
25 if instream is not None:
26 self.instream = instream
27 self.infile = infile
28 else:
29 self.instream = sys.stdin
30 self.infile = None
31 self.posix = posix
32 if posix:
33 self.eof = None
34 else:
35 self.eof = ''
36 self.commenters = '#'
37 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
38 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_')
39 if self.posix:
40 self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
41 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
42 self.whitespace = ' \t\r\n'
43 self.whitespace_split = False
44 self.quotes = '\'"'
45 self.escape = '\\'
46 self.escapedquotes = '"'
47 self.state = ' '
48 self.pushback = deque()
49 self.lineno = 1
50 self.debug = 0
51 self.token = ''
52 self.filestack = deque()
53 self.source = None
54 if not punctuation_chars:
55 punctuation_chars = ''
56 elif punctuation_chars is True:
57 punctuation_chars = '();<>|&'
58 self._punctuation_chars = punctuation_chars
59 if punctuation_chars:
60 # _pushback_chars is a push back queue used by lookahead logic
61 self._pushback_chars = deque()
62 # these chars added because allowed in file names, args, wildcards
63 self.wordchars += '~-./*?='
64 #remove any punctuation chars from wordchars
65 t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars))
66 self.wordchars = self.wordchars.translate(t)
67
68 @property
69 def punctuation_chars(self):
70 return self._punctuation_chars
71
72 def push_token(self, tok):
73 "Push a token onto the stack popped by the get_token method"
74 if self.debug >= 1:
75 print("shlex: pushing token " + repr(tok))
76 self.pushback.appendleft(tok)
77
78 def push_source(self, newstream, newfile=None):
79 "Push an input source onto the lexer's input source stack."
80 if isinstance(newstream, str):
81 newstream = StringIO(newstream)
82 self.filestack.appendleft((self.infile, self.instream, self.lineno))
83 self.infile = newfile
84 self.instream = newstream
85 self.lineno = 1
86 if self.debug:
87 if newfile is not None:
88 print('shlex: pushing to file %s' % (self.infile,))
89 else:
90 print('shlex: pushing to stream %s' % (self.instream,))
91
92 def pop_source(self):
93 "Pop the input source stack."
94 self.instream.close()
95 (self.infile, self.instream, self.lineno) = self.filestack.popleft()
96 if self.debug:
97 print('shlex: popping to %s, line %d' \
98 % (self.instream, self.lineno))
99 self.state = ' '
100
101 def get_token(self):
102 "Get a token from the input stream (or from stack if it's nonempty)"
103 if self.pushback:
104 tok = self.pushback.popleft()
105 if self.debug >= 1:
106 print("shlex: popping token " + repr(tok))
107 return tok
108 # No pushback. Get a token.
109 raw = self.read_token()
110 # Handle inclusions
111 if self.source is not None:
112 while raw == self.source:
113 spec = self.sourcehook(self.read_token())
114 if spec:
115 (newfile, newstream) = spec
116 self.push_source(newstream, newfile)
117 raw = self.get_token()
118 # Maybe we got EOF instead?
119 while raw == self.eof:
120 if not self.filestack:
121 return self.eof
122 else:
123 self.pop_source()
124 raw = self.get_token()
125 # Neither inclusion nor EOF
126 if self.debug >= 1:
127 if raw != self.eof:
128 print("shlex: token=" + repr(raw))
129 else:
130 print("shlex: token=EOF")
131 return raw
132
133 def read_token(self):
134 quoted = False
135 escapedstate = ' '
136 while True:
137 if self.punctuation_chars and self._pushback_chars:
138 nextchar = self._pushback_chars.pop()
139 else:
140 nextchar = self.instream.read(1)
141 if nextchar == '\n':
142 self.lineno += 1
143 if self.debug >= 3:
144 print("shlex: in state %r I see character: %r" % (self.state,
145 nextchar))
146 if self.state is None:
147 self.token = '' # past end of file
148 break
149 elif self.state == ' ':
150 if not nextchar:
151 self.state = None # end of file
152 break
153 elif nextchar in self.whitespace:
154 if self.debug >= 2:
155 print("shlex: I see whitespace in whitespace state")
156 if self.token or (self.posix and quoted):
157 break # emit current token
158 else:
159 continue
160 elif nextchar in self.commenters:
161 self.instream.readline()
162 self.lineno += 1
163 elif self.posix and nextchar in self.escape:
164 escapedstate = 'a'
165 self.state = nextchar
166 elif nextchar in self.wordchars:
167 self.token = nextchar
168 self.state = 'a'
169 elif nextchar in self.punctuation_chars:
170 self.token = nextchar
171 self.state = 'c'
172 elif nextchar in self.quotes:
173 if not self.posix:
174 self.token = nextchar
175 self.state = nextchar
176 elif self.whitespace_split:
177 self.token = nextchar
178 self.state = 'a'
179 else:
180 self.token = nextchar
181 if self.token or (self.posix and quoted):
182 break # emit current token
183 else:
184 continue
185 elif self.state in self.quotes:
186 quoted = True
187 if not nextchar: # end of file
188 if self.debug >= 2:
189 print("shlex: I see EOF in quotes state")
190 # XXX what error should be raised here?
191 raise ValueError("No closing quotation")
192 if nextchar == self.state:
193 if not self.posix:
194 self.token += nextchar
195 self.state = ' '
196 break
197 else:
198 self.state = 'a'
199 elif (self.posix and nextchar in self.escape and self.state
200 in self.escapedquotes):
201 escapedstate = self.state
202 self.state = nextchar
203 else:
204 self.token += nextchar
205 elif self.state in self.escape:
206 if not nextchar: # end of file
207 if self.debug >= 2:
208 print("shlex: I see EOF in escape state")
209 # XXX what error should be raised here?
210 raise ValueError("No escaped character")
211 # In posix shells, only the quote itself or the escape
212 # character may be escaped within quotes.
213 if (escapedstate in self.quotes and
214 nextchar != self.state and nextchar != escapedstate):
215 self.token += self.state
216 self.token += nextchar
217 self.state = escapedstate
218 elif self.state in ('a', 'c'):
219 if not nextchar:
220 self.state = None # end of file
221 break
222 elif nextchar in self.whitespace:
223 if self.debug >= 2:
224 print("shlex: I see whitespace in word state")
225 self.state = ' '
226 if self.token or (self.posix and quoted):
227 break # emit current token
228 else:
229 continue
230 elif nextchar in self.commenters:
231 self.instream.readline()
232 self.lineno += 1
233 if self.posix:
234 self.state = ' '
235 if self.token or (self.posix and quoted):
236 break # emit current token
237 else:
238 continue
239 elif self.state == 'c':
240 if nextchar in self.punctuation_chars:
241 self.token += nextchar
242 else:
243 if nextchar not in self.whitespace:
244 self._pushback_chars.append(nextchar)
245 self.state = ' '
246 break
247 elif self.posix and nextchar in self.quotes:
248 self.state = nextchar
249 elif self.posix and nextchar in self.escape:
250 escapedstate = 'a'
251 self.state = nextchar
252 elif (nextchar in self.wordchars or nextchar in self.quotes
253 or (self.whitespace_split and
254 nextchar not in self.punctuation_chars)):
255 self.token += nextchar
256 else:
257 if self.punctuation_chars:
258 self._pushback_chars.append(nextchar)
259 else:
260 self.pushback.appendleft(nextchar)
261 if self.debug >= 2:
262 print("shlex: I see punctuation in word state")
263 self.state = ' '
264 if self.token or (self.posix and quoted):
265 break # emit current token
266 else:
267 continue
268 result = self.token
269 self.token = ''
270 if self.posix and not quoted and result == '':
271 result = None
272 if self.debug > 1:
273 if result:
274 print("shlex: raw token=" + repr(result))
275 else:
276 print("shlex: raw token=EOF")
277 return result
278
279 def sourcehook(self, newfile):
280 "Hook called on a filename to be sourced."
281 if newfile[0] == '"':
282 newfile = newfile[1:-1]
283 # This implements cpp-like semantics for relative-path inclusion.
284 if isinstance(self.infile, str) and not os.path.isabs(newfile):
285 newfile = os.path.join(os.path.dirname(self.infile), newfile)
286 return (newfile, open(newfile, "r"))
287
288 def error_leader(self, infile=None, lineno=None):
289 "Emit a C-compiler-like, Emacs-friendly error-message leader."
290 if infile is None:
291 infile = self.infile
292 if lineno is None:
293 lineno = self.lineno
294 return "\"%s\", line %d: " % (infile, lineno)
295
296 def __iter__(self):
297 return self
298
299 def __next__(self):
300 token = self.get_token()
301 if token == self.eof:
302 raise StopIteration
303 return token
304
305def split(s, comments=False, posix=True):
306 """Split the string *s* using shell-like syntax."""
307 if s is None:
308 import warnings
309 warnings.warn("Passing None for 's' to shlex.split() is deprecated.",
310 DeprecationWarning, stacklevel=2)
311 lex = shlex(s, posix=posix)
312 lex.whitespace_split = True
313 if not comments:
314 lex.commenters = ''
315 return list(lex)
316
317
318def join(split_command):
319 """Return a shell-escaped string from *split_command*."""
320 return ' '.join(quote(arg) for arg in split_command)
321
322
323_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
324
325def quote(s):
326 """Return a shell-escaped version of the string *s*."""
327 if not s:
328 return "''"
329 if _find_unsafe(s) is None:
330 return s
331
332 # use single quotes, and put single quotes into double quotes
333 # the string $'b is then quoted as '$'"'"'b'
334 return "'" + s.replace("'", "'\"'\"'") + "'"
335
336
337def _print_tokens(lexer):
338 while 1:
339 tt = lexer.get_token()
340 if not tt:
341 break
342 print("Token: " + repr(tt))
343
344if __name__ == '__main__':
345 if len(sys.argv) == 1:
346 _print_tokens(shlex())
347 else:
348 fn = sys.argv[1]
349 with open(fn) as f:
350 _print_tokens(shlex(f, fn))