blob: ea897c3032257b290724c21c2e4c20542175b42c [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""Parse (absolute and relative) URLs.
2
3urlparse module is based upon the following RFC specifications.
4
5RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6and L. Masinter, January 2005.
7
8RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9and L.Masinter, December 1999.
10
11RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12Berners-Lee, R. Fielding, and L. Masinter, August 1998.
13
14RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
15
16RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
171995.
18
19RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20McCahill, December 1994
21
22RFC 3986 is considered the current standard and any future changes to
23urlparse module should conform with it. The urlparse module is
24currently not entirely compliant with this RFC due to defacto
25scenarios for parsing, and for backward compatibility purposes, some
26parsing quirks from older RFCs are retained. The testcases in
27test_urlparse.py provides a good indicator of parsing behavior.
28"""
29
30import re
31import sys
32import types
33import collections
34import warnings
35
36__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
37 "urlsplit", "urlunsplit", "urlencode", "parse_qs",
38 "parse_qsl", "quote", "quote_plus", "quote_from_bytes",
39 "unquote", "unquote_plus", "unquote_to_bytes",
40 "DefragResult", "ParseResult", "SplitResult",
41 "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
42
43# A classification of schemes.
44# The empty string classifies URLs with no scheme specified,
45# being the default value returned by “urlsplit” and “urlparse”.
46
47uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
48 'wais', 'file', 'https', 'shttp', 'mms',
49 'prospero', 'rtsp', 'rtspu', 'sftp',
50 'svn', 'svn+ssh', 'ws', 'wss']
51
52uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
53 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
54 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
55 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
56 'ws', 'wss']
57
58uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
59 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
60 'mms', 'sftp', 'tel']
61
62# These are not actually used anymore, but should stay for backwards
63# compatibility. (They are undocumented, but have a public-looking name.)
64
65non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
66 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
67
68uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
69 'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
70
71uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
72 'nntp', 'wais', 'https', 'shttp', 'snews',
73 'file', 'prospero']
74
75# Characters valid in scheme names
76scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
77 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
78 '0123456789'
79 '+-.')
80
81# XXX: Consider replacing with functools.lru_cache
82MAX_CACHE_SIZE = 20
83_parse_cache = {}
84
85def clear_cache():
86 """Clear the parse cache and the quoters cache."""
87 _parse_cache.clear()
88 _safe_quoters.clear()
89
90
91# Helpers for bytes handling
92# For 3.2, we deliberately require applications that
93# handle improperly quoted URLs to do their own
94# decoding and encoding. If valid use cases are
95# presented, we may relax this by using latin-1
96# decoding internally for 3.3
97_implicit_encoding = 'ascii'
98_implicit_errors = 'strict'
99
100def _noop(obj):
101 return obj
102
103def _encode_result(obj, encoding=_implicit_encoding,
104 errors=_implicit_errors):
105 return obj.encode(encoding, errors)
106
107def _decode_args(args, encoding=_implicit_encoding,
108 errors=_implicit_errors):
109 return tuple(x.decode(encoding, errors) if x else '' for x in args)
110
111def _coerce_args(*args):
112 # Invokes decode if necessary to create str args
113 # and returns the coerced inputs along with
114 # an appropriate result coercion function
115 # - noop for str inputs
116 # - encoding function otherwise
117 str_input = isinstance(args[0], str)
118 for arg in args[1:]:
119 # We special-case the empty string to support the
120 # "scheme=''" default argument to some functions
121 if arg and isinstance(arg, str) != str_input:
122 raise TypeError("Cannot mix str and non-str arguments")
123 if str_input:
124 return args + (_noop,)
125 return _decode_args(args) + (_encode_result,)
126
127# Result objects are more helpful than simple tuples
128class _ResultMixinStr(object):
129 """Standard approach to encoding parsed results from str to bytes"""
130 __slots__ = ()
131
132 def encode(self, encoding='ascii', errors='strict'):
133 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
134
135
136class _ResultMixinBytes(object):
137 """Standard approach to decoding parsed results from bytes to str"""
138 __slots__ = ()
139
140 def decode(self, encoding='ascii', errors='strict'):
141 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
142
143
144class _NetlocResultMixinBase(object):
145 """Shared methods for the parsed result objects containing a netloc element"""
146 __slots__ = ()
147
148 @property
149 def username(self):
150 return self._userinfo[0]
151
152 @property
153 def password(self):
154 return self._userinfo[1]
155
156 @property
157 def hostname(self):
158 hostname = self._hostinfo[0]
159 if not hostname:
160 return None
161 # Scoped IPv6 address may have zone info, which must not be lowercased
162 # like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
163 separator = '%' if isinstance(hostname, str) else b'%'
164 hostname, percent, zone = hostname.partition(separator)
165 return hostname.lower() + percent + zone
166
167 @property
168 def port(self):
169 port = self._hostinfo[1]
170 if port is not None:
171 try:
172 port = int(port, 10)
173 except ValueError:
174 message = f'Port could not be cast to integer value as {port!r}'
175 raise ValueError(message) from None
176 if not ( 0 <= port <= 65535):
177 raise ValueError("Port out of range 0-65535")
178 return port
179
180 __class_getitem__ = classmethod(types.GenericAlias)
181
182
183class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
184 __slots__ = ()
185
186 @property
187 def _userinfo(self):
188 netloc = self.netloc
189 userinfo, have_info, hostinfo = netloc.rpartition('@')
190 if have_info:
191 username, have_password, password = userinfo.partition(':')
192 if not have_password:
193 password = None
194 else:
195 username = password = None
196 return username, password
197
198 @property
199 def _hostinfo(self):
200 netloc = self.netloc
201 _, _, hostinfo = netloc.rpartition('@')
202 _, have_open_br, bracketed = hostinfo.partition('[')
203 if have_open_br:
204 hostname, _, port = bracketed.partition(']')
205 _, _, port = port.partition(':')
206 else:
207 hostname, _, port = hostinfo.partition(':')
208 if not port:
209 port = None
210 return hostname, port
211
212
213class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
214 __slots__ = ()
215
216 @property
217 def _userinfo(self):
218 netloc = self.netloc
219 userinfo, have_info, hostinfo = netloc.rpartition(b'@')
220 if have_info:
221 username, have_password, password = userinfo.partition(b':')
222 if not have_password:
223 password = None
224 else:
225 username = password = None
226 return username, password
227
228 @property
229 def _hostinfo(self):
230 netloc = self.netloc
231 _, _, hostinfo = netloc.rpartition(b'@')
232 _, have_open_br, bracketed = hostinfo.partition(b'[')
233 if have_open_br:
234 hostname, _, port = bracketed.partition(b']')
235 _, _, port = port.partition(b':')
236 else:
237 hostname, _, port = hostinfo.partition(b':')
238 if not port:
239 port = None
240 return hostname, port
241
242
243from collections import namedtuple
244
245_DefragResultBase = namedtuple('DefragResult', 'url fragment')
246_SplitResultBase = namedtuple(
247 'SplitResult', 'scheme netloc path query fragment')
248_ParseResultBase = namedtuple(
249 'ParseResult', 'scheme netloc path params query fragment')
250
251_DefragResultBase.__doc__ = """
252DefragResult(url, fragment)
253
254A 2-tuple that contains the url without fragment identifier and the fragment
255identifier as a separate argument.
256"""
257
258_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
259
260_DefragResultBase.fragment.__doc__ = """
261Fragment identifier separated from URL, that allows indirect identification of a
262secondary resource by reference to a primary resource and additional identifying
263information.
264"""
265
266_SplitResultBase.__doc__ = """
267SplitResult(scheme, netloc, path, query, fragment)
268
269A 5-tuple that contains the different components of a URL. Similar to
270ParseResult, but does not split params.
271"""
272
273_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
274
275_SplitResultBase.netloc.__doc__ = """
276Network location where the request is made to.
277"""
278
279_SplitResultBase.path.__doc__ = """
280The hierarchical path, such as the path to a file to download.
281"""
282
283_SplitResultBase.query.__doc__ = """
284The query component, that contains non-hierarchical data, that along with data
285in path component, identifies a resource in the scope of URI's scheme and
286network location.
287"""
288
289_SplitResultBase.fragment.__doc__ = """
290Fragment identifier, that allows indirect identification of a secondary resource
291by reference to a primary resource and additional identifying information.
292"""
293
294_ParseResultBase.__doc__ = """
295ParseResult(scheme, netloc, path, params, query, fragment)
296
297A 6-tuple that contains components of a parsed URL.
298"""
299
300_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
301_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
302_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
303_ParseResultBase.params.__doc__ = """
304Parameters for last path element used to dereference the URI in order to provide
305access to perform some operation on the resource.
306"""
307
308_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
309_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
310
311
312# For backwards compatibility, alias _NetlocResultMixinStr
313# ResultBase is no longer part of the documented API, but it is
314# retained since deprecating it isn't worth the hassle
315ResultBase = _NetlocResultMixinStr
316
317# Structured result objects for string data
318class DefragResult(_DefragResultBase, _ResultMixinStr):
319 __slots__ = ()
320 def geturl(self):
321 if self.fragment:
322 return self.url + '#' + self.fragment
323 else:
324 return self.url
325
326class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
327 __slots__ = ()
328 def geturl(self):
329 return urlunsplit(self)
330
331class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
332 __slots__ = ()
333 def geturl(self):
334 return urlunparse(self)
335
336# Structured result objects for bytes data
337class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
338 __slots__ = ()
339 def geturl(self):
340 if self.fragment:
341 return self.url + b'#' + self.fragment
342 else:
343 return self.url
344
345class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
346 __slots__ = ()
347 def geturl(self):
348 return urlunsplit(self)
349
350class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
351 __slots__ = ()
352 def geturl(self):
353 return urlunparse(self)
354
355# Set up the encode/decode result pairs
356def _fix_result_transcoding():
357 _result_pairs = (
358 (DefragResult, DefragResultBytes),
359 (SplitResult, SplitResultBytes),
360 (ParseResult, ParseResultBytes),
361 )
362 for _decoded, _encoded in _result_pairs:
363 _decoded._encoded_counterpart = _encoded
364 _encoded._decoded_counterpart = _decoded
365
366_fix_result_transcoding()
367del _fix_result_transcoding
368
369def urlparse(url, scheme='', allow_fragments=True):
370 """Parse a URL into 6 components:
371 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
372
373 The result is a named 6-tuple with fields corresponding to the
374 above. It is either a ParseResult or ParseResultBytes object,
375 depending on the type of the url parameter.
376
377 The username, password, hostname, and port sub-components of netloc
378 can also be accessed as attributes of the returned object.
379
380 The scheme argument provides the default value of the scheme
381 component when no scheme is found in url.
382
383 If allow_fragments is False, no attempt is made to separate the
384 fragment component from the previous component, which can be either
385 path or query.
386
387 Note that % escapes are not expanded.
388 """
389 url, scheme, _coerce_result = _coerce_args(url, scheme)
390 splitresult = urlsplit(url, scheme, allow_fragments)
391 scheme, netloc, url, query, fragment = splitresult
392 if scheme in uses_params and ';' in url:
393 url, params = _splitparams(url)
394 else:
395 params = ''
396 result = ParseResult(scheme, netloc, url, params, query, fragment)
397 return _coerce_result(result)
398
399def _splitparams(url):
400 if '/' in url:
401 i = url.find(';', url.rfind('/'))
402 if i < 0:
403 return url, ''
404 else:
405 i = url.find(';')
406 return url[:i], url[i+1:]
407
408def _splitnetloc(url, start=0):
409 delim = len(url) # position of end of domain part of url, default is end
410 for c in '/?#': # look for delimiters; the order is NOT important
411 wdelim = url.find(c, start) # find first of this delim
412 if wdelim >= 0: # if found
413 delim = min(delim, wdelim) # use earliest delim position
414 return url[start:delim], url[delim:] # return (domain, rest)
415
416def _checknetloc(netloc):
417 if not netloc or netloc.isascii():
418 return
419 # looking for characters like \u2100 that expand to 'a/c'
420 # IDNA uses NFKC equivalence, so normalize for this check
421 import unicodedata
422 n = netloc.replace('@', '') # ignore characters already included
423 n = n.replace(':', '') # but not the surrounding text
424 n = n.replace('#', '')
425 n = n.replace('?', '')
426 netloc2 = unicodedata.normalize('NFKC', n)
427 if n == netloc2:
428 return
429 for c in '/?#@:':
430 if c in netloc2:
431 raise ValueError("netloc '" + netloc + "' contains invalid " +
432 "characters under NFKC normalization")
433
434def urlsplit(url, scheme='', allow_fragments=True):
435 """Parse a URL into 5 components:
436 <scheme>://<netloc>/<path>?<query>#<fragment>
437
438 The result is a named 5-tuple with fields corresponding to the
439 above. It is either a SplitResult or SplitResultBytes object,
440 depending on the type of the url parameter.
441
442 The username, password, hostname, and port sub-components of netloc
443 can also be accessed as attributes of the returned object.
444
445 The scheme argument provides the default value of the scheme
446 component when no scheme is found in url.
447
448 If allow_fragments is False, no attempt is made to separate the
449 fragment component from the previous component, which can be either
450 path or query.
451
452 Note that % escapes are not expanded.
453 """
454
455 url, scheme, _coerce_result = _coerce_args(url, scheme)
456 allow_fragments = bool(allow_fragments)
457 key = url, scheme, allow_fragments, type(url), type(scheme)
458 cached = _parse_cache.get(key, None)
459 if cached:
460 return _coerce_result(cached)
461 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
462 clear_cache()
463 netloc = query = fragment = ''
464 i = url.find(':')
465 if i > 0:
466 for c in url[:i]:
467 if c not in scheme_chars:
468 break
469 else:
470 scheme, url = url[:i].lower(), url[i+1:]
471
472 if url[:2] == '//':
473 netloc, url = _splitnetloc(url, 2)
474 if (('[' in netloc and ']' not in netloc) or
475 (']' in netloc and '[' not in netloc)):
476 raise ValueError("Invalid IPv6 URL")
477 if allow_fragments and '#' in url:
478 url, fragment = url.split('#', 1)
479 if '?' in url:
480 url, query = url.split('?', 1)
481 _checknetloc(netloc)
482 v = SplitResult(scheme, netloc, url, query, fragment)
483 _parse_cache[key] = v
484 return _coerce_result(v)
485
486def urlunparse(components):
487 """Put a parsed URL back together again. This may result in a
488 slightly different, but equivalent URL, if the URL that was parsed
489 originally had redundant delimiters, e.g. a ? with an empty query
490 (the draft states that these are equivalent)."""
491 scheme, netloc, url, params, query, fragment, _coerce_result = (
492 _coerce_args(*components))
493 if params:
494 url = "%s;%s" % (url, params)
495 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
496
497def urlunsplit(components):
498 """Combine the elements of a tuple as returned by urlsplit() into a
499 complete URL as a string. The data argument can be any five-item iterable.
500 This may result in a slightly different, but equivalent URL, if the URL that
501 was parsed originally had unnecessary delimiters (for example, a ? with an
502 empty query; the RFC states that these are equivalent)."""
503 scheme, netloc, url, query, fragment, _coerce_result = (
504 _coerce_args(*components))
505 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
506 if url and url[:1] != '/': url = '/' + url
507 url = '//' + (netloc or '') + url
508 if scheme:
509 url = scheme + ':' + url
510 if query:
511 url = url + '?' + query
512 if fragment:
513 url = url + '#' + fragment
514 return _coerce_result(url)
515
516def urljoin(base, url, allow_fragments=True):
517 """Join a base URL and a possibly relative URL to form an absolute
518 interpretation of the latter."""
519 if not base:
520 return url
521 if not url:
522 return base
523
524 base, url, _coerce_result = _coerce_args(base, url)
525 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
526 urlparse(base, '', allow_fragments)
527 scheme, netloc, path, params, query, fragment = \
528 urlparse(url, bscheme, allow_fragments)
529
530 if scheme != bscheme or scheme not in uses_relative:
531 return _coerce_result(url)
532 if scheme in uses_netloc:
533 if netloc:
534 return _coerce_result(urlunparse((scheme, netloc, path,
535 params, query, fragment)))
536 netloc = bnetloc
537
538 if not path and not params:
539 path = bpath
540 params = bparams
541 if not query:
542 query = bquery
543 return _coerce_result(urlunparse((scheme, netloc, path,
544 params, query, fragment)))
545
546 base_parts = bpath.split('/')
547 if base_parts[-1] != '':
548 # the last item is not a directory, so will not be taken into account
549 # in resolving the relative path
550 del base_parts[-1]
551
552 # for rfc3986, ignore all base path should the first character be root.
553 if path[:1] == '/':
554 segments = path.split('/')
555 else:
556 segments = base_parts + path.split('/')
557 # filter out elements that would cause redundant slashes on re-joining
558 # the resolved_path
559 segments[1:-1] = filter(None, segments[1:-1])
560
561 resolved_path = []
562
563 for seg in segments:
564 if seg == '..':
565 try:
566 resolved_path.pop()
567 except IndexError:
568 # ignore any .. segments that would otherwise cause an IndexError
569 # when popped from resolved_path if resolving for rfc3986
570 pass
571 elif seg == '.':
572 continue
573 else:
574 resolved_path.append(seg)
575
576 if segments[-1] in ('.', '..'):
577 # do some post-processing here. if the last segment was a relative dir,
578 # then we need to append the trailing '/'
579 resolved_path.append('')
580
581 return _coerce_result(urlunparse((scheme, netloc, '/'.join(
582 resolved_path) or '/', params, query, fragment)))
583
584
585def urldefrag(url):
586 """Removes any existing fragment from URL.
587
588 Returns a tuple of the defragmented URL and the fragment. If
589 the URL contained no fragments, the second element is the
590 empty string.
591 """
592 url, _coerce_result = _coerce_args(url)
593 if '#' in url:
594 s, n, p, a, q, frag = urlparse(url)
595 defrag = urlunparse((s, n, p, a, q, ''))
596 else:
597 frag = ''
598 defrag = url
599 return _coerce_result(DefragResult(defrag, frag))
600
601_hexdig = '0123456789ABCDEFabcdef'
602_hextobyte = None
603
604def unquote_to_bytes(string):
605 """unquote_to_bytes('abc%20def') -> b'abc def'."""
606 # Note: strings are encoded as UTF-8. This is only an issue if it contains
607 # unescaped non-ASCII characters, which URIs should not.
608 if not string:
609 # Is it a string-like object?
610 string.split
611 return b''
612 if isinstance(string, str):
613 string = string.encode('utf-8')
614 bits = string.split(b'%')
615 if len(bits) == 1:
616 return string
617 res = [bits[0]]
618 append = res.append
619 # Delay the initialization of the table to not waste memory
620 # if the function is never called
621 global _hextobyte
622 if _hextobyte is None:
623 _hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
624 for a in _hexdig for b in _hexdig}
625 for item in bits[1:]:
626 try:
627 append(_hextobyte[item[:2]])
628 append(item[2:])
629 except KeyError:
630 append(b'%')
631 append(item)
632 return b''.join(res)
633
634_asciire = re.compile('([\x00-\x7f]+)')
635
636def unquote(string, encoding='utf-8', errors='replace'):
637 """Replace %xx escapes by their single-character equivalent. The optional
638 encoding and errors parameters specify how to decode percent-encoded
639 sequences into Unicode characters, as accepted by the bytes.decode()
640 method.
641 By default, percent-encoded sequences are decoded with UTF-8, and invalid
642 sequences are replaced by a placeholder character.
643
644 unquote('abc%20def') -> 'abc def'.
645 """
646 if isinstance(string, bytes):
647 return unquote_to_bytes(string).decode(encoding, errors)
648 if '%' not in string:
649 string.split
650 return string
651 if encoding is None:
652 encoding = 'utf-8'
653 if errors is None:
654 errors = 'replace'
655 bits = _asciire.split(string)
656 res = [bits[0]]
657 append = res.append
658 for i in range(1, len(bits), 2):
659 append(unquote_to_bytes(bits[i]).decode(encoding, errors))
660 append(bits[i + 1])
661 return ''.join(res)
662
663
664def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
665 encoding='utf-8', errors='replace', max_num_fields=None):
666 """Parse a query given as a string argument.
667
668 Arguments:
669
670 qs: percent-encoded query string to be parsed
671
672 keep_blank_values: flag indicating whether blank values in
673 percent-encoded queries should be treated as blank strings.
674 A true value indicates that blanks should be retained as
675 blank strings. The default false value indicates that
676 blank values are to be ignored and treated as if they were
677 not included.
678
679 strict_parsing: flag indicating what to do with parsing errors.
680 If false (the default), errors are silently ignored.
681 If true, errors raise a ValueError exception.
682
683 encoding and errors: specify how to decode percent-encoded sequences
684 into Unicode characters, as accepted by the bytes.decode() method.
685
686 max_num_fields: int. If set, then throws a ValueError if there
687 are more than n fields read by parse_qsl().
688
689 Returns a dictionary.
690 """
691 parsed_result = {}
692 pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
693 encoding=encoding, errors=errors,
694 max_num_fields=max_num_fields)
695 for name, value in pairs:
696 if name in parsed_result:
697 parsed_result[name].append(value)
698 else:
699 parsed_result[name] = [value]
700 return parsed_result
701
702
703def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
704 encoding='utf-8', errors='replace', max_num_fields=None):
705 """Parse a query given as a string argument.
706
707 Arguments:
708
709 qs: percent-encoded query string to be parsed
710
711 keep_blank_values: flag indicating whether blank values in
712 percent-encoded queries should be treated as blank strings.
713 A true value indicates that blanks should be retained as blank
714 strings. The default false value indicates that blank values
715 are to be ignored and treated as if they were not included.
716
717 strict_parsing: flag indicating what to do with parsing errors. If
718 false (the default), errors are silently ignored. If true,
719 errors raise a ValueError exception.
720
721 encoding and errors: specify how to decode percent-encoded sequences
722 into Unicode characters, as accepted by the bytes.decode() method.
723
724 max_num_fields: int. If set, then throws a ValueError
725 if there are more than n fields read by parse_qsl().
726
727 Returns a list, as G-d intended.
728 """
729 qs, _coerce_result = _coerce_args(qs)
730
731 # If max_num_fields is defined then check that the number of fields
732 # is less than max_num_fields. This prevents a memory exhaustion DOS
733 # attack via post bodies with many fields.
734 if max_num_fields is not None:
735 num_fields = 1 + qs.count('&') + qs.count(';')
736 if max_num_fields < num_fields:
737 raise ValueError('Max number of fields exceeded')
738
739 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
740 r = []
741 for name_value in pairs:
742 if not name_value and not strict_parsing:
743 continue
744 nv = name_value.split('=', 1)
745 if len(nv) != 2:
746 if strict_parsing:
747 raise ValueError("bad query field: %r" % (name_value,))
748 # Handle case of a control-name with no equal sign
749 if keep_blank_values:
750 nv.append('')
751 else:
752 continue
753 if len(nv[1]) or keep_blank_values:
754 name = nv[0].replace('+', ' ')
755 name = unquote(name, encoding=encoding, errors=errors)
756 name = _coerce_result(name)
757 value = nv[1].replace('+', ' ')
758 value = unquote(value, encoding=encoding, errors=errors)
759 value = _coerce_result(value)
760 r.append((name, value))
761 return r
762
763def unquote_plus(string, encoding='utf-8', errors='replace'):
764 """Like unquote(), but also replace plus signs by spaces, as required for
765 unquoting HTML form values.
766
767 unquote_plus('%7e/abc+def') -> '~/abc def'
768 """
769 string = string.replace('+', ' ')
770 return unquote(string, encoding, errors)
771
772_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
773 b'abcdefghijklmnopqrstuvwxyz'
774 b'0123456789'
775 b'_.-~')
776_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
777_safe_quoters = {}
778
779class Quoter(collections.defaultdict):
780 """A mapping from bytes (in range(0,256)) to strings.
781
782 String values are percent-encoded byte values, unless the key < 128, and
783 in the "safe" set (either the specified safe set, or default set).
784 """
785 # Keeps a cache internally, using defaultdict, for efficiency (lookups
786 # of cached keys don't call Python code at all).
787 def __init__(self, safe):
788 """safe: bytes object."""
789 self.safe = _ALWAYS_SAFE.union(safe)
790
791 def __repr__(self):
792 # Without this, will just display as a defaultdict
793 return "<%s %r>" % (self.__class__.__name__, dict(self))
794
795 def __missing__(self, b):
796 # Handle a cache miss. Store quoted string in cache and return.
797 res = chr(b) if b in self.safe else '%{:02X}'.format(b)
798 self[b] = res
799 return res
800
801def quote(string, safe='/', encoding=None, errors=None):
802 """quote('abc def') -> 'abc%20def'
803
804 Each part of a URL, e.g. the path info, the query, etc., has a
805 different set of reserved characters that must be quoted. The
806 quote function offers a cautious (not minimal) way to quote a
807 string for most of these parts.
808
809 RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
810 the following (un)reserved characters.
811
812 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
813 reserved = gen-delims / sub-delims
814 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
815 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
816 / "*" / "+" / "," / ";" / "="
817
818 Each of the reserved characters is reserved in some component of a URL,
819 but not necessarily in all of them.
820
821 The quote function %-escapes all characters that are neither in the
822 unreserved chars ("always safe") nor the additional chars set via the
823 safe arg.
824
825 The default for the safe arg is '/'. The character is reserved, but in
826 typical usage the quote function is being called on a path where the
827 existing slash characters are to be preserved.
828
829 Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
830 Now, "~" is included in the set of unreserved characters.
831
832 string and safe may be either str or bytes objects. encoding and errors
833 must not be specified if string is a bytes object.
834
835 The optional encoding and errors parameters specify how to deal with
836 non-ASCII characters, as accepted by the str.encode method.
837 By default, encoding='utf-8' (characters are encoded with UTF-8), and
838 errors='strict' (unsupported characters raise a UnicodeEncodeError).
839 """
840 if isinstance(string, str):
841 if not string:
842 return string
843 if encoding is None:
844 encoding = 'utf-8'
845 if errors is None:
846 errors = 'strict'
847 string = string.encode(encoding, errors)
848 else:
849 if encoding is not None:
850 raise TypeError("quote() doesn't support 'encoding' for bytes")
851 if errors is not None:
852 raise TypeError("quote() doesn't support 'errors' for bytes")
853 return quote_from_bytes(string, safe)
854
855def quote_plus(string, safe='', encoding=None, errors=None):
856 """Like quote(), but also replace ' ' with '+', as required for quoting
857 HTML form values. Plus signs in the original string are escaped unless
858 they are included in safe. It also does not have safe default to '/'.
859 """
860 # Check if ' ' in string, where string may either be a str or bytes. If
861 # there are no spaces, the regular quote will produce the right answer.
862 if ((isinstance(string, str) and ' ' not in string) or
863 (isinstance(string, bytes) and b' ' not in string)):
864 return quote(string, safe, encoding, errors)
865 if isinstance(safe, str):
866 space = ' '
867 else:
868 space = b' '
869 string = quote(string, safe + space, encoding, errors)
870 return string.replace(' ', '+')
871
872def quote_from_bytes(bs, safe='/'):
873 """Like quote(), but accepts a bytes object rather than a str, and does
874 not perform string-to-bytes encoding. It always returns an ASCII string.
875 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
876 """
877 if not isinstance(bs, (bytes, bytearray)):
878 raise TypeError("quote_from_bytes() expected bytes")
879 if not bs:
880 return ''
881 if isinstance(safe, str):
882 # Normalize 'safe' by converting to bytes and removing non-ASCII chars
883 safe = safe.encode('ascii', 'ignore')
884 else:
885 safe = bytes([c for c in safe if c < 128])
886 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
887 return bs.decode()
888 try:
889 quoter = _safe_quoters[safe]
890 except KeyError:
891 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__
892 return ''.join([quoter(char) for char in bs])
893
894def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
895 quote_via=quote_plus):
896 """Encode a dict or sequence of two-element tuples into a URL query string.
897
898 If any values in the query arg are sequences and doseq is true, each
899 sequence element is converted to a separate parameter.
900
901 If the query arg is a sequence of two-element tuples, the order of the
902 parameters in the output will match the order of parameters in the
903 input.
904
905 The components of a query arg may each be either a string or a bytes type.
906
907 The safe, encoding, and errors parameters are passed down to the function
908 specified by quote_via (encoding and errors only if a component is a str).
909 """
910
911 if hasattr(query, "items"):
912 query = query.items()
913 else:
914 # It's a bother at times that strings and string-like objects are
915 # sequences.
916 try:
917 # non-sequence items should not work with len()
918 # non-empty strings will fail this
919 if len(query) and not isinstance(query[0], tuple):
920 raise TypeError
921 # Zero-length sequences of all types will get here and succeed,
922 # but that's a minor nit. Since the original implementation
923 # allowed empty dicts that type of behavior probably should be
924 # preserved for consistency
925 except TypeError:
926 ty, va, tb = sys.exc_info()
927 raise TypeError("not a valid non-string sequence "
928 "or mapping object").with_traceback(tb)
929
930 l = []
931 if not doseq:
932 for k, v in query:
933 if isinstance(k, bytes):
934 k = quote_via(k, safe)
935 else:
936 k = quote_via(str(k), safe, encoding, errors)
937
938 if isinstance(v, bytes):
939 v = quote_via(v, safe)
940 else:
941 v = quote_via(str(v), safe, encoding, errors)
942 l.append(k + '=' + v)
943 else:
944 for k, v in query:
945 if isinstance(k, bytes):
946 k = quote_via(k, safe)
947 else:
948 k = quote_via(str(k), safe, encoding, errors)
949
950 if isinstance(v, bytes):
951 v = quote_via(v, safe)
952 l.append(k + '=' + v)
953 elif isinstance(v, str):
954 v = quote_via(v, safe, encoding, errors)
955 l.append(k + '=' + v)
956 else:
957 try:
958 # Is this a sufficient test for sequence-ness?
959 x = len(v)
960 except TypeError:
961 # not a sequence
962 v = quote_via(str(v), safe, encoding, errors)
963 l.append(k + '=' + v)
964 else:
965 # loop over the sequence
966 for elt in v:
967 if isinstance(elt, bytes):
968 elt = quote_via(elt, safe)
969 else:
970 elt = quote_via(str(elt), safe, encoding, errors)
971 l.append(k + '=' + elt)
972 return '&'.join(l)
973
974
975def to_bytes(url):
976 warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
977 DeprecationWarning, stacklevel=2)
978 return _to_bytes(url)
979
980
981def _to_bytes(url):
982 """to_bytes(u"URL") --> 'URL'."""
983 # Most URL schemes require ASCII. If that changes, the conversion
984 # can be relaxed.
985 # XXX get rid of to_bytes()
986 if isinstance(url, str):
987 try:
988 url = url.encode("ASCII").decode()
989 except UnicodeError:
990 raise UnicodeError("URL " + repr(url) +
991 " contains non-ASCII characters")
992 return url
993
994
995def unwrap(url):
996 """Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
997
998 The string is returned unchanged if it's not a wrapped URL.
999 """
1000 url = str(url).strip()
1001 if url[:1] == '<' and url[-1:] == '>':
1002 url = url[1:-1].strip()
1003 if url[:4] == 'URL:':
1004 url = url[4:].strip()
1005 return url
1006
1007
1008def splittype(url):
1009 warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
1010 "use urllib.parse.urlparse() instead",
1011 DeprecationWarning, stacklevel=2)
1012 return _splittype(url)
1013
1014
1015_typeprog = None
1016def _splittype(url):
1017 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1018 global _typeprog
1019 if _typeprog is None:
1020 _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
1021
1022 match = _typeprog.match(url)
1023 if match:
1024 scheme, data = match.groups()
1025 return scheme.lower(), data
1026 return None, url
1027
1028
1029def splithost(url):
1030 warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
1031 "use urllib.parse.urlparse() instead",
1032 DeprecationWarning, stacklevel=2)
1033 return _splithost(url)
1034
1035
1036_hostprog = None
1037def _splithost(url):
1038 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1039 global _hostprog
1040 if _hostprog is None:
1041 _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
1042
1043 match = _hostprog.match(url)
1044 if match:
1045 host_port, path = match.groups()
1046 if path and path[0] != '/':
1047 path = '/' + path
1048 return host_port, path
1049 return None, url
1050
1051
1052def splituser(host):
1053 warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
1054 "use urllib.parse.urlparse() instead",
1055 DeprecationWarning, stacklevel=2)
1056 return _splituser(host)
1057
1058
1059def _splituser(host):
1060 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1061 user, delim, host = host.rpartition('@')
1062 return (user if delim else None), host
1063
1064
1065def splitpasswd(user):
1066 warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
1067 "use urllib.parse.urlparse() instead",
1068 DeprecationWarning, stacklevel=2)
1069 return _splitpasswd(user)
1070
1071
1072def _splitpasswd(user):
1073 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1074 user, delim, passwd = user.partition(':')
1075 return user, (passwd if delim else None)
1076
1077
1078def splitport(host):
1079 warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
1080 "use urllib.parse.urlparse() instead",
1081 DeprecationWarning, stacklevel=2)
1082 return _splitport(host)
1083
1084
1085# splittag('/path#tag') --> '/path', 'tag'
1086_portprog = None
1087def _splitport(host):
1088 """splitport('host:port') --> 'host', 'port'."""
1089 global _portprog
1090 if _portprog is None:
1091 _portprog = re.compile('(.*):([0-9]*)', re.DOTALL)
1092
1093 match = _portprog.fullmatch(host)
1094 if match:
1095 host, port = match.groups()
1096 if port:
1097 return host, port
1098 return host, None
1099
1100
1101def splitnport(host, defport=-1):
1102 warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
1103 "use urllib.parse.urlparse() instead",
1104 DeprecationWarning, stacklevel=2)
1105 return _splitnport(host, defport)
1106
1107
1108def _splitnport(host, defport=-1):
1109 """Split host and port, returning numeric port.
1110 Return given default port if no ':' found; defaults to -1.
1111 Return numerical port if a valid number are found after ':'.
1112 Return None if ':' but not a valid number."""
1113 host, delim, port = host.rpartition(':')
1114 if not delim:
1115 host = port
1116 elif port:
1117 try:
1118 nport = int(port)
1119 except ValueError:
1120 nport = None
1121 return host, nport
1122 return host, defport
1123
1124
1125def splitquery(url):
1126 warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
1127 "use urllib.parse.urlparse() instead",
1128 DeprecationWarning, stacklevel=2)
1129 return _splitquery(url)
1130
1131
1132def _splitquery(url):
1133 """splitquery('/path?query') --> '/path', 'query'."""
1134 path, delim, query = url.rpartition('?')
1135 if delim:
1136 return path, query
1137 return url, None
1138
1139
1140def splittag(url):
1141 warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
1142 "use urllib.parse.urlparse() instead",
1143 DeprecationWarning, stacklevel=2)
1144 return _splittag(url)
1145
1146
1147def _splittag(url):
1148 """splittag('/path#tag') --> '/path', 'tag'."""
1149 path, delim, tag = url.rpartition('#')
1150 if delim:
1151 return path, tag
1152 return url, None
1153
1154
1155def splitattr(url):
1156 warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
1157 "use urllib.parse.urlparse() instead",
1158 DeprecationWarning, stacklevel=2)
1159 return _splitattr(url)
1160
1161
1162def _splitattr(url):
1163 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1164 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1165 words = url.split(';')
1166 return words[0], words[1:]
1167
1168
1169def splitvalue(attr):
1170 warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
1171 "use urllib.parse.parse_qsl() instead",
1172 DeprecationWarning, stacklevel=2)
1173 return _splitvalue(attr)
1174
1175
1176def _splitvalue(attr):
1177 """splitvalue('attr=value') --> 'attr', 'value'."""
1178 attr, delim, value = attr.partition('=')
1179 return attr, (value if delim else None)