Blame - linux-x64/clang/python3/lib/python3.9/urllib/parse.py - hafnium/prebuilts

blob: ea897c3032257b290724c21c2e4c20542175b42c [file] [log] [blame]

Olivier Deprez	f4ef2d0	2021-04-20 13:36:24 +0200	[diff] [blame^]	1	"""Parse (absolute and relative) URLs.
				2
				3	urlparse module is based upon the following RFC specifications.
				4
				5	RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
				6	and L. Masinter, January 2005.
				7
				8	RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
				9	and L.Masinter, December 1999.
				10
				11	RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
				12	Berners-Lee, R. Fielding, and L. Masinter, August 1998.
				13
				14	RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998.
				15
				16	RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
				17	1995.
				18
				19	RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
				20	McCahill, December 1994
				21
				22	RFC 3986 is considered the current standard and any future changes to
				23	urlparse module should conform with it. The urlparse module is
				24	currently not entirely compliant with this RFC due to defacto
				25	scenarios for parsing, and for backward compatibility purposes, some
				26	parsing quirks from older RFCs are retained. The testcases in
				27	test_urlparse.py provides a good indicator of parsing behavior.
				28	"""
				29
				30	import re
				31	import sys
				32	import types
				33	import collections
				34	import warnings
				35
				36	__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
				37	"urlsplit", "urlunsplit", "urlencode", "parse_qs",
				38	"parse_qsl", "quote", "quote_plus", "quote_from_bytes",
				39	"unquote", "unquote_plus", "unquote_to_bytes",
				40	"DefragResult", "ParseResult", "SplitResult",
				41	"DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
				42
				43	# A classification of schemes.
				44	# The empty string classifies URLs with no scheme specified,
				45	# being the default value returned by “urlsplit” and “urlparse”.
				46
				47	uses_relative = ['', 'ftp', 'http', 'gopher', 'nntp', 'imap',
				48	'wais', 'file', 'https', 'shttp', 'mms',
				49	'prospero', 'rtsp', 'rtspu', 'sftp',
				50	'svn', 'svn+ssh', 'ws', 'wss']
				51
				52	uses_netloc = ['', 'ftp', 'http', 'gopher', 'nntp', 'telnet',
				53	'imap', 'wais', 'file', 'mms', 'https', 'shttp',
				54	'snews', 'prospero', 'rtsp', 'rtspu', 'rsync',
				55	'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh',
				56	'ws', 'wss']
				57
				58	uses_params = ['', 'ftp', 'hdl', 'prospero', 'http', 'imap',
				59	'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
				60	'mms', 'sftp', 'tel']
				61
				62	# These are not actually used anymore, but should stay for backwards
				63	# compatibility. (They are undocumented, but have a public-looking name.)
				64
				65	non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
				66	'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
				67
				68	uses_query = ['', 'http', 'wais', 'imap', 'https', 'shttp', 'mms',
				69	'gopher', 'rtsp', 'rtspu', 'sip', 'sips']
				70
				71	uses_fragment = ['', 'ftp', 'hdl', 'http', 'gopher', 'news',
				72	'nntp', 'wais', 'https', 'shttp', 'snews',
				73	'file', 'prospero']
				74
				75	# Characters valid in scheme names
				76	scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
				77	'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				78	'0123456789'
				79	'+-.')
				80
				81	# XXX: Consider replacing with functools.lru_cache
				82	MAX_CACHE_SIZE = 20
				83	_parse_cache = {}
				84
				85	def clear_cache():
				86	"""Clear the parse cache and the quoters cache."""
				87	_parse_cache.clear()
				88	_safe_quoters.clear()
				89
				90
				91	# Helpers for bytes handling
				92	# For 3.2, we deliberately require applications that
				93	# handle improperly quoted URLs to do their own
				94	# decoding and encoding. If valid use cases are
				95	# presented, we may relax this by using latin-1
				96	# decoding internally for 3.3
				97	_implicit_encoding = 'ascii'
				98	_implicit_errors = 'strict'
				99
				100	def _noop(obj):
				101	return obj
				102
				103	def _encode_result(obj, encoding=_implicit_encoding,
				104	errors=_implicit_errors):
				105	return obj.encode(encoding, errors)
				106
				107	def _decode_args(args, encoding=_implicit_encoding,
				108	errors=_implicit_errors):
				109	return tuple(x.decode(encoding, errors) if x else '' for x in args)
				110
				111	def _coerce_args(*args):
				112	# Invokes decode if necessary to create str args
				113	# and returns the coerced inputs along with
				114	# an appropriate result coercion function
				115	# - noop for str inputs
				116	# - encoding function otherwise
				117	str_input = isinstance(args[0], str)
				118	for arg in args[1:]:
				119	# We special-case the empty string to support the
				120	# "scheme=''" default argument to some functions
				121	if arg and isinstance(arg, str) != str_input:
				122	raise TypeError("Cannot mix str and non-str arguments")
				123	if str_input:
				124	return args + (_noop,)
				125	return _decode_args(args) + (_encode_result,)
				126
				127	# Result objects are more helpful than simple tuples
				128	class _ResultMixinStr(object):
				129	"""Standard approach to encoding parsed results from str to bytes"""
				130	__slots__ = ()
				131
				132	def encode(self, encoding='ascii', errors='strict'):
				133	return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
				134
				135
				136	class _ResultMixinBytes(object):
				137	"""Standard approach to decoding parsed results from bytes to str"""
				138	__slots__ = ()
				139
				140	def decode(self, encoding='ascii', errors='strict'):
				141	return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
				142
				143
				144	class _NetlocResultMixinBase(object):
				145	"""Shared methods for the parsed result objects containing a netloc element"""
				146	__slots__ = ()
				147
				148	@property
				149	def username(self):
				150	return self._userinfo[0]
				151
				152	@property
				153	def password(self):
				154	return self._userinfo[1]
				155
				156	@property
				157	def hostname(self):
				158	hostname = self._hostinfo[0]
				159	if not hostname:
				160	return None
				161	# Scoped IPv6 address may have zone info, which must not be lowercased
				162	# like http://[fe80::822a:a8ff:fe49:470c%tESt]:1234/keys
				163	separator = '%' if isinstance(hostname, str) else b'%'
				164	hostname, percent, zone = hostname.partition(separator)
				165	return hostname.lower() + percent + zone
				166
				167	@property
				168	def port(self):
				169	port = self._hostinfo[1]
				170	if port is not None:
				171	try:
				172	port = int(port, 10)
				173	except ValueError:
				174	message = f'Port could not be cast to integer value as {port!r}'
				175	raise ValueError(message) from None
				176	if not ( 0 <= port <= 65535):
				177	raise ValueError("Port out of range 0-65535")
				178	return port
				179
				180	__class_getitem__ = classmethod(types.GenericAlias)
				181
				182
				183	class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
				184	__slots__ = ()
				185
				186	@property
				187	def _userinfo(self):
				188	netloc = self.netloc
				189	userinfo, have_info, hostinfo = netloc.rpartition('@')
				190	if have_info:
				191	username, have_password, password = userinfo.partition(':')
				192	if not have_password:
				193	password = None
				194	else:
				195	username = password = None
				196	return username, password
				197
				198	@property
				199	def _hostinfo(self):
				200	netloc = self.netloc
				201	_, _, hostinfo = netloc.rpartition('@')
				202	_, have_open_br, bracketed = hostinfo.partition('[')
				203	if have_open_br:
				204	hostname, _, port = bracketed.partition(']')
				205	_, _, port = port.partition(':')
				206	else:
				207	hostname, _, port = hostinfo.partition(':')
				208	if not port:
				209	port = None
				210	return hostname, port
				211
				212
				213	class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
				214	__slots__ = ()
				215
				216	@property
				217	def _userinfo(self):
				218	netloc = self.netloc
				219	userinfo, have_info, hostinfo = netloc.rpartition(b'@')
				220	if have_info:
				221	username, have_password, password = userinfo.partition(b':')
				222	if not have_password:
				223	password = None
				224	else:
				225	username = password = None
				226	return username, password
				227
				228	@property
				229	def _hostinfo(self):
				230	netloc = self.netloc
				231	_, _, hostinfo = netloc.rpartition(b'@')
				232	_, have_open_br, bracketed = hostinfo.partition(b'[')
				233	if have_open_br:
				234	hostname, _, port = bracketed.partition(b']')
				235	_, _, port = port.partition(b':')
				236	else:
				237	hostname, _, port = hostinfo.partition(b':')
				238	if not port:
				239	port = None
				240	return hostname, port
				241
				242
				243	from collections import namedtuple
				244
				245	_DefragResultBase = namedtuple('DefragResult', 'url fragment')
				246	_SplitResultBase = namedtuple(
				247	'SplitResult', 'scheme netloc path query fragment')
				248	_ParseResultBase = namedtuple(
				249	'ParseResult', 'scheme netloc path params query fragment')
				250
				251	_DefragResultBase.__doc__ = """
				252	DefragResult(url, fragment)
				253
				254	A 2-tuple that contains the url without fragment identifier and the fragment
				255	identifier as a separate argument.
				256	"""
				257
				258	_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
				259
				260	_DefragResultBase.fragment.__doc__ = """
				261	Fragment identifier separated from URL, that allows indirect identification of a
				262	secondary resource by reference to a primary resource and additional identifying
				263	information.
				264	"""
				265
				266	_SplitResultBase.__doc__ = """
				267	SplitResult(scheme, netloc, path, query, fragment)
				268
				269	A 5-tuple that contains the different components of a URL. Similar to
				270	ParseResult, but does not split params.
				271	"""
				272
				273	_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
				274
				275	_SplitResultBase.netloc.__doc__ = """
				276	Network location where the request is made to.
				277	"""
				278
				279	_SplitResultBase.path.__doc__ = """
				280	The hierarchical path, such as the path to a file to download.
				281	"""
				282
				283	_SplitResultBase.query.__doc__ = """
				284	The query component, that contains non-hierarchical data, that along with data
				285	in path component, identifies a resource in the scope of URI's scheme and
				286	network location.
				287	"""
				288
				289	_SplitResultBase.fragment.__doc__ = """
				290	Fragment identifier, that allows indirect identification of a secondary resource
				291	by reference to a primary resource and additional identifying information.
				292	"""
				293
				294	_ParseResultBase.__doc__ = """
				295	ParseResult(scheme, netloc, path, params, query, fragment)
				296
				297	A 6-tuple that contains components of a parsed URL.
				298	"""
				299
				300	_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
				301	_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
				302	_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
				303	_ParseResultBase.params.__doc__ = """
				304	Parameters for last path element used to dereference the URI in order to provide
				305	access to perform some operation on the resource.
				306	"""
				307
				308	_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
				309	_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
				310
				311
				312	# For backwards compatibility, alias _NetlocResultMixinStr
				313	# ResultBase is no longer part of the documented API, but it is
				314	# retained since deprecating it isn't worth the hassle
				315	ResultBase = _NetlocResultMixinStr
				316
				317	# Structured result objects for string data
				318	class DefragResult(_DefragResultBase, _ResultMixinStr):
				319	__slots__ = ()
				320	def geturl(self):
				321	if self.fragment:
				322	return self.url + '#' + self.fragment
				323	else:
				324	return self.url
				325
				326	class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
				327	__slots__ = ()
				328	def geturl(self):
				329	return urlunsplit(self)
				330
				331	class ParseResult(_ParseResultBase, _NetlocResultMixinStr):
				332	__slots__ = ()
				333	def geturl(self):
				334	return urlunparse(self)
				335
				336	# Structured result objects for bytes data
				337	class DefragResultBytes(_DefragResultBase, _ResultMixinBytes):
				338	__slots__ = ()
				339	def geturl(self):
				340	if self.fragment:
				341	return self.url + b'#' + self.fragment
				342	else:
				343	return self.url
				344
				345	class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes):
				346	__slots__ = ()
				347	def geturl(self):
				348	return urlunsplit(self)
				349
				350	class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes):
				351	__slots__ = ()
				352	def geturl(self):
				353	return urlunparse(self)
				354
				355	# Set up the encode/decode result pairs
				356	def _fix_result_transcoding():
				357	_result_pairs = (
				358	(DefragResult, DefragResultBytes),
				359	(SplitResult, SplitResultBytes),
				360	(ParseResult, ParseResultBytes),
				361	)
				362	for _decoded, _encoded in _result_pairs:
				363	_decoded._encoded_counterpart = _encoded
				364	_encoded._decoded_counterpart = _decoded
				365
				366	_fix_result_transcoding()
				367	del _fix_result_transcoding
				368
				369	def urlparse(url, scheme='', allow_fragments=True):
				370	"""Parse a URL into 6 components:
				371	<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
				372
				373	The result is a named 6-tuple with fields corresponding to the
				374	above. It is either a ParseResult or ParseResultBytes object,
				375	depending on the type of the url parameter.
				376
				377	The username, password, hostname, and port sub-components of netloc
				378	can also be accessed as attributes of the returned object.
				379
				380	The scheme argument provides the default value of the scheme
				381	component when no scheme is found in url.
				382
				383	If allow_fragments is False, no attempt is made to separate the
				384	fragment component from the previous component, which can be either
				385	path or query.
				386
				387	Note that % escapes are not expanded.
				388	"""
				389	url, scheme, _coerce_result = _coerce_args(url, scheme)
				390	splitresult = urlsplit(url, scheme, allow_fragments)
				391	scheme, netloc, url, query, fragment = splitresult
				392	if scheme in uses_params and ';' in url:
				393	url, params = _splitparams(url)
				394	else:
				395	params = ''
				396	result = ParseResult(scheme, netloc, url, params, query, fragment)
				397	return _coerce_result(result)
				398
				399	def _splitparams(url):
				400	if '/' in url:
				401	i = url.find(';', url.rfind('/'))
				402	if i < 0:
				403	return url, ''
				404	else:
				405	i = url.find(';')
				406	return url[:i], url[i+1:]
				407
				408	def _splitnetloc(url, start=0):
				409	delim = len(url) # position of end of domain part of url, default is end
				410	for c in '/?#': # look for delimiters; the order is NOT important
				411	wdelim = url.find(c, start) # find first of this delim
				412	if wdelim >= 0: # if found
				413	delim = min(delim, wdelim) # use earliest delim position
				414	return url[start:delim], url[delim:] # return (domain, rest)
				415
				416	def _checknetloc(netloc):
				417	if not netloc or netloc.isascii():
				418	return
				419	# looking for characters like \u2100 that expand to 'a/c'
				420	# IDNA uses NFKC equivalence, so normalize for this check
				421	import unicodedata
				422	n = netloc.replace('@', '') # ignore characters already included
				423	n = n.replace(':', '') # but not the surrounding text
				424	n = n.replace('#', '')
				425	n = n.replace('?', '')
				426	netloc2 = unicodedata.normalize('NFKC', n)
				427	if n == netloc2:
				428	return
				429	for c in '/?#@:':
				430	if c in netloc2:
				431	raise ValueError("netloc '" + netloc + "' contains invalid " +
				432	"characters under NFKC normalization")
				433
				434	def urlsplit(url, scheme='', allow_fragments=True):
				435	"""Parse a URL into 5 components:
				436	<scheme>://<netloc>/<path>?<query>#<fragment>
				437
				438	The result is a named 5-tuple with fields corresponding to the
				439	above. It is either a SplitResult or SplitResultBytes object,
				440	depending on the type of the url parameter.
				441
				442	The username, password, hostname, and port sub-components of netloc
				443	can also be accessed as attributes of the returned object.
				444
				445	The scheme argument provides the default value of the scheme
				446	component when no scheme is found in url.
				447
				448	If allow_fragments is False, no attempt is made to separate the
				449	fragment component from the previous component, which can be either
				450	path or query.
				451
				452	Note that % escapes are not expanded.
				453	"""
				454
				455	url, scheme, _coerce_result = _coerce_args(url, scheme)
				456	allow_fragments = bool(allow_fragments)
				457	key = url, scheme, allow_fragments, type(url), type(scheme)
				458	cached = _parse_cache.get(key, None)
				459	if cached:
				460	return _coerce_result(cached)
				461	if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
				462	clear_cache()
				463	netloc = query = fragment = ''
				464	i = url.find(':')
				465	if i > 0:
				466	for c in url[:i]:
				467	if c not in scheme_chars:
				468	break
				469	else:
				470	scheme, url = url[:i].lower(), url[i+1:]
				471
				472	if url[:2] == '//':
				473	netloc, url = _splitnetloc(url, 2)
				474	if (('[' in netloc and ']' not in netloc) or
				475	(']' in netloc and '[' not in netloc)):
				476	raise ValueError("Invalid IPv6 URL")
				477	if allow_fragments and '#' in url:
				478	url, fragment = url.split('#', 1)
				479	if '?' in url:
				480	url, query = url.split('?', 1)
				481	_checknetloc(netloc)
				482	v = SplitResult(scheme, netloc, url, query, fragment)
				483	_parse_cache[key] = v
				484	return _coerce_result(v)
				485
				486	def urlunparse(components):
				487	"""Put a parsed URL back together again. This may result in a
				488	slightly different, but equivalent URL, if the URL that was parsed
				489	originally had redundant delimiters, e.g. a ? with an empty query
				490	(the draft states that these are equivalent)."""
				491	scheme, netloc, url, params, query, fragment, _coerce_result = (
				492	_coerce_args(*components))
				493	if params:
				494	url = "%s;%s" % (url, params)
				495	return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
				496
				497	def urlunsplit(components):
				498	"""Combine the elements of a tuple as returned by urlsplit() into a
				499	complete URL as a string. The data argument can be any five-item iterable.
				500	This may result in a slightly different, but equivalent URL, if the URL that
				501	was parsed originally had unnecessary delimiters (for example, a ? with an
				502	empty query; the RFC states that these are equivalent)."""
				503	scheme, netloc, url, query, fragment, _coerce_result = (
				504	_coerce_args(*components))
				505	if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
				506	if url and url[:1] != '/': url = '/' + url
				507	url = '//' + (netloc or '') + url
				508	if scheme:
				509	url = scheme + ':' + url
				510	if query:
				511	url = url + '?' + query
				512	if fragment:
				513	url = url + '#' + fragment
				514	return _coerce_result(url)
				515
				516	def urljoin(base, url, allow_fragments=True):
				517	"""Join a base URL and a possibly relative URL to form an absolute
				518	interpretation of the latter."""
				519	if not base:
				520	return url
				521	if not url:
				522	return base
				523
				524	base, url, _coerce_result = _coerce_args(base, url)
				525	bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
				526	urlparse(base, '', allow_fragments)
				527	scheme, netloc, path, params, query, fragment = \
				528	urlparse(url, bscheme, allow_fragments)
				529
				530	if scheme != bscheme or scheme not in uses_relative:
				531	return _coerce_result(url)
				532	if scheme in uses_netloc:
				533	if netloc:
				534	return _coerce_result(urlunparse((scheme, netloc, path,
				535	params, query, fragment)))
				536	netloc = bnetloc
				537
				538	if not path and not params:
				539	path = bpath
				540	params = bparams
				541	if not query:
				542	query = bquery
				543	return _coerce_result(urlunparse((scheme, netloc, path,
				544	params, query, fragment)))
				545
				546	base_parts = bpath.split('/')
				547	if base_parts[-1] != '':
				548	# the last item is not a directory, so will not be taken into account
				549	# in resolving the relative path
				550	del base_parts[-1]
				551
				552	# for rfc3986, ignore all base path should the first character be root.
				553	if path[:1] == '/':
				554	segments = path.split('/')
				555	else:
				556	segments = base_parts + path.split('/')
				557	# filter out elements that would cause redundant slashes on re-joining
				558	# the resolved_path
				559	segments[1:-1] = filter(None, segments[1:-1])
				560
				561	resolved_path = []
				562
				563	for seg in segments:
				564	if seg == '..':
				565	try:
				566	resolved_path.pop()
				567	except IndexError:
				568	# ignore any .. segments that would otherwise cause an IndexError
				569	# when popped from resolved_path if resolving for rfc3986
				570	pass
				571	elif seg == '.':
				572	continue
				573	else:
				574	resolved_path.append(seg)
				575
				576	if segments[-1] in ('.', '..'):
				577	# do some post-processing here. if the last segment was a relative dir,
				578	# then we need to append the trailing '/'
				579	resolved_path.append('')
				580
				581	return _coerce_result(urlunparse((scheme, netloc, '/'.join(
				582	resolved_path) or '/', params, query, fragment)))
				583
				584
				585	def urldefrag(url):
				586	"""Removes any existing fragment from URL.
				587
				588	Returns a tuple of the defragmented URL and the fragment. If
				589	the URL contained no fragments, the second element is the
				590	empty string.
				591	"""
				592	url, _coerce_result = _coerce_args(url)
				593	if '#' in url:
				594	s, n, p, a, q, frag = urlparse(url)
				595	defrag = urlunparse((s, n, p, a, q, ''))
				596	else:
				597	frag = ''
				598	defrag = url
				599	return _coerce_result(DefragResult(defrag, frag))
				600
				601	_hexdig = '0123456789ABCDEFabcdef'
				602	_hextobyte = None
				603
				604	def unquote_to_bytes(string):
				605	"""unquote_to_bytes('abc%20def') -> b'abc def'."""
				606	# Note: strings are encoded as UTF-8. This is only an issue if it contains
				607	# unescaped non-ASCII characters, which URIs should not.
				608	if not string:
				609	# Is it a string-like object?
				610	string.split
				611	return b''
				612	if isinstance(string, str):
				613	string = string.encode('utf-8')
				614	bits = string.split(b'%')
				615	if len(bits) == 1:
				616	return string
				617	res = [bits[0]]
				618	append = res.append
				619	# Delay the initialization of the table to not waste memory
				620	# if the function is never called
				621	global _hextobyte
				622	if _hextobyte is None:
				623	_hextobyte = {(a + b).encode(): bytes.fromhex(a + b)
				624	for a in _hexdig for b in _hexdig}
				625	for item in bits[1:]:
				626	try:
				627	append(_hextobyte[item[:2]])
				628	append(item[2:])
				629	except KeyError:
				630	append(b'%')
				631	append(item)
				632	return b''.join(res)
				633
				634	_asciire = re.compile('([\x00-\x7f]+)')
				635
				636	def unquote(string, encoding='utf-8', errors='replace'):
				637	"""Replace %xx escapes by their single-character equivalent. The optional
				638	encoding and errors parameters specify how to decode percent-encoded
				639	sequences into Unicode characters, as accepted by the bytes.decode()
				640	method.
				641	By default, percent-encoded sequences are decoded with UTF-8, and invalid
				642	sequences are replaced by a placeholder character.
				643
				644	unquote('abc%20def') -> 'abc def'.
				645	"""
				646	if isinstance(string, bytes):
				647	return unquote_to_bytes(string).decode(encoding, errors)
				648	if '%' not in string:
				649	string.split
				650	return string
				651	if encoding is None:
				652	encoding = 'utf-8'
				653	if errors is None:
				654	errors = 'replace'
				655	bits = _asciire.split(string)
				656	res = [bits[0]]
				657	append = res.append
				658	for i in range(1, len(bits), 2):
				659	append(unquote_to_bytes(bits[i]).decode(encoding, errors))
				660	append(bits[i + 1])
				661	return ''.join(res)
				662
				663
				664	def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
				665	encoding='utf-8', errors='replace', max_num_fields=None):
				666	"""Parse a query given as a string argument.
				667
				668	Arguments:
				669
				670	qs: percent-encoded query string to be parsed
				671
				672	keep_blank_values: flag indicating whether blank values in
				673	percent-encoded queries should be treated as blank strings.
				674	A true value indicates that blanks should be retained as
				675	blank strings. The default false value indicates that
				676	blank values are to be ignored and treated as if they were
				677	not included.
				678
				679	strict_parsing: flag indicating what to do with parsing errors.
				680	If false (the default), errors are silently ignored.
				681	If true, errors raise a ValueError exception.
				682
				683	encoding and errors: specify how to decode percent-encoded sequences
				684	into Unicode characters, as accepted by the bytes.decode() method.
				685
				686	max_num_fields: int. If set, then throws a ValueError if there
				687	are more than n fields read by parse_qsl().
				688
				689	Returns a dictionary.
				690	"""
				691	parsed_result = {}
				692	pairs = parse_qsl(qs, keep_blank_values, strict_parsing,
				693	encoding=encoding, errors=errors,
				694	max_num_fields=max_num_fields)
				695	for name, value in pairs:
				696	if name in parsed_result:
				697	parsed_result[name].append(value)
				698	else:
				699	parsed_result[name] = [value]
				700	return parsed_result
				701
				702
				703	def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
				704	encoding='utf-8', errors='replace', max_num_fields=None):
				705	"""Parse a query given as a string argument.
				706
				707	Arguments:
				708
				709	qs: percent-encoded query string to be parsed
				710
				711	keep_blank_values: flag indicating whether blank values in
				712	percent-encoded queries should be treated as blank strings.
				713	A true value indicates that blanks should be retained as blank
				714	strings. The default false value indicates that blank values
				715	are to be ignored and treated as if they were not included.
				716
				717	strict_parsing: flag indicating what to do with parsing errors. If
				718	false (the default), errors are silently ignored. If true,
				719	errors raise a ValueError exception.
				720
				721	encoding and errors: specify how to decode percent-encoded sequences
				722	into Unicode characters, as accepted by the bytes.decode() method.
				723
				724	max_num_fields: int. If set, then throws a ValueError
				725	if there are more than n fields read by parse_qsl().
				726
				727	Returns a list, as G-d intended.
				728	"""
				729	qs, _coerce_result = _coerce_args(qs)
				730
				731	# If max_num_fields is defined then check that the number of fields
				732	# is less than max_num_fields. This prevents a memory exhaustion DOS
				733	# attack via post bodies with many fields.
				734	if max_num_fields is not None:
				735	num_fields = 1 + qs.count('&') + qs.count(';')
				736	if max_num_fields < num_fields:
				737	raise ValueError('Max number of fields exceeded')
				738
				739	pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
				740	r = []
				741	for name_value in pairs:
				742	if not name_value and not strict_parsing:
				743	continue
				744	nv = name_value.split('=', 1)
				745	if len(nv) != 2:
				746	if strict_parsing:
				747	raise ValueError("bad query field: %r" % (name_value,))
				748	# Handle case of a control-name with no equal sign
				749	if keep_blank_values:
				750	nv.append('')
				751	else:
				752	continue
				753	if len(nv[1]) or keep_blank_values:
				754	name = nv[0].replace('+', ' ')
				755	name = unquote(name, encoding=encoding, errors=errors)
				756	name = _coerce_result(name)
				757	value = nv[1].replace('+', ' ')
				758	value = unquote(value, encoding=encoding, errors=errors)
				759	value = _coerce_result(value)
				760	r.append((name, value))
				761	return r
				762
				763	def unquote_plus(string, encoding='utf-8', errors='replace'):
				764	"""Like unquote(), but also replace plus signs by spaces, as required for
				765	unquoting HTML form values.
				766
				767	unquote_plus('%7e/abc+def') -> '~/abc def'
				768	"""
				769	string = string.replace('+', ' ')
				770	return unquote(string, encoding, errors)
				771
				772	_ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
				773	b'abcdefghijklmnopqrstuvwxyz'
				774	b'0123456789'
				775	b'_.-~')
				776	_ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE)
				777	_safe_quoters = {}
				778
				779	class Quoter(collections.defaultdict):
				780	"""A mapping from bytes (in range(0,256)) to strings.
				781
				782	String values are percent-encoded byte values, unless the key < 128, and
				783	in the "safe" set (either the specified safe set, or default set).
				784	"""
				785	# Keeps a cache internally, using defaultdict, for efficiency (lookups
				786	# of cached keys don't call Python code at all).
				787	def __init__(self, safe):
				788	"""safe: bytes object."""
				789	self.safe = _ALWAYS_SAFE.union(safe)
				790
				791	def __repr__(self):
				792	# Without this, will just display as a defaultdict
				793	return "<%s %r>" % (self.__class__.__name__, dict(self))
				794
				795	def __missing__(self, b):
				796	# Handle a cache miss. Store quoted string in cache and return.
				797	res = chr(b) if b in self.safe else '%{:02X}'.format(b)
				798	self[b] = res
				799	return res
				800
				801	def quote(string, safe='/', encoding=None, errors=None):
				802	"""quote('abc def') -> 'abc%20def'
				803
				804	Each part of a URL, e.g. the path info, the query, etc., has a
				805	different set of reserved characters that must be quoted. The
				806	quote function offers a cautious (not minimal) way to quote a
				807	string for most of these parts.
				808
				809	RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
				810	the following (un)reserved characters.
				811
				812	unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
				813	reserved = gen-delims / sub-delims
				814	gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
				815	sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
				816	/ "*" / "+" / "," / ";" / "="
				817
				818	Each of the reserved characters is reserved in some component of a URL,
				819	but not necessarily in all of them.
				820
				821	The quote function %-escapes all characters that are neither in the
				822	unreserved chars ("always safe") nor the additional chars set via the
				823	safe arg.
				824
				825	The default for the safe arg is '/'. The character is reserved, but in
				826	typical usage the quote function is being called on a path where the
				827	existing slash characters are to be preserved.
				828
				829	Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings.
				830	Now, "~" is included in the set of unreserved characters.
				831
				832	string and safe may be either str or bytes objects. encoding and errors
				833	must not be specified if string is a bytes object.
				834
				835	The optional encoding and errors parameters specify how to deal with
				836	non-ASCII characters, as accepted by the str.encode method.
				837	By default, encoding='utf-8' (characters are encoded with UTF-8), and
				838	errors='strict' (unsupported characters raise a UnicodeEncodeError).
				839	"""
				840	if isinstance(string, str):
				841	if not string:
				842	return string
				843	if encoding is None:
				844	encoding = 'utf-8'
				845	if errors is None:
				846	errors = 'strict'
				847	string = string.encode(encoding, errors)
				848	else:
				849	if encoding is not None:
				850	raise TypeError("quote() doesn't support 'encoding' for bytes")
				851	if errors is not None:
				852	raise TypeError("quote() doesn't support 'errors' for bytes")
				853	return quote_from_bytes(string, safe)
				854
				855	def quote_plus(string, safe='', encoding=None, errors=None):
				856	"""Like quote(), but also replace ' ' with '+', as required for quoting
				857	HTML form values. Plus signs in the original string are escaped unless
				858	they are included in safe. It also does not have safe default to '/'.
				859	"""
				860	# Check if ' ' in string, where string may either be a str or bytes. If
				861	# there are no spaces, the regular quote will produce the right answer.
				862	if ((isinstance(string, str) and ' ' not in string) or
				863	(isinstance(string, bytes) and b' ' not in string)):
				864	return quote(string, safe, encoding, errors)
				865	if isinstance(safe, str):
				866	space = ' '
				867	else:
				868	space = b' '
				869	string = quote(string, safe + space, encoding, errors)
				870	return string.replace(' ', '+')
				871
				872	def quote_from_bytes(bs, safe='/'):
				873	"""Like quote(), but accepts a bytes object rather than a str, and does
				874	not perform string-to-bytes encoding. It always returns an ASCII string.
				875	quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f'
				876	"""
				877	if not isinstance(bs, (bytes, bytearray)):
				878	raise TypeError("quote_from_bytes() expected bytes")
				879	if not bs:
				880	return ''
				881	if isinstance(safe, str):
				882	# Normalize 'safe' by converting to bytes and removing non-ASCII chars
				883	safe = safe.encode('ascii', 'ignore')
				884	else:
				885	safe = bytes([c for c in safe if c < 128])
				886	if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe):
				887	return bs.decode()
				888	try:
				889	quoter = _safe_quoters[safe]
				890	except KeyError:
				891	_safe_quoters[safe] = quoter = Quoter(safe).__getitem__
				892	return ''.join([quoter(char) for char in bs])
				893
				894	def urlencode(query, doseq=False, safe='', encoding=None, errors=None,
				895	quote_via=quote_plus):
				896	"""Encode a dict or sequence of two-element tuples into a URL query string.
				897
				898	If any values in the query arg are sequences and doseq is true, each
				899	sequence element is converted to a separate parameter.
				900
				901	If the query arg is a sequence of two-element tuples, the order of the
				902	parameters in the output will match the order of parameters in the
				903	input.
				904
				905	The components of a query arg may each be either a string or a bytes type.
				906
				907	The safe, encoding, and errors parameters are passed down to the function
				908	specified by quote_via (encoding and errors only if a component is a str).
				909	"""
				910
				911	if hasattr(query, "items"):
				912	query = query.items()
				913	else:
				914	# It's a bother at times that strings and string-like objects are
				915	# sequences.
				916	try:
				917	# non-sequence items should not work with len()
				918	# non-empty strings will fail this
				919	if len(query) and not isinstance(query[0], tuple):
				920	raise TypeError
				921	# Zero-length sequences of all types will get here and succeed,
				922	# but that's a minor nit. Since the original implementation
				923	# allowed empty dicts that type of behavior probably should be
				924	# preserved for consistency
				925	except TypeError:
				926	ty, va, tb = sys.exc_info()
				927	raise TypeError("not a valid non-string sequence "
				928	"or mapping object").with_traceback(tb)
				929
				930	l = []
				931	if not doseq:
				932	for k, v in query:
				933	if isinstance(k, bytes):
				934	k = quote_via(k, safe)
				935	else:
				936	k = quote_via(str(k), safe, encoding, errors)
				937
				938	if isinstance(v, bytes):
				939	v = quote_via(v, safe)
				940	else:
				941	v = quote_via(str(v), safe, encoding, errors)
				942	l.append(k + '=' + v)
				943	else:
				944	for k, v in query:
				945	if isinstance(k, bytes):
				946	k = quote_via(k, safe)
				947	else:
				948	k = quote_via(str(k), safe, encoding, errors)
				949
				950	if isinstance(v, bytes):
				951	v = quote_via(v, safe)
				952	l.append(k + '=' + v)
				953	elif isinstance(v, str):
				954	v = quote_via(v, safe, encoding, errors)
				955	l.append(k + '=' + v)
				956	else:
				957	try:
				958	# Is this a sufficient test for sequence-ness?
				959	x = len(v)
				960	except TypeError:
				961	# not a sequence
				962	v = quote_via(str(v), safe, encoding, errors)
				963	l.append(k + '=' + v)
				964	else:
				965	# loop over the sequence
				966	for elt in v:
				967	if isinstance(elt, bytes):
				968	elt = quote_via(elt, safe)
				969	else:
				970	elt = quote_via(str(elt), safe, encoding, errors)
				971	l.append(k + '=' + elt)
				972	return '&'.join(l)
				973
				974
				975	def to_bytes(url):
				976	warnings.warn("urllib.parse.to_bytes() is deprecated as of 3.8",
				977	DeprecationWarning, stacklevel=2)
				978	return _to_bytes(url)
				979
				980
				981	def _to_bytes(url):
				982	"""to_bytes(u"URL") --> 'URL'."""
				983	# Most URL schemes require ASCII. If that changes, the conversion
				984	# can be relaxed.
				985	# XXX get rid of to_bytes()
				986	if isinstance(url, str):
				987	try:
				988	url = url.encode("ASCII").decode()
				989	except UnicodeError:
				990	raise UnicodeError("URL " + repr(url) +
				991	" contains non-ASCII characters")
				992	return url
				993
				994
				995	def unwrap(url):
				996	"""Transform a string like '<URL:scheme://host/path>' into 'scheme://host/path'.
				997
				998	The string is returned unchanged if it's not a wrapped URL.
				999	"""
				1000	url = str(url).strip()
				1001	if url[:1] == '<' and url[-1:] == '>':
				1002	url = url[1:-1].strip()
				1003	if url[:4] == 'URL:':
				1004	url = url[4:].strip()
				1005	return url
				1006
				1007
				1008	def splittype(url):
				1009	warnings.warn("urllib.parse.splittype() is deprecated as of 3.8, "
				1010	"use urllib.parse.urlparse() instead",
				1011	DeprecationWarning, stacklevel=2)
				1012	return _splittype(url)
				1013
				1014
				1015	_typeprog = None
				1016	def _splittype(url):
				1017	"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
				1018	global _typeprog
				1019	if _typeprog is None:
				1020	_typeprog = re.compile('([^/:]+):(.*)', re.DOTALL)
				1021
				1022	match = _typeprog.match(url)
				1023	if match:
				1024	scheme, data = match.groups()
				1025	return scheme.lower(), data
				1026	return None, url
				1027
				1028
				1029	def splithost(url):
				1030	warnings.warn("urllib.parse.splithost() is deprecated as of 3.8, "
				1031	"use urllib.parse.urlparse() instead",
				1032	DeprecationWarning, stacklevel=2)
				1033	return _splithost(url)
				1034
				1035
				1036	_hostprog = None
				1037	def _splithost(url):
				1038	"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
				1039	global _hostprog
				1040	if _hostprog is None:
				1041	_hostprog = re.compile('//([^/#?])(.)', re.DOTALL)
				1042
				1043	match = _hostprog.match(url)
				1044	if match:
				1045	host_port, path = match.groups()
				1046	if path and path[0] != '/':
				1047	path = '/' + path
				1048	return host_port, path
				1049	return None, url
				1050
				1051
				1052	def splituser(host):
				1053	warnings.warn("urllib.parse.splituser() is deprecated as of 3.8, "
				1054	"use urllib.parse.urlparse() instead",
				1055	DeprecationWarning, stacklevel=2)
				1056	return _splituser(host)
				1057
				1058
				1059	def _splituser(host):
				1060	"""splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
				1061	user, delim, host = host.rpartition('@')
				1062	return (user if delim else None), host
				1063
				1064
				1065	def splitpasswd(user):
				1066	warnings.warn("urllib.parse.splitpasswd() is deprecated as of 3.8, "
				1067	"use urllib.parse.urlparse() instead",
				1068	DeprecationWarning, stacklevel=2)
				1069	return _splitpasswd(user)
				1070
				1071
				1072	def _splitpasswd(user):
				1073	"""splitpasswd('user:passwd') -> 'user', 'passwd'."""
				1074	user, delim, passwd = user.partition(':')
				1075	return user, (passwd if delim else None)
				1076
				1077
				1078	def splitport(host):
				1079	warnings.warn("urllib.parse.splitport() is deprecated as of 3.8, "
				1080	"use urllib.parse.urlparse() instead",
				1081	DeprecationWarning, stacklevel=2)
				1082	return _splitport(host)
				1083
				1084
				1085	# splittag('/path#tag') --> '/path', 'tag'
				1086	_portprog = None
				1087	def _splitport(host):
				1088	"""splitport('host:port') --> 'host', 'port'."""
				1089	global _portprog
				1090	if _portprog is None:
				1091	_portprog = re.compile('(.):([0-9])', re.DOTALL)
				1092
				1093	match = _portprog.fullmatch(host)
				1094	if match:
				1095	host, port = match.groups()
				1096	if port:
				1097	return host, port
				1098	return host, None
				1099
				1100
				1101	def splitnport(host, defport=-1):
				1102	warnings.warn("urllib.parse.splitnport() is deprecated as of 3.8, "
				1103	"use urllib.parse.urlparse() instead",
				1104	DeprecationWarning, stacklevel=2)
				1105	return _splitnport(host, defport)
				1106
				1107
				1108	def _splitnport(host, defport=-1):
				1109	"""Split host and port, returning numeric port.
				1110	Return given default port if no ':' found; defaults to -1.
				1111	Return numerical port if a valid number are found after ':'.
				1112	Return None if ':' but not a valid number."""
				1113	host, delim, port = host.rpartition(':')
				1114	if not delim:
				1115	host = port
				1116	elif port:
				1117	try:
				1118	nport = int(port)
				1119	except ValueError:
				1120	nport = None
				1121	return host, nport
				1122	return host, defport
				1123
				1124
				1125	def splitquery(url):
				1126	warnings.warn("urllib.parse.splitquery() is deprecated as of 3.8, "
				1127	"use urllib.parse.urlparse() instead",
				1128	DeprecationWarning, stacklevel=2)
				1129	return _splitquery(url)
				1130
				1131
				1132	def _splitquery(url):
				1133	"""splitquery('/path?query') --> '/path', 'query'."""
				1134	path, delim, query = url.rpartition('?')
				1135	if delim:
				1136	return path, query
				1137	return url, None
				1138
				1139
				1140	def splittag(url):
				1141	warnings.warn("urllib.parse.splittag() is deprecated as of 3.8, "
				1142	"use urllib.parse.urlparse() instead",
				1143	DeprecationWarning, stacklevel=2)
				1144	return _splittag(url)
				1145
				1146
				1147	def _splittag(url):
				1148	"""splittag('/path#tag') --> '/path', 'tag'."""
				1149	path, delim, tag = url.rpartition('#')
				1150	if delim:
				1151	return path, tag
				1152	return url, None
				1153
				1154
				1155	def splitattr(url):
				1156	warnings.warn("urllib.parse.splitattr() is deprecated as of 3.8, "
				1157	"use urllib.parse.urlparse() instead",
				1158	DeprecationWarning, stacklevel=2)
				1159	return _splitattr(url)
				1160
				1161
				1162	def _splitattr(url):
				1163	"""splitattr('/path;attr1=value1;attr2=value2;...') ->
				1164	'/path', ['attr1=value1', 'attr2=value2', ...]."""
				1165	words = url.split(';')
				1166	return words[0], words[1:]
				1167
				1168
				1169	def splitvalue(attr):
				1170	warnings.warn("urllib.parse.splitvalue() is deprecated as of 3.8, "
				1171	"use urllib.parse.parse_qsl() instead",
				1172	DeprecationWarning, stacklevel=2)
				1173	return _splitvalue(attr)
				1174
				1175
				1176	def _splitvalue(attr):
				1177	"""splitvalue('attr=value') --> 'attr', 'value'."""
				1178	attr, delim, value = attr.partition('=')
				1179	return attr, (value if delim else None)