richardblythman
/

as_chat

Model card Files Files and versions Community

as_chat / .venv /lib /python3.12 /site-packages /httpx /_urlparse.py

richardblythman

Upload folder using huggingface_hub

dd2bdcb verified 8 months ago

raw

history blame contribute delete

17.7 kB

	"""
	An implementation of `urlparse` that provides URL validation and normalization
	as described by RFC3986.

	We rely on this implementation rather than the one in Python's stdlib, because:

	* It provides more complete URL validation.
	* It properly differentiates between an empty querystring and an absent querystring,
	to distinguish URLs with a trailing '?'.
	* It handles scheme, hostname, port, and path normalization.
	* It supports IDNA hostnames, normalizing them to their encoded form.
	* The API supports passing individual components, as well as the complete URL string.

	Previously we relied on the excellent `rfc3986` package to handle URL parsing and
	validation, but this module provides a simpler alternative, with less indirection
	required.
	"""
	from __future__ import annotations

	import ipaddress
	import re
	import typing

	import idna

	from ._exceptions import InvalidURL

	MAX_URL_LENGTH = 65536

	# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
	UNRESERVED_CHARACTERS = (
	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
	)
	SUB_DELIMS = "!$&'()*+,;="

	PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")


	# {scheme}: (optional)
	# //{authority} (optional)
	# {path}
	# ?{query} (optional)
	# #{fragment} (optional)
	URL_REGEX = re.compile(
	(
	r"(?:(?P<scheme>{scheme}):)?"
	r"(?://(?P<authority>{authority}))?"
	r"(?P<path>{path})"
	r"(?:\?(?P<query>{query}))?"
	r"(?:#(?P<fragment>{fragment}))?"
	).format(
	scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
	authority="[^/?#]*",
	path="[^?#]*",
	query="[^#]*",
	fragment=".*",
	)
	)

	# {userinfo}@ (optional)
	# {host}
	# :{port} (optional)
	AUTHORITY_REGEX = re.compile(
	(
	r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
	).format(
	userinfo=".*", # Any character sequence.
	host="(\\[.\\]\|[^:@])", # Either any character sequence excluding ':' or '@',
	# or an IPv6 address enclosed within square brackets.
	port=".*", # Any character sequence.
	)
	)


	# If we call urlparse with an individual component, then we need to regex
	# validate that component individually.
	# Note that we're duplicating the same strings as above. Shock! Horror!!
	COMPONENT_REGEX = {
	"scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
	"authority": re.compile("[^/?#]*"),
	"path": re.compile("[^?#]*"),
	"query": re.compile("[^#]*"),
	"fragment": re.compile(".*"),
	"userinfo": re.compile("[^@]*"),
	"host": re.compile("(\\[.\\]\|[^:])"),
	"port": re.compile(".*"),
	}


	# We use these simple regexs as a first pass before handing off to
	# the stdlib 'ipaddress' module for IP address validation.
	IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
	IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")


	class ParseResult(typing.NamedTuple):
	scheme: str
	userinfo: str
	host: str
	port: int \| None
	path: str
	query: str \| None
	fragment: str \| None

	@property
	def authority(self) -> str:
	return "".join(
	[
	f"{self.userinfo}@" if self.userinfo else "",
	f"[{self.host}]" if ":" in self.host else self.host,
	f":{self.port}" if self.port is not None else "",
	]
	)

	@property
	def netloc(self) -> str:
	return "".join(
	[
	f"[{self.host}]" if ":" in self.host else self.host,
	f":{self.port}" if self.port is not None else "",
	]
	)

	def copy_with(self, **kwargs: str \| None) -> ParseResult:
	if not kwargs:
	return self

	defaults = {
	"scheme": self.scheme,
	"authority": self.authority,
	"path": self.path,
	"query": self.query,
	"fragment": self.fragment,
	}
	defaults.update(kwargs)
	return urlparse("", **defaults)

	def __str__(self) -> str:
	authority = self.authority
	return "".join(
	[
	f"{self.scheme}:" if self.scheme else "",
	f"//{authority}" if authority else "",
	self.path,
	f"?{self.query}" if self.query is not None else "",
	f"#{self.fragment}" if self.fragment is not None else "",
	]
	)


	def urlparse(url: str = "", **kwargs: str \| None) -> ParseResult:
	# Initial basic checks on allowable URLs.
	# ---------------------------------------

	# Hard limit the maximum allowable URL length.
	if len(url) > MAX_URL_LENGTH:
	raise InvalidURL("URL too long")

	# If a URL includes any ASCII control characters including \t, \r, \n,
	# then treat it as invalid.
	if any(char.isascii() and not char.isprintable() for char in url):
	raise InvalidURL("Invalid non-printable ASCII character in URL")

	# Some keyword arguments require special handling.
	# ------------------------------------------------

	# Coerce "port" to a string, if it is provided as an integer.
	if "port" in kwargs:
	port = kwargs["port"]
	kwargs["port"] = str(port) if isinstance(port, int) else port

	# Replace "netloc" with "host and "port".
	if "netloc" in kwargs:
	netloc = kwargs.pop("netloc") or ""
	kwargs["host"], _, kwargs["port"] = netloc.partition(":")

	# Replace "username" and/or "password" with "userinfo".
	if "username" in kwargs or "password" in kwargs:
	username = quote(kwargs.pop("username", "") or "")
	password = quote(kwargs.pop("password", "") or "")
	kwargs["userinfo"] = f"{username}:{password}" if password else username

	# Replace "raw_path" with "path" and "query".
	if "raw_path" in kwargs:
	raw_path = kwargs.pop("raw_path") or ""
	kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
	if not seperator:
	kwargs["query"] = None

	# Ensure that IPv6 "host" addresses are always escaped with "[...]".
	if "host" in kwargs:
	host = kwargs.get("host") or ""
	if ":" in host and not (host.startswith("[") and host.endswith("]")):
	kwargs["host"] = f"[{host}]"

	# If any keyword arguments are provided, ensure they are valid.
	# -------------------------------------------------------------

	for key, value in kwargs.items():
	if value is not None:
	if len(value) > MAX_URL_LENGTH:
	raise InvalidURL(f"URL component '{key}' too long")

	# If a component includes any ASCII control characters including \t, \r, \n,
	# then treat it as invalid.
	if any(char.isascii() and not char.isprintable() for char in value):
	raise InvalidURL(
	f"Invalid non-printable ASCII character in URL component '{key}'"
	)

	# Ensure that keyword arguments match as a valid regex.
	if not COMPONENT_REGEX[key].fullmatch(value):
	raise InvalidURL(f"Invalid URL component '{key}'")

	# The URL_REGEX will always match, but may have empty components.
	url_match = URL_REGEX.match(url)
	assert url_match is not None
	url_dict = url_match.groupdict()

	# * 'scheme', 'authority', and 'path' may be empty strings.
	# * 'query' may be 'None', indicating no trailing "?" portion.
	# Any string including the empty string, indicates a trailing "?".
	# * 'fragment' may be 'None', indicating no trailing "#" portion.
	# Any string including the empty string, indicates a trailing "#".
	scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
	authority = kwargs.get("authority", url_dict["authority"]) or ""
	path = kwargs.get("path", url_dict["path"]) or ""
	query = kwargs.get("query", url_dict["query"])
	fragment = kwargs.get("fragment", url_dict["fragment"])

	# The AUTHORITY_REGEX will always match, but may have empty components.
	authority_match = AUTHORITY_REGEX.match(authority)
	assert authority_match is not None
	authority_dict = authority_match.groupdict()

	# * 'userinfo' and 'host' may be empty strings.
	# * 'port' may be 'None'.
	userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
	host = kwargs.get("host", authority_dict["host"]) or ""
	port = kwargs.get("port", authority_dict["port"])

	# Normalize and validate each component.
	# We end up with a parsed representation of the URL,
	# with components that are plain ASCII bytestrings.
	parsed_scheme: str = scheme.lower()
	parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
	parsed_host: str = encode_host(host)
	parsed_port: int \| None = normalize_port(port, scheme)

	has_scheme = parsed_scheme != ""
	has_authority = (
	parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
	)
	validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
	if has_authority:
	path = normalize_path(path)

	# The GEN_DELIMS set is... : / ? # [ ] @
	# These do not need to be percent-quoted unless they serve as delimiters for the
	# specific component.

	# For 'path' we need to drop ? and # from the GEN_DELIMS set.
	parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
	# For 'query' we need to drop '#' from the GEN_DELIMS set.
	parsed_query: str \| None = (
	None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
	)
	# For 'fragment' we can include all of the GEN_DELIMS set.
	parsed_fragment: str \| None = (
	None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
	)

	# The parsed ASCII bytestrings are our canonical form.
	# All properties of the URL are derived from these.
	return ParseResult(
	parsed_scheme,
	parsed_userinfo,
	parsed_host,
	parsed_port,
	parsed_path,
	parsed_query,
	parsed_fragment,
	)


	def encode_host(host: str) -> str:
	if not host:
	return ""

	elif IPv4_STYLE_HOSTNAME.match(host):
	# Validate IPv4 hostnames like #.#.#.#
	#
	# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
	#
	# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
	try:
	ipaddress.IPv4Address(host)
	except ipaddress.AddressValueError:
	raise InvalidURL(f"Invalid IPv4 address: {host!r}")
	return host

	elif IPv6_STYLE_HOSTNAME.match(host):
	# Validate IPv6 hostnames like [...]
	#
	# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
	#
	# "A host identified by an Internet Protocol literal address, version 6
	# [RFC3513] or later, is distinguished by enclosing the IP literal
	# within square brackets ("[" and "]"). This is the only place where
	# square bracket characters are allowed in the URI syntax."
	try:
	ipaddress.IPv6Address(host[1:-1])
	except ipaddress.AddressValueError:
	raise InvalidURL(f"Invalid IPv6 address: {host!r}")
	return host[1:-1]

	elif host.isascii():
	# Regular ASCII hostnames
	#
	# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
	#
	# reg-name = *( unreserved / pct-encoded / sub-delims )
	return quote(host.lower(), safe=SUB_DELIMS)

	# IDNA hostnames
	try:
	return idna.encode(host.lower()).decode("ascii")
	except idna.IDNAError:
	raise InvalidURL(f"Invalid IDNA hostname: {host!r}")


	def normalize_port(port: str \| int \| None, scheme: str) -> int \| None:
	# From https://tools.ietf.org/html/rfc3986#section-3.2.3
	#
	# "A scheme may define a default port. For example, the "http" scheme
	# defines a default port of "80", corresponding to its reserved TCP
	# port number. The type of port designated by the port number (e.g.,
	# TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
	# normalizers should omit the port component and its ":" delimiter if
	# port is empty or if its value would be the same as that of the
	# scheme's default."
	if port is None or port == "":
	return None

	try:
	port_as_int = int(port)
	except ValueError:
	raise InvalidURL(f"Invalid port: {port!r}")

	# See https://url.spec.whatwg.org/#url-miscellaneous
	default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
	scheme
	)
	if port_as_int == default_port:
	return None
	return port_as_int


	def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
	"""
	Path validation rules that depend on if the URL contains
	a scheme or authority component.

	See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
	"""
	if has_authority:
	# If a URI contains an authority component, then the path component
	# must either be empty or begin with a slash ("/") character."
	if path and not path.startswith("/"):
	raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
	else:
	# If a URI does not contain an authority component, then the path cannot begin
	# with two slash characters ("//").
	if path.startswith("//"):
	raise InvalidURL(
	"URLs with no authority component cannot have a path starting with '//'"
	)
	# In addition, a URI reference (Section 4.1) may be a relative-path reference,
	# in which case the first path segment cannot contain a colon (":") character.
	if path.startswith(":") and not has_scheme:
	raise InvalidURL(
	"URLs with no scheme component cannot have a path starting with ':'"
	)


	def normalize_path(path: str) -> str:
	"""
	Drop "." and ".." segments from a URL path.

	For example:

	normalize_path("/path/./to/somewhere/..") == "/path/to"
	"""
	# https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
	components = path.split("/")
	output: list[str] = []
	for component in components:
	if component == ".":
	pass
	elif component == "..":
	if output and output != [""]:
	output.pop()
	else:
	output.append(component)
	return "/".join(output)


	def percent_encode(char: str) -> str:
	"""
	Replace a single character with the percent-encoded representation.

	Characters outside the ASCII range are represented with their a percent-encoded
	representation of their UTF-8 byte sequence.

	For example:

	percent_encode(" ") == "%20"
	"""
	return "".join([f"%{byte:02x}" for byte in char.encode("utf-8")]).upper()


	def is_safe(string: str, safe: str = "/") -> bool:
	"""
	Determine if a given string is already quote-safe.
	"""
	NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe + "%"

	# All characters must already be non-escaping or '%'
	for char in string:
	if char not in NON_ESCAPED_CHARS:
	return False

	return True


	def percent_encoded(string: str, safe: str = "/") -> str:
	"""
	Use percent-encoding to quote a string.
	"""
	if is_safe(string, safe=safe):
	return string

	NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
	return "".join(
	[char if char in NON_ESCAPED_CHARS else percent_encode(char) for char in string]
	)


	def quote(string: str, safe: str = "/") -> str:
	"""
	Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.

	See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1

	* `string`: The string to be percent-escaped.
	* `safe`: A string containing characters that may be treated as safe, and do not
	need to be escaped. Unreserved characters are always treated as safe.
	See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
	"""
	parts = []
	current_position = 0
	for match in re.finditer(PERCENT_ENCODED_REGEX, string):
	start_position, end_position = match.start(), match.end()
	matched_text = match.group(0)
	# Add any text up to the '%xx' escape sequence.
	if start_position != current_position:
	leading_text = string[current_position:start_position]
	parts.append(percent_encoded(leading_text, safe=safe))

	# Add the '%xx' escape sequence.
	parts.append(matched_text)
	current_position = end_position

	# Add any text after the final '%xx' escape sequence.
	if current_position != len(string):
	trailing_text = string[current_position:]
	parts.append(percent_encoded(trailing_text, safe=safe))

	return "".join(parts)


	def urlencode(items: list[tuple[str, str]]) -> str:
	"""
	We can use a much simpler version of the stdlib urlencode here because
	we don't need to handle a bunch of different typing cases, such as bytes vs str.

	https://github.com/python/cpython/blob/b2f7b2ef0b5421e01efb8c7bee2ef95d3bab77eb/Lib/urllib/parse.py#L926

	Note that we use '%20' encoding for spaces. and '%2F for '/'.
	This is slightly different than `requests`, but is the behaviour that browsers use.

	See
	- https://github.com/encode/httpx/issues/2536
	- https://github.com/encode/httpx/issues/2721
	- https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
	"""
	return "&".join(
	[
	percent_encoded(k, safe="") + "=" + percent_encoded(v, safe="")
	for k, v in items
	]
	)