Source code for apeye.url

#!/usr/bin/env python
#
#  url.py
"""
:mod:`pathlib`-like approach to URLs.

.. versionchanged:: 1.0.0

	:class:`~apeye.slumber_url.SlumberURL` and :class:`~apeye.requests_url.RequestsURL`
	moved to :mod:`apeye.slumber_url` and :mod:`apeye.requests_url` respectively.
"""
#
#  Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#  Based on the "pathlib" module from CPython.
#  Licensed under the Python Software Foundation License Version 2.
#  Copyright © 2001-2020 Python Software Foundation. All rights reserved.
#  Copyright © 2000 BeOpen.com. All rights reserved.
#  Copyright © 1995-2000 Corporation for National Research Initiatives. All rights reserved.
#  Copyright © 1991-1995 Stichting Mathematisch Centrum. All rights reserved.
#
#  Based on Slumber <https://slumber.readthedocs.io>
#  Copyright (c) 2011 Donald Stufft
#  Licensed under the 2-clause BSD License
#
#  Some docstrings from Requests <https://requests.readthedocs.io>
#  Copyright 2019 Kenneth Reitz
#  Licensed under the Apache License, Version 2.0
#

# stdlib
import ipaddress
import os
import pathlib
import re
from operator import attrgetter
from typing import (
		TYPE_CHECKING,
		Any,
		Dict,
		Iterable,
		List,
		Mapping,
		NamedTuple,
		Optional,
		Tuple,
		Type,
		TypeVar,
		Union
		)
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse

# 3rd party
from domdf_python_tools.doctools import prettify_docstrings
from domdf_python_tools.typing import PathLike

# this package
from apeye import _tld

if TYPE_CHECKING:
	# stdlib
	from typing import NoReturn

__all__ = ["URL", "URLPath", "Domain", "URLType", "URLPathType"]

URLType = TypeVar("URLType", bound="URL")

URLPathType = TypeVar("URLPathType", bound="URLPath")
"""
.. versionadded:: 1.1.0
"""


@prettify_docstrings
class URLPath(pathlib.PurePosixPath):
	"""
	Represents the path part of a URL.

	Subclass of :class:`pathlib.PurePosixPath` that provides a subset of its methods.

	.. versionchanged:: 1.1.0

		Implemented :meth:`~.URLPath.is_absolute`, :meth:`~.URLPath.joinpath`,
		:meth:`~.URLPath.relative_to`, :meth:`~.pathlib.PurePath.match`,
		``anchor``, ``drive``, and support for rich comparisons (``<``, ``<=``, ``>`` and ``>=``),
		which previously raised :exc:`NotImplementedError`.
	"""

	def __str__(self) -> str:
		"""
		Return the string representation of the path, suitable for passing to system calls.
		"""

		try:
			return self._str  # type: ignore
		except AttributeError:
			self._str = self._format_parsed_parts(self._drv, self._root, self._parts) or ''  # type: ignore
			return self._str

	def __repr__(self):
		return super().__repr__()

	@classmethod
	def _format_parsed_parts(cls, drv, root, parts):
		if drv or root:
			return drv + root + pathlib._posix_flavour.join(parts[1:])  # type: ignore
		else:
			return pathlib._posix_flavour.join(parts)  # type: ignore

	def is_absolute(self) -> bool:
		"""
		Returns whether the path is absolute (i.e. starts with ``/``).

		.. versionadded:: 1.1.0  previously raised :exc:`NotImplementedError`.
		"""

		return self.root == '/'

	def joinpath(self: URLPathType, *args) -> URLPathType:
		"""
		Combine this :class:`~.URLPath` with one or several arguments.

		.. versionadded:: 1.1.0  previously raised :exc:`NotImplementedError`.

		:returns: A new :class:`~.URLPath` representing either a subpath
			(if all arguments are relative paths) or a totally different path
			(if one of the arguments is absolute).
		"""

		return super().joinpath(*args)

	def relative_to(self: URLPathType, *other: PathLike) -> URLPathType:
		r"""
		Returns the relative path to another path identified by the passed arguments.

		The arguments are joined together to form a single path, and therefore the following behave identically:

		.. code-block:: pycon

			>>> URLPath("/news/sport").relative_to("/", "news")
			URLPath('sport')
			>>> URLPath("/news/sport").relative_to("/news")
			URLPath('sport')

		.. versionadded:: 1.1.0  previously raised :exc:`NotImplementedError`.

		:param \*other:

		:raises ValueError: if the operation is not possible (because this is not a subpath of the other path)

		.. seealso::

			:meth:`~.URL.relative_to`, which is recommended when constructing a relative path from a :class:`~URL`.
			This method cannot correctly handle some cases, such as:

			.. code-block:: pycon

				>>> URL("https://github.com/domdfcoding").path.relative_to(URL("https://github.com").path)
				Traceback (most recent call last):
				ValueError: '/domdfcoding' does not start with ''

			Since ``URL("https://github.com").path`` is ``URLPath('')``.

			Instead, use:

				>>> URL("https://github.com/domdfcoding").relative_to(URL("https://github.com"))
				URLPath('domdfcoding')
		"""

		return super().relative_to(*other)

	def as_uri(self, *args, **kwargs) -> "NoReturn":  # noqa: D102
		raise NotImplementedError


class URL(os.PathLike):
	r"""
	:mod:`pathlib`-like class for URLs.

	:param url: The URL to construct the :class:`~apeye.url.URL` object from.

	.. versionchanged:: 0.3.0  The ``url`` parameter can now be a string or a :class:`~.URL`.

	.. versionchanged:: 1.1.0

		Added support for sorting and rich comparisons (``<``, ``<=``, ``>`` and ``>=``).

	.. autoclasssumm:: URL
		:autosummary-sections: Methods

	.. autosummary-widths:: 1/5

	.. autoclasssumm:: URL
		:autosummary-sections: Attributes
	"""

	#: URL scheme specifier
	scheme: str

	#: Network location part of the URL
	netloc: str

	#: The hierarchical path of the URL
	path: URLPath

	query: Dict[str, List[str]]
	"""
	The query parameters of the URL, if present.

	.. versionadded:: 0.7.0
	"""

	fragment: Optional[str]
	"""
	The URL fragment, used to identify a part of the document. :py:obj:`None` if absent from the URL.

	.. versionadded:: 0.7.0
	"""

	def __init__(self, url: Union[str, "URL"] = ''):
		if isinstance(url, URL):
			url = str(url)

		if not re.match("([A-Za-z-.]+:)?//", url):
			url = "//" + str(url)

		scheme, netloc, parts, params, query, fragment = urlparse(url)

		self.scheme: str = scheme
		self.netloc: str = netloc
		self.path = URLPath(parts)
		self.query = parse_qs(query or '')
		self.fragment = fragment or None

	@property
	def port(self) -> Optional[int]:
		"""
		The port of number of the URL as an integer, if present. Default :py:obj:`None`.

		.. versionadded:: 0.7.0
		"""

		if ':' not in self.netloc:
			return None
		else:
			return int(self.netloc.split(':')[-1])

	@classmethod
	def from_parts(
			cls: Type[URLType],
			scheme: str,
			netloc: str,
			path: PathLike,
			query: Optional[Mapping[Any, List]] = None,
			fragment: Optional[str] = None,
			) -> URLType:
		"""
		Construct a :class:`~apeye.url.URL` from a scheme, netloc and path.

		:param scheme: The scheme of the URL, e.g ``'http'``.
		:param netloc: The netloc of the URl, e.g. ``'bbc.co.uk:80'``.
		:param path: The path of the URL, e.g. ``'/news'``.
		:param query: The query parameters of the URL, if present.
		:param fragment: The URL fragment, used to identify a part of the document.
			:py:obj:`None` if absent from the URL.

		Put together, the resulting path would be ``'http://bbc.co.uk:80/news'``

		:rtype:

		.. versionchanged:: 0.7.0  Added the ``query`` and ``fragment`` arguments.
		"""

		obj = cls('')
		obj.scheme = scheme
		obj.netloc = netloc
		obj.query = dict(query or {})
		obj.fragment = fragment or None

		path = URLPath(path)

		if path.root == '/':
			obj.path = path
		else:
			obj.path = URLPath('/' + str(path))

		return obj

	def __str__(self) -> str:
		"""
		Returns the :class:`~apeye.url.URL` as a string.
		"""

		query = urlencode(self.query, doseq=True)
		url = urlunparse([self.scheme, self.netloc, str(self.path), None, query, self.fragment])

		if url.startswith("//"):
			return url[2:]
		else:
			return url

	def __repr__(self) -> str:
		"""
		Returns the string representation of the :class:`~apeye.url.URL`.
		"""

		return f"{self.__class__.__name__}({str(self)!r})"

	def __truediv__(self: URLType, key: Union[PathLike, int]) -> URLType:
		"""
		Construct a new :class:`~apeye.url.URL` object for the given child of this :class:`~apeye.url.URL`.

		:rtype:

		.. versionchanged:: 0.7.0

			* Added support for division by integers.

			* Now officially supports the new path having a URL fragment and/or query parameters.
			  Any URL fragment or query parameters from the parent URL are not inherited by its children.
		"""

		try:
			return self._make_child((key, ))
		except TypeError:
			return NotImplemented

	def _make_child(self: URLType, args: Iterable[Union[PathLike, int]]) -> URLType:
		"""
		Construct a new :class:`~apeye.url.URL` object by combining the given arguments with this instance's path part.

		.. versionadded:: 1.1.0  (private)

		Except for the final path element any queries and fragments are ignored.

		:returns: A new :class:`~.URL` representing either a subpath
			(if all arguments are relative paths) or a totally different path
			(if one of the arguments is absolute).
		"""

		parsed_args: List[ParseResult] = []

		for arg in args:

			raw_arg = arg

			if isinstance(arg, pathlib.PurePath):
				arg = arg.as_posix()
			elif isinstance(arg, os.PathLike):
				arg = os.fspath(arg)
			elif isinstance(arg, int):
				arg = str(arg)

			try:
				parse_result = urlparse(arg)
			except AttributeError as e:
				if str(e).endswith("'decode'"):
					msg = f"Cannot join {type(raw_arg).__name__!r} to a {type(self.path).__name__!r}"
					raise TypeError(msg) from None
				else:
					raise

			parsed_args.append(parse_result)

		try:
			new_path = self.from_parts(
					self.scheme,
					self.netloc,
					self.path.joinpath(*map(attrgetter("path"), parsed_args)),
					)
		except TypeError:
			return NotImplemented

		if parsed_args:
			new_path.query = parse_qs(parsed_args[-1].query)
			new_path.fragment = parsed_args[-1].fragment or None

		return new_path

	def joinurl(self: URLType, *args) -> URLType:
		"""
		Construct a new :class:`~apeye.url.URL` object by combining the given arguments with this instance's path part.

		.. versionadded:: 1.1.0

		Except for the final path element any queries and fragments are ignored.

		:returns: A new :class:`~.URL` representing either a subpath
			(if all arguments are relative paths) or a totally different path
			(if one of the arguments is absolute).
		"""

		return self._make_child(args)

	def __fspath__(self) -> str:
		"""
		Returns the file system path representation of the :class:`~.URL`.

		This is comprised of the ``netloc`` and ``path`` attributes.
		"""

		return f"{self.netloc}{self.path}"

	def __eq__(self, other) -> bool:
		"""
		Return ``self == other``.

		.. latex:vspace:: -10px

		.. attention::

			URL fragments and query parameters are not compared.

			.. seealso:: :meth:`.URL.strict_compare`, which *does* consider those attributes.

		.. latex:vspace:: -20px
		"""

		if isinstance(other, URL):
			return self.netloc == other.netloc and self.scheme == other.scheme and self.path == other.path
		else:
			return NotImplemented

	def __lt__(self, other):
		if isinstance(other, URL):
			return self._parts_port < other._parts_port
		else:
			return NotImplemented

	def __le__(self, other):
		if isinstance(other, URL):
			return self._parts_port <= other._parts_port
		else:
			return NotImplemented

	def __gt__(self, other):
		if isinstance(other, URL):
			return self._parts_port > other._parts_port
		else:
			return NotImplemented

	def __ge__(self, other):
		if isinstance(other, URL):
			return self._parts_port >= other._parts_port
		else:
			return NotImplemented

	def strict_compare(self, other) -> bool:
		"""
		Return ``self ≡ other``, comparing the scheme, netloc, path, fragment and query parameters.

		.. versionadded:: 0.7.0
		"""

		if isinstance(other, URL):
			return (
					self.netloc == other.netloc and self.scheme == other.scheme and self.path == other.path
					and self.query == other.query and self.fragment == other.fragment
					)
		else:
			return NotImplemented

	def __hash__(self) -> int:
		"""
		Returns the has of the :class:`~apeye.url.URL` .
		"""

		return hash((self.scheme, self.netloc, self.path))

	@property
	def name(self) -> str:
		"""
		The final path component, if any.
		"""

		return self.path.name

	@property
	def suffix(self) -> str:
		"""
		The final component's last suffix, if any.

		This includes the leading period. For example: ``'.txt'``.
		"""
		return self.path.suffix

	@property
	def suffixes(self) -> List[str]:
		"""
		A list of the final component's suffixes, if any.

		These include the leading periods. For example: ``['.tar', '.gz']``.
		"""
		return self.path.suffixes

	@property
	def stem(self):
		"""
		The final path component, minus its last suffix.
		"""

		return self.path.stem

	def with_name(self: URLType, name: str, inherit: bool = True) -> URLType:
		"""
		Return a new :class:`~apeye.url.URL` with the file name changed.

		:param name:
		:param inherit: Whether the new :class:`~apeye.url.URL` should inherit the query string
			and fragment from this :class:`~apeye.url.URL`.

		:rtype:

		.. versionchanged:: 0.7.0  Added the ``inherit`` parameter.
		"""

		if inherit:
			kwargs = {"query": self.query, "fragment": self.fragment}
		else:
			kwargs = {}

		return self.from_parts(
				self.scheme,
				self.netloc,
				self.path.with_name(name),
				**kwargs,  # type: ignore
				)

	def with_suffix(self: URLType, suffix: str, inherit: bool = True) -> URLType:
		"""
		Returns a new :class:`~apeye.url.URL` with the file suffix changed.

		If the :class:`~apeye.url.URL` has no suffix, add the given suffix.

		If the given suffix is an empty string, remove the suffix from the :class:`~apeye.url.URL`.

		:param suffix:
		:param inherit: Whether the new :class:`~apeye.url.URL` should inherit the query string
			and fragment from this :class:`~apeye.url.URL`.

		:rtype:

		.. versionchanged:: 0.7.0  Added the ``inherit`` parameter.
		"""

		if inherit:
			kwargs = {"query": self.query, "fragment": self.fragment}
		else:
			kwargs = {}

		return self.from_parts(
				self.scheme,
				self.netloc,
				self.path.with_suffix(suffix),
				**kwargs,  # type: ignore
				)

	@property
	def parts(self) -> Tuple[str, ...]:
		"""
		An object providing sequence-like access to the components in the URL.

		To retrieve only the parts of the path, use :meth:`URL.path.parts <URLPath.parts>`.
		"""

		return (
				self.scheme,
				self.domain.subdomain,
				self.domain.domain,
				self.domain.suffix,
				*('/' / self.path).parts[1:],
				)

	@property
	def _parts_port(self) -> Tuple:
		"""
		An object providing sequence-like access to the components in the URL.

		Unlike ``.parts`` this includes the port.

		To retrieve only the parts of the path, use :meth:`URL.path.parts <URLPath.parts>`.

		.. versionadded:: 1.1.0  (private)
		"""

		return (
				self.scheme,
				self.domain.subdomain,
				self.domain.domain,
				self.domain.suffix,
				self.port or 0,
				*('/' / self.path).parts[1:],
				)

	@property
	def parent(self: URLType) -> URLType:
		"""
		The logical parent of the :class:`~apeye.url.URL`.
		"""

		return self.from_parts(self.scheme, self.netloc, self.path.parent)

	@property
	def parents(self: URLType) -> Tuple[URLType, ...]:
		"""
		An immutable sequence providing access to the logical ancestors of the :class:`~apeye.url.URL`.
		"""

		return tuple(self.from_parts(self.scheme, self.netloc, path) for path in self.path.parents)

	@property
	def fqdn(self) -> str:
		"""
		Returns the Fully Qualified Domain Name of the :class:`~apeye.url.URL` .
		"""

		return self.domain.fqdn

	@property
	def domain(self) -> "Domain":
		"""
		Returns a :class:`apeye.url.Domain` object representing the domain part of the URL.
		"""

		return Domain._make(_tld.extract_tld(self.netloc))

	@property
	def base_url(self: URLType) -> URLType:
		"""
		Returns a :class:`apeye.url.URL` object representing the URL without query strings or URL fragments.

		.. versionadded:: 0.7.0
		"""

		return self.from_parts(
				self.scheme,
				self.netloc,
				self.path,
				)

	def relative_to(self, other: Union[str, "URL", URLPath]) -> URLPath:
		"""
		Returns a version of this URL's path relative to ``other``.

		.. versionadded:: 1.1.0

		:param other: Either a :class:`~.URL`, or a string or :class:`~.URLPath` representing an *absolute* path.
			If a :class:`~.URL`, the :attr:`~.URL.netloc` must match this URL's.

		:raises ValueError: if the operation is not possible
			(i.e. because this URL's path is not a subpath of the other path)
		"""

		if isinstance(other, URLPath):
			if not other.is_absolute():
				raise ValueError("'URL.relative_to' cannot be used with relative URLPath objects")
			else:
				other = URL('/') / other
		elif not isinstance(other, URL):
			# Parse other as a URL
			other = URL(other)

		# Compare netloc, if both have one
		if self.netloc and other.netloc and self.netloc.lower() != other.netloc.lower():
			raise ValueError(f"{self!r} does not start with {other!r}")

		# Make the paths absolute
		# If coming from a URL they must always be absolute
		our_path = '/' / self.path
		other_path = '/' / other.path

		relative_path = our_path.relative_to(other_path)

		return relative_path


class Domain(NamedTuple):
	"""
	:class:`typing.NamedTuple` of a URL's subdomain, domain, and suffix.
	"""

	subdomain: str
	domain: str
	suffix: str

	@property
	def registered_domain(self):
		"""
		Joins the domain and suffix fields with a dot, if they're both set.

		.. code-block:: python

			>>> URL('https://forums.bbc.co.uk').domain.registered_domain
			'bbc.co.uk'
			>>> URL('https://localhost:8080').domain.registered_domain
			''
		"""
		if self.domain and self.suffix:
			return self.domain + '.' + self.suffix
		return ''

	@property
	def fqdn(self):
		"""
		Returns a Fully Qualified Domain Name, if there is a proper domain/suffix.

		.. code-block:: python

			>>> URL('https://forums.bbc.co.uk/path/to/file').domain.fqdn
			'forums.bbc.co.uk'
			>>> URL('https://localhost:8080').domain.fqdn
			''
		"""
		if self.domain and self.suffix:
			# self is the namedtuple (subdomain domain suffix)
			return '.'.join(i for i in self if i)
		return ''

	@property
	def ipv4(self) -> Optional[ipaddress.IPv4Address]:
		"""
		Returns the ipv4 if that is what the presented domain/url is.

		.. code-block:: python

			>>> URL('https://127.0.0.1/path/to/file').domain.ipv4
			IPv4Address('127.0.0.1')
			>>> URL('https://127.0.0.1.1/path/to/file').domain.ipv4
			>>> URL('https://256.1.1.1').domain.ipv4
		"""

		if not (self.suffix or self.subdomain) and _tld.IP_RE.match(self.domain):
			return ipaddress.ip_address(self.domain)
		return None

	def __repr__(self) -> str:
		"""
		Return a string representation of the :class:`~.Domain`.
		"""

		# This is necessary to get the custom docstring

		repr_fmt = f"({', '.join(f'{name}=%r' for name in self._fields)})"
		return f"{self.__class__.__name__}{repr_fmt % self}"