Source code for lars.datatypes.url

# vim: set et sw=4 sts=4 fileencoding=utf-8:
#
# Copyright (c) 2013-2017 Dave Jones <dave@waveform.org.uk>
# Copyright (c) 2013 Mime Consulting Ltd. <info@mimeconsulting.co.uk>
# All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Defines the URL parsing specific parts of :mod:`lars.datatypes`.
"""

from __future__ import (
    unicode_literals,
    absolute_import,
    print_function,
    division,
    )

from collections import namedtuple
try:
    from urllib import parse
except ImportError:
    import urlparse as parse

from .ipaddress import hostname

str = type('')  # pylint: disable=redefined-builtin,invalid-name


[docs]def path(s): """ Returns a :class:`Path` object for the given string. :param str s: The string containing the path to parse :returns: A :class:`Path` object representing the path """ i = s.rfind('/') + 1 dirname, basename = s[:i], s[i:] if dirname and dirname != '/' * len(dirname): dirname = dirname.rstrip('/') i = basename.rfind('.') if i > 0: ext = basename[i:] else: ext = '' return Path(dirname, basename, ext)
[docs]def url(s): """ Returns a :class:`Url` object for the given string. :param str s: The string containing the URL to parse :returns: A :class:`Url` tuple representing the URL """ return Url(*parse.urlparse(s))
def request(s): """ Returns a :class:`Request` object for the given string. :param str s: The string containing the request line to parse :returns: A :class:`Request` tuple representing the request line """ try: method, s = s.split(' ', 1) except ValueError: raise ValueError('Request line is missing a space separated method') try: s, protocol = s.rsplit(' ', 1) except ValueError: raise ValueError('Request line is missing a space separated protocol') s = s.strip() if not s: raise ValueError('Request line URL cannot be blank') return Request(method, url(s) if s != '*' else None, protocol)
[docs]class Path(namedtuple('Path', 'dirname basename ext')): """ Represents a path. This type is returned by the :func:`path` function and represents a path in POSIX format (forward slash separators and no drive portion). It is used to represent the path portion of URLs and provides attributes for extracting parts of the path there-in. The original path can be obtained as a string by asking for the string conversion of this class, like so:: p = datatypes.path('/foo/bar/baz.ext') assert p.dirname == '/foo/bar' assert p.basename == 'baz.ext' assert str(p) == '/foo/bar/baz.ext' .. attribute:: dirname A string containing all of the path except the basename at the end .. attribute:: basename A string containing the basename (filename and extension) at the end of the path .. attribute:: ext A string containing the filename's extension (including the leading dot) """ __slots__ = () @property def dirs(self): """ Returns a sequence of the directories making up :attr:`dirname` """ return [d for d in self.dirname.split('/') if d] @property def basename_no_ext(self): """ Returns a string containing basename with the extension removed (including the final dot separator). """ if self.ext: return self.basename[:-len(self.ext)] else: return self.basename @property def isabs(self): """ Returns True if the path is absolute (dirname begins with one or more forward slashes). """ return self.dirname.startswith('/')
[docs] def join(self, *paths): """ Joins this path with the specified parts, returning a new :class:`Path` object. :param \\*paths: The parts to append to this path :returns: A new :class:`Path` object representing the extended path """ # pylint: disable=invalid-name result = str(self) for p in paths: if not isinstance(p, str): p = str(p) # Strip doubled slashes? Or leave this to normpath? if p.startswith('/'): result = p elif not result or result.endswith('/'): result += p else: result += '/' + p return path(result)
def __str__(self): result = self.dirname if not result or result.endswith('/'): return result + self.basename else: return result + '/' + self.basename
# This is rather hackish; in Python 2.x, urlparse.ResultMixin provides # functionality for extracting username, password, hostname and port from a # parsed URL. In Python 3 this changed to ResultBase, then to a whole bunch of # undocumented classes (split between strings and bytes) with ResultBase as an # alias try: _ResultMixin = parse.ResultBase # pylint: disable=invalid-name except AttributeError: _ResultMixin = parse.ResultMixin # pylint: disable=invalid-name
[docs]class Url(namedtuple('Url', ('scheme', 'netloc', 'path_str', 'params', 'query_str', 'fragment')), _ResultMixin): """ Represents a URL. This type is returned by the :func:`url` function and represents the parts of the URL. You can obtain the original URL as a string by requesting the string conversion of this class, for example:: >>> u = datatypes.url('http://foo/bar/baz') >>> print u.scheme http >>> print u.hostname foo >>> print str(u) http://foo/bar/baz .. attribute:: scheme The scheme of the URL, before the first ``:`` .. attribute:: netloc The "network location" of the URL, comprising the hostname and port (separated by a colon), and historically the username and password (prefixed to the hostname and separated with an ampersand) .. attribute:: path_str The path of the URL from the first slash after the network location .. attribute:: path The path of the URL, parsed into a tuple which splits out the directory, filename, and extension:: >>> u = datatypes.url('foo/bar/baz.html') >>> u.path Path(dirname='foo/bar', basename='baz.html', ext='.html') >>> u.path.isabs False .. attribute:: params The parameters of the URL .. attribute:: query_str The query string of the URL from the first question-mark in the path .. attribute:: query The query string, parsed into a mapping of keys to lists of values. For example:: >>> u = datatypes.url('foo/bar?a=1&a=2&b=3&c=') >>> print u.query {'a': ['1', '2'], 'c': [''], 'b': ['3']} >>> print 'a' in u.query True .. attribute:: fragment The fragment of the URL from the last hash-mark to the end of the URL Additionally, the following attributes can be used to separate out the various parts of the :attr:`netloc` attribute: .. attribute:: username The username (historical, rare to see this used on the modern web) .. attribute:: password The password (historical, almost unheard of on the modern web as it's extremely insecure to include credentials in the URL) .. attribute:: hostname The hostname from the network location. This attribute returns a :class:`Hostname` object which can be used to resolve the hostname into an IP address if required. .. attribute:: port The optional network port """ __slots__ = ()
[docs] def geturl(self): """ Return the URL as a string string. """ return parse.urlunparse(self)
def __str__(self): return self.geturl() @property def hostname(self): return hostname(super(Url, self).hostname) @property def query(self): # pylint: disable=missing-docstring return parse.parse_qs(self.query_str, keep_blank_values=True) @property def path(self): # pylint: disable=missing-docstring return path(self.path_str)
class Request(namedtuple('Request', 'method url protocol')): """ Represents an HTTP request line. This type is returned by the :func:`request` function and represents the three parts of an HTTP request line: the method, the URL (optional, can be None in the case of methods like OPTIONS), and the protocol. The following attributes exist: .. attribute:: method The method of the request (typically GET, POST, or PUT but can technically be any valid HTTP token) .. attribute:: url The requested URL. May be an absolute URL, an absolute path, an authority token, or None in the case that the request line contained "*" for the URL. .. attribute:: protocol The HTTP protocol version requested. A string of the format 'HTTP/x.y' where x.y is the version number. At the time of writing only HTTP/1.0 and HTTP/1.1 are defined. """ def __str__(self): return '%s %s %s' % (self.method, self.url, self.protocol)