Source code for lars.iis

# vim: set et sw=4 sts=4 fileencoding=utf-8:
#
# Copyright (c) 2013-2017 Dave Jones <dave@waveform.org.uk>
# Copyright (c) 2013 Mime Consulting Ltd. <info@mimeconsulting.co.uk>
# All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
This module provides a wrapper for W3C extended log files, typically used by
the Microsoft IIS web-server.

The :class:`IISSource` class is the major element that this module provides;
this is the class which wraps a file-like object containing a W3C formatted log
file and yields rows from it as tuples.


Classes
=======

.. autoclass:: IISSource
    :members:

    .. attribute:: count

        Returns the number of rows successfully read from the source

    .. attribute:: date

        The timestamp specified by the last encountered ``#Date`` directive (if
        any), as a :class:`~lars.datatypes.DateTime` instance

    .. attribute:: fields

        A sequence of fields names found in the ``#Fields`` directive in the
        file header

    .. attribute:: finish

        The timestamp found in the ``#End-Date`` directive (if any, as a
        :class:`~lars.datatypes.DateTime` instance)

    .. attribute:: remark

        The remarks recorded in the ``#Remark`` directive (if any)

    .. attribute:: software

        The name of the software which produced the source file as given by
        the ``#Software`` directive (if any)

    .. attribute:: start

        The timestamp found in the ``#Start-Date`` directive (if any), as a
        :class:`~lars.datatypes.DateTime` instance

    .. attribute:: version

        The version of the source file, as given by the ``#Version`` directive
        in the header


Exceptions
==========

.. autoclass:: IISError
   :members:

.. autoexception:: IISDirectiveError

.. autoexception:: IISFieldsError

.. autoexception:: IISVersionError

.. autoexception:: IISWarning


Examples
========

A typical usage of this class is as follows::

    import io
    from lars import iis, csv

    with io.open('logs\\iis.txt', 'rb') as infile:
        with io.open('iis.csv', 'wb') as outfile:
            with iis.IISSource(infile) as source:
                with csv.CSVTarget(outfile) as target:
                    for row in source:
                        target.write(row)


Note for maintainers
====================

The draft standard for the `W3C Extended Log File Format`_ is not well written
(see the various notes and comments in the code); actual practice deviates from
the draft in several areas, and the draft is deficient in describing what is
potentially permitted in other areas.

Examples of the format as produced by IIS (the major user of the draft) can be
found on `MSDN`_. When maintaining the code below, please refer to both the
draft (for information on what *could* be included in W3C log files) as well as
the examples (for information on what typically *is* included in W3C log files,
even when it outright violates the draft), and bear in mind `Postel's Law`_.


.. _W3C Extended Log File Format: http://www.w3.org/TR/WD-logfile.html
.. _MSDN: http://bit.ly/2lPjHfz
.. _Postel's Law: http://en.wikipedia.org/wiki/Robustness_principle
"""

from __future__ import (
    unicode_literals,
    absolute_import,
    print_function,
    division,
    )

import re
import warnings
import logging
try:
    from urllib.parse import unquote_plus
except ImportError:
    from urllib import unquote_plus  # pylint: disable=wrong-import-order

from . import parsers, datatypes as dt
from .exc import LarsError, LarsWarning

str = type('')  # pylint: disable=redefined-builtin,invalid-name


def _string_parse(s):
    """
    Parse a string in a IIS extended log format file.

    Quoted strings have the external quotes stripped off and internal quotes,
    which are doubled for escaping purposes, halved. Unquoted strings are
    assumed to be URI %-encoded and are decoded as such.

    :param str s: The string to parse
    :returns: The decoded string
    """
    if s == '-':
        return None
    if s[:1] == '"':
        return s[1:-1].replace('""', '"')
    return unquote_plus(s)


[docs]class IISError(LarsError):
    """
    Base class for IISSource errors.

    Exceptions of this class take the optional arguments line_number and line
    for specifying the index and content of the line that caused the error
    respectively. If specified, the :meth:`__str__` method is overridden to
    include the line number in the error message.

    :param str message: The error message
    :param int line_number: The 1-based index of the line that caused the error
    :param str line: The content of the line that caused the error
    """
    def __init__(self, message, line_number=None, line=None):
        self.line_number = line_number
        self.line = line
        super(IISError, self).__init__(message)

    def __str__(self):
        result = super(IISError, self).__str__()
        if self.line_number:
            result = 'Line %d: %s' % (self.line_number, result)
        return result


[docs]class IISDirectiveError(IISError):
    """
    Raised when an error is encountered in any ``#Directive``.
    """


[docs]class IISFieldsError(IISDirectiveError):
    """
    Raised when an error is encountered in a ``#Fields`` directive.
    """


[docs]class IISVersionError(IISDirectiveError):
    """
    Raised for a ``#Version`` directive with an unknown version is found.
    """


[docs]class IISWarning(LarsWarning):
    """
    Raised when an error is encountered in parsing a log row.
    """


[docs]class IISSource(object):
    """
    Wraps a stream containing a IIS formatted log file.

    This wrapper converts a stream containing a IIS formatted log file into an
    iterable which yields tuples. Each tuple is a namedtuple instance with the
    fieldnames of the tuple being the sanitized versions of the field names in
    the original log file (as specified in the ``#Fields`` directive).

    The directives contained in the file can be obtained from attributes of the
    wrapper itself (useful in the case that relative timestamps, e.g. with the
    ``#Date`` directive, are being used) in which case the attribute will be
    the lower-cased version of the directive name without the ``#`` prefix.

    :param source: A file-like object containing the source stream
    """
    # pylint: disable=too-many-instance-attributes,too-few-public-methods

    def __init__(self, source):
        self.source = source
        self.version = None
        self.software = None
        self.remark = None
        self.start = None
        self.finish = None
        self.date = None
        self.fields = []
        self.count = 0
        self._row_pattern = None
        self._row_funcs = None
        self._row_type = None

    # The following regexes are used to identify directives within IIS log
    # files. Contrary to popular opinion these can occur anywhere within the
    # log file; the draft places no limitations on where they can occur except
    # that #Version and #Fields directives must precede the first line of data.
    # This implementation assumes that a second #Fields directive is an error
    # but technically the draft does permit this (although we've never observed
    # it in practice).

    VERSION_RE = re.compile(
        r'^#\s*Version\s*:\s*(?P<text>\d+\.\d+)\s*$', flags=re.IGNORECASE)
    START_DATE_RE = re.compile(
        r'^#\s*Start-Date\s*:\s*(?P<date>\d{4}-\d{2}-\d{2})\s*'
        r'(?P<time>\d{2}:\d{2}:\d{2})\s*$', flags=re.IGNORECASE)
    END_DATE_RE = re.compile(
        r'^#\s*End-Date\s*:\s*(?P<date>\d{4}-\d{2}-\d{2})\s*'
        r'(?P<time>\d{2}:\d{2}:\d{2})\s*$', flags=re.IGNORECASE)
    DATE_RE = re.compile(
        r'^#\s*Date\s*:\s*(?P<date>\d{4}-\d{2}-\d{2})\s*'
        r'(?P<time>\d{2}:\d{2}:\d{2})\s*$', flags=re.IGNORECASE)
    SOFTWARE_RE = re.compile(
        r'^#\s*Software\s*:\s*(?P<text>.*)$', flags=re.IGNORECASE)
    REMARK_RE = re.compile(
        r'^#\s*Remark\s*:\s*(?P<text>.*)$', flags=re.IGNORECASE)
    FIELDS_RE = re.compile(
        r'^#\s*Fields\s*:\s*(?P<text>.*)$', flags=re.IGNORECASE)

    # This is, apparently, the date format used by IIS log files. At least,
    # it's the format the draft dictates in the Date and Time sections, but
    # bizarrely the example in the Example section uses something quite
    # different (D-MMM-YYYY HH:MM:SS). However, every real-life example we've
    # seen to date follows the ISO(ish) format, so that's what we specify here.

    DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'

    def _process_directive(self, line):
        """
        Processes a ``#Directive`` in a IIS log file.

        This method is called by the :meth:`__iter__` method when a
        ``#Directive`` line is encountered anywhere in a IIS log file
        (``#Directives`` can occur beyond the header, although it's rare to
        find them in practice). The method parses the ``#Directive`` and sets
        various instance attributes in response, the most important probably
        being ``#Version`` and ``#Fields`` which must occur before any data is
        encountered.

        :param str line: The directive line to process
        """
        logging.debug('Parsing directive: %s', line)
        directive = None
        for directive, regex in (
                ('Version', self.VERSION_RE),
                ('Software', self.SOFTWARE_RE),
                ('Remar', self.REMARK_RE),
                ('Fields', self.FIELDS_RE),
                ('Start-Date', self.START_DATE_RE),
                ('End-Date', self.END_DATE_RE),
                ('Date', self.DATE_RE),
        ):
            match = regex.match(line)
            if match:
                break
        else:
            raise IISDirectiveError('Unrecognized directive %s' %
                                    line.rstrip())

        if directive == 'Version':
            if self.version is not None:
                raise IISVersionError('Found a second #Version directive')
            self.version = match.group('text')
            if self.version != '1.0':
                raise IISVersionError('Unknown IIS log version %s' %
                                      self.version)
        elif directive == 'Software':
            self.software = match.group('text')
        elif directive == 'Remark':
            self.remark = match.group('text')
        elif directive == 'Fields':
            self._process_fields(match.group('text'))
        elif directive == 'Start-Date':
            self.start = dt.datetime(
                '%s %s' % (match.group('date'), match.group('time')),
                self.DATETIME_FORMAT
                )
        elif directive == 'End-Date':
            self.finish = dt.datetime(
                '%s %s' % (match.group('date'), match.group('time')),
                self.DATETIME_FORMAT
                )
        elif directive == 'Date':
            self.date = dt.datetime(
                '%s %s' % (match.group('date'), match.group('time')),
                self.DATETIME_FORMAT
                )

    # The FIELD_RE regex is intended to match a single header name within the
    # #Fields specification of a IIS log file. Basically headers come in one of
    # three varieties:
    #
    #   * unprefixed, "identifier"
    #   * prefixed which take the form "prefix-ident"
    #   * HTTP header which take the form "prefix(header)"
    #
    # We limit the possible prefixes as the draft defines them, but we don't
    # place any limits on what characters can occur in the identifier as the
    # draft doesn't either (however, we do disallow space as otherwise there'd
    # be no way of differentiating a delimiter and a space in an identifier ...
    # sadly the draft doesn't even explicitly forbid this pathological case).

    FIELD_RE = re.compile(
        r'(?:(?P<prefix>[rc]s?|s[rc]?|x)'
        r'(?:-|(?P<header>\()))?'
        r'(?P<identifier>[^ ]+)(?(header)\))')

    # FIELD_TYPES maps a field's identifier (sans prefix) to a data-type
    # defined in the W3C draft. Any fields which are not mapped are assumed to
    # be type <string> (like all header fields which the draft explicitly
    # defines as having type <string>).
    #
    # The "extended IIS definitions" come from the IIS log definition, and from
    # MS KB909264 which details naming restrictions in Windows (the IIS log
    # definition isn't explicit about the types for things like site name and
    # computer name, aka NetBIOS name).

    FIELD_TYPES = {
        # Specified in the W3C draft standard
        'bytes':         'integer',
        'cached':        'integer',
        'comment':       'text',
        'count':         'integer',
        'date':          'date_iso',
        'dns':           'hostname',
        'interval':      'integer',
        'ip':            'address_port',
        'method':        'hostname',  # No really, that's what the draft says!
        'status':        'integer',
        'time-from':     'time_iso',
        'time-taken':    'fixed',
        'time':          'time_iso',
        'time-to':       'time_iso',
        'uri-query':     'url',
        'uri-stem':      'url',
        'uri':           'url',
        # Extended IIS definitions
        'computername':  'string',
        'host':          'hostname',
        'port':          'integer',
        'sitename':      'string',
        'substatus':     'integer',
        'username':      'string',
        'version':       'string',
        'win32-status':  'integer',
        }

    # TYPES defines conversion functions and regexes for each of the datatypes
    # used in the W3C draft

    TYPES = {
        'integer':      (parsers.int_parse, parsers.INTEGER),
        'fixed':        (parsers.fixed_parse, parsers.FIXED),
        'date_iso':     (parsers.date_parse, parsers.DATE_ISO),
        'time_iso':     (parsers.time_parse, parsers.TIME_ISO),
        'url':          (parsers.url_parse, parsers.URL),
        # This regex deviates from the draft's specifications; in practice IIS
        # always URI encodes the content of prefix(header) fields but the draft
        # demands a "quoted string" format instead. The draft also demands that
        # the usual empty-field notation of a dash ("-") is not used for
        # "string" type fields (presumably an empty pair of quotes should be
        # used, although the draft doesn't explicitly state this), but, again,
        # practice deviates from this. This is very specific to the W3C format
        # so this isn't one of the standard regexes
        'string':       (_string_parse,
                         r'(?P<%(name)s>"([^"]|"")*"|[^"\s]\S*|-)'),
        # The draft dictates <alpha> for names, but firstly doesn't define what
        # <alpha> actually means; furthermore if we assume if means alphabetic
        # chars only (as seems reasonable) that's not even slightly sufficient
        # for validating DNS names (which is what this type is for), and
        # generally one expects that in the case of DNS resolution failure, an
        # IP address might be recorded in such fields too. Here we simply use
        # our default hostname regex
        'hostname':     (parsers.hostname_parse, parsers.HOSTNAME),
        # Again, the draft's BNF for an IP address is deficient (e.g. doesn't
        # specify a limit on octets, and isn't compatible with IPv6 which will
        # presumably start appearing in logs at some point), so we use our
        # generic address+port regex
        'address_port': (parsers.address_parse, parsers.ADDRESS_PORT),
        }

    def _process_fields(self, line):
        """
        Processes a ``#Fields`` directive.

        This method is responsible for configuring a regex for matching data
        rows, and a namedtuple to organize the content of data rows, based on
        the fields defined in the ``#Fields`` header directive.

        :param str line: The content of the ``#Fields`` directive
        """
        logging.debug('Parsing #Fields: %s', line)
        if self.fields:
            raise IISFieldsError('Second #Fields directive found')
        fields = self.FIELD_RE.findall(line)
        pattern = ''
        tuple_fields = []
        tuple_funcs = []
        for prefix, header, identifier in fields:
            # Figure out the original field name, a Python-ified version of
            # this name, and what type the field has
            if header:
                original_name = '%s(%s)' % (prefix, identifier)
                python_name = dt.sanitize_name('%s_%s' % (prefix, identifier))
                # According to the draft, all header fields are type <string>
                # but for user-friendliness we special-case Referr?er here
                if identifier.lower() in ('referer', 'referrer'):
                    field_type = 'url'
                else:
                    field_type = 'string'
            elif prefix:
                original_name = '%s-%s' % (prefix, identifier)
                python_name = dt.sanitize_name('%s_%s' % (prefix, identifier))
                # Default to <string> if we don't know the field identifier
                field_type = self.FIELD_TYPES.get(identifier, 'string')
            else:
                original_name = identifier
                python_name = dt.sanitize_name(identifier)
                field_type = self.FIELD_TYPES.get(identifier, 'string')
            if pattern:
                pattern += r'\s+'
            logging.debug('Field %s has type %s', original_name, field_type)
            field_fn, field_re = self.TYPES[field_type]
            pattern += field_re % {'name': python_name}
            tuple_funcs.append(field_fn)
            if original_name in self.fields:
                raise IISFieldsError('Duplicate field name %s' % original_name)
            self.fields.append(original_name)
            tuple_fields.append(python_name)
        logging.debug('Constructing row regex: %s', pattern)
        self._row_pattern = re.compile('^' + pattern + '$')
        logging.debug('Constructing row tuple with fields: %s',
                      ','.join(tuple_fields))
        self._row_type = dt.row(*tuple_fields)
        logging.debug('Constructing row parser functions')
        self._row_funcs = tuple_funcs

    def __enter__(self):
        logging.debug('Entering IIS context')
        self.count = 0
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        logging.debug('Exiting IIS context')

    def __iter__(self):
        """
        Yields a row tuple for each line in the file-like source object.

        This method is the main body of the class and is responsible for
        transforming lines from the source file-like object into row tuples.
        However, the main work of transforming strings into tuples is actually
        performed by the regular expressions and tuple class set up in response
        to encountering the ``#Fields`` directive in :meth:`_process_directive`
        above.
        """
        for num, line in enumerate(self.source):
            try:
                if line.startswith('#'):
                    self._process_directive(line.rstrip())
                elif self.version is None:
                    raise IISVersionError(
                        'Missing #Version directive before data')
                elif not self.fields:
                    raise IISFieldsError(
                        'Missing #Fields directive before data')
                else:
                    match = self._row_pattern.match(line.rstrip())
                    if match:
                        values = match.group(*self._row_type._fields)
                        try:
                            values = [f(v) for (f, v) in zip(self._row_funcs,
                                                             values)]
                        except ValueError as exc:
                            raise IISWarning(str(exc))
                        self.count += 1
                        yield self._row_type(*values)
                    else:
                        raise IISWarning('Line contains invalid data')
            except IISWarning as exc:
                # Add line number to the warning and report with warn()
                warnings.warn('Line %d: %s' % (num + 1, str(exc)), IISWarning)
            except IISError as exc:
                # Add line content and number to the exception and re-raise
                if not exc.line_number:
                    raise type(exc)(exc.args[0], line_number=num + 1,
                                    line=line)
                raise  # pragma: no cover