Source code for cheshire3.preParser

from __future__ import absolute_import

import os
import re
import time
import string
import glob
import httplib
import mimetypes
import tempfile
import hashlib
import subprocess
import lz4

try:
    import cStringIO as StringIO
except ImportError:
    import StringIO

try:
    import cPickle as pickle
except ImportError:
    import pickle

from xml.sax.saxutils import escape
from warnings import warn
from lxml import etree
from lxml import html
from lxml.builder import ElementMaker
from base64 import b64encode, b64decode
from zipfile import ZipFile
from docutils.core import publish_string

# Intra-package imports
from cheshire3.baseObjects import PreParser
from cheshire3.document import StringDocument
from cheshire3.internal import CONFIG_NS
from cheshire3.marc_utils import MARC
from cheshire3.utils import getShellResult, gen_uuid
from cheshire3.exceptions import ConfigFileException, ExternalSystemException,\
    MissingDependencyException


# TODO: All PreParsers should set mimetype, and record in/out mimetype

class TypedPreParser(PreParser):
    _possibleSettings = {
        "inMimeType": {
            'docs': "The mimetype expected for incoming documents"
        },
        "outMimeType": {
            'docs': "The mimetype set on outgoing documents"
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.inMimeType = self.get_setting(session, 'inMimeType', '')
        self.outMimeType = self.get_setting(session, 'outMimeType', '')


[docs]class NormalizerPreParser(PreParser):
    """ Calls a named Normalizer to do the conversion."""

    _possiblePaths = {
        'normalizer': {
            'docs': "Normalizer identifier to call to do the transformation",
            'required': True
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.normalizer = self.get_path(session, 'normalizer', None)
        if self.normalizer is None:
            msg = "Normalizer for {0} does not exist.".format(self.id)
            raise ConfigFileException(msg)

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = self.normalizer.process_string(session, data)
        return StringDocument(new, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class UnicodeDecodePreParser(PreParser):
    """PreParser to turn non-unicode into Unicode Documents.

    A UnicodeDecodePreParser should accept a Document with content encoded in
    a non-unicode character encoding scheme and return a Document with the
    same content decoded to Python's Unicode implementation.
    """

    _possibleSettings = {
        'codec': {
            'docs': 'Codec to use to decode to unicode. Defaults to UTF-8'
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.codec = self.get_setting(session, 'codec', 'utf-8')

    def process_document(self, session, doc):
        try:
            data = doc.get_raw(session).decode(self.codec)
        except UnicodeDecodeError as e:
            raise e
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class CmdLinePreParser(TypedPreParser):

    _possiblePaths = {
        'executable': {'docs': "Name of the executable to run"},
        'executablePath': {'docs': "Path to the executable"},
        'workingPath': {'docs': 'Path to be in when executing command'}
    }

    _possibleSettings = {
        'commandLine': {
            'docs': """\
Command line to use. %INDOC% is substituted to create a temporary file to
read, and %OUTDOC% is substituted for a temporary file for the process to
write to"""
        }
    }

    def __init__(self, session, config, parent):
        TypedPreParser.__init__(self, session, config, parent)
        exe = self.get_path(session, 'executable', '')
        if not exe:
            msg = "Missing mandatory 'executable' path in {0}".format(self.id)
            raise ConfigFileException(msg)
        tp = self.get_path(session, 'executablePath', '')
        if tp:
            exe = os.path.join(tp, exe)
        cl = self.get_setting(session, 'commandLine', '')
        self.cmd = exe + ' ' + cl
        self.working = self.get_path(session, 'workingPath', '')

    def process_document(self, session, doc):
        cmd = self.cmd
        stdIn = cmd.find('%INDOC%') == -1
        stdOut = cmd.find('%OUTDOC%') == -1
        if not stdIn:
            # Create temp file for incoming data
            if doc.mimeType or doc.filename:
                # Guess our extn~n
                try:
                    suff = mimetypes.guess_extension(doc.mimeType)
                except:
                    suff = ''
                if not suff:
                    suff = mimetypes.guess_extension(doc.filename)
                    if not suff:
                        (foofn, suff) = os.path.splitext(doc.filename)
                if suff:
                    (qq, infn) = tempfile.mkstemp(suff)
                else:
                    (qq, infn) = tempfile.mkstemp()
            else:
                (qq, infn) = tempfile.mkstemp()

            os.close(qq)
            fh = open(infn, 'w')
            fh.write(doc.get_raw(session))
            fh.close()
            cmd = cmd.replace("%INDOC%", infn)
        if not stdOut:
            # Create temp file to outgoing data
            if self.outMimeType:
                # Guess our extn~n
                suff = mimetypes.guess_extension(self.outMimeType)
                (qq, outfn) = tempfile.mkstemp(suff)
            else:
                (qq, outfn) = tempfile.mkstemp()
            cmd = cmd.replace("%OUTDOC%", outfn)
            os.close(qq)

        if self.working:
            old = os.getcwd()
            os.chdir(self.working)
        else:
            old = ''

        if stdIn:
            pipe = subprocess.Popen(cmd, bufsize=0, shell=True,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            pipe.stdin.write(doc.get_raw(session))
            pipe.stdin.close()
            result = pipe.stdout.read()
            pipe.stdout.close()
            pipe.stderr.close()
            del pipe
        else:
            # Result will read stdout+err regardless
            result = getShellResult(cmd)
            os.remove(infn)
            if not stdOut:
                if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                    fh = open(outfn)
                else:
                    # Command probably appended something to the filename
                    # Annoying! Have to glob for it
                    matches = glob.glob(outfn + "*")
                    # Or maybe ignored absolute path and put it in pwd...
                    matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
                    for m in matches + matches2:
                        if os.path.getsize(m) > 0:
                            fh = open(m)
                            break
                try:
                    try:
                        result = fh.read()
                    except:
                        msg = '{0}: {1}'.format(cmd, result)
                        raise ExternalSystemException(msg)
                    else:
                        fh.close()
                finally:
                    os.remove(outfn)
                    try:
                        # Clean up when data written elsewhere
                        os.remove(fh.name)
                    except OSError:
                        pass
        if old:
            os.chdir(old)
        mt = self.outMimeType
        if not mt:
            mt = doc.mimeType
        return StringDocument(result, self.id, doc.processHistory,
                              mimeType=mt, parent=doc.parent,
                              filename=doc.filename)


[docs]class FileUtilPreParser(TypedPreParser):
    """Call 'file' util to find out the current type of file."""

    def __init__(self, session, config, parent):
        TypedPreParser.__init__(self, session, config, parent)
        warn(
            '{0} is deprecated in favour of objects available from the'
            'cheshire3.formats package.'.format(self.__class__.__name__),
            DeprecationWarning,
            stacklevel=6
        )

    def process_document(self, session, doc):
        cmd = "file -i -b %INDOC%"
        (qq, infn) = tempfile.mkstemp()
        os.close(qq)
        fh = open(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()
        cmd = cmd.replace("%INDOC%", infn)
        res = getShellResult(cmd)
        mt = res.strip()
        if mt.find(';') > -1:
            bits = mt.split(';')
            mt = bits[0]
            for b in bits[1:]:
                # just stuff them on doc for now
                (type, value) = b.split('=')
                setattr(doc, type, value)

        if mt == "text/plain":
            # Might be sgml, xml, text etc
            res = getShellResult("file -b {0}".format(infn))
            mt2 = res.strip()
            if mt2 == "exported SGML document text":
                mt = "text/sgml"
            elif mt2 == "XML document text":
                mt = "text/xml"
            # Others include java, etc. but not very useful to us
        doc.mimeType = mt
        doc.processHistory.append(self.id)
        return doc


[docs]class MagicRedirectPreParser(TypedPreParser):
    """Map to appropriate PreParser based on incoming MIME type."""

    def _handleLxmlConfigNode(self, session, node):
        # Handle config in the form:
        # <hash>
        #     <object mimeType="" ref=""/>
        #     ...
        # </hash>
        if node.tag in ['hash', '{%s}hash' % CONFIG_NS]:
            for c in node.iterchildren(tag=etree.Element):
                if c.tag in ['object', '{%s}object' % CONFIG_NS]:
                    mt = c.attrib['mimeType']
                    ref = c.attrib['ref']
                    self.mimeTypeHash[mt] = ref

    def _handleConfigNode(self, session, node):
        # Handle config in the form:
        # <hash>
        #     <object mimeType="" ref=""/>
        #     ...
        # </hash>
        if node.localName == "hash":
            for c in node.childNodes:
                if c.nodeType == elementType and c.localName == "object":
                    mt = c.getAttributeNS(None, 'mimeType')
                    ref = c.getAttributeNS(None, 'ref')
                    self.mimeTypeHash[mt] = ref

    def __init__(self, session, config, parent):
        self.mimeTypeHash = {
            "application/x-gzip": "GunzipPreParser",
            "application/postscript": "PsPdfPreParser",
            "application/pdf": "PdfXmlPreParser",
            "text/html": "HtmlSmashPreParser",
            "text/plain": "TxtToXmlPreParser",
            "text/prs.fallenstein.rst": "RstToXmlPreParser",
            "text/sgml": "SgmlPreParser",
            "application/x-bzip2": "BzipPreParser",
            "application/zip": "ZIPToMETSPreParser",
            ("application/vnd.openxmlformats-officedocument."
             "wordprocessingml.document"): "ZIPToMETSPreParser",    # Word
            ("application/vnd.openxmlformats-officedocument."
             "presentationml.presentation"): "ZIPToMETSPreParser",  # PPT
            ("application/vnd.openxmlformats-officedocument."
             "spreadsheetml.sheet"): "ZIPToMETSPreParser",          # Excel
            ("application/vnd.oasis.opendocument."
             "text"): "ZIPToMETSPreParser",             # ODF Text
            ("application/vnd.oasis.opendocument."
             "presentation"): "ZIPToMETSPreParser",     # ODF Presentation
            ("application/vnd.oasis.opendocument."
             "spreadsheet"): "ZIPToMETSPreParser",      # ODF Spreadsheet(s)
            ("application/vnd.oasis.opendocument."
             "graphics"): "ZIPToMETSPreParser"          # ODF Graphic
            # "application/x-zip": "single zip preparser ?"
        }

        # Now override from config in init:
        TypedPreParser.__init__(self, session, config, parent)

    def process_document(self, session, doc):
        mt = doc.mimeType
        # Need Database from which to fetch potentially custom PreParsers
        db = session.server.get_object(session, session.database)
        if not mt:
            # Nasty kludge - use FileUtilPreParser to determine MIME type
            fu = db.get_object(session, 'FileUtilPreParser')
            doc2 = fu.process_document(session, doc)
            mt = doc2.mimeType
            if not mt and doc.filename:
                # Try and guess from filename
                mts = mimetypes.guess_type(doc.filename)
                if mts and mts[0]:
                    mt = mts[0]
        if mt in self.mimeTypeHash or "*" in self.mimeTypeHash:
            if mt not in self.mimeTypeHash:
                # There is a * mime-type
                # Something to be done for any unmatched type
                mt = '*'
            redirect = db.get_object(session, self.mimeTypeHash[mt])
            if isinstance(redirect, PreParser):
                return redirect.process_document(session, doc)
            else:
                # Only other thing it could legitimately be is workflow
                return redirect.process(session, doc)
        else:
            # Return unaltered Document
            # It may be that it is already the desired mime-type (e.g. XML)
            return doc


# --- HTML PreParsers ---

[docs]class HtmlSmashPreParser(PreParser):
    """ Attempts to reduce HTML to its raw text """

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.body = re.compile('<body(.*?)</body>', re.S | re.I)
        self.tagstrip = re.compile('<[^>]+>')
        self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
        self.script = re.compile('<script(.*?)</script>', re.S | re.I)
        self.style = re.compile('<style(.*?)</style>', re.S | re.I)
        self.comment = re.compile('<!--(.*?)-->', re.S | re.I)

    def process_document(self, session, doc):
        data = self.script.sub('', doc.get_raw(session))
        data = self.style.sub('', data)
        data = self.comment.sub('', data)
        tm = self.title.search(data)
        if tm:
            title = data[tm.start():tm.end()]
        else:
            title = ""
        m = self.body.search(data)
        if m:
            body = data[m.start():m.end()]
        else:
            body = data
        text = self.tagstrip.sub(' ', body)
        text = text.replace('<', '&lt;')
        text = text.replace('>', '&gt;')
        text = text.replace("&nbsp;", ' ')
        text = text.replace("&nbsp", ' ')
        l = text.split()
        text = ' '.join(l)
        data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


class HtmlFixupPreParser(PreParser):
    """Attempt to fix up HTML to make it complete and parseable XML.

    Uses the lxml.html package so as to preserve as much of the intended
    structure as possible.
    """

    def process_document(self, session, doc):
        root = html.document_fromstring(doc.get_raw(session))
        try:
            # Remove any xmlns to avoid duplication, and hence failed parsing
            del root.attrib['xmlns']
        except KeyError:
            pass
        data = etree.tostring(root)
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class RegexpSmashPreParser(PreParser):
    """Strip, replace or keep only data which matches a given regex."""

    _possibleSettings = {
        'char': {
            'docs': """\
Character(s) to replace matches in the regular expression with. Defaults to
empty string (i.e. strip matches)"""
        },
        'regexp': {
            'docs': "Regular expression to match in the data.",
            'required': True
        },
        'keep': {
            'docs': """\
Should instead keep only the matches. Boolean, defaults to False""",
            'type': int,
            'options': "0|1"
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        char = self.get_setting(session, 'char')
        regex = self.get_setting(session, 'regexp')
        self.keep = self.get_setting(session, 'keep')
        if regex:
            self.regexp = re.compile(regex, re.S)
        if char:
            self.char = char
        else:
            self.char = ''

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        if self.keep:
            l = self.regexp.findall(data)
            if l and l[0] and type(l[0]) == tuple:
                r = []
                for e in l:
                    r.append(e[0])
                l = r
            d2 = self.char.join(l)
        else:
            d2 = self.regexp.sub(self.char, data)
        return StringDocument(d2, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


try:
    import tidy
except ImportError:
    # Gracefully degrade functionality

    class HtmlTidyPreParser(PreParser):

        def __init__(self, session, config, parent):
            raise MissingDependencyException(self.__class__.__name__,
                                             "tidy")

else:
[docs]    class HtmlTidyPreParser(PreParser):
        """Uses TidyLib to turn HTML into XHTML for parsing."""

        def process_document(self, session, doc):
            d = tidy.parseString(doc.get_raw(session),
                                 output_xhtml=1,
                                 add_xml_decl=0,
                                 tidy_mark=0,
                                 indent=0)
            return StringDocument(str(d), self.id, doc.processHistory,
                                  mimeType=doc.mimeType, parent=doc.parent,
                                  filename=doc.filename)


# --- Not Quite Xml PreParsers ---

[docs]class SgmlPreParser(PreParser):
    """ Convert SGML into XML """
    entities = {}
    emptyTags = []
    doctype_re = None
    attr_re = None
    elem_re = None
    amp_re = None
    inMimeType = "text/sgml"
    outMimeType = "text/xml"

    _possibleSettings = {
        'emptyElements': {
            'docs': '''\
Space separated list of empty elements in the SGML to turn into empty XML
elements.'''
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
        self.attr_re = re.compile(
            ' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])'
        )
        self.pi_re = re.compile("<\?(.*?)\?>")
        self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
        self.amp_re = re.compile('&(\s)')
        taglist = self.get_setting(session, 'emptyElements')
        if taglist:
            self.emptyTags = taglist.split()

    def _loneAmpersand(self, match):
        # Fix unencoded ampersands
        return '&amp;%s' % match.group(1)

    def _lowerElement(self, match):
        # Make all tags lowercase
        #return match.groups()[0] + match.groups()[1].lower()
        return "%s%s" % (match.group(1), match.group(2).lower())

    def _attributeFix(self, match):
        # Fix messy attribute values
        # - lowercase attribute names
        # - remove spurious whitespace
        # - quote unquoted values
        #return match.groups()[0].lower() + '="' + match.groups()[1] + '"'
        return ' %s="%s"%s' % (match.group(1).lower(),
                               match.group(2),
                               match.group(3))

    def _emptyElement(self, match):
        # Make empty elements sefl-closing
        return "<%s/>" % (match.group(1))

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        txt = txt.replace('\n', ' ')
        txt = txt.replace('\r', ' ')
        for x in range(9, 14):
            txt = txt.replace('&#%d;' % (x), ' ')
        txt = self.doctype_re.sub('', txt)
        for e in self.entities.keys():
            txt = txt.replace("&%s;" % (e), self.entities[e])
        txt = self.amp_re.sub(self._loneAmpersand, txt)
        txt = txt.replace('&<', '&amp;<')
        txt = self.attr_re.sub(self._attributeFix, txt)
        txt = self.elem_re.sub(self._lowerElement, txt)
        for t in self.emptyTags:
            empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t)
            txt = empty_re.sub(self._emptyElement, txt)
        # strip processing instructions.
        txt = self.pi_re.sub('', txt)

        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class AmpPreParser(PreParser):
    """Escape lone ampersands in otherwise XML text."""
    entities = {}

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.amp_re = re.compile('&([^\s;]*)(\s|$)')
        self.entities = {}

    def _loneAmpersand(self, match):
        # Fix unencoded ampersands
        return '&amp;%s ' % match.group(1)

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        for e in self.entities.keys():
            txt = txt.replace("&%s;" % (e), self.entities[e])
        txt = self.amp_re.sub(self._loneAmpersand, txt)
        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


# --- MARC PreParsers ---

[docs]class MarcToXmlPreParser(PreParser):
    """ Convert MARC into MARCXML """
    inMimeType = "application/marc"
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        m = MARC(data)
        return StringDocument(m.toMARCXML(), self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


[docs]class MarcToSgmlPreParser(PreParser):
    """ Convert MARC into Cheshire2's MarcSgml """
    inMimeType = "application/marc"
    outMimeType = "text/sgml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        m = MARC(data)
        return StringDocument(m.toSGML(), self.id, doc.processHistory,
                              mimeType='text/sgml', parent=doc.parent,
                              filename=doc.filename)


# --- Raw Text PreParsers ---

[docs]class TxtToXmlPreParser(PreParser):
    """Minimally wrap text in <data> XML tags"""

    inMimeType = "text/plain"
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        txt = escape(txt)
        data = "<data>{0}</data>".format(txt)
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


class RstToXmlPreParser(PreParser):
    """Convert reStructuredText into Docutils-native XML."""

    inMimeType = "text/prs.fallenstein.rst"
    outMimeType = "application/xml"

    def process_document(self, session, doc):
        rst = doc.get_raw(session)
        data = publish_string(rst, writer_name="xml")
        return StringDocument(data,
                              self.id,
                              doc.processHistory,
                              mimeType=self.outMimeType,
                              parent=doc.parent,
                              filename=doc.filename
                              )


#  --- Compression PreParsers ---


[docs]class PicklePreParser(PreParser):
    """Compress Document content using Python pickle."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        string = pickle.dumps(data)
        return StringDocument(string, self.id, doc.processHistory,
                              mimeType='text/pickle', parent=doc.parent,
                              filename=doc.filename)


[docs]class UnpicklePreParser(PreParser):
    """Decompress Document content using Python pickle."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        string = pickle.loads(data)
        return StringDocument(string, self.id, doc.processHistory,
                              mimeType='text/pickle', parent=doc.parent,
                              filename=doc.filename)

try:
    import gzip
except ImportError:
    # Gracefully degrade functionality

    class GzipPreParser(PreParser):
        """Gzip a not-gzipped document."""
        def __init__(self, session, config, parent):
            raise MissingDependencyException(self.__class__.__name__,
                                             "gzip")

    class GunzipPreParser(PreParser):
        """Gunzip a gzipped document."""
        def __init__(self, session, config, parent):
            raise MissingDependencyException(self.__class__.__name__,
                                             "gzip")

else:
[docs]    class GzipPreParser(PreParser):
        """Gzip a not-gzipped document."""
        inMimeType = ""
        outMimeType = ""

        def __init__(self, session, config, parent):
            PreParser.__init__(self, session, config, parent)
            self.compressLevel = self.get_setting(session, "compressLevel", 1)

        def process_document(self, session, doc):
            outDoc = StringIO.StringIO()
            zfile = gzip.GzipFile(mode='wb', fileobj=outDoc,
                                  compresslevel=self.compressLevel)
            zfile.write(doc.get_raw(session))
            zfile.close()
            l = outDoc.tell()
            outDoc.seek(0)
            data = outDoc.read(l)
            outDoc.close()
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)

    # This comment needed for validation by PEP8 validator

[docs]    class GunzipPreParser(PreParser):
        """Gunzip a gzipped document."""
        inMimeType = ""
        outMimeType = ""

        def process_document(self, session, doc):
            buff = StringIO.StringIO(doc.get_raw(session))
            zfile = gzip.GzipFile(mode='rb', fileobj=buff)
            data = zfile.read()
            zfile.close()
            buff.close()
            del zfile
            del buff
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)

try:
    import bz2
except ImportError:
    # Gracefully degrade functionality

    class Bzip2PreParser(PreParser):
        """Unzip a bz2 zipped document."""
        def __init__(self, session, config, parent):
            raise MissingDependencyException(self.__class__.__name__,
                                             "bzip2")

else:
    class Bzip2PreParser(PreParser):
        """Unzip a bz2 zipped document."""
        def process_document(self, session, doc):
            bzdata = doc.get_raw(session)
            data = bz2.decompress(bzdata)
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)


[docs]class B64EncodePreParser(PreParser):
    """Encode document in Base64."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = b64encode(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


[docs]class B64DecodePreParser(PreParser):
    """Decode document from Base64."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = b64decode(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


class LZ4CompressPreParser(PreParser):
    """Compress data using the lz4 algorithm."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = lz4.compress(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


class LZ4DecompressPreParser(PreParser):
    """Decompress lz4 compressed data."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = lz4.decompress(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


# --- Nasty OpenOffice PreParser ---

class UrlPreParser(PreParser):
    """Abstract Base Class for PreParsers that use OpenOffice.

    DEPRECATED: see cheshire3.formats sub-package instead
    """

    _possiblePaths = {
        'remoteUrl': {
            'docs': 'URL at which the OpenOffice handler is listening'
        }
    }

    def _post_multipart(self, host, selector, fields, files):
        content_type, body = self._encode_multipart_formdata(fields, files)
        h = httplib.HTTPConnection(host)
        headers = {'content-type': content_type}
        h.request('POST', selector, body, headers)
        resp = h.getresponse()
        return resp.read()

    def _encode_multipart_formdata(self, fields, files):
        BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
        CRLF = '\r\n'
        L = []
        for (key, value) in fields:
            L.append('--' + BOUNDARY)
            L.append('Content-Disposition: form-data; name="%s"' % key)
            L.append('')
            L.append(value)
        for (key, filename, value) in files:
            L.append('--' + BOUNDARY)
            L.append(
                'Content-Disposition: form-data; name="%s"; filename="%s"' %
                (key, filename)
            )
            L.append('Content-Type: %s' % self._get_content_type(filename))
            L.append('')
            L.append(value)
        L.append('--' + BOUNDARY + '--')
        L.append('')
        body = CRLF.join(L)
        content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
        return content_type, body

    def _get_content_type(self, filename):
        return mimetypes.guess_type(filename)[0] or 'application/octet-stream'

    def _send_request(self, session, data=None):
        url = self.get_path(session, 'remoteUrl')
        if (url[:7] == "http://"):
            url = url[7:]
        hlist = url.split('/', 1)
        host = hlist[0]
        if (len(hlist) == 2):
            selector = hlist[1]
        else:
            selector = ""
        # TODO:  Remove dependency
        fields = ()
        files = [("file", "foo.doc", data)]
        return self._post_multipart(host, selector, fields, files)


class OpenOfficePreParser(UrlPreParser):
    """Use OpenOffice server to convert documents into OpenDocument XML """

    inMimeType = ""
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        try:
            xml = self._send_request(session, data)
        except:
            xml = "<error/>"
        return StringDocument(xml, self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


[docs]class PrintableOnlyPreParser(PreParser):
    """Replace or Strip non printable characters."""

    inMimeType = "text/*"
    outMimeType = "text/plain"

    _possibleSettings = {
        'strip': {
            'docs': """\
Should the preParser strip the characters or replace with numeric character \
entities (default)""",
            'type': int,
            'options': "0|1"
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.asciiRe = re.compile('([\x7b-\xff])')
        self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
        self.strip = self.get_setting(session, 'strip', 0)

    def process_document(self, session, doc):
        """Strip any non printable characters."""
        data = doc.get_raw(session)
        # This is bizarre, but otherwise:
        # UnicodeDecodeError: 'ascii' codec can't decode byte ...
        if isinstance(data, unicode):
            data = data.replace(u"\xe2\x80\x9c", u'&quot;')
            data = data.replace(u"\xe2\x80\x9d", u'&quot;')
            data = data.replace(u"\xe2\x80\x9e", u'&quot;')
            data = data.replace(u"\xe2\x80\x93", u'-')
            data = data.replace(u"\xe2\x80\x98", u"'")
            data = data.replace(u"\xe2\x80\x99", u"'")
            data = data.replace(u"\xe2\x80\x9a", u",")
            data = data.replace(u"\x99", u"'")
            data = data.replace(u'\xa0', u' ')
        else:
            data = data.replace("\xe2\x80\x9c", '&quot;')
            data = data.replace("\xe2\x80\x9d", '&quot;')
            data = data.replace("\xe2\x80\x9e", '&quot;')
            data = data.replace("\xe2\x80\x93", '-')
            data = data.replace("\xe2\x80\x98", "'")
            data = data.replace("\xe2\x80\x99", "'")
            data = data.replace("\xe2\x80\x9a", ",")
            data = data.replace("\x99", "'")
            data = data.replace('\xa0', ' ')
        data = self.nonxmlRe.sub(' ', data)
        if self.strip:
            new = self.asciiRe.sub('', data)
        else:
            fn = lambda x: "&#%s;" % ord(x.group(1))
            new = self.asciiRe.sub(fn, data)
        return StringDocument(new, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class CharacterEntityPreParser(PreParser):
    """Change named and broken entities to numbered.

    Transform latin-1 and broken character entities into numeric character
    entities. eg
    &amp;something; --> &amp;#123;
    """

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.numericalEntRe = re.compile('&(\d+);')
        self.fractionRe = re.compile('&frac(\d)(\d);')
        self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')
        self.start = 160
        self.otherEntities = {
            "quot": '#34',
            "amp": '#38',
            "lt": '#60',
            "gt": '#62',
            "trade": '#8482',
            "OElig": '#338',
            "oelig": '#339',
            "Scaron": '#352',
            "scaron": '#353',
            "Yuml": '#376',
            "circ": '#710',
            "tilde": '#732',
            "ensp": '#8194',
            "emsp": '#8195',
            "thinsp": '#8201',
            "zwnj": '#8204',
            "zwj": '#8205',
            "lrm": '#8206',
            "rlm": '#8207',
            "ndash": '#8211',
            "mdash": '#8212',
            "lsquo": '#8216',
            "rsquo": '#8217',
            "sbquo": '#8218',
            "ldquo": '#8220',
            "rdquo": '#8221',
            "bdquo": '#8222',
            "dagger": '#8224',
            "Dagger": '#8225',
            "permil": '#8240',
            "lsaquo": '#8249',
            "rsaquo": '#8250',
            "euro": '#8364',
            "rdquo": '#34',
            "lsquo": '#34',
            "rsquo": '#34',
            "half": '#189',
            "ast": '#8727'
        }
        self.inane = {
            "apos": "'",
            "hellip": '...',
            "ldquo": '',
            "lsqb": '[',
            "rsqb": ']',
            "sol": '\\',
            "commat": '@',
            "plus": '+',
            "percnt": '%'
        }

        self.preEntities = {
            "OUML;": "Ouml",
            "UUML": "Uuml",
            "AELIG": "AElig",
            "Aelig": "AElig"
        }
        self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
                         'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
                         'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
                         'sup2', 'sup3', 'acute', 'micro', 'para', 'middot',
                         'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12',
                         'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
                         'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
                         'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave',
                         'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
                         'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
                         'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml',
                         'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
                         'acirc', 'atilde', 'auml', 'aring', 'aelig',
                         'ccedil', 'egrave', 'eacute', 'ecirc', 'euml',
                         'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde',
                         'ograve', 'oacute', 'ocirc', 'otilde', 'ouml',
                         'divide', 'oslash', 'ugrave', 'uacute', 'ucirc',
                         'uuml', 'yacute', 'thorn', 'yuml']

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        # Replace entities that can be represented with simple chars
        for (fromEnt, toEnt) in self.inane.iteritems():
            txt = txt.replace("&%s;" % fromEnt, toEnt)
        # Fix some common mistakes
        for (fromEnt, toEnt) in self.preEntities.iteritems():
            txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt)
        # Fix straight forward entites
        for (s, enty) in enumerate(self.entities):
            txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s))
        # Fix additional random entities
        for (fent, totxt) in self.otherEntities.iteritems():
            txt = txt.replace("&%s;" % fent, "&%s;" % totxt)
        # Add missing # in &123;

        def hashed(mo):
            return '&#%s;' % mo.group(1)

        txt = self.numericalEntRe.sub(hashed, txt)
        # Fix made up fraction entities. (?)

        def fraction(mo):
            return '%s&#8260;%s' % (mo.group(1), mo.group(2))

        txt = self.fractionRe.sub(fraction, txt)
        # Kill remaining invalid character entities
        txt = self.invalidRe.sub('', txt)
        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


class DataChecksumPreParser(PreParser):
    """Checksum Document data and add to Document metadata."""

    _possibleSettings = {
        'sumType': {
            'docs': "Type of checkSum to carry out.",
            'type': str,
            'default': 'md5'
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.sumType = self.get_setting(session, 'sumType', 'md5')
        try:
            hashlib.new(self.sumType)
        except ValueError as e:
            raise ConfigFileException(str(e))

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        h = hashlib.new(self.sumType)
        h.update(data)
        md = {
            self.sumType: {
                'hexdigest': h.hexdigest(),
                'analysisDateTime': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
            }
        }
        try:
            doc.metadata['checksum'].update(md)
        except KeyError:
            doc.metadata['checksum'] = md
        doc.processHistory.append(self.id)
        return doc


class METSWrappingPreParser(TypedPreParser):
    """PreParser to wrap any Document content in METS XML."""

    def __init__(self, session, config, parent):
        TypedPreParser.__init__(self, session, config, parent)
        # Over-ride if missing outgoing mime-type
        if not self.outMimeType:
            self.outMimeType = 'application/xml'

    def _get_metsWrapper(self, doc):
        # Get a generic METS wrapper for the given Document
        # Find/Generate identifiers and labels
        objid = gen_uuid()
        # Set up METS root and header
        mets = METS.mets(
            {'ID': '/'.join([objid, 'mets']),
             'OBJID': objid,
             'TYPE': 'ZIPFILE'
             },
            METS.metsHdr(
                {'ID': '/'.join([objid, 'metsHdr']),
                 'CREATEDDATE': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
                 },
                METS.agent(
                    {'ROLE': "CREATOR",
                     'TYPE': "OTHER",
                     'OTHERTYPE': 'SOFTWARE'
                     },
                    METS.name("Cheshire3"),
                    METS.note(
                        "METS instance was created by a Cheshire3 object"
                        " of type {0} identified as {1}"
                        "".format(type(self).__name__, self.id)
                    )
                )
            ),
            METS.dmdSec(),
            METS.amdSec(),
            METS.fileSec(
                METS.fileGrp({'ID': '/'.join([objid, 'fileGrp', '0001'])})
            )
        )
        # Set a human readable label if possible
        if doc.filename:
            mets.set("LABEL", os.path.abspath(doc.filename))
        elif doc.id:
            mets.set("LABEL", doc.id)
        return mets

    def _get_metsFile(self, identifier, rawdata, size=0, mimeType=""):
        # Get a METS file element for the given data
        file_ = METS.file({'ID': identifier,
                           }
                          )
        # Create a METS FContent element
        FContent = METS.FContent()
        file_.append(FContent)
        # Try to set size
        if size:
            file_.attrib["SIZE"] = str(size)
        else:
            file_.attrib["SIZE"] = str(len(rawdata))
        # Attempt to add the MIME-Type
        if mimeType == "text/xml":
            # Fix broken MIME-Type
            file_.attrib['MIMETYPE'] = 'application/xml'
        elif mimeType:
            file_.attrib['MIMETYPE'] = mimeType
        # Add the content as either XML or binary (Base 64) data
        try:
            # Attempt to parse file content as XML
            xmldata = etree.fromstring(rawdata)
        except etree.XMLSyntaxError:
            # Encode as Base64
            FContent.append(METS.binData(b64encode(rawdata)))
        else:
            FContent.append(METS.xmlData(xmldata))
        return file_

    def process_document(self, session, doc):
        global METS_NAMESPACES
        mets = self._get_metsWrapper(doc)
        objid = mets.get("OBJID")
        # Get the fileSec element
        fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]',
                             namespaces=METS_NAMESPACES)[0]
        file_ = self._get_metsFile(
            '/'.join([objid,
                      mets.attrib.get("LABEL", "file0001")
                      ]),
            doc.get_raw(session),
            doc.byteCount,
            doc.mimeType
        )
        # Append the file element to fileGrp
        fileGrp.append(file_)
        # Update last modification date
        mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
        # Serialize METS
        data = etree.tostring(mets, pretty_print=True)
        # Return a Document
        return StringDocument(
            data,
            self.id,
            doc.processHistory,
            self.outMimeType,
            parent=doc.parent,
            filename=doc.filename,
            byteCount=len(data),
            byteOffset=0
        )


class ZIPToMETSPreParser(METSWrappingPreParser):
    """PreParser to process a ZIP file to METS XML.

    As Office Open XML format and OpenDocument format Documents are based on
    ZIP files, this PreParser can also be used to unpack them, and wrap their
    component parts in METS.

    Office Open XML (a.k.a. OpenXML, OOXML) is the name for ECMA 376 office
    file formats used by default in Microsoft Office 2007 onwards (.docx,
    .xlsx , .pptx etc.) It is available as an import/export format in
    LibreOffice, OpenOffice >= 3.2, Google Docs and more.

    """

    def process_document(self, session, doc):
        global METS_NAMESPACES
        mets = self._get_metsWrapper(doc)
        objid = mets.get("OBJID")
        # Get the fileSec element
        fileGrp = mets.xpath('/mets:mets/mets:fileSec/mets:fileGrp[1]',
                             namespaces=METS_NAMESPACES)[0]
        # Make raw data of incoming document file-like
        stringio = StringIO.StringIO(doc.get_raw(session))
        # Read file-like object as a ZIP file
        with ZipFile(stringio, 'r') as zf:
            # Iterate through the zipped files
            for zipinfo in zf.infolist():
                # Attempt to get the MIME-Type
                mts = mimetypes.guess_type(zipinfo.filename)
                if mts and mts[0]:
                    mimeType = mts[0]
                else:
                    mimeType = ""
                file_ = self._get_metsFile(
                    '/'.join([objid, zipinfo.filename]),
                    zf.read(zipinfo),
                    str(zipinfo.file_size),
                    mimeType
                )
                # Append the file element to fileGrp
                fileGrp.append(file_)
        # Update last modification date
        mets.attrib['LASTMODDATE'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
        # Serialize METS
        data = etree.tostring(mets, pretty_print=True)
        # Return a Document
        return StringDocument(
            data,
            self.id,
            doc.processHistory,
            self.outMimeType,
            parent=doc.parent,
            filename=doc.filename,
            byteCount=len(data),
            byteOffset=0
        )


# Set up ElementMaker for METS and XLink namespaces
METS_NAMESPACES = {'mets': "http://www.loc.gov/METS/",
                   'xlink': "http://www.w3.org/1999/xlink"
                   }
METS = ElementMaker(namespace=METS_NAMESPACES['mets'],
                    nsmap=METS_NAMESPACES
                    )