Source code for cheshire3.preParser

from __future__ import absolute_import

import os
import re
import time
import string
import glob
import httplib
import mimetypes
import tempfile
import hashlib
import subprocess

try:
    import cStringIO as StringIO
except ImportError:
    import StringIO

try:
    import cPickle as pickle
except ImportError:
    import pickle

from xml.sax.saxutils import escape
from warnings import warn
from lxml import etree
from base64 import b64encode, b64decode

# Intra-package imports
from cheshire3.baseObjects import PreParser
from cheshire3.document import StringDocument
from cheshire3.internal import CONFIG_NS
from cheshire3.marc_utils import MARC
from cheshire3.utils import getShellResult
from cheshire3.exceptions import ConfigFileException, ExternalSystemException


# TODO: All PreParsers should set mimetype, and record in/out mimetype

class TypedPreParser(PreParser):
    _possibleSettings = {
        "inMimeType": {
            'docs': "The mimetype expected for incoming documents"
        },
        "outMimeType": {
            'docs': "The mimetype set on outgoing documents"
        }
     }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.inMimeType = self.get_setting(session, 'inMimeType', '')
        self.outMimeType = self.get_setting(session, 'outMimeType', '')


[docs]class NormalizerPreParser(PreParser):
    """ Calls a named Normalizer to do the conversion."""

    _possiblePaths = {
        'normalizer': {
             'docs': "Normalizer identifier to call to do the transformation",
             'required': True
         }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.normalizer = self.get_path(session, 'normalizer', None)
        if self.normalizer is None:
            msg = "Normalizer for {0} does not exist.".format(self.id)
            raise ConfigFileException(msg)

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = self.normalizer.process_string(session, data)
        return StringDocument(new, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class UnicodeDecodePreParser(PreParser):
    """PreParser to turn non-unicode into Unicode Documents.

    A UnicodeDecodePreParser should accept a Document with content encoded in 
    a non-unicode character encoding scheme and return a Document with the 
    same content decoded to Python's Unicode implementation.
    """

    _possibleSettings = {
        'codec': {
            'docs': 'Codec to use to decode to unicode. Defaults to UTF-8'
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.codec = self.get_setting(session, 'codec', 'utf-8')

    def process_document(self, session, doc):
        try:
            data = doc.get_raw(session).decode(self.codec)
        except UnicodeDecodeError as e:
            raise e
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class CmdLinePreParser(TypedPreParser):

    _possiblePaths = {
        'executable': {'docs': "Name of the executable to run"},
        'executablePath': {'docs': "Path to the executable"},
        'workingPath': {'docs': 'Path to be in when executing command'}
    }

    _possibleSettings = {
        'commandLine': {
            'docs': """\
Command line to use. %INDOC% is substituted to create a temporary file to 
read, and %OUTDOC% is substituted for a temporary file for the process to 
write to"""
        }
    }

    def __init__(self, session, config, parent):
        TypedPreParser.__init__(self, session, config, parent)
        exe = self.get_path(session, 'executable', '')
        if not exe:
            msg = "Missing mandatory 'executable' path in {0}".format(self.id)
            raise ConfigFileException(msg)
        tp = self.get_path(session, 'executablePath', '')
        if tp:
            exe = os.path.join(tp, exe)
        cl = self.get_setting(session, 'commandLine', '')
        self.cmd = exe + ' ' + cl
        self.working = self.get_path(session, 'workingPath', '')

    def process_document(self, session, doc):
        cmd = self.cmd
        stdIn = cmd.find('%INDOC%') == -1
        stdOut = cmd.find('%OUTDOC%') == -1
        if not stdIn:
            # Create temp file for incoming data
            if doc.mimeType or doc.filename:
                # Guess our extn~n                
                try:
                    suff = mimetypes.guess_extension(doc.mimeType)
                except:
                    suff = ''
                if not suff:
                    suff = mimetypes.guess_extension(doc.filename)
                    if not suff:
                        (foofn, suff) = os.path.splitext(doc.filename)
                if suff:
                    (qq, infn) = tempfile.mkstemp(suff)
                else:
                    (qq, infn) = tempfile.mkstemp()                    
            else:
                (qq, infn) = tempfile.mkstemp()                 

            os.close(qq)
            fh = open(infn, 'w')
            fh.write(doc.get_raw(session))
            fh.close()
            cmd = cmd.replace("%INDOC%", infn)
        if not stdOut:
            # Create temp file to outgoing data
            if self.outMimeType:
                # Guess our extn~n
                suff = mimetypes.guess_extension(self.outMimeType)
                (qq, outfn) = tempfile.mkstemp(suff)
            else:
                (qq, outfn) = tempfile.mkstemp()
            cmd = cmd.replace("%OUTDOC%", outfn)               
            os.close(qq)

        if self.working:
            old = os.getcwd()
            os.chdir(self.working)            
        else:
            old = ''

        if stdIn:
            pipe = subprocess.Popen(cmd, bufsize=0, shell=True, 
                                    stdin=subprocess.PIPE, 
                                    stdout=subprocess.PIPE, 
                                    stderr=subprocess.PIPE)
            pipe.stdin.write(doc.get_raw(session))
            pipe.stdin.close()
            result = pipe.stdout.read()
            pipe.stdout.close()
            pipe.stderr.close()
            del pipe
        else:
            # Result will read stdout+err regardless
            result = getShellResult(cmd)
            os.remove(infn)
            if not stdOut:
                if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
                    fh = open(outfn)
                else:
                    # Command probably appended something to the filename
                    # Annoying! Have to glob for it
                    matches = glob.glob(outfn + "*")
                    # Or maybe ignored absolute path and put it in pwd...
                    matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
                    for m in matches + matches2:
                        if os.path.getsize(m) > 0:
                            fh = open(m)
                            break
                try:
                    try:
                        result = fh.read()
                    except:
                        msg = '{0}: {1}'.format(cmd, result)
                        raise ExternalSystemException(msg)
                    else:
                        fh.close()
                finally:
                    os.remove(outfn)
                    try:
                        # Clean up when data written elsewhere
                        os.remove(fh.name) 
                    except OSError:
                        pass
        if old:
            os.chdir(old)
        mt = self.outMimeType
        if not mt:
            mt = doc.mimeType
        return StringDocument(result, self.id, doc.processHistory,
                              mimeType=mt, parent=doc.parent,
                              filename=doc.filename) 


[docs]class FileUtilPreParser(TypedPreParser):
    """Call 'file' util to find out the current type of file."""

    def __init__(self, session, config, parent):
        TypedPreParser.__init__(self, session, config, parent)
        warn('''\
{0} is deprecated in favour of objects available from the 
cheshire3.formats package.'''.format(self.__class__.__name__), 
            DeprecationWarning, 
            stacklevel=6)

    def process_document(self, session, doc):
        cmd = "file -i -b %INDOC%"
        (qq, infn) = tempfile.mkstemp()
        os.close(qq)
        fh = open(infn, 'w')
        fh.write(doc.get_raw(session))
        fh.close()
        cmd = cmd.replace("%INDOC%", infn)
        res = getShellResult(cmd)
        mt = res.strip()
        if mt.find(';') > -1:
            bits = mt.split(';')
            mt = bits[0]
            for b in bits[1:]:
                # just stuff them on doc for now
                (type, value) = b.split('=')
                setattr(doc, type, value)

        if mt == "text/plain":
            # Might be sgml, xml, text etc
            res = getShellResult("file -b {0}".format(infn))
            mt2 = res.strip()
            if mt2 == "exported SGML document text":
                mt = "text/sgml"
            elif mt2 == "XML document text":
                mt = "text/xml"
            # Others include java, etc. but not very useful to us
        doc.mimeType = mt
        doc.processHistory.append(self.id)
        return doc


[docs]class MagicRedirectPreParser(TypedPreParser):
    """Map to appropriate PreParser based on incoming MIME type."""

    def _handleLxmlConfigNode(self, session, node):
        # Handle config in the form:
        # <hash>
        #     <object mimeType="" ref=""/>
        #     ...
        # </hash>
        if node.tag in ['hash', '{%s}hash' % CONFIG_NS]:
            for c in node.iterchildren(tag=etree.Element):
                if c.tag in ['object', '{%s}object' % CONFIG_NS]:
                    mt = c.attrib['mimeType']
                    ref = c.attrib['ref']
                    self.mimeTypeHash[mt] = ref

    def _handleConfigNode(self, session, node):
        # Handle config in the form:
        # <hash>
        #     <object mimeType="" ref=""/>
        #     ...
        # </hash>
        if node.localName == "hash":
            for c in node.childNodes:
                if c.nodeType == elementType and c.localName == "object":
                    mt = c.getAttributeNS(None, 'mimeType')
                    ref = c.getAttributeNS(None, 'ref')
                    self.mimeTypeHash[mt] = ref

    def __init__(self, session, config, parent):
        self.mimeTypeHash = {"application/x-gzip": "GunzipPreParser",
                             "application/postscript": "PsPdfPreParser",
                             "application/pdf": "PdfXmlPreParser",
                             "text/html": "HtmlSmashPreParser",
                             "text/plain": "TxtToXmlPreParser",
                             "text/sgml": "SgmlPreParser",
                             "application/x-bzip2": "BzipPreParser"
                             # "application/x-zip": "single zip preparser ?"
                             }

        # Now override from config in init:
        TypedPreParser.__init__(self, session, config, parent)

    def process_document(self, session, doc):
        mt = doc.mimeType
        db = session.server.get_object(session, session.database)
        if not mt:
            # Nasty kludge - use FileUtilPreParser to determine MIME type
            fu = db.get_object(session, 'FileUtilPreParser')
            doc2 = fu.process_document(session, doc)
            mt = doc2.mimeType
            if not mt and doc.filename:
                # Try and guess from filename
                mts = mimetypes.guess_type(doc.filename)
                if mts and mts[0]:
                    mt = mts[0]
        if mt in self.mimeTypeHash:
            db = session.server.get_object(session, session.database)
            redirect = db.get_object(session, self.mimeTypeHash[mt])
            if isinstance(redirect, PreParser):
                return redirect.process_document(session, doc)
            else:
                # Only other thing is workflow
                return redirect.process(session, doc)
        else:
            # XXX: Should we return or raise?
            return doc


# --- HTML PreParsers ---

[docs]class HtmlSmashPreParser(PreParser):
    """ Attempts to reduce HTML to its raw text """

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.body = re.compile('<body(.*?)</body>', re.S | re.I)
        self.tagstrip = re.compile('<[^>]+>')
        self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
        self.script = re.compile('<script(.*?)</script>', re.S | re.I)
        self.style = re.compile('<style(.*?)</style>', re.S | re.I)
        self.comment = re.compile('<!--(.*?)-->', re.S | re.I)

    def process_document(self, session, doc):
        data = self.script.sub('', doc.get_raw(session))
        data = self.style.sub('', data)
        data = self.comment.sub('', data)
        tm = self.title.search(data)
        if tm:
            title = data[tm.start():tm.end()]
        else:
            title = ""
        m = self.body.search(data)
        if m:
            body = data[m.start():m.end()]
        else:
            body = data
        text = self.tagstrip.sub(' ', body)	
        text = text.replace('<', '&lt;')
        text = text.replace('>', '&gt;')
        text = text.replace("&nbsp;", ' ')
        text = text.replace("&nbsp", ' ')
        l = text.split()
        text = ' '.join(l)
        data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename) 


[docs]class RegexpSmashPreParser(PreParser):
    """Strip, replace or keep only data which matches a given regex."""

    _possibleSettings = {
        'char': {
            'docs': """\
Character(s) to replace matches in the regular expression with. Defaults to 
empty string (i.e. strip matches)"""
        },
        'regexp': {
            'docs': "Regular expression to match in the data.",
            'required': True
        },
        'keep': {
            'docs': """\
Should instead keep only the matches. Boolean, defaults to False""",
            'type': int,
            'options': "0|1"
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        char = self.get_setting(session, 'char')
        regex = self.get_setting(session, 'regexp')
        self.keep = self.get_setting(session, 'keep')
        if regex:
            self.regexp = re.compile(regex, re.S)
        if char:
            self.char = char
        else:
            self.char = ''

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        if self.keep:
            l = self.regexp.findall(data)
            if l and l[0] and type(l[0]) == tuple:
                r = []
                for e in l:
                    r.append(e[0])
                l = r
            d2 = self.char.join(l)
        else:
            d2 = self.regexp.sub(self.char, data)
        return StringDocument(d2, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename) 


try:
    import tidy
except ImportError:
    # Gracefully degrade functionality

    class HtmlTidyPreParser(PreParser):

        def __init__(self, session, config, parent):
            raise NotImplementedError("""\
HtmlTidyPreParser not supported due to a missing library on your system.""")

else:
[docs]    class HtmlTidyPreParser(PreParser):
        """Uses TidyLib to turn HTML into XHTML for parsing."""

        def process_document(self, session, doc):
            d = tidy.parseString(doc.get_raw(session),
                                 output_xhtml=1,
                                 add_xml_decl=0,
                                 tidy_mark=0,
                                 indent=0)
            return StringDocument(str(d), self.id, doc.processHistory,
                                  mimeType=doc.mimeType, parent=doc.parent,
                                  filename=doc.filename)


# --- Not Quite Xml PreParsers ---

[docs]class SgmlPreParser(PreParser):
    """ Convert SGML into XML """
    entities = {}
    emptyTags = []
    doctype_re = None
    attr_re = None
    elem_re = None
    amp_re = None
    inMimeType = "text/sgml"
    outMimeType = "text/xml"

    _possibleSettings = {
        'emptyElements': {
            'docs': '''\
Space separated list of empty elements in the SGML to turn into empty XML 
elements.'''
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
        self.attr_re = re.compile(
            ' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])'
        )
        self.pi_re = re.compile("<\?(.*?)\?>")
        self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
        self.amp_re = re.compile('&(\s)')
        taglist = self.get_setting(session, 'emptyElements')
        if taglist:
            self.emptyTags = taglist.split()

    def _loneAmpersand(self, match):
        # Fix unencoded ampersands
        return '&amp;%s' % match.group(1)

    def _lowerElement(self, match):
        # Make all tags lowercase
        #return match.groups()[0] + match.groups()[1].lower()
        return "%s%s" % (match.group(1), match.group(2).lower())

    def _attributeFix(self, match):
        # Fix messy attribute values
        # - lowercase attribute names
        # - remove spurious whitespace
        # - quote unquoted values
        #return match.groups()[0].lower() + '="' + match.groups()[1] + '"'
        return ' %s="%s"%s' % (match.group(1).lower(), 
                               match.group(2), 
                               match.group(3))

    def _emptyElement(self, match):
        # Make empty elements sefl-closing
        return "<%s/>" % (match.group(1))

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        txt = txt.replace('\n', ' ')
        txt = txt.replace('\r', ' ')
        for x in range(9, 14):
            txt = txt.replace('&#%d;' % (x), ' ')
        txt = self.doctype_re.sub('', txt)
        for e in self.entities.keys():
            txt = txt.replace("&%s;" % (e), self.entities[e])
        txt = self.amp_re.sub(self._loneAmpersand, txt)
        txt = txt.replace('&<', '&amp;<')
        txt = self.attr_re.sub(self._attributeFix, txt)
        txt = self.elem_re.sub(self._lowerElement, txt)
        for t in self.emptyTags:
            empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t)
            txt = empty_re.sub(self._emptyElement, txt)
        # strip processing instructions.
        txt = self.pi_re.sub('', txt)

        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class AmpPreParser(PreParser):
    """Escape lone ampersands in otherwise XML text."""
    entities = {}

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.amp_re = re.compile('&([^\s;]*)(\s|$)')
        self.entities = {}

    def _loneAmpersand(self, match):
        # Fix unencoded ampersands
        return '&amp;%s ' % match.group(1)

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        for e in self.entities.keys():
            txt = txt.replace("&%s;" % (e), self.entities[e])
        txt = self.amp_re.sub(self._loneAmpersand, txt)
        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


# --- MARC PreParsers ---

[docs]class MarcToXmlPreParser(PreParser):
    """ Convert MARC into MARCXML """
    inMimeType = "application/marc"
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        m = MARC(data)
        return StringDocument(m.toMARCXML(), self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


[docs]class MarcToSgmlPreParser(PreParser):
    """ Convert MARC into Cheshire2's MarcSgml """
    inMimeType = "application/marc"
    outMimeType = "text/sgml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        m = MARC(data)
        return StringDocument(m.toSGML(), self.id, doc.processHistory,
                              mimeType='text/sgml', parent=doc.parent,
                              filename=doc.filename)


# --- Raw Text PreParsers ---

[docs]class TxtToXmlPreParser(PreParser):
    """Minimally wrap text in <data> XML tags"""

    inMimeType = "text/plain"
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        txt = escape(txt)
        data = "<data>{0}</data>".format(txt)
        return StringDocument(data, self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


#  --- Compression PreParsers ---


[docs]class PicklePreParser(PreParser):
    """Compress Document content using Python pickle."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        string = pickle.dumps(data)
        return StringDocument(string, self.id, doc.processHistory,
                              mimeType='text/pickle', parent=doc.parent,
                              filename=doc.filename)


[docs]class UnpicklePreParser(PreParser):
    """Decompress Document content using Python pickle."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        string = pickle.loads(data)
        return StringDocument(string, self.id, doc.processHistory,
                              mimeType='text/pickle', parent=doc.parent,
                              filename=doc.filename)

try:
    import gzip
except ImportError:
    # Gracefully degrade functionality

    class GzipPreParser(PreParser):
        """Gzip a not-gzipped document."""
        def __init__(self, session, config, parent):
            raise NotImplementedError('''\
Compression by gzip is not supported due to a missing library in your system.\
''')

    class GunzipPreParser(PreParser):
        """Gunzip a gzipped document."""
        def __init__(self, session, config, parent):
            raise NotImplementedError('''\
Decompression by gzip is not supported due to a missing library in your \
system.''')

else:
[docs]    class GzipPreParser(PreParser):
        """Gzip a not-gzipped document."""
        inMimeType = ""
        outMimeType = ""

        def __init__(self, session, config, parent):
            PreParser.__init__(self, session, config, parent)
            self.compressLevel = self.get_setting(session, "compressLevel", 1)

        def process_document(self, session, doc):
            outDoc = StringIO.StringIO()
            zfile = gzip.GzipFile(mode='wb', fileobj=outDoc, 
                                  compresslevel=self.compressLevel)
            zfile.write(doc.get_raw(session))
            zfile.close()
            l = outDoc.tell()
            outDoc.seek(0)
            data = outDoc.read(l)
            outDoc.close()
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)

    # This comment needed for validation by PEP8 validator

[docs]    class GunzipPreParser(PreParser):
        """Gunzip a gzipped document."""
        inMimeType = ""
        outMimeType = ""

        def process_document(self, session, doc):
            buff = StringIO.StringIO(doc.get_raw(session))
            zfile = gzip.GzipFile(mode='rb', fileobj=buff)
            data = zfile.read()
            zfile.close()
            buff.close()
            del zfile
            del buff
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)

try:
    import bz2
except ImportError:
    # Gracefully degrade functionality

    class Bzip2PreParser(PreParser):
        """Unzip a bz2 zipped document."""
        def __init__(self, session, config, parent):
            raise NotImplementedError('''\
Decompression by bzip2 is not supported due to a missing library in your \
system.''')

else:
    class Bzip2PreParser(PreParser):
        """Unzip a bz2 zipped document."""
        def process_document(self, session, doc):
            bzdata = doc.get_raw(session)
            data = bz2.decompress(bzdata)
            return StringDocument(data, self.id, doc.processHistory,
                                  parent=doc.parent, filename=doc.filename)


[docs]class B64EncodePreParser(PreParser):
    """Encode document in Base64."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = b64encode(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


[docs]class B64DecodePreParser(PreParser):
    """Decode document from Base64."""

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        new = b64decode(data)
        return StringDocument(new, self.id, doc.processHistory,
                              parent=doc.parent, filename=doc.filename)


# --- Nasty OpenOffice PreParser ---

class UrlPreParser(PreParser):
    """Abstract Base Class for PreParsers that use OpenOffice.

    DEPRECATED: see cheshire3.formats sub-package instead
    """

    _possiblePaths = {
        'remoteUrl': {
            'docs': 'URL at which the OpenOffice handler is listening'
        }
    }

    def _post_multipart(self, host, selector, fields, files):
        content_type, body = self._encode_multipart_formdata(fields, files)
        h = httplib.HTTPConnection(host)
        headers = {'content-type': content_type}
        h.request('POST', selector, body, headers)
        resp = h.getresponse()
        return resp.read()

    def _encode_multipart_formdata(self, fields, files):
        BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
        CRLF = '\r\n'
        L = []
        for (key, value) in fields:
            L.append('--' + BOUNDARY)
            L.append('Content-Disposition: form-data; name="%s"' % key)
            L.append('')
            L.append(value)
        for (key, filename, value) in files:
            L.append('--' + BOUNDARY)
            L.append(
                 'Content-Disposition: form-data; name="%s"; filename="%s"' % 
                 (key, filename)
                 )
            L.append('Content-Type: %s' % self._get_content_type(filename))
            L.append('')
            L.append(value)
        L.append('--' + BOUNDARY + '--')
        L.append('')
        body = CRLF.join(L)
        content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
        return content_type, body

    def _get_content_type(self, filename):
        return mimetypes.guess_type(filename)[0] or 'application/octet-stream'

    def _send_request(self, session, data=None):
        url = self.get_path(session, 'remoteUrl')
        if (url[:7] == "http://"):
            url = url[7:]
        hlist = url.split('/', 1)
        host = hlist[0]
        if (len(hlist) == 2):
            selector = hlist[1]
        else:
            selector = ""
        # TODO:  Remove dependency
        fields = ()
        files = [("file", "foo.doc", data)]
        return self._post_multipart(host, selector, fields, files)


class OpenOfficePreParser(UrlPreParser):
    """Use OpenOffice server to convert documents into OpenDocument XML """

    inMimeType = ""
    outMimeType = "text/xml"

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        try:
            xml = self._send_request(session, data)
        except:
            xml = "<error/>"
        return StringDocument(xml, self.id, doc.processHistory,
                              mimeType='text/xml', parent=doc.parent,
                              filename=doc.filename)


[docs]class PrintableOnlyPreParser(PreParser):
    """Replace or Strip non printable characters."""

    inMimeType = "text/*"
    outMimeType = "text/plain"

    _possibleSettings = {
        'strip': {
            'docs': """\
Should the preParser strip the characters or replace with numeric character \
entities (default)""", 
            'type': int,
            'options': "0|1"
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.asciiRe = re.compile('([\x7b-\xff])')
        self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
        self.strip = self.get_setting(session, 'strip', 0)

    def process_document(self, session, doc):
        """Strip any non printable characters."""
        data = doc.get_raw(session)
        # This is bizarre, but otherwise:
        # UnicodeDecodeError: 'ascii' codec can't decode byte ...
        if type(data) == unicode:
            data = data.replace(u"\xe2\x80\x9c", u'&quot;')
            data = data.replace(u"\xe2\x80\x9d", u'&quot;')
            data = data.replace(u"\xe2\x80\x9e", u'&quot;')
            data = data.replace(u"\xe2\x80\x93", u'-')
            data = data.replace(u"\xe2\x80\x98", u"'")
            data = data.replace(u"\xe2\x80\x99", u"'")
            data = data.replace(u"\xe2\x80\x9a", u",")
            data = data.replace(u"\x99", u"'")        
            data = data.replace(u'\xa0', u' ')
        else:
            data = data.replace("\xe2\x80\x9c", '&quot;')
            data = data.replace("\xe2\x80\x9d", '&quot;')
            data = data.replace("\xe2\x80\x9e", '&quot;')
            data = data.replace("\xe2\x80\x93", '-')
            data = data.replace("\xe2\x80\x98", "'")
            data = data.replace("\xe2\x80\x99", "'")
            data = data.replace("\xe2\x80\x9a", ",")
            data = data.replace("\x99", "'")        
            data = data.replace('\xa0', ' ')
        data = self.nonxmlRe.sub(' ', data)
        if self.strip:
            new = self.asciiRe.sub('', data)
        else:
            fn = lambda x: "&#%s;" % ord(x.group(1))
            new = self.asciiRe.sub(fn, data)
        return StringDocument(new, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


[docs]class CharacterEntityPreParser(PreParser):
    """Change named and broken entities to numbered.

    Transform latin-1 and broken character entities into numeric character 
    entities. eg
    &amp;something; --> &amp;#123;
    """

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.numericalEntRe = re.compile('&(\d+);')
        self.fractionRe = re.compile('&frac(\d)(\d);')
        self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')
        self.start = 160
        self.otherEntities = {
            "quot": '#34',
            "amp": '#38',
            "lt": '#60',
            "gt": '#62',
            "trade": '#8482',
            "OElig": '#338',
            "oelig": '#339',
            "Scaron": '#352',
            "scaron": '#353',
            "Yuml": '#376',
            "circ": '#710',
            "tilde": '#732',
            "ensp": '#8194',
            "emsp": '#8195',
            "thinsp": '#8201',
            "zwnj": '#8204',
            "zwj": '#8205',
            "lrm": '#8206',
            "rlm": '#8207',
            "ndash": '#8211',
            "mdash": '#8212',
            "lsquo": '#8216',
            "rsquo": '#8217',
            "sbquo": '#8218',
            "ldquo": '#8220',
            "rdquo": '#8221',
            "bdquo": '#8222',
            "dagger": '#8224',
            "Dagger": '#8225',
            "permil": '#8240',
            "lsaquo": '#8249',
            "rsaquo": '#8250',
            "euro": '#8364',
            "rdquo": '#34',
            "lsquo": '#34',
            "rsquo": '#34',
            "half": '#189',
            "ast": '#8727'
        }
        self.inane = {
            "apos": "'",
            "hellip": '...',
            "ldquo": '',
            "lsqb": '[',
            "rsqb": ']',
            "sol": '\\',
            "commat": '@',
            "plus": '+',
            "percnt": '%'
        }

        self.preEntities = {
            "OUML;": "Ouml",
            "UUML": "Uuml",
            "AELIG": "AElig",
            "Aelig": "AElig"
        }
        self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
                         'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
                         'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
                         'sup2', 'sup3', 'acute', 'micro', 'para', 'middot',
                         'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12',
                         'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
                         'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
                         'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave',
                         'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
                         'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
                         'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml',
                         'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
                         'acirc', 'atilde', 'auml', 'aring', 'aelig',
                         'ccedil', 'egrave', 'eacute', 'ecirc', 'euml',
                         'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde',
                         'ograve', 'oacute', 'ocirc', 'otilde', 'ouml',
                         'divide', 'oslash', 'ugrave', 'uacute', 'ucirc',
                         'uuml', 'yacute', 'thorn', 'yuml']

    def process_document(self, session, doc):
        txt = doc.get_raw(session)
        # Replace entities that can be represented with simple chars
        for (fromEnt, toEnt) in self.inane.iteritems():
            txt = txt.replace("&%s;" % fromEnt, toEnt)
        # Fix some common mistakes
        for (fromEnt, toEnt) in self.preEntities.iteritems():
            txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt)
        # Fix straight forward entites
        for (s, enty) in enumerate(self.entities):
            txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s))
        # Fix additional random entities
        for (fent, totxt) in self.otherEntities.iteritems():
            txt = txt.replace("&%s;" % fent, "&%s;" % totxt)
        # Add missing # in &123;

        def hashed(mo):
            return '&#%s;' % mo.group(1)

        txt = self.numericalEntRe.sub(hashed, txt)
        # Fix made up fraction entities. (?)

        def fraction(mo):
            return '%s&#8260;%s' % (mo.group(1), mo.group(2))

        txt = self.fractionRe.sub(fraction, txt)
        # Kill remaining invalid character entities
        txt = self.invalidRe.sub('', txt)
        return StringDocument(txt, self.id, doc.processHistory,
                              mimeType=doc.mimeType, parent=doc.parent,
                              filename=doc.filename)


class DataChecksumPreParser(PreParser):
    """Checksum Document data and add to Document metadata."""

    _possibleSettings = {
        'sumType': {
            'docs': "Type of checkSum to carry out.",
            'type': str,
            'default': 'md5'
        }
    }

    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.sumType = self.get_setting(session, 'sumType', 'md5')
        try:
            hashlib.new(self.sumType)
        except ValueError as e:
            raise ConfigFileException(str(e))

    def process_document(self, session, doc):
        data = doc.get_raw(session)
        h = hashlib.new(self.sumType)
        h.update(data)
        md = {
            self.sumType: {
                'hexdigest': h.hexdigest(),
                'analysisDateTime': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
            }
        }
        try:
            doc.metadata['checksum'].update(md)
        except KeyError:
            doc.metadata['checksum'] = md
        doc.processHistory.append(self.id)
        return doc