from __future__ import absolute_import
import os
import re
import time
import string
import glob
import httplib
import mimetypes
import tempfile
import hashlib
import subprocess
try:
import cStringIO as StringIO
except ImportError:
import StringIO
try:
import cPickle as pickle
except ImportError:
import pickle
from xml.sax.saxutils import escape
from warnings import warn
from lxml import etree
from base64 import b64encode, b64decode
# Intra-package imports
from cheshire3.baseObjects import PreParser
from cheshire3.document import StringDocument
from cheshire3.internal import CONFIG_NS
from cheshire3.marc_utils import MARC
from cheshire3.utils import getShellResult
from cheshire3.exceptions import ConfigFileException, ExternalSystemException
# TODO: All PreParsers should set mimetype, and record in/out mimetype
class TypedPreParser(PreParser):
_possibleSettings = {
"inMimeType": {
'docs': "The mimetype expected for incoming documents"
},
"outMimeType": {
'docs': "The mimetype set on outgoing documents"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.inMimeType = self.get_setting(session, 'inMimeType', '')
self.outMimeType = self.get_setting(session, 'outMimeType', '')
[docs]class NormalizerPreParser(PreParser):
""" Calls a named Normalizer to do the conversion."""
_possiblePaths = {
'normalizer': {
'docs': "Normalizer identifier to call to do the transformation",
'required': True
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.normalizer = self.get_path(session, 'normalizer', None)
if self.normalizer is None:
msg = "Normalizer for {0} does not exist.".format(self.id)
raise ConfigFileException(msg)
def process_document(self, session, doc):
data = doc.get_raw(session)
new = self.normalizer.process_string(session, data)
return StringDocument(new, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class UnicodeDecodePreParser(PreParser):
"""PreParser to turn non-unicode into Unicode Documents.
A UnicodeDecodePreParser should accept a Document with content encoded in
a non-unicode character encoding scheme and return a Document with the
same content decoded to Python's Unicode implementation.
"""
_possibleSettings = {
'codec': {
'docs': 'Codec to use to decode to unicode. Defaults to UTF-8'
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.codec = self.get_setting(session, 'codec', 'utf-8')
def process_document(self, session, doc):
try:
data = doc.get_raw(session).decode(self.codec)
except UnicodeDecodeError as e:
raise e
return StringDocument(data, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class CmdLinePreParser(TypedPreParser):
_possiblePaths = {
'executable': {'docs': "Name of the executable to run"},
'executablePath': {'docs': "Path to the executable"},
'workingPath': {'docs': 'Path to be in when executing command'}
}
_possibleSettings = {
'commandLine': {
'docs': """\
Command line to use. %INDOC% is substituted to create a temporary file to
read, and %OUTDOC% is substituted for a temporary file for the process to
write to"""
}
}
def __init__(self, session, config, parent):
TypedPreParser.__init__(self, session, config, parent)
exe = self.get_path(session, 'executable', '')
if not exe:
msg = "Missing mandatory 'executable' path in {0}".format(self.id)
raise ConfigFileException(msg)
tp = self.get_path(session, 'executablePath', '')
if tp:
exe = os.path.join(tp, exe)
cl = self.get_setting(session, 'commandLine', '')
self.cmd = exe + ' ' + cl
self.working = self.get_path(session, 'workingPath', '')
def process_document(self, session, doc):
cmd = self.cmd
stdIn = cmd.find('%INDOC%') == -1
stdOut = cmd.find('%OUTDOC%') == -1
if not stdIn:
# Create temp file for incoming data
if doc.mimeType or doc.filename:
# Guess our extn~n
try:
suff = mimetypes.guess_extension(doc.mimeType)
except:
suff = ''
if not suff:
suff = mimetypes.guess_extension(doc.filename)
if not suff:
(foofn, suff) = os.path.splitext(doc.filename)
if suff:
(qq, infn) = tempfile.mkstemp(suff)
else:
(qq, infn) = tempfile.mkstemp()
else:
(qq, infn) = tempfile.mkstemp()
os.close(qq)
fh = open(infn, 'w')
fh.write(doc.get_raw(session))
fh.close()
cmd = cmd.replace("%INDOC%", infn)
if not stdOut:
# Create temp file to outgoing data
if self.outMimeType:
# Guess our extn~n
suff = mimetypes.guess_extension(self.outMimeType)
(qq, outfn) = tempfile.mkstemp(suff)
else:
(qq, outfn) = tempfile.mkstemp()
cmd = cmd.replace("%OUTDOC%", outfn)
os.close(qq)
if self.working:
old = os.getcwd()
os.chdir(self.working)
else:
old = ''
if stdIn:
pipe = subprocess.Popen(cmd, bufsize=0, shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
pipe.stdin.write(doc.get_raw(session))
pipe.stdin.close()
result = pipe.stdout.read()
pipe.stdout.close()
pipe.stderr.close()
del pipe
else:
# Result will read stdout+err regardless
result = getShellResult(cmd)
os.remove(infn)
if not stdOut:
if os.path.exists(outfn) and os.path.getsize(outfn) > 0:
fh = open(outfn)
else:
# Command probably appended something to the filename
# Annoying! Have to glob for it
matches = glob.glob(outfn + "*")
# Or maybe ignored absolute path and put it in pwd...
matches2 = glob.glob(os.path.split(outfn)[-1] + '*')
for m in matches + matches2:
if os.path.getsize(m) > 0:
fh = open(m)
break
try:
try:
result = fh.read()
except:
msg = '{0}: {1}'.format(cmd, result)
raise ExternalSystemException(msg)
else:
fh.close()
finally:
os.remove(outfn)
try:
# Clean up when data written elsewhere
os.remove(fh.name)
except OSError:
pass
if old:
os.chdir(old)
mt = self.outMimeType
if not mt:
mt = doc.mimeType
return StringDocument(result, self.id, doc.processHistory,
mimeType=mt, parent=doc.parent,
filename=doc.filename)
[docs]class FileUtilPreParser(TypedPreParser):
"""Call 'file' util to find out the current type of file."""
def __init__(self, session, config, parent):
TypedPreParser.__init__(self, session, config, parent)
warn('''\
{0} is deprecated in favour of objects available from the
cheshire3.formats package.'''.format(self.__class__.__name__),
DeprecationWarning,
stacklevel=6)
def process_document(self, session, doc):
cmd = "file -i -b %INDOC%"
(qq, infn) = tempfile.mkstemp()
os.close(qq)
fh = open(infn, 'w')
fh.write(doc.get_raw(session))
fh.close()
cmd = cmd.replace("%INDOC%", infn)
res = getShellResult(cmd)
mt = res.strip()
if mt.find(';') > -1:
bits = mt.split(';')
mt = bits[0]
for b in bits[1:]:
# just stuff them on doc for now
(type, value) = b.split('=')
setattr(doc, type, value)
if mt == "text/plain":
# Might be sgml, xml, text etc
res = getShellResult("file -b {0}".format(infn))
mt2 = res.strip()
if mt2 == "exported SGML document text":
mt = "text/sgml"
elif mt2 == "XML document text":
mt = "text/xml"
# Others include java, etc. but not very useful to us
doc.mimeType = mt
doc.processHistory.append(self.id)
return doc
[docs]class MagicRedirectPreParser(TypedPreParser):
"""Map to appropriate PreParser based on incoming MIME type."""
def _handleLxmlConfigNode(self, session, node):
# Handle config in the form:
# <hash>
# <object mimeType="" ref=""/>
# ...
# </hash>
if node.tag in ['hash', '{%s}hash' % CONFIG_NS]:
for c in node.iterchildren(tag=etree.Element):
if c.tag in ['object', '{%s}object' % CONFIG_NS]:
mt = c.attrib['mimeType']
ref = c.attrib['ref']
self.mimeTypeHash[mt] = ref
def _handleConfigNode(self, session, node):
# Handle config in the form:
# <hash>
# <object mimeType="" ref=""/>
# ...
# </hash>
if node.localName == "hash":
for c in node.childNodes:
if c.nodeType == elementType and c.localName == "object":
mt = c.getAttributeNS(None, 'mimeType')
ref = c.getAttributeNS(None, 'ref')
self.mimeTypeHash[mt] = ref
def __init__(self, session, config, parent):
self.mimeTypeHash = {"application/x-gzip": "GunzipPreParser",
"application/postscript": "PsPdfPreParser",
"application/pdf": "PdfXmlPreParser",
"text/html": "HtmlSmashPreParser",
"text/plain": "TxtToXmlPreParser",
"text/sgml": "SgmlPreParser",
"application/x-bzip2": "BzipPreParser"
# "application/x-zip": "single zip preparser ?"
}
# Now override from config in init:
TypedPreParser.__init__(self, session, config, parent)
def process_document(self, session, doc):
mt = doc.mimeType
db = session.server.get_object(session, session.database)
if not mt:
# Nasty kludge - use FileUtilPreParser to determine MIME type
fu = db.get_object(session, 'FileUtilPreParser')
doc2 = fu.process_document(session, doc)
mt = doc2.mimeType
if not mt and doc.filename:
# Try and guess from filename
mts = mimetypes.guess_type(doc.filename)
if mts and mts[0]:
mt = mts[0]
if mt in self.mimeTypeHash:
db = session.server.get_object(session, session.database)
redirect = db.get_object(session, self.mimeTypeHash[mt])
if isinstance(redirect, PreParser):
return redirect.process_document(session, doc)
else:
# Only other thing is workflow
return redirect.process(session, doc)
else:
# XXX: Should we return or raise?
return doc
# --- HTML PreParsers ---
[docs]class HtmlSmashPreParser(PreParser):
""" Attempts to reduce HTML to its raw text """
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.body = re.compile('<body(.*?)</body>', re.S | re.I)
self.tagstrip = re.compile('<[^>]+>')
self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
self.script = re.compile('<script(.*?)</script>', re.S | re.I)
self.style = re.compile('<style(.*?)</style>', re.S | re.I)
self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
def process_document(self, session, doc):
data = self.script.sub('', doc.get_raw(session))
data = self.style.sub('', data)
data = self.comment.sub('', data)
tm = self.title.search(data)
if tm:
title = data[tm.start():tm.end()]
else:
title = ""
m = self.body.search(data)
if m:
body = data[m.start():m.end()]
else:
body = data
text = self.tagstrip.sub(' ', body)
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace(" ", ' ')
text = text.replace(" ", ' ')
l = text.split()
text = ' '.join(l)
data = "<html><head>%s</head><body>%s</body></html>" % (title, text)
return StringDocument(data, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class RegexpSmashPreParser(PreParser):
"""Strip, replace or keep only data which matches a given regex."""
_possibleSettings = {
'char': {
'docs': """\
Character(s) to replace matches in the regular expression with. Defaults to
empty string (i.e. strip matches)"""
},
'regexp': {
'docs': "Regular expression to match in the data.",
'required': True
},
'keep': {
'docs': """\
Should instead keep only the matches. Boolean, defaults to False""",
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
char = self.get_setting(session, 'char')
regex = self.get_setting(session, 'regexp')
self.keep = self.get_setting(session, 'keep')
if regex:
self.regexp = re.compile(regex, re.S)
if char:
self.char = char
else:
self.char = ''
def process_document(self, session, doc):
data = doc.get_raw(session)
if self.keep:
l = self.regexp.findall(data)
if l and l[0] and type(l[0]) == tuple:
r = []
for e in l:
r.append(e[0])
l = r
d2 = self.char.join(l)
else:
d2 = self.regexp.sub(self.char, data)
return StringDocument(d2, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
try:
import tidy
except ImportError:
# Gracefully degrade functionality
class HtmlTidyPreParser(PreParser):
def __init__(self, session, config, parent):
raise NotImplementedError("""\
HtmlTidyPreParser not supported due to a missing library on your system.""")
else:
[docs] class HtmlTidyPreParser(PreParser):
"""Uses TidyLib to turn HTML into XHTML for parsing."""
def process_document(self, session, doc):
d = tidy.parseString(doc.get_raw(session),
output_xhtml=1,
add_xml_decl=0,
tidy_mark=0,
indent=0)
return StringDocument(str(d), self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
# --- Not Quite Xml PreParsers ---
[docs]class SgmlPreParser(PreParser):
""" Convert SGML into XML """
entities = {}
emptyTags = []
doctype_re = None
attr_re = None
elem_re = None
amp_re = None
inMimeType = "text/sgml"
outMimeType = "text/xml"
_possibleSettings = {
'emptyElements': {
'docs': '''\
Space separated list of empty elements in the SGML to turn into empty XML
elements.'''
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
self.attr_re = re.compile(
' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])'
)
self.pi_re = re.compile("<\?(.*?)\?>")
self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
self.amp_re = re.compile('&(\s)')
taglist = self.get_setting(session, 'emptyElements')
if taglist:
self.emptyTags = taglist.split()
def _loneAmpersand(self, match):
# Fix unencoded ampersands
return '&%s' % match.group(1)
def _lowerElement(self, match):
# Make all tags lowercase
#return match.groups()[0] + match.groups()[1].lower()
return "%s%s" % (match.group(1), match.group(2).lower())
def _attributeFix(self, match):
# Fix messy attribute values
# - lowercase attribute names
# - remove spurious whitespace
# - quote unquoted values
#return match.groups()[0].lower() + '="' + match.groups()[1] + '"'
return ' %s="%s"%s' % (match.group(1).lower(),
match.group(2),
match.group(3))
def _emptyElement(self, match):
# Make empty elements sefl-closing
return "<%s/>" % (match.group(1))
def process_document(self, session, doc):
txt = doc.get_raw(session)
txt = txt.replace('\n', ' ')
txt = txt.replace('\r', ' ')
for x in range(9, 14):
txt = txt.replace('&#%d;' % (x), ' ')
txt = self.doctype_re.sub('', txt)
for e in self.entities.keys():
txt = txt.replace("&%s;" % (e), self.entities[e])
txt = self.amp_re.sub(self._loneAmpersand, txt)
txt = txt.replace('&<', '&<')
txt = self.attr_re.sub(self._attributeFix, txt)
txt = self.elem_re.sub(self._lowerElement, txt)
for t in self.emptyTags:
empty_re = re.compile('<(%s( [^>/]+)?)[\s/]*>' % t)
txt = empty_re.sub(self._emptyElement, txt)
# strip processing instructions.
txt = self.pi_re.sub('', txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class AmpPreParser(PreParser):
"""Escape lone ampersands in otherwise XML text."""
entities = {}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.amp_re = re.compile('&([^\s;]*)(\s|$)')
self.entities = {}
def _loneAmpersand(self, match):
# Fix unencoded ampersands
return '&%s ' % match.group(1)
def process_document(self, session, doc):
txt = doc.get_raw(session)
for e in self.entities.keys():
txt = txt.replace("&%s;" % (e), self.entities[e])
txt = self.amp_re.sub(self._loneAmpersand, txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
# --- MARC PreParsers ---
[docs]class MarcToXmlPreParser(PreParser):
""" Convert MARC into MARCXML """
inMimeType = "application/marc"
outMimeType = "text/xml"
def process_document(self, session, doc):
data = doc.get_raw(session)
m = MARC(data)
return StringDocument(m.toMARCXML(), self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
[docs]class MarcToSgmlPreParser(PreParser):
""" Convert MARC into Cheshire2's MarcSgml """
inMimeType = "application/marc"
outMimeType = "text/sgml"
def process_document(self, session, doc):
data = doc.get_raw(session)
m = MARC(data)
return StringDocument(m.toSGML(), self.id, doc.processHistory,
mimeType='text/sgml', parent=doc.parent,
filename=doc.filename)
# --- Raw Text PreParsers ---
[docs]class TxtToXmlPreParser(PreParser):
"""Minimally wrap text in <data> XML tags"""
inMimeType = "text/plain"
outMimeType = "text/xml"
def process_document(self, session, doc):
txt = doc.get_raw(session)
txt = escape(txt)
data = "<data>{0}</data>".format(txt)
return StringDocument(data, self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
# --- Compression PreParsers ---
[docs]class PicklePreParser(PreParser):
"""Compress Document content using Python pickle."""
def process_document(self, session, doc):
data = doc.get_raw(session)
string = pickle.dumps(data)
return StringDocument(string, self.id, doc.processHistory,
mimeType='text/pickle', parent=doc.parent,
filename=doc.filename)
[docs]class UnpicklePreParser(PreParser):
"""Decompress Document content using Python pickle."""
def process_document(self, session, doc):
data = doc.get_raw(session)
string = pickle.loads(data)
return StringDocument(string, self.id, doc.processHistory,
mimeType='text/pickle', parent=doc.parent,
filename=doc.filename)
try:
import gzip
except ImportError:
# Gracefully degrade functionality
class GzipPreParser(PreParser):
"""Gzip a not-gzipped document."""
def __init__(self, session, config, parent):
raise NotImplementedError('''\
Compression by gzip is not supported due to a missing library in your system.\
''')
class GunzipPreParser(PreParser):
"""Gunzip a gzipped document."""
def __init__(self, session, config, parent):
raise NotImplementedError('''\
Decompression by gzip is not supported due to a missing library in your \
system.''')
else:
[docs] class GzipPreParser(PreParser):
"""Gzip a not-gzipped document."""
inMimeType = ""
outMimeType = ""
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.compressLevel = self.get_setting(session, "compressLevel", 1)
def process_document(self, session, doc):
outDoc = StringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', fileobj=outDoc,
compresslevel=self.compressLevel)
zfile.write(doc.get_raw(session))
zfile.close()
l = outDoc.tell()
outDoc.seek(0)
data = outDoc.read(l)
outDoc.close()
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
# This comment needed for validation by PEP8 validator
[docs] class GunzipPreParser(PreParser):
"""Gunzip a gzipped document."""
inMimeType = ""
outMimeType = ""
def process_document(self, session, doc):
buff = StringIO.StringIO(doc.get_raw(session))
zfile = gzip.GzipFile(mode='rb', fileobj=buff)
data = zfile.read()
zfile.close()
buff.close()
del zfile
del buff
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
try:
import bz2
except ImportError:
# Gracefully degrade functionality
class Bzip2PreParser(PreParser):
"""Unzip a bz2 zipped document."""
def __init__(self, session, config, parent):
raise NotImplementedError('''\
Decompression by bzip2 is not supported due to a missing library in your \
system.''')
else:
class Bzip2PreParser(PreParser):
"""Unzip a bz2 zipped document."""
def process_document(self, session, doc):
bzdata = doc.get_raw(session)
data = bz2.decompress(bzdata)
return StringDocument(data, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
[docs]class B64EncodePreParser(PreParser):
"""Encode document in Base64."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = b64encode(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
[docs]class B64DecodePreParser(PreParser):
"""Decode document from Base64."""
def process_document(self, session, doc):
data = doc.get_raw(session)
new = b64decode(data)
return StringDocument(new, self.id, doc.processHistory,
parent=doc.parent, filename=doc.filename)
# --- Nasty OpenOffice PreParser ---
class UrlPreParser(PreParser):
"""Abstract Base Class for PreParsers that use OpenOffice.
DEPRECATED: see cheshire3.formats sub-package instead
"""
_possiblePaths = {
'remoteUrl': {
'docs': 'URL at which the OpenOffice handler is listening'
}
}
def _post_multipart(self, host, selector, fields, files):
content_type, body = self._encode_multipart_formdata(fields, files)
h = httplib.HTTPConnection(host)
headers = {'content-type': content_type}
h.request('POST', selector, body, headers)
resp = h.getresponse()
return resp.read()
def _encode_multipart_formdata(self, fields, files):
BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
CRLF = '\r\n'
L = []
for (key, value) in fields:
L.append('--' + BOUNDARY)
L.append('Content-Disposition: form-data; name="%s"' % key)
L.append('')
L.append(value)
for (key, filename, value) in files:
L.append('--' + BOUNDARY)
L.append(
'Content-Disposition: form-data; name="%s"; filename="%s"' %
(key, filename)
)
L.append('Content-Type: %s' % self._get_content_type(filename))
L.append('')
L.append(value)
L.append('--' + BOUNDARY + '--')
L.append('')
body = CRLF.join(L)
content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
return content_type, body
def _get_content_type(self, filename):
return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
def _send_request(self, session, data=None):
url = self.get_path(session, 'remoteUrl')
if (url[:7] == "http://"):
url = url[7:]
hlist = url.split('/', 1)
host = hlist[0]
if (len(hlist) == 2):
selector = hlist[1]
else:
selector = ""
# TODO: Remove dependency
fields = ()
files = [("file", "foo.doc", data)]
return self._post_multipart(host, selector, fields, files)
class OpenOfficePreParser(UrlPreParser):
"""Use OpenOffice server to convert documents into OpenDocument XML """
inMimeType = ""
outMimeType = "text/xml"
def process_document(self, session, doc):
data = doc.get_raw(session)
try:
xml = self._send_request(session, data)
except:
xml = "<error/>"
return StringDocument(xml, self.id, doc.processHistory,
mimeType='text/xml', parent=doc.parent,
filename=doc.filename)
[docs]class PrintableOnlyPreParser(PreParser):
"""Replace or Strip non printable characters."""
inMimeType = "text/*"
outMimeType = "text/plain"
_possibleSettings = {
'strip': {
'docs': """\
Should the preParser strip the characters or replace with numeric character \
entities (default)""",
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.asciiRe = re.compile('([\x7b-\xff])')
self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
self.strip = self.get_setting(session, 'strip', 0)
def process_document(self, session, doc):
"""Strip any non printable characters."""
data = doc.get_raw(session)
# This is bizarre, but otherwise:
# UnicodeDecodeError: 'ascii' codec can't decode byte ...
if type(data) == unicode:
data = data.replace(u"\xe2\x80\x9c", u'"')
data = data.replace(u"\xe2\x80\x9d", u'"')
data = data.replace(u"\xe2\x80\x9e", u'"')
data = data.replace(u"\xe2\x80\x93", u'-')
data = data.replace(u"\xe2\x80\x98", u"'")
data = data.replace(u"\xe2\x80\x99", u"'")
data = data.replace(u"\xe2\x80\x9a", u",")
data = data.replace(u"\x99", u"'")
data = data.replace(u'\xa0', u' ')
else:
data = data.replace("\xe2\x80\x9c", '"')
data = data.replace("\xe2\x80\x9d", '"')
data = data.replace("\xe2\x80\x9e", '"')
data = data.replace("\xe2\x80\x93", '-')
data = data.replace("\xe2\x80\x98", "'")
data = data.replace("\xe2\x80\x99", "'")
data = data.replace("\xe2\x80\x9a", ",")
data = data.replace("\x99", "'")
data = data.replace('\xa0', ' ')
data = self.nonxmlRe.sub(' ', data)
if self.strip:
new = self.asciiRe.sub('', data)
else:
fn = lambda x: "&#%s;" % ord(x.group(1))
new = self.asciiRe.sub(fn, data)
return StringDocument(new, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
[docs]class CharacterEntityPreParser(PreParser):
"""Change named and broken entities to numbered.
Transform latin-1 and broken character entities into numeric character
entities. eg
&something; --> &#123;
"""
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.numericalEntRe = re.compile('&(\d+);')
self.fractionRe = re.compile('&frac(\d)(\d);')
self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')
self.start = 160
self.otherEntities = {
"quot": '#34',
"amp": '#38',
"lt": '#60',
"gt": '#62',
"trade": '#8482',
"OElig": '#338',
"oelig": '#339',
"Scaron": '#352',
"scaron": '#353',
"Yuml": '#376',
"circ": '#710',
"tilde": '#732',
"ensp": '#8194',
"emsp": '#8195',
"thinsp": '#8201',
"zwnj": '#8204',
"zwj": '#8205',
"lrm": '#8206',
"rlm": '#8207',
"ndash": '#8211',
"mdash": '#8212',
"lsquo": '#8216',
"rsquo": '#8217',
"sbquo": '#8218',
"ldquo": '#8220',
"rdquo": '#8221',
"bdquo": '#8222',
"dagger": '#8224',
"Dagger": '#8225',
"permil": '#8240',
"lsaquo": '#8249',
"rsaquo": '#8250',
"euro": '#8364',
"rdquo": '#34',
"lsquo": '#34',
"rsquo": '#34',
"half": '#189',
"ast": '#8727'
}
self.inane = {
"apos": "'",
"hellip": '...',
"ldquo": '',
"lsqb": '[',
"rsqb": ']',
"sol": '\\',
"commat": '@',
"plus": '+',
"percnt": '%'
}
self.preEntities = {
"OUML;": "Ouml",
"UUML": "Uuml",
"AELIG": "AElig",
"Aelig": "AElig"
}
self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen',
'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo',
'not', 'shy', 'reg', 'macr', 'deg', 'plusmn',
'sup2', 'sup3', 'acute', 'micro', 'para', 'middot',
'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12',
'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',
'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil',
'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave',
'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times',
'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml',
'Yacute', 'THORN', 'szlig', 'agrave', 'aacute',
'acirc', 'atilde', 'auml', 'aring', 'aelig',
'ccedil', 'egrave', 'eacute', 'ecirc', 'euml',
'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde',
'ograve', 'oacute', 'ocirc', 'otilde', 'ouml',
'divide', 'oslash', 'ugrave', 'uacute', 'ucirc',
'uuml', 'yacute', 'thorn', 'yuml']
def process_document(self, session, doc):
txt = doc.get_raw(session)
# Replace entities that can be represented with simple chars
for (fromEnt, toEnt) in self.inane.iteritems():
txt = txt.replace("&%s;" % fromEnt, toEnt)
# Fix some common mistakes
for (fromEnt, toEnt) in self.preEntities.iteritems():
txt = txt.replace("&%s;" % fromEnt, "&%s;" % toEnt)
# Fix straight forward entites
for (s, enty) in enumerate(self.entities):
txt = txt.replace("&%s;" % enty, "&#%s;" % (160 + s))
# Fix additional random entities
for (fent, totxt) in self.otherEntities.iteritems():
txt = txt.replace("&%s;" % fent, "&%s;" % totxt)
# Add missing # in &123;
def hashed(mo):
return '&#%s;' % mo.group(1)
txt = self.numericalEntRe.sub(hashed, txt)
# Fix made up fraction entities. (?)
def fraction(mo):
return '%s⁄%s' % (mo.group(1), mo.group(2))
txt = self.fractionRe.sub(fraction, txt)
# Kill remaining invalid character entities
txt = self.invalidRe.sub('', txt)
return StringDocument(txt, self.id, doc.processHistory,
mimeType=doc.mimeType, parent=doc.parent,
filename=doc.filename)
class DataChecksumPreParser(PreParser):
"""Checksum Document data and add to Document metadata."""
_possibleSettings = {
'sumType': {
'docs': "Type of checkSum to carry out.",
'type': str,
'default': 'md5'
}
}
def __init__(self, session, config, parent):
PreParser.__init__(self, session, config, parent)
self.sumType = self.get_setting(session, 'sumType', 'md5')
try:
hashlib.new(self.sumType)
except ValueError as e:
raise ConfigFileException(str(e))
def process_document(self, session, doc):
data = doc.get_raw(session)
h = hashlib.new(self.sumType)
h.update(data)
md = {
self.sumType: {
'hexdigest': h.hexdigest(),
'analysisDateTime': time.strftime('%Y-%m-%dT%H:%M:%S%Z')
}
}
try:
doc.metadata['checksum'].update(md)
except KeyError:
doc.metadata['checksum'] = md
doc.processHistory.append(self.id)
return doc