import cStringIO
import StringIO
from xml.sax import make_parser, ErrorHandler, SAXParseException
from xml.sax import InputSource as SaxInput
from xml.dom.minidom import parseString as domParseString
from xml.parsers.expat import ExpatError
from lxml import etree
from cheshire3.baseObjects import Parser
from cheshire3.record import SaxRecord, SaxContentHandler, MinidomRecord, \
MarcRecord
from cheshire3.record import LxmlRecord
from cheshire3.utils import nonTextToken
from exceptions import XMLSyntaxError
class BaseParser(Parser):
def _copyData(self, doc, rec):
# Utility function to update data on record from document
rec.filename = doc.filename
rec.tagName = doc.tagName
rec.processHistory = doc.processHistory
rec.processHistory.append(self.id)
if doc.documentStore:
rec.parent = ('document', doc.documentStore, doc.id)
elif doc.parent:
rec.parent = doc.parent
class MinidomParser(BaseParser):
[docs] """Use default Python Minidom implementation to parse document."""
def process_document(self, session, doc):
xml = doc.get_raw(session)
try:
dom = domParseString(xml)
except ExpatError as e:
raise XMLSyntaxError(e.message)
rec = MinidomRecord(dom, xml)
self._copyData(doc, rec)
return rec
class SaxParser(BaseParser):
[docs] """Default SAX based parser. Creates SaxRecord."""
_possibleSettings = {
'namespaces': {
'docs': "Enable namespace processing in SAX"
},
'stripWhitespace': {
'docs': "Strip additional whitespace when processing."
},
'attrHash': {
'docs': "Tag/Attribute combinations to include in hash."
}
}
def __init__(self, session, config, parent):
Parser.__init__(self, session, config, parent)
self.parser = make_parser()
self.errorHandler = ErrorHandler()
self.parser.setErrorHandler(self.errorHandler)
self.inputSource = SaxInput()
ch = SaxContentHandler()
self.contentHandler = ch
self.parser.setContentHandler(ch)
self.keepError = 1
if (self.get_setting(session, 'namespaces')):
self.parser.setFeature('http://xml.org/sax/features/namespaces',
1)
p = self.get_setting(session, 'attrHash')
if (p):
l = p.split()
for i in l:
(a, b) = i.split("@")
try:
ch.hashAttributesNames[a].append(b)
except:
ch.hashAttributesNames[a] = [b]
if self.get_setting(session, 'stripWhitespace'):
ch.stripWS = 1
def process_document(self, session, doc):
xml = doc.get_raw(session)
if type(xml) == unicode:
# SAX parser cannot deal with unicode
xml = xml.encode('utf-8')
self.inputSource.setByteStream(cStringIO.StringIO(xml))
ch = self.contentHandler
ch.reinit()
try:
self.parser.parse(self.inputSource)
except SAXParseException as e:
# Splat. Reset self and reraise
if self.keepError:
# Work out path
path = []
for l in ch.pathLines:
line = ch.currentText[l]
elemName = line[2:line.index('{') - 1]
path.append("%s[@SAXID='%s']" % (elemName, l))
self.errorPath = '/'.join(path)
else:
ch.reinit()
raise XMLSyntaxError(str(e))
rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount)
rec.elementHash = ch.elementHash
rec.byteCount = len(xml)
self._copyData(doc, rec)
ch.reinit()
return rec
class StoredSaxParser(BaseParser):
[docs]
def process_document(self, session, doc):
data = doc.get_raw(session)
data = unicode(data, 'utf-8')
sax = data.split(nonTextToken)
if sax[-1][0] == "9":
line = sax.pop()
elemHash = pickle.loads(str(line[2:]))
else:
elemHash = {}
rec = SaxRecord(sax)
rec.elementHash = elemHash
return rec
class LxmlParser(BaseParser):
[docs] """ lxml based Parser. Creates LxmlRecords """
_possibleSettings = {
'validateDTD': {
'docs': ("Validate to DTD while parsing (if a DTD was "
"referenced by the Document.)"),
'type': int,
'options': "0|1"
},
'allowNetwork': {
'docs': ("Allow network access to look up external documents "
"(DTDs etc.)"),
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
BaseParser.__init__(self, session, config, parent)
dtdVal = bool(self.get_setting(session, 'validateDTD', 0))
noNetwork = not self.get_setting(session, 'allowNetwork', 0)
self.parser = etree.XMLParser(dtd_validation=dtdVal,
no_network=noNetwork)
def process_document(self, session, doc):
# Input must be string or stream
data = doc.get_raw(session)
try:
try:
et = etree.parse(StringIO.StringIO(data), self.parser)
except AssertionError:
data = data.decode('utf8')
et = etree.parse(StringIO.StringIO(data), self.parser)
except etree.XMLSyntaxError as e:
raise XMLSyntaxError(e.message)
rec = LxmlRecord(et)
rec.byteCount = len(data)
self._copyData(doc, rec)
return rec
class LxmlSchemaParser(Parser):
pass
class LxmlRelaxNGParser(Parser):
pass
class LxmlHtmlParser(BaseParser):
[docs] """lxml based parser for HTML documents."""
def __init__(self, session, config, parent):
BaseParser.__init__(self, session, config, parent)
self.parser = etree.HTMLParser()
def process_document(self, session, doc):
data = doc.get_raw(session)
et = etree.parse(StringIO.StringIO(data), self.parser)
rec = LxmlRecord(et)
rec.byteCount = len(data)
self._copyData(doc, rec)
return rec
class PassThroughParser(BaseParser):
[docs] """Take a Document that already contains parsed data and return a Record.
Copy the data from a document (eg list of sax events or a dom tree) into
an appropriate record object.
"""
def process_document(self, session, doc):
# Simply copy data into a record of appropriate type
data = doc.get_raw(session)
if (typeof(data) == types.ListType):
rec = SaxRecord(data)
else:
rec = DomRecord(data)
self._copyData(doc, rec)
return rec
class MarcParser(BaseParser):
[docs] """Creates MarcRecords which fake the Record API for Marc."""
def process_document(self, session, doc):
return MarcRecord(doc)