Source code for cheshire3.parser

import cStringIO
import StringIO

from xml.sax import make_parser, ErrorHandler, SAXParseException
from xml.sax import InputSource as SaxInput
from xml.dom.minidom import parseString as domParseString
from xml.parsers.expat import ExpatError
from lxml import etree

from cheshire3.baseObjects import Parser
from cheshire3.record import (
    SaxRecord,
    SaxContentHandler,
    DomRecord,
    MinidomRecord,
    MarcRecord
)
from cheshire3.record import LxmlRecord
from cheshire3.utils import nonTextToken
from exceptions import XMLSyntaxError


class BaseParser(Parser):

    def _copyData(self, doc, rec):
        # Utility function to update data on record from document
        rec.id = doc.id
        rec.filename = doc.filename
        rec.tagName = doc.tagName
        rec.processHistory = doc.processHistory
        rec.processHistory.append(self.id)
        if doc.documentStore:
            rec.parent = ('document', doc.documentStore, doc.id)
        elif doc.parent:
            rec.parent = doc.parent


class MinidomParser(BaseParser):
[docs] """Use default Python Minidom implementation to parse document.""" def process_document(self, session, doc): xml = doc.get_raw(session) try: dom = domParseString(xml) except ExpatError as e: raise XMLSyntaxError(e.message) rec = MinidomRecord(dom, xml) self._copyData(doc, rec) return rec class SaxParser(BaseParser):
[docs] """Default SAX based parser. Creates SaxRecord.""" _possibleSettings = { 'namespaces': { 'docs': "Enable namespace processing in SAX" }, 'stripWhitespace': { 'docs': "Strip additional whitespace when processing." }, 'attrHash': { 'docs': "Tag/Attribute combinations to include in hash." } } def __init__(self, session, config, parent): Parser.__init__(self, session, config, parent) self.parser = make_parser() self.errorHandler = ErrorHandler() self.parser.setErrorHandler(self.errorHandler) self.inputSource = SaxInput() ch = SaxContentHandler() self.contentHandler = ch self.parser.setContentHandler(ch) self.keepError = 1 if (self.get_setting(session, 'namespaces')): self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) p = self.get_setting(session, 'attrHash') if (p): l = p.split() for i in l: (a, b) = i.split("@") try: ch.hashAttributesNames[a].append(b) except: ch.hashAttributesNames[a] = [b] if self.get_setting(session, 'stripWhitespace'): ch.stripWS = 1 def process_document(self, session, doc): xml = doc.get_raw(session) if type(xml) == unicode: # SAX parser cannot deal with unicode xml = xml.encode('utf-8') self.inputSource.setByteStream(cStringIO.StringIO(xml)) ch = self.contentHandler ch.reinit() try: self.parser.parse(self.inputSource) except SAXParseException as e: # Splat. Reset self and reraise if self.keepError: # Work out path path = [] for l in ch.pathLines: line = ch.currentText[l] elemName = line[2:line.index('{') - 1] path.append("%s[@SAXID='%s']" % (elemName, l)) self.errorPath = '/'.join(path) else: ch.reinit() raise XMLSyntaxError(str(e)) rec = SaxRecord(ch.currentText, xml, wordCount=ch.recordWordCount) rec.elementHash = ch.elementHash rec.byteCount = len(xml) self._copyData(doc, rec) ch.reinit() return rec class StoredSaxParser(BaseParser):
[docs] def process_document(self, session, doc): data = doc.get_raw(session) data = unicode(data, 'utf-8') sax = data.split(nonTextToken) if sax[-1][0] == "9": line = sax.pop() elemHash = pickle.loads(str(line[2:])) else: elemHash = {} rec = SaxRecord(sax) rec.elementHash = elemHash return rec class LxmlParser(BaseParser):
[docs] """ lxml based Parser. Creates LxmlRecords """ _possibleSettings = { 'validateDTD': { 'docs': ("Validate to DTD while parsing (if a DTD was " "referenced by the Document.)"), 'type': int, 'options': "0|1" }, 'allowNetwork': { 'docs': ("Allow network access to look up external documents " "(DTDs etc.)"), 'type': int, 'options': "0|1" } } def __init__(self, session, config, parent): BaseParser.__init__(self, session, config, parent) dtdVal = bool(self.get_setting(session, 'validateDTD', 0)) noNetwork = not self.get_setting(session, 'allowNetwork', 0) self.parser = etree.XMLParser(dtd_validation=dtdVal, no_network=noNetwork) def process_document(self, session, doc): # Input must be string or stream data = doc.get_raw(session) try: try: et = etree.parse(StringIO.StringIO(data), self.parser) except AssertionError: data = data.decode('utf8') et = etree.parse(StringIO.StringIO(data), self.parser) except etree.XMLSyntaxError as e: raise XMLSyntaxError(e.message) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec class LxmlSchemaParser(Parser):
pass class LxmlRelaxNGParser(Parser): pass class LxmlHtmlParser(BaseParser):
[docs] """lxml based parser for HTML documents.""" def __init__(self, session, config, parent): BaseParser.__init__(self, session, config, parent) self.parser = etree.HTMLParser() def process_document(self, session, doc): data = doc.get_raw(session) et = etree.parse(StringIO.StringIO(data), self.parser) rec = LxmlRecord(et) rec.byteCount = len(data) self._copyData(doc, rec) return rec class PassThroughParser(BaseParser):
[docs] """Take a Document that already contains parsed data and return a Record. Copy the data from a document (eg list of sax events or a dom tree) into an appropriate record object. """ def process_document(self, session, doc): # Simply copy data into a record of appropriate type data = doc.get_raw(session) if isinstance(data, list): rec = SaxRecord(data) else: rec = DomRecord(data) self._copyData(doc, rec) return rec class MarcParser(BaseParser):
[docs] """Creates MarcRecords which fake the Record API for Marc.""" def process_document(self, session, doc): return MarcRecord(doc)