Source code for cheshire3.record

"""Record implementations.

Split to separate object to allow for DOM->SAX direct conversion by throwing
events from DOM tree to handler.
"""

import types
import unicodedata
import re

from xml.sax.saxutils import escape
from xml.sax import ContentHandler

from cheshire3.baseObjects import Record
from cheshire3.exceptions import C3Exception
import cheshire3.utils
from cheshire3.utils import flattenTexts, elementType, textType
from cheshire3.marc_utils import MARC, MARC8_to_Unicode
from __builtin__ import isinstance


class SaxContentHandler(ContentHandler):
    """Cheshire3 SAX implementation.

    1 <name> <attrHash> parent predicate end
    Element
    4 <as 1>
    Namespaced Element
    2 <name> <startLine>
    End Element
    5 <as 2>
    End Namespaced
    3 <text>
    Characters
    9 <element hash>
    pickled hash of locations
    """

    currentText = []
    currentPath = []
    pathLines = []
    currentLine = -1
    recordWordCount = 0
    elementHash = {}
    namespaces = []
    hashAttributesNames = {}
    hashAttributes = []
    stripWS = 0
    saveElementIndexes = 1

    def __init__(self):
        self.saveElementIndexes = 1
        self.hashAttributesNames = {}
        self.hashAttributes = []
        self.stripWS = 0
        self.reinit()

    def reinit(self):
        self.currentText = []
        self.currentPath = []
        self.pathLines = []
        self.currentLine = -1
        self.recordWordCount = 0
        self.elementHash = {}
        self.elementIndexes = []
        self.namespaces = []

    def startPrefixMapping(self, pfx, uri):
        self.currentLine += 1
        if (pfx is None):
            pfx = ''
        self.currentText.append("6 %r, %r" % (pfx, uri))

    # We want to fwd elems to NS elem handlers with default NS?
    def startElement(self, name, attrs):
        self.currentLine += 1
        self.pathLines.append(self.currentLine)
        try:
            parent = self.pathLines[-2]
        except IndexError:
            parent = -1
        attrHash = {}
        if (attrs):
            for k in attrs.keys():
                attrHash[k] = escape(attrs[k])

        try:
            npred = self.elementIndexes[-1][name] + 1
            self.elementIndexes[-1][name] += 1
        except IndexError:
            # Empty
            npred = 1
            self.elementIndexes = [{name: npred}]
        except KeyError:
            # First occurence of Element
            npred = 1
            self.elementIndexes[-1][name] = 1
        except:
            raise
        self.elementIndexes.append({})
        self.currentText.append("1 %s %s %d %d" %
                                (name, repr(attrHash), parent, npred))
        saveAttrs = []
        try:
            hashAttrList = self.hashAttributesNames[name]
            for a in hashAttrList:
                try:
                    saveAttrs.append("%s[@%s='%s']" % (name, a, attrHash[a]))
                except:
                    pass
        except:
            pass
        try:
            starAttrList = self.hashAttributesNames['*']
            for a in starAttrList:
                try:
                    saveAttrs.append("*[@%s='%s']" % (a, attrHash[a]))
                except:
                    pass
        except:
            pass
        if saveAttrs:
            self.hashAttributes.append((self.currentLine, saveAttrs))

    def endElement(self, name):
        self.currentLine += 1
        start = self.pathLines.pop()
        self.currentText.append("2 %s %d" % (name, start))
        self.currentText[start] = ("%s %d" %
                                   (self.currentText[start], self.currentLine))
        self.elementIndexes.pop()
        try:
            self.elementHash[name].append([start, self.currentLine])
        except:
            self.elementHash[name] = [[start, self.currentLine]]
        if self.hashAttributes and self.hashAttributes[-1][0] == start:
            attrs = self.hashAttributes.pop()[1]
            for sa in attrs:
                try:
                    self.elementHash[sa].append([start, self.currentLine])
                except:
                    self.elementHash[sa] = [[start, self.currentLine]]

    def startElementNS(self, name, qname, attrs):
        self.currentLine += 1
        self.pathLines.append(self.currentLine)
        try:
            parent = self.pathLines[-2]
        except:
            parent = -1
        attrHash = {}
        # Convert from weird sax thing
        if (attrs):
            for k in attrs.keys():
                attrHash[k] = attrs[k]

        simpleName = name[1]
        try:
            npred = self.elementIndexes[-1][simpleName] + 1
            self.elementIndexes[-1][simpleName] += 1
        except IndexError:
            # Empty
            npred = 1
            self.elementIndexes = [{simpleName: npred}]
        except KeyError:
            # First occurence of Element
            npred = 1
            self.elementIndexes[-1][simpleName] = 1
        self.elementIndexes.append({})

        self.currentText.append(
            "4 %r, %r, %r, %r %d %d" %
            (name[0], simpleName, qname, attrHash, parent, npred)
        )

        saveAttrs = []
        try:
            hashAttrList = self.hashAttributesNames[simpleName]
            for a in hashAttrList:
                try:
                    saveAttrs.append("%s[@%s='%s']" %
                                     (simpleName, a, attrHash[a]))
                except:
                    pass
        except:
            pass
        try:
            starAttrList = self.hashAttributesNames['*']
            for a in starAttrList:
                try:
                    saveAttrs.append("*[@%s='%s']" % (a, attrHash[a]))
                except:
                    pass
        except:
            pass
        if saveAttrs:
            self.hashAttributes.append((self.currentLine, saveAttrs))

    def endElementNS(self, name, qname):
        self.currentLine += 1
        start = self.pathLines.pop()
        self.currentText.append("5 %r, %r, %r %d" %
                                (name[0], name[1], qname, start))
        self.currentText[start] = ("%s %d" %
                                   (self.currentText[start], self.currentLine))
        self.elementIndexes.pop()
        try:
            self.elementHash[name[1]].append([start, self.currentLine])
        except:
            self.elementHash[name[1]] = [[start, self.currentLine]]
        if self.hashAttributes and self.hashAttributes[-1][0] == start:
            attrs = self.hashAttributes.pop()[1]
            for sa in attrs:
                try:
                    self.elementHash[sa].append([start, self.currentLine])
                except:
                    self.elementHash[sa] = [[start, self.currentLine]]

    def characters(self, text, start=0, length=-1):
        # if text.isspace():
        #     text = " "
        prev = self.currentText[-1]
        if self.stripWS and text.isspace():
            return
        self.currentLine += 1
        if (
            len(text) != 1 and len(prev) != 3 and
            prev[0] == "3" and not prev[-1] in [' ', '-']
        ):
            # Adjacent lines of text, ensure spaces
            text = ' ' + text
        self.currentText.append("3 %s" % (text))
        self.recordWordCount += len(text.split())

    def ignorableWhitespace(self, ws):
        # ... ignore! :D
        pass

    def processingInstruction(self, target, data):
        pass

    def skippedEntity(self, name):
        pass


class SaxToDomHandler:
    nodeStack = []
    document = None
    currText = ""

    def initState(self):
        self.nodeStack = []
        self.document = None
        self.top = None

    def startElement(self, name, attribs={}):
        if (not self.document):
            self.document = implementation.createDocument(None, name, None)
            elem = self.document.childNodes[0]
        else:
            elem = self.document.createElementNS(None, name)
        for a in attribs:
            elem.setAttributeNS(None, a, attribs[a])
        if (self.nodeStack):
            self.nodeStack[-1].appendChild(elem)
        else:
            self.document.appendChild(elem)
        self.nodeStack.append(elem)

    def endElement(self, foo):
        self.nodeStack.pop()

    def characters(self, text, zero=0, length=0):
        if (self.nodeStack):
            if (text.isspace()):
                text = " "
            # Is this escape necessary?
            text = escape(text)
            d = self.document.createTextNode(text)
            self.nodeStack[-1].appendChild(d)

    def startElementNS(self, name, qname, attribs):
        if (not self.document):
            self.document = implementation.createDocument(name[0], name[1],
                                                          None)
            elem = self.document.childNodes[0]
        else:
            elem = self.document.createElementNS(name[0], name[1])

        for a in attribs:
            elem.setAttributeNS(a[0], a[1], attribs[a])
        if (self.nodeStack):
            self.nodeStack[-1].appendChild(elem)
        else:
            self.document.appendChild(elem)
        self.nodeStack.append(elem)

    def endElementNS(self, name, qname):
        self.nodeStack.pop()

    def startPrefixMapping(self, pref, uri):
        pass

    def getRootNode(self):
        return self.document


class SaxToXmlHandler:
    xml = []
    currNs = 0
    newNamespaces = {}

    def initState(self):
        self.xml = []
        self.namespaces = {}
        self.currNs = 0
        self.newNamespaces = {}

    def startPrefixMapping(self, pref, uri):
        self.namespaces[uri] = pref
        self.newNamespaces[pref] = uri

    def startElement(self, name, attribs={}):
        attrs = []
        for a in attribs:
            attrs.append('%s="%s"' % (a, attribs[a]))
        attribtxt = ' '.join(attrs)
        if (attribtxt):
            attribtxt = " " + attribtxt
        self.xml.append("<%s%s>" % (name, attribtxt))

    def endElement(self, name):
        self.xml.append("</%s>" % (name))

    def _getPrefix(self, ns):
        if (not ns):
            return ""
        pref = self.namespaces.get(ns, None)
        if (pref is None):
            self.currNs += 1
            pref = "ns%d" % (self.currNs)
            self.namespaces[ns] = pref
            self.newNamespaces[pref] = ns
        return pref

    def startElementNS(self, n, qn=None, attrs={}):
        pref = self._getPrefix(n[0])
        if (pref):
            name = "%s:%s" % (pref, n[1])
        else:
            name = n[1]
        attrlist = []
        for ns, aname in attrs:
            p2 = self._getPrefix(ns)
            if (p2):
                nsaname = "%s:%s" % (p2, aname)
            else:
                nsaname = aname
            attrlist.append('%s="%s"' % (nsaname, attrs[(ns, aname)]))
        for x in self.newNamespaces.iteritems():
            if (x[0]):
                attrlist.append('xmlns:%s="%s"' % (x[0], x[1]))
            else:
                attrlist.append('xmlns="%s"' % (x[1]))
        self.newNamespaces = {}
        attribtxt = ' '.join(attrlist)
        if (attribtxt):
            attribtxt = " " + attribtxt
        self.xml.append("<%s%s>" % (name, attribtxt))

    def endElementNS(self, n, qn=None):
        pref = self._getPrefix(n[0])
        if (pref):
            name = "%s:%s" % (pref, n[1])
        else:
            name = n[1]
        self.xml.append("</%s>" % (name))

    def characters(self, text, zero=0, length=0):
        text = escape(text)
        self.xml.append(text)

    def get_xmlString(self):
        return ''.join(self.xml)


class NumericPredicateException(C3Exception):
    pass


class DomRecord(Record):
    context = None
    size = 0

    def __init__(self, data, xml="", docId=None, wordCount=0, byteCount=0):
        self.dom = data
        self.xml = xml
        self.id = docId
        self.parent = ('', '', -1)
        self.context = None
        self.metadata = {}
        if wordCount:
            self.wordCount = wordCount
        else:
            try:
                # Sometimes this blows up
                self.wordCount = len(flattenTexts(data).split())
            except:
                self.wordCount = 0
        self.byteCount = byteCount

    def _walk(self, node):
        pass

    def get_sax(self, session):
        if (not self.sax):
            self.handler = SaxContentHandler()
            for c in self.dom.childNodes:
                self._walkTop(c)
            self.sax = self.handler.currentText
            self.sax.append("9 %r" % self.handler.elementHash)
            self.handler = None
        return self.sax

    def get_dom(self, session):
        return self.dom

    def fetch_vector(self, session, index, summary=False):
        return index.indexStore.fetch_vector(session, index, self, summary)

    def fetch_proxVector(self, session, index, elem=-1):
        return index.indexStore.fetch_proxVector(session, index, self, elem)


[docs]class MinidomRecord(DomRecord): useNamespace = 1 def get_xml(self, session): if (self.xml): return self.xml else: self.xml = self.dom.toxml() return self.xml def _walkTop(self, node): # top level node if node.nodeType == elementType: self.namespaces = node.namespaceURI is not None self._walk(node) def _walk(self, node): if (node.nodeType == elementType): name = node.localName ns = node.namespaceURI attrHash = {} for ai in range(node.attributes.length): attr = node.attributes.item(ai) if self.namespaces: if attr.namespaceURI == 'http://www.w3.org/2000/xmlns/': self.handler.startPrefixMapping(attr.localName, attr.value) else: attrHash[(attr.namespaceURI, attr.localName)] = attr.value else: attrHash[attr.localName] = attr.value if self.namespaces: self.handler.startElementNS((node.namespaceURI, node.localName), None, attrHash) else: self.handler.startElement(node.localName, attrHash) for c in node.childNodes: self._walk(c) if self.namespaces: self.handler.endElementNS((node.namespaceURI, node.localName), None) else: self.handler.endElement(node.localName) elif node.nodeType == utils.textType: self.handler.characters(node.data) def process_xpath(self, session, xpath, maps={}): raise NotImplementedError
try: from lxml import etree, sax except: class LxmlRecord(DomRecord): pass else:
[docs] class LxmlRecord(DomRecord): def process_xpath(self, session, xpath, maps={}): global prefixRe if (isinstance(xpath, list)): xpath = repr(xpath[0]) if not any( [xpath.startswith('/'), xpath.endswith(')')] ): xpath = "//" + xpath if maps: retval = self.dom.xpath(xpath, namespaces=maps) else: retval = self.dom.xpath(xpath) if isinstance(retval, list): return retval else: return [retval] def get_xml(self, session): return etree.tostring(self.dom) def get_sax(self, session): if (not self.sax): handler = SaxContentHandler() sax.saxify(self.dom, handler) self.sax = handler.currentText self.sax.append("9 %r" % handler.elementHash) return self.sax def get_dom(self, session): try: return self.dom.getroot() except AttributeError: return self.dom
try: from xpath import ( ParsedRelativeLocationPath as PRLP, ParsedAbsoluteLocationPath as PALP, ParsedStep, ParsedNodeTest, ParsedExpr, Compile, ParsedAbbreviatedAbsoluteLocationPath as PAALP, ParsedAbbreviatedRelativeLocationPath as PARLP, ParsedNodeTest ) except: # This means we can't do xpaths on SaxRecords... # making them a bit pointless, but not fatal as we likely don't need them pass def traversePath(node): if (isinstance(node, PRLP.ParsedRelativeLocationPath)): left = traversePath(node._left) right = traversePath(node._right) if (left == []): # self::node() return [right] elif (type(left[0]) in types.StringTypes): return [left, right] else: left.append(right) return left elif (isinstance(node, PALP.ParsedAbsoluteLocationPath)): left = ['/'] if (node._child): right = traversePath(node._child) else: return left if isinstance(right[0], basestring): return [left, right] else: left.extend(right) return left elif (isinstance(node, PARLP.ParsedAbbreviatedRelativeLocationPath)): left = traversePath(node._left) right = traversePath(node._right) right[0] = 'descendant' if (left == []): # self::node() return [right] elif (type(left[0]) in types.StringTypes): return [left, right] else: left.append(right) return left elif (isinstance(node, ParsedStep.ParsedStep)): # TODO: Check that axis is something we can parse a = node._axis._axis if (a == 'self'): return [] n = node._nodeTest local = ParsedNodeTest.NodeNameTest nameattr = "_nodeName" if (isinstance(n, local)): n = getattr(n, nameattr) elif (isinstance(n, ParsedNodeTest.TextNodeTest)): n = "__text()" elif (isinstance(n, ParsedNodeTest.QualifiedNameTest)): n = n._prefix + ":" + n._localName elif (isinstance(n, ParsedNodeTest.PrincipalTypeTest)): n = "*" else: raise NotImplementedError preds = node._predicates pp = [] if (preds): for pred in preds: pp.append(traversePath(pred)) return [a, n, pp] elif (isinstance(node, ParsedExpr.ParsedEqualityExpr) or isinstance(node, ParsedExpr.ParsedRelationalExpr)): # @id="fish" op = node._op # Override check for common: [position()=int] if ( op == '=' and isinstance(node._left, ParsedExpr.FunctionCall) and node._left._name == 'position' and isinstance(node._right, ParsedExpr.ParsedNLiteralExpr) ): return node._right._literal left = traversePath(node._left) if (isinstance(left, list) and left[0] == "attribute"): left = left[1] right = traversePath(node._right) if not op in ('=', '!='): op = ['<', '<=', '>', '>='][op] return [left, op, right] elif ( isinstance(node, ParsedExpr.ParsedNLiteralExpr) or isinstance(node, ParsedExpr.ParsedLiteralExpr) ): # 7 or "fish" return node._literal elif (isinstance(node, ParsedExpr.FunctionCall)): if (node._name == 'last'): # Override for last using Pythonic expr return -1 elif node._name == 'name': return ['FUNCTION', '__name()'] elif node._name == 'starts-with': # only for foo[starts-with(@bar, 'baz')] return ['FUNCTION', 'starts-with', traversePath(node._arg0)[1], node._arg1._literal] elif node._name == 'regexp': return ['FUNCTION', 'regexp', traversePath(node._arg0)[1], re.compile(node._arg1._literal)] elif node._name == 'count': return ['FUNCTION', 'count', traversePath(node._arg0)] else: raise(NotImplementedError) elif (isinstance(node, ParsedExpr.ParsedAndExpr)): return [traversePath(node._left), 'and', traversePath(node._right)] elif (isinstance(node, ParsedExpr.ParsedOrExpr)): return [traversePath(node._left), 'or', traversePath(node._right)] else: # We'll need to do full XPath vs DOM raise NotImplementedError def parseOldXPath(p): xpObj = Compile(p) t = traversePath(xpObj) if (t[0] != '/' and type(t[0]) in types.StringTypes): t = [t] return [xpObj, t]
[docs]class SaxRecord(Record): def __init__(self, data, xml="", docId=None, wordCount=0, byteCount=0): self.sax = data self.id = docId self.xml = xml self.history = [] self.rights = [] self.elementHash = {} self.wordCount = wordCount self.byteCount = byteCount self.parent = ('', '', -1) self.attrRe = re.compile("u['\"](.+?)['\"]: u['\"](.*?)['\"](, |})") # self.attrRe = re.compile("u(?P<quote>['\"])(.+?)(?P=quote): " # "u(?P<quoteb>['\"])(.*?)(?P=quoteb)(, |})") self.recordStore = "" def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath xpath = parseOldXPath(xpath) xp = xpath[1] try: flatten = 0 if xp[0][0] == "FUNCTION" and xp[0][1] == 'count': # process xpath and return number of matches if isinstance(xp[0][2][0], str) and xp[0][2][0] != '/': data = self.process_xpath(session, [None, [xp[0][2]]], maps) else: data = self.process_xpath(session, [None, xp[0][2]], maps) return len(data) if (xp[-1][0] == 'child' and xp[-1][1] == "__text()"): flatten = 1 xp = xp[:-1] if (xp[-1][0] == 'attribute'): return self._handleAttribute(xp, maps) elif (xp[-1][0] == "/"): # Return top level element for x in xrange(len(self.sax)): if self.sax[x][0] in ['1', '4']: return self.sax[x:] elif(xp[-1][0] in ['child', 'descendant']): data = [] # Extracting element elemName = xp[-1][1] nselem = elemName.split(":") if (len(nselem) == 2): # Namespaced. nsUri = maps[nselem[0]] elemName = nselem[1] else: nsUri = "" attr = xp[-1][2] elemLines = [] if elemName == '*' and attr: for p in attr: if p[0] == 'FUNCTION' and p[2] == '__name()': names = self.elementHash.keys() if p[1] == 'starts-with' and p[2] == '__name()': for x in names: if x.find(p[3]) == 0: elemLines.extend(self.elementHash[x]) elif p[1] == 'regexp' and p[2] == '__name()': for x in names: if p[3].search(x): elemLines.extend(self.elementHash[x]) elif (not elemName in self.elementHash): return [] if ( len(attr) == 1 and isinstance(attr[0], list) and attr[0][1] == "=" ): n = u"%s[@%s='%s']" % (elemName, attr[0][0], attr[0][2]) elemLines = self.elementHash.get(n, []) if elemLines == []: try: elemLines = self.elementHash[elemName] except: # might really be empty pass for e in elemLines: if ( not nsUri or self.sax[e[0]][4:4 + len(nsUri)] == nsUri ): match = self._checkSaxXPathLine(xp, e[0]) if (match): # Return event chunk l = self.sax[e[0]] end = int(l[l.rfind(' ') + 1:]) data.append(self.sax[e[0]:end + 1]) else: # Unsupported final axis raise(NotImplementedError) if flatten and data: # Flatten to text nodes ndata = [] for match in data: txt = [] for ev in match: if ev[0] == '3': txt.append(ev[2:]) ndata.append(''.join(txt)) return ndata else: return data except NotImplementedError: # Convert to DOM (slow) and reapply (slower still) dom = self.get_dom(session) xp = xpTuple[0] try: return utils.evaluateXPath(xp, dom) except: self.log_critical("Buggy Xpath: %r" % xp) return [] # Otherwise just fall over as we've hit a real bug def _handleAttribute(self, xp, maps={}): attrName = xp[-1][1] nselem = attrName.split(":") if (len(nselem) == 2): # Namespaced attribute nsUri = maps[nselem[0]] attrName = nselem[1] else: nsUri = None data = [] if (len(xp) == 1): # Extracting all occs of attribute anywhere!? # Check predicates... (only support one numeric predicate) if ( len(xp[0][2]) == 1 and isinstance(xp[0][2][0], float) ): nth = int(xp[0][2][0]) elif (len(xp[0][2])): # Non index or multiple predicates?? raise(NotImplementedError) else: nth = 0 currn = 0 for l in self.sax: if (l[0] == "1"): (name, attrs) = self._convert_elem(l) if (attrName in attrs): currn += 1 content = attrs[attrName] if (currn == nth): data.append(content) break elif (not nth): data.append(content) else: elemName = xp[-2][1] flatten = 0 if (elemName == "*"): # Let DOM code handle this monstrosity :P raise(NotImplementedError) nselem = elemName.split(":") if (len(nselem) == 2): # Namespaced. elemNsUri = maps[nselem[0]] elemName = nselem[1] else: elemNsUri = "" if (elemName in self.elementHash): elemLines = self.elementHash[elemName] for e in elemLines: if ( not elemNsUri or self.sax[e[0]][4:4 + len(elemNsUri)] == elemNsUri ): line = self.sax[e[0]] (name, attrs) = self._convert_elem(line) if (attrName == '*'): # All attributes' values match = self._checkSaxXPathLine(xp[:-1], e[0]) if (match): for k in attrs.keys(): data.append(attrs[k]) else: if (not attrName in attrs): attrName = (nsUri, attrName) if (not attrName in attrs and not nsUri): # step through and take first content = None for key in attrs: if key[1] == attrName[1]: content = attrs[key] else: content = attrs.get(attrName, None) if (content): # Now check rest of path match = self._checkSaxXPathLine(xp[:-1], e[0]) if (match): data.append(content) return data def _checkSaxXPathLine(self, xp, line): # Check that event at line in record matches xpath up tree # Pass by reference, need a copy to pop! Looks like a hack... xpath = xp[:] climb = False while (xpath): posn = len(xpath) node = xpath.pop() if (line == -1): if node != "/" and node != ['/']: return 0 else: elem = self.sax[line] (name, attrs) = self._convert_elem(elem) match = self._checkSaxXPathNode(node, name, attrs, line, posn) if not match: if not climb: return 0 else: # Previous was a descendant, keep looking while not match: start = elem.rfind("}") + 2 end = elem.find(" ", start) line = int(elem[start:end]) if line != -1: elem = self.sax[line] (name, attrs) = self._convert_elem(elem) match = self._checkSaxXPathNode(node, name, attrs, line, posn) else: return 0 if xpath: start = elem.rfind("}") + 2 end = elem.find(" ", start) line = int(elem[start:end]) climb = (node and node[0] == "descendant") return 1 def _checkSaxXPathNode(self, step, name, attrs, line, posn): # name already checked, strip if step in ['/', ['/']] and name: return 0 if ( step[1] != name and step[1] != '*' and step[1][step[1].find(":") + 1:] != name ): return 0 elif (not step[0] in ['child', 'descendant']): # Unsupported axis raise(NotImplementedError) elif (step[2]): # Check predicates predPosn = 0 for pred in (step[2]): predPosn += 1 m = self._checkSaxXPathPredicate(pred, name, attrs, line, posn, predPosn) if (not m): return 0 return 1 def _checkSaxXPathPredicate(self, pred, name, attrs, line, posn, predPosn): if not isinstance(pred, list): # Numeric Predicate. (eg /foo/bar[1]) if (predPosn != 1): # Can't do numeric predicate on already predicated nodeset # eg: text[@type='main'][2] raise(NotImplementedError) if (posn == 1): # First position in relative path. # Check against position in elementHash if (name in self.elementHash): all = self.elementHash[name] p = int(pred) if (len(all) < p): return 0 return all[int(pred) - 1][0] == line return 0 else: # Not first position, so it applies to parent elem # Which we record during parsing elem = self.sax[line] end = elem.rfind("}") + 2 start = elem.find(' ', end) + 1 end = elem.find(' ', start) npred = float(elem[start:end]) return npred == pred elif (pred[1] in ['=', '!=', '<', '>', '<=', '>=']): # Single attribute return self._checkSaxXPathAttr(pred, attrs) elif (pred[1] in ['and', 'or']): # Attribute combinations left = self._checkSaxXPathPredicate(pred[0], name, attrs, line, posn, predPosn) right = self._checkSaxXPathPredicate(pred[2], name, attrs, line, posn, predPosn) if (pred[1] == 'and' and left and right): return 1 elif (pred[1] == 'or' and (left or right)): return 1 return 0 elif (pred[0] == 'attribute'): # Attribute exists test return pred[1] in attrs elif (pred[0] == 'FUNCTION'): if pred[2] == "__name()": return True if pred[1] == 'starts-with': if pred[2] in attrs: val = attrs[pred[2]] return not val.find(pred[3]) else: return False elif pred[1] == 'regexp': if pred[2] in attrs: return pred[3].search(attrs[pred[2]]) is not None else: return False raise NotImplementedError else: # No idea!! raise(NotImplementedError) return 1 def _checkSaxXPathAttr(self, pred, attrs): # Namespacey if (not pred[0] in attrs): if ((None, pred[0]) in attrs): pred[0] = (None, pred[0]) else: return 0 rel = pred[1] # -Much- faster than eval if isinstance(pred[2], float): attrValue = float(attrs[pred[0]]) else: attrValue = attrs[pred[0]] comp = cmp(attrValue, pred[2]) if rel == "=": return comp == 0 elif rel == ">": return comp == 1 elif rel == "<": return comp == -1 elif rel == "<=": return comp in (-1, 0) elif rel == ">=": return comp in (1, 0) elif rel == "!=": return comp in (1, -1) else: raise(NotImplementedError) def _convert_elem(self, line): # Currently: 1 name {attrs} parent npred end if (line[0] == '1'): start = line.find("{") name = line[2:start - 1] if line[start + 1] == '}': attrs = {} else: attrList = self.attrRe.findall(line) attrs = {} for m in attrList: attrs[unicode(m[0])] = unicode(m[1]) return [name, attrs] elif (line[0] == '4'): end = line.rfind("}") stuff = eval(line[2:end + 1]) return [stuff[1], stuff[3]] else: raise ValueError("Called convert on non element.") def saxify(self, session, handler=None, sax=[]): if handler is None: handler = self if not sax: sax = self.get_sax(session) for l in sax: line = l # line = l.strip() if line[0] == "1": # String manipulation method (name, attrs) = self._convert_elem(line) handler.startElement(name, attrs) elif line[0] == "3": handler.characters(line[2:], 0, len(line) - 2) elif line[0] == "2": end = line.rfind(' ') handler.endElement(line[2:end]) elif line[0] == "9": pass elif line[0] == '4': # 4 ns,name,qname, {} idx = line.rfind(' ') idx = line[:idx].rfind(' ') idx = line[:idx].rfind(' ') line = line[:idx] (ns, name, qname, attrs) = eval(line[2:]) handler.startElementNS((ns, name), qname, attrs) elif line[0] == '5': # 5 ns,name,qname parent pred end idx = line.rfind(' ') line = line[:idx] (ns, name, qname) = eval(line[2:]) handler.endElementNS((ns, name), qname) elif line[0] == '6': # 6 pref, uri pref, uri = eval(line[2:]) handler.startPrefixMapping(pref, uri) else: # Unknown type raise ValueError(line) def get_dom(self, session): if (self.dom): return self.dom else: # Turn SAX into DOM and cache s2dhandler.initState() self.saxify(session, s2dhandler) self.dom = s2dhandler.getRootNode() return self.dom def get_xml(self, session, events=[]): if (not events and self.xml): return self.xml else: # Turn SAX into XML and cache if not events: process = self.sax else: process = events s2xhandler.initState() self.saxify(session, s2xhandler, process) if not events: self.xml = s2xhandler.get_xmlString() return self.xml else: return s2xhandler.get_xmlString() def get_sax(self, session): return self.sax def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)
[docs]class MarcRecord(Record): """For dealing with Library MARC Records.""" def __init__(self, data, xml="", docId=0, wordCount=0, byteCount=0): txt = doc.get_raw(session) self.marc = MARC(txt) self.id = docId # Estimate number of words... display = str(self.marc) if not wordCount: wordCount = len(display.split()) - (len(display.split('\n')) * 2) self.wordCount = wordCount if byteCount: self.byteCount = byteCount else: self.byteCount = len(display) self.decoder = MARC8_to_Unicode() self.asciiRe = re.compile('([\x0e-\x1f]|[\x7b-\xff])') def process_xpath(self, session, xpath, maps={}): if (not isinstance(xpath, list)): # Raw XPath # c = utils.verifyXPaths([xpath]) if (not c or not c[0][1]): return [] else: xpath = c[0] xp = xpath[1] # format: fldNNN/a try: fld = int(xp[0][1][3:]) except ValueError: # not a NNN not an int return [] if fld in self.marc.fields: data = self.marc.fields[fld] else: return [] if len(xp) > 1: subfield = xp[1][1] else: subfield = "" vals = [] if fld in [0, 1]: vals = data else: for d in data: if not subfield: vals.append(' '.join([x[1] for x in d[2]])) elif subfield == 'ind1': vals.append(d[0]) elif subfield == 'ind2': vals.append(d[1]) elif fld == 8: if not subfield: vals.append(d) elif subfield == 'lang': vals.append(d[35:38]) elif subfield == 'date': vals.append(d[:6]) elif subfield == 'pubStatus': vals.append(d[6]) elif subfield == 'date1': vals.append(d[7:11]) elif subfield == 'date2': vals.append(d[11:15]) elif subfield == 'pubPlace': vals.append(d[15:18]) else: for x in d[2]: try: if x[0] == subfield: vals.append(x[1]) except: # broken pass nvals = [] for v in vals: try: nvals.append(v.decode('utf-8')) except: try: convtd = self.decoder.translate(v) nvals.append(unicodedata.normalize('NFC', convtd)) except: # strip out any totally @^%(ed characters v = self.asciiRe.sub('?', v) nvals.append(v) return nvals def get_dom(self, session): raise(NotImplementedError) def get_sax(self, session): raise(NotImplementedError) def get_xml(self, session): return self.marc.toMARCXML() def fetch_vector(self, session, index, summary=False): return index.indexStore.fetch_vector(session, index, self, summary)
s2dhandler = SaxToDomHandler() s2xhandler = SaxToXmlHandler()