Source code for cheshire3.selector

"""Cheshire3 Selector Implementations.

possible location types:  'xpath', 'attribute', 'function', 'sparql' (in graph)
"""

import time

from lxml import etree

from cheshire3.baseObjects import Selector
from cheshire3.record import LxmlRecord
from cheshire3.exceptions import ConfigFileException
from cheshire3.internal import CONFIG_NS
from cheshire3.utils import getFirstData, elementType


class SimpleSelector(Selector):

    def _handleLocationNode(self, session, child):
        data = {'maps': {}, 'string': '', 'type': ''}
        xp = getFirstData(child)
        data['string'] = xp

        if child.localName == 'xpath':
            data['type'] = 'xpath'
        else:
            try:
                data['type'] = child.getAttribute('type').lower()
            except:
                raise ConfigFileException("Location element in {0} must have "
                                          "'type' attribute".format(self.id))

        if data['type'] == 'xpath':
            for a in child.attributes.keys():
                # ConfigStore using 4Suite
                if type(a) == tuple:
                    attrNode = child.attributes[a]
                    a = attrNode.name
                if (a[:6] == "xmlns:"):
                    pref = a[6:]
                    uri = child.getAttributeNS('http://www.w3.org/2000/xmlns/',
                                               pref)
                    if not uri:
                        uri = child.getAttribute(a)
                    data['maps'][pref] = uri
                else:
                    data[a] = child.getAttributeNS(None, a)
        return data

    def _handleLxmlLocationNode(self, session, child):
        data = {'maps': {}, 'string': '', 'type': ''}
        data['string'] = child.text

        if child.tag in ['xpath', '{%s}xpath' % CONFIG_NS]:
            data['type'] = 'xpath'
        else:
            try:
                data['type'] = child.attrib['type'].lower()
            except KeyError:
                raise ConfigFileException("Location element in {0} must have "
                                          "'type' attribute".format(self.id))

        if data['type'] in ['xpath', 'sparql']:
            for a in child.nsmap:
                if a is not None:
                    data['maps'][a] = child.nsmap[a]
        for a in child.attrib:
            if not a in ['type', 'maps', 'string']:
                data[a] = child.attrib['a']
        return data

    def _handleConfigNode(self, session, node):
        if (node.localName == "source"):
            paths = []
            for child in node.childNodes:
                if child.nodeType == elementType:
                    if child.localName in ["xpath", 'location']:
                        # add XPath Location
                        xp = self._handleLocationNode(session, child)
                        paths.append(xp)
            self.sources.append(paths)

    def _handleLxmlConfigNode(self, session, node):
        if node.tag in ["source", '{%s}source' % CONFIG_NS]:
            xpaths = []
            for child in node.iterchildren(tag=etree.Element):
                if child.tag in ["xpath", '{%s}xpath' % CONFIG_NS,
                                 "location", '{%s}location' % CONFIG_NS]:
                    # add XPath
                    xp = self._handleLxmlLocationNode(session, child)
                    xpaths.append(xp)
            self.sources.append(xpaths)

    def __init__(self, session, config, parent):
        self.sources = []
        Selector.__init__(self, session, config, parent)


[docs]class TransformerSelector(SimpleSelector): u"""Selector that applies a Transformer to the Record to select data.""" def __init__(self, session, config, parent): SimpleSelector.__init__(self, session, config, parent) self.transformer = self.get_path(session, 'transformer') def process_record(self, session, record): u"""Apply Transformer to the Record, return the resulting data.""" doc = self.transformer.process_record(session, record) try: return [[doc.text.decode('utf-8')]] except: return [[doc.text]]
[docs]class MetadataSelector(SimpleSelector): u"""Selector specifying and attribute or function. Selector that specifies an attribute or function to use to select data from Records. """ def process_record(self, session, record): u"Extract the attribute, or run the specified function, return data." # Check name against record metadata vals = [] for src in self.sources: # list of {}s for xp in src: name = xp['string'] typ = xp['type'] if typ == 'xpath': # handle old style if hasattr(record, name): vals.append([getattr(record, name)]) elif name == 'now': # eg for lastModified/created etc now = time.strftime("%Y-%m-%d %H:%M:%S") vals.append([now]) else: vals.append(None) elif typ == 'attribute': if hasattr(record, name): vals.append([getattr(record, name)]) elif typ == 'function': if name in ['now', 'now()']: now = time.strftime("%Y-%m-%d %H:%M:%S") vals.append([now]) else: # nothing else defined? raise ConfigFileException("Unknown function: " "%s" % name) else: raise ConfigFileException("Unknown metadata selector type:" " %s" % typ) return vals
[docs]class XPathSelector(SimpleSelector): u"""Selects data specified by XPath(s) from Records.""" def __init__(self, session, config, parent): self.sources = [] SimpleSelector.__init__(self, session, config, parent) def process_record(self, session, record): u"Select and return data from elements matching configured XPaths." if not isinstance(record, LxmlRecord): raise TypeError("XPathSelector '{0}' only supports selection from " "LxmlRecords") vals = [] for src in self.sources: # list of {}s for xp in src: vals.append(record.process_xpath(session, xp['string'], xp['maps'])) return vals
[docs]class SpanXPathSelector(SimpleSelector): u"""Selects data from between two given XPaths. Requires exactly two XPaths. The span starts at first configured XPath and ends at the second. The same XPath may be given as both start and end point, in which case each matching element acts as a start and stop point (e.g. an XPath for a page break). """ def __init__(self, session, config, parent): self.sources = [] SimpleSelector.__init__(self, session, config, parent) try: if len(self.sources[0]) != 2: raise ConfigFileException("SpanXPathSelector '{0}' requires " "exactly two XPaths".format(self.id)) except IndexError: raise ConfigFileException("SpanXPathSelector '{0}' requires " "exactly 1 <source>".format(self.id)) def process_record(self, session, record): vals = [] startPath = self.sources[0][0]['string'] startMaps = self.sources[0][0]['maps'] if not startPath.startswith('/'): # Not absolute path, prepend // startPath = '//{0}'.format(startPath) endPath = self.sources[0][1]['string'] endMaps = self.sources[0][1]['maps'] if not endPath.startswith('/'): # Not absolute path, prepend // endPath = '//{0}'.format(endPath) if isinstance(record, LxmlRecord): # Avoid unnecessary re-parsing tree = record.get_dom(session) else: # Parse to an lxml.etree tree = etree.fromstring(record.get_xml(session)) # Find all of the start nodes startNodes = tree.xpath(startPath, namespaces=startMaps) # Initialize empty startEndPair startEndPair = (None, None) if startPath == endPath: # Paths are the same - copy the start nodes endNodes = startNodes[:] # Start path and end path are the same, treat as break points for elem in tree.iter(): if elem in startNodes: # When we hit a node from the start node list if startEndPair[0] is None: # we don't have a start node in our startEndPair # put this one in as the start node startEndPair = (elem, startEndPair[1]) else: # We already have a start node # Add this as the end node startEndPair = (startEndPair[0], elem) # Append the startEndPair to the list vals.append(startEndPair) # Start a new startEndPair with this as the start node startEndPair = (elem, None) else: # Start path and end path are different # # N.B. this algorithm is non-greedy. # The shortest span is always selected. If another start node is # hit before an end node occurs it will overwrite the first # # N.B. developers: this works slightly differently from the # previous SAX base version which treated the end of the record # as an end tag, this does not # # Find all the end nodes endNodes = tree.xpath(endPath, namespaces=endMaps) for elem in tree.iter(): if elem in startNodes: # When we hit a node from the start node list # put this one in as the start node startEndPair = (elem, startEndPair[1]) elif elem in endNodes and startEndPair[0] is not None: # When we hit an end node and we already have a start node # Add this as the end node startEndPair = (startEndPair[0], elem) # Append the startEndPair to the list vals.append(startEndPair) # Reset the startEndPair startEndPair = (None, None) return vals