Source code for cheshire3.transformer

import os.path
import types
import time
import re
import bz2

from lxml import etree

from cheshire3.configParser import C3Object
from cheshire3.baseObjects import Transformer, Record
from cheshire3.document import StringDocument
from cheshire3.utils import nonTextToken
from cheshire3.marc_utils import MARC
from cheshire3.exceptions import ConfigFileException


class FilepathTransformer(Transformer):
    """ Returns record.id as an identifier, in raw SAX events. For use as the inTransformer of a recordStore """
    def process_record(self, session, rec):
        sax = ['1 identifier {}', '3 ' + str(rec.id), '2 identifier']
        data = nonTextToken.join(sax)
        return StringDocument(data)


# Simplest transformation ...
class XmlTransformer(Transformer):
[docs]    """ Return a Document containing the raw XML string of the record """
    def process_record(self,session, rec):
        return StringDocument(rec.get_xml(session))


class Bzip2XmlTransformer(Transformer):
[docs]    """Return a Document containing the raw XML string of the record, compressed using the bzip2 algorithm."""
    
    def process_record(self, session, rec):
        data = rec.get_xml(session)
        bzdata = bz2.compress(data)
        return StringDocument(bzdata, self.id)
    

class SaxTransformer(Transformer):
[docs]    def process_record(self, session, rec):
        sax = [x.encode('utf8') for x in rec.get_sax(session)]
        sax.append("9 " + pickle.dumps(rec.elementHash))
        data = nonTextToken.join(sax)       
        return StringDocument(data)


class WorkflowTransformer(Transformer):
[docs]    """Transformer to execute a workflow."""
    
    def __init__(self, session, config, parent):
        Transformer.__init__(self, session, config, parent)
        self.workflow = self.get_path(session, 'workflow')
        
    def process_record(self, session, record):
        u"""Apply Workflow to the Record, return the resulting Document."""
        output = self.workflow.process(session, record)
        if isinstance(output, basestring):
            output = StringDocument(output)
        elif isinstance(output, Record):
            output = StringDocument(output.get_xml(session))
        
        return output

# --- XSLT Transformers ---


def myTimeFn(dummy):
    # call as <xsl:value-of select="c3fn:now()"/>
    # with c3fn defined as http://www.cheshire3.org/ns/function/xsl/
    return time.strftime("%Y-%m-%dT%H:%M:%SZ")
 

class LxmlXsltTransformer(Transformer):
[docs]    """XSLT transformer using Lxml implementation. Requires LxmlRecord.
    
    Use Record's resultSetItem's proximity information to highlight query term matches."""

    _possiblePaths = {'xsltPath' : {'docs' : "Path to the XSLT file to use."}}
    
    _possibleSettings = {'parameter' : {'docs' : "Parameters to be passed to the transformer."}}

    def __init__(self, session, config, parent):
        Transformer.__init__(self, session, config, parent)
        xfrPath = self.get_path(session, "xsltPath")
        if xfrPath is None:
            raise ConfigFileException("Missing path 'xsltPath' for "
                                      "{0}.".format(self.id))
        
        if os.path.isabs(xfrPath):
            path = xfrPath
        else:
            dfp = self.get_path(session, "defaultPath")
            path = os.path.join(dfp, xfrPath)
        
        ns = etree.FunctionNamespace('http://www.cheshire3.org/ns/function/xsl/')
        ns['now'] = myTimeFn
        self.functionNamespace = ns
        self.parsedXslt = etree.parse(path)
        self.txr = etree.XSLT(self.parsedXslt)
        self.params = None
        parameter = self.get_setting(session, 'parameter', None)
        if (parameter):
            self.params = {}
            kv = parameter.split(' ')
            for pair in kv:
                (k, v) = pair.split(':')
                self.params[k] = '"%s"' % v
                

    def process_record(self, session, rec):
        # return StringDocument
        dom = rec.get_dom(session)
        if (session.environment == 'apache'):
            self.txr = etree.XSLT(self.parsedXslt)
            
        if self.params:
            result = self.txr(dom, **self.params)
        else:
            result = self.txr(dom)
        return StringDocument(str(result))


class LxmlQueryTermHighlightingTransformer(Transformer):
    """Query term highlighting transformer based on Lxml implementation. Abstract Class."""
    
    _possibleSettings = {'highlightTag': {'docs' : 'Tag to indicate highlighted section (will be inserted into output document as: <highlightTag>blah blah</highlightTag>)'}
                        ,'tagAttrList': {'docs': 'Space separated list of attribute name="value" pairs (will be inserted into output document as: <highlightTag name="value">blah blah</highlightTag>)'}
                        ,'breakElementsList': {'docs': 'Space separated list of element names to break at when tagging Query Terms. This can be useful when a speedy response is more important than complete tagging.'}
                        }
    
    def __init__(self, session, config, parent):
        Transformer.__init__(self, session, config, parent)
        htag = self.get_setting(session, 'highlightTag', None)
        if htag is None:
            self.highlightTag = 'c3:highlight'
            self.attrs = {'xmlns:c3': "http://www.cheshire3.org/schemas/highlight/"}
        else:
            self.highlightTag = htag 
            self.attrs = {}
        
        tagAttrs = self.get_setting(session, 'tagAttrList', None)
        if tagAttrs is not None:
            for attr in tagAttrs.split(' '):
                bits = attr.split('=', 1)
                k = bits[0]
                v = bits[1][1:-1]    # strip off "s
                self.attrs[k] = v
                
        self.breakElements = self.get_setting(session, 'breakElementsList', '').split(' ')
        
    def _insertHighlightElement(self, element, located, start, end):
        text = getattr(element, located)
        setattr(element, located, text[:start])
        hel = etree.Element(self.highlightTag)
        hel.attrib.update(self.attrs)
        hel.text = text[start:end]
        hel.tail = text[end:]
        return hel

class LxmlPositionQueryTermHighlightingTransformer(LxmlQueryTermHighlightingTransformer):
    """Use word position from Record's resultSetItem's proximity information to highlight query term matches.
    
    Note Well: this can be unreliable when used in conjunction with stoplists."""

    def __init__(self, session, config, parent):
        raise NotImplementedError


class LxmlOffsetQueryTermHighlightingTransformer(LxmlQueryTermHighlightingTransformer):
[docs]    """Use character offsets from Record's resultSetItem's proximity information to highlight query term matches."""
    
    def __init__(self, session, config, parent):
        LxmlQueryTermHighlightingTransformer.__init__(self, session, config, parent)
        try:
            # try to get database's own version of RegexpFindOffsetTokenizer in case config is non-default
            db = session.server.get_object(session, session.database)
        except:
            self.wordRe = re.compile(u"""
              (?xu)                                            #verbose, unicode
              (?:
                [a-zA-Z0-9!#$%*/?|^{}`~&'+-=_]+@[0-9a-zA-Z.-]+ #email
               |(?:[\w+-]+)?[+-]/[+-]                          #alleles
               |\w+(?:-\w+)+(?:'(?:t|ll've|ll|ve|s|d've|d|re))?  #hypenated word (maybe 'xx on the end)
               |[$\xa3\xa5\u20AC]?[0-9]+(?:[.,:-][0-9]+)+[%]?  #date/num/money/time
               |[$\xa3\xa5\u20AC][0-9]+                        #single money
               |[0-9]+(?=[a-zA-Z]+)                            #split: 8am 1Million
               |[0-9]+%                                        #single percentage 
               |(?:[A-Z]\.)+[A-Z\.]                            #acronym
               |[oOd]'[a-zA-Z]+                                #o'clock, O'brien, d'Artagnan   
               |[a-zA-Z]+://[^\s]+                             #URI
               |\w+'(?:d've|d|t|ll've|ll|ve|s|re)              #don't, we've
               |(?:[hH]allowe'en|[mM]a'am|[Ii]'m|[fF]o'c's'le|[eE]'en|[sS]'pose)
               |[\w+]+                                         #basic words, including +
              )""")
        else:
            self.wordRe = db.get_object(session, 'RegexpFindOffsetTokenizer').regexp
        
    def process_record(self, session, rec):
        recDom = rec.get_dom(session)
        if (rec.resultSetItem is not None) and (rec.resultSetItem.proxInfo is not None) and (len(rec.resultSetItem.proxInfo) > 0):
            # munge proxInfo into more useable form
            proxInfo = rec.resultSetItem.proxInfo
            proxInfo2 = set()
            # for each group of proxInfo (i.e. from each query clause)
            for pig in proxInfo:
                # for each item of proxInfo: [nodeIdx, wordIdx, offset, termId(?)] NB termId from spoke indexes so useless to us :(
                for pi in pig:
                    # values must be strings for sets to work
                    proxInfo2.add('%d %d' % (pi[0], pi[2]))
            proxInfo = [map(int, pis.split(' ')) for pis in proxInfo2]
            nodeIdxs = []
            wordOffsets = []
            # sort proxInfo so that nodeIdxs are sorted descending (so that offsets don't get upset when modifying text)
            for x in sorted(proxInfo, reverse=True):
                nodeIdxs.append(x[0])
                wordOffsets.append(x[1])

            xps = {}
            tree = recDom.getroottree()
            walker = recDom.getiterator()
            for x, n in enumerate(walker):
                if n.tag in self.breakElements:
                    break
                if x in nodeIdxs:
                    xps[x] = tree.getpath(n)
            xpathfn = recDom.xpath
            for ni, offset in zip(nodeIdxs, wordOffsets):
                try:
                    xp = xps[ni]
                except KeyError:
                    continue # no XPath
                el = xpathfn(xp)[0]
                located = None
                for ci, c in enumerate(el.iter()): # ignore comments processing instructions etc.
                    if c.text:
                        text = c.text
                        if len(c.text) > offset:
                            start = offset
                            try:
                                end = self.wordRe.search(text, start).end()
                            except:
                                pass # well I still... haven't found... what I'm looking for!
                            else:
                                located = 'text'
                                if not (c.tag == self.highlightTag):
                                    hel = self._insertHighlightElement(c, located, start, end)
                                    try:
                                        c.insert(0, hel)
                                    except TypeError:
                                        # immutable element (comment!?)
                                        break
                                break
                        else:
                            # adjust offset accordingly
                            offset -= len(text)
                    if c != el and c.tail and located is None:
                        text = c.tail
                        if len(c.tail) > offset:
                            start = offset
                            try:
                                end = self.wordRe.search(text, start).end()
                            except:
                                pass # well I still haven't found, what I'm looking for!
                            else:
                                if end == -1:
                                    end = len(text)
                                located = 'tail'
                                if not (c.tag == self.highlightTag):
                                    hel = self._insertHighlightElement(c, located, start, end)
                                    p = c.getparent()
                                    try:
                                        p.insert(p.index(c)+1, hel)
                                    except TypeError:
                                        # immutable element (comment!?)
                                        break
                                break
                        else:
                            # adjust offset accordingly
                            offset -= len(text)
        return StringDocument(etree.tostring(recDom))


class TemplatedTransformer(Transformer):
[docs]    """Transformer to insert the output of a Selector into a template string containing place-holders.
    
    Template can be specified directly in the configuration using the 
    template setting (whitespace is respected), or in a file using the 
    templatePath path. If the template is specified in the configuration, 
    XML reserved characters (<, >, & etc.) must be escaped.
    
    This can be useful for Record types that are not easily transformed using
    more standard mechanism (e.g. XSLT), a prime example being GraphRecords
    
    Example
    
    config:
    
    <subConfig type="transformer" id="myTemplatedTransformer">
        <objectType>cheshire3.transformer.TemplatedTransformer</objectType>
        <paths>
            <object type="selector" ref="mySelector"/>
            <object type="extractor" ref="SimpleExtractor"/>
        </paths>
        <options>
            <setting type="template">
                This is my document. The title is {0}. The author is {1}
            </setting>
        </options>
    </subConfig>
    
    selector config:
    
    <subConfig type="selector" id="mySelector">
        <objectType>cheshire3.selector.XpathSelector</objectType>
        <source>
            <location type="xpath">//title</location>
            <location type="xpath">//author</location>
        </source>
    </subConfig>
    
    """
    
    _possiblePaths = {'selector': {'docs': "Selector to use to get data from the record."},
                      'extractor': {'docs': "An Extractor to use on each data item returned by the Selector. The Extractor used must be able to handle the output from the Selector (e.g. A SPARQL Selector would require an RDF Extractor). Default is SimpleExtractor"},
                      'templatePath': {'docs': "Path to the file containing the template for the output Document with place-holders for the selected data items."}
                     }
    
    _possibleSettings = {'template': {'docs': "A string representing the template for the output Document with place-holders for selected data items."}}
    
    def __init__(self, session, config, parent):
        Transformer.__init__(self, session, config, parent)
        self.selector = self.get_path(session, 'selector')
        self.extractor = self.get_path(session, 'extractor')
        tmplPath = self.get_path(session, "templatePath")
        if tmplPath is not None:
            dfp = self.get_path(session, "defaultPath")
            path = os.path.join(dfp, tmplPath)
            with open(path, 'r') as fh:
                self.template = unicode(fh.read()) 
        else:
            tmpl = self.get_setting(session, 'template', '')
            if not tmpl:
                raise ConfigFileException("{0} requires either a "
                                          "'templatePath' path or a "
                                          "'template' setting."
                                          "".format(self.id))
            self.template = unicode(tmpl)
            
    def process_record(self, session, rec):
        process_eventList = self.extractor.process_eventList
        process_string = self.extractor.process_string
        process_node = self.extractor.process_node
        data = self.selector.process_record(session, rec)
        vals = []
        for location in data:
            vals2 = []
            for match in location:
                if (type(match) == types.ListType):
                    # SAX event
                    vals2.append(process_eventList(session, match).keys()[0])
                elif (type(match) in types.StringTypes or type(match) in [int, long, float, bool]):
                    # Attribute content or function result (e.g. count())
                    vals2.append(process_string(session, match).keys()[0])
                elif type(match) == types.TupleType:
                    # RDF graph results (?)
                    vals3 = [] 
                    for item in match:
                        if item is not None:
                            vals3.append(process_node(session, item).keys()[0])
                        else:
                            vals3.append(None)
                    vals2.append(vals3)
                else:
                    # DOM nodes
                    vals2.append(process_node(session, match).keys()[0])
            vals.append(vals2)
        tmpl = self.template
        try:
            return StringDocument(tmpl.format(*vals))
        except IndexError as e:
            try:
                session.logger.log_error(session, repr(vals))
                session.logger.log_error(session, tmpl)
            except AttributeError:
                pass
            raise ConfigFileException('Template contained a place-holder for which data was not selected by the selector.')


class MarcTransformer(Transformer):
[docs]    """Transformer to converts records in marc21xml to marc records."""
    
    def __init__(self, session, config, parent):       
        Transformer.__init__(self, session, config, parent)
        self.session = session
    
    def _process_tagName(self, tagname):
        for i, c in enumerate(tagname):
            if c != '0':
                return int(tagname[i:])

    def process_record(self, session, rec):
        fields = {}
        tree = rec.get_dom(session)
        try:
            walker = tree.getiterator("controlfield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("controlfield")  
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            contents = element.text
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents]
                
        try:
            walker = tree.getiterator("datafield")
        except AttributeError:
            # lxml 1.3 or later
            walker = tree.iter("datafield")  
        for element in walker:
            tag = self._process_tagName(element.get('tag'))
            try:
                children = element.getiterator('subfield')
            except AttributeError:
                # lxml 1.3 or later
                walker = element.iter('subfield') 
            subelements = [(c.get('code'), c.text) for c in children]
            contents = (element.get('ind1'), element.get('ind2'), subelements)         
            if tag in fields:
                fields[tag].append(contents)
            else:
                fields[tag] = [contents] 

        leader = tree.xpath('//leader')[0]
        l = leader.text
        fields[0] = [''.join([l[5:10], l[17:20]])]
        marcObject = MARC()
        marcObject.fields = fields
        return StringDocument(marcObject.get_MARC())