Source code for cheshire3.transformer

import os.path
import types
import time
import re
import bz2

from lxml import etree

from cheshire3.configParser import C3Object
from cheshire3.baseObjects import Transformer, Record, Database, Server
from cheshire3.document import StringDocument
from cheshire3.utils import nonTextToken
from cheshire3.marc_utils import MARC
from cheshire3.exceptions import ConfigFileException


class FilepathTransformer(Transformer):
    """Returns record.id as an identifier, in raw SAX events.

    For use as the inTransformer of a recordStore.
    """
    def process_record(self, session, rec):
        sax = ['1 identifier {}', '3 ' + str(rec.id), '2 identifier']
        data = nonTextToken.join(sax)
        return StringDocument(data)


# Simplest transformation ...
class XmlTransformer(Transformer):
[docs] """ Return a Document containing the raw XML string of the record """ def process_record(self, session, rec): return StringDocument(rec.get_xml(session)) class Bzip2XmlTransformer(Transformer):
[docs] """Return a Document containing bzip2 compressed XML. Return a Document containing the raw XML string of the record, compressed using the bzip2 algorithm. """ def process_record(self, session, rec): data = rec.get_xml(session) bzdata = bz2.compress(data) return StringDocument(bzdata, self.id) class SaxTransformer(Transformer):
[docs] def process_record(self, session, rec): sax = [x.encode('utf8') for x in rec.get_sax(session)] sax.append("9 " + pickle.dumps(rec.elementHash)) data = nonTextToken.join(sax) return StringDocument(data) class WorkflowTransformer(Transformer):
[docs] """Transformer to execute a workflow.""" def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) self.workflow = self.get_path(session, 'workflow') def process_record(self, session, record): u"""Apply Workflow to the Record, return the resulting Document.""" output = self.workflow.process(session, record) if isinstance(output, basestring): output = StringDocument(output) elif isinstance(output, Record): output = StringDocument(output.get_xml(session)) return output class ComponentParentFetchingTransformer(Transformer):
"""Given a Cheshire3 component, fetch and return the parent Document. Given a Cheshire3 component Record, fetch and return the data for its parent in a new Document. """ def process_record(self, session, record): # Get RecordStore and identifier of parent record try: parentId = record.process_xpath(session, '/c3component/@parent')[0] except IndexError: parentId = record.process_xpath( session, '/c3:component/@c3:parent', maps={'c3': "http://www.cheshire3.org/schemas/component/"} )[0] recStoreId, parentId = parentId.split('/', 1) # Get RecordStore object if isinstance(self.parent, Database): db = self.parent elif isinstance(self.parent, Server) and session.database: db = self.parent.get_object(session, session.database) elif ( session.server and isinstance(session.server, Server) and session.database ): db = session.server.get_object(session, session.database) elif not session.server: raise ValueError("No session.server") else: raise ValueError("No session.database") recStore = db.get_object(session, recStoreId) # Fetch parent record parentRec = recStore.fetch_record(session, parentId) # Return a new Document with parent data and identifier data = parentRec.get_xml(session) doc = StringDocument( data, self.id, byteCount=len(data), byteOffset=0 ) doc.id = parentId return doc # --- XSLT Transformers --- def myTimeFn(dummy): # call as <xsl:value-of select="c3fn:now()"/> # with c3fn defined as http://www.cheshire3.org/ns/function/xsl/ return time.strftime("%Y-%m-%dT%H:%M:%SZ") class LxmlXsltTransformer(Transformer):
[docs] """XSLT transformer using Lxml implementation. Requires LxmlRecord. Use Record's resultSetItem's proximity information to highlight query term matches. """ _possiblePaths = { 'xsltPath': { 'docs': "Path to the XSLT file to use." } } _possibleSettings = { 'parameter': { 'docs': "Parameters to be passed to the transformer." } } def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) xfrPath = self.get_path(session, "xsltPath") if xfrPath is None: raise ConfigFileException("Missing path 'xsltPath' for " "{0}.".format(self.id)) if os.path.isabs(xfrPath): path = xfrPath else: dfp = self.get_path(session, "defaultPath") path = os.path.join(dfp, xfrPath) ns = etree.FunctionNamespace( 'http://www.cheshire3.org/ns/function/xsl/' ) ns['now'] = myTimeFn self.functionNamespace = ns self.parsedXslt = etree.parse(path) self.txr = etree.XSLT(self.parsedXslt) self.params = None parameter = self.get_setting(session, 'parameter', None) if (parameter): self.params = {} kv = parameter.split(' ') for pair in kv: (k, v) = pair.split(':') self.params[k] = '"%s"' % v def process_record(self, session, rec): # return StringDocument dom = rec.get_dom(session) if (session.environment == 'apache'): self.txr = etree.XSLT(self.parsedXslt) if self.params: result = self.txr(dom, **self.params) else: result = self.txr(dom) return StringDocument(str(result)) class LxmlQueryTermHighlightingTransformer(Transformer):
"Abstract Class for query term highlighting Transformers for LxmlRecords." HIGHLIGHT_NS = "http://www.cheshire3.org/schemas/highlight/" _possibleSettings = { 'highlightTag': { 'docs': ("Tag to indicate highlighted section (will be inserted " "into output document as: " "<highlightTag>blah blah</highlightTag>)") }, 'tagAttrList': { 'docs': ('Space separated list of attribute name="value" pairs ' '(will be inserted into output document as: ' '<highlightTag name="value">blah blah</highlightTag>)') }, 'breakElementsList': { 'docs': ('Space separated list of element names to break at when ' 'tagging Query Terms. This can be useful when a speedy ' 'response is more important than complete tagging.') } } def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) htag = self.get_setting(session, 'highlightTag', None) if htag is None: self.highlightTag = 'c3:highlight' self.attrs = {'xmlns:c3': self.HIGHLIGHT_NS} else: self.highlightTag = htag self.attrs = {} tagAttrs = self.get_setting(session, 'tagAttrList', None) if tagAttrs is not None: for attr in tagAttrs.split(' '): bits = attr.split('=', 1) k = bits[0] v = bits[1][1:-1] # strip off "s self.attrs[k] = v self.breakElements = self.get_setting(session, 'breakElementsList', '').split(' ') def _insertHighlightElement(self, element, located, start, end): text = getattr(element, located) setattr(element, located, text[:start]) hel = etree.Element(self.highlightTag) hel.attrib.update(self.attrs) hel.text = text[start:end] hel.tail = text[end:] return hel LxmlHighlighTxr = LxmlQueryTermHighlightingTransformer class LxmlPositionQueryTermHighlightingTransformer(LxmlHighlighTxr): """Return Document with search hits higlighted based on word position. Use word position from Record's resultSetItem's proximity information to highlight query term matches. Note Well: this can be unreliable when used in conjunction with stoplists. """ def __init__(self, session, config, parent): raise NotImplementedError class LxmlOffsetQueryTermHighlightingTransformer(LxmlHighlighTxr):
[docs] """Return Document with search hits higlighted based on character offsets. Use character offsets from Record's resultSetItem's proximity information to highlight query term matches. """ def __init__(self, session, config, parent): LxmlHighlighTxr.__init__(self, session, config, parent) try: # Try to get database's own version of RegexpFindOffsetTokenizer in # case config is non-default db = session.server.get_object(session, session.database) except: self.wordRe = re.compile(u""" (?xu) #verbose, unicode (?: [a-zA-Z0-9!#$%*/?|^{}`~&'+-=_]+@[0-9a-zA-Z.-]+ #email |(?:[\w+-]+)?[+-]/[+-] #alleles #hypenated word (maybe 'xx on the end) |\w+(?:-\w+)+(?:'(?:t|ll've|ll|ve|s|d've|d|re))? #date/num/money/time |[$\xa3\xa5\u20AC]?[0-9]+(?:[.,:-][0-9]+)+[%]? |[$\xa3\xa5\u20AC][0-9]+ #single money #split: 8am 1Million |[0-9]+(?=[a-zA-Z]+) #single percentage |[0-9]+% |(?:[A-Z]\.)+[A-Z\.] #acronym #o'clock, O'brien, d'Artagnan |[oOd]'[a-zA-Z]+ |[a-zA-Z]+://[^\s]+ #URI |\w+'(?:d've|d|t|ll've|ll|ve|s|re) #don't, we've |(?:[hH]allowe'en|[mM]a'am|[Ii]'m|[fF]o'c's'le|[eE]'en|[sS]'pose) #basic words, including + |[\w+]+ )""") else: self.wordRe = db.get_object(session, 'RegexpFindOffsetTokenizer').regexp def process_record(self, session, rec): recDom = rec.get_dom(session) if ( (rec.resultSetItem is not None) and (rec.resultSetItem.proxInfo is not None) and (len(rec.resultSetItem.proxInfo) > 0) ): # munge proxInfo into more useable form proxInfo = rec.resultSetItem.proxInfo proxInfo2 = set() # for each group of proxInfo (i.e. from each query clause) for pig in proxInfo: # for each item of proxInfo: # [nodeIdx, wordIdx, offset, termId(?)] for pi in pig: # values must be strings for sets to work proxInfo2.add('%d %d' % (pi[0], pi[2])) proxInfo = [map(int, pis.split(' ')) for pis in proxInfo2] nodeIdxs = [] wordOffsets = [] # sort proxInfo so that nodeIdxs are sorted descending (so that # offsets don't get upset when modifying text) for x in sorted(proxInfo, reverse=True): nodeIdxs.append(x[0]) wordOffsets.append(x[1]) xps = {} tree = recDom.getroottree() walker = recDom.getiterator() for x, n in enumerate(walker): if n.tag in self.breakElements: break if x in nodeIdxs: xps[x] = tree.getpath(n) xpathfn = recDom.xpath for ni, offset in zip(nodeIdxs, wordOffsets): try: xp = xps[ni] except KeyError: # No XPath continue el = xpathfn(xp)[0] located = None for ci, c in enumerate(el.iter()): # Ignore comments processing instructions etc. if c.text: text = c.text if len(c.text) > offset: start = offset try: end = self.wordRe.search(text, start).end() except: # Well I still... # haven't found... # what I'm looking for! pass else: located = 'text' if not (c.tag == self.highlightTag): hel = self._insertHighlightElement(c, located, start, end) try: c.insert(0, hel) except TypeError: # Immutable element (?) break break else: # Adjust offset accordingly offset -= len(text) if c != el and c.tail and located is None: text = c.tail if len(c.tail) > offset: start = offset try: end = self.wordRe.search(text, start).end() except: # Well I still... # haven't found... # what I'm looking for! pass else: if end == -1: end = len(text) located = 'tail' if not (c.tag == self.highlightTag): hel = self._insertHighlightElement(c, located, start, end) p = c.getparent() try: p.insert(p.index(c) + 1, hel) except TypeError: # Immutable element (?) break break else: # Adjust offset accordingly offset -= len(text) return StringDocument(etree.tostring(recDom)) class TemplatedTransformer(Transformer):
[docs] """Trasnform a Record using a Selector and a Python string.Template. Transformer to insert the output of a Selector into a template string containing place-holders. Template can be specified directly in the configuration using the template setting (whitespace is respected), or in a file using the templatePath path. If the template is specified in the configuration, XML reserved characters (<, >, & etc.) must be escaped. This can be useful for Record types that are not easily transformed using more standard mechanism (e.g. XSLT), a prime example being GraphRecords Example config: <subConfig type="transformer" id="myTemplatedTransformer"> <objectType>cheshire3.transformer.TemplatedTransformer</objectType> <paths> <object type="selector" ref="mySelector"/> <object type="extractor" ref="SimpleExtractor"/> </paths> <options> <setting type="template"> This is my document. The title is {0}. The author is {1} </setting> </options> </subConfig> selector config: <subConfig type="selector" id="mySelector"> <objectType>cheshire3.selector.XpathSelector</objectType> <source> <location type="xpath">//title</location> <location type="xpath">//author</location> </source> </subConfig> """ _possiblePaths = { 'selector': { 'docs': "Selector to use to get data from the record." }, 'extractor': { 'docs': ("An Extractor to use on each data item returned by the " "Selector. The Extractor used must be able to handle the " "output from the Selector (e.g. A SPARQL Selector would " "require an RDF Extractor). Default is SimpleExtractor") }, 'templatePath': { 'docs': ("Path to the file containing the template for the output " "Document with place-holders for the selected data items." ) } } _possibleSettings = { 'template': { 'docs': ("A string representing the template for the output " "Document with place-holders for selected data items.") } } def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) self.selector = self.get_path(session, 'selector') self.extractor = self.get_path(session, 'extractor') tmplPath = self.get_path(session, "templatePath") if tmplPath is not None: dfp = self.get_path(session, "defaultPath") path = os.path.join(dfp, tmplPath) with open(path, 'r') as fh: self.template = unicode(fh.read()) else: tmpl = self.get_setting(session, 'template', '') if not tmpl: raise ConfigFileException("{0} requires either a " "'templatePath' path or a " "'template' setting." "".format(self.id)) self.template = unicode(tmpl) def process_record(self, session, rec): process_eventList = self.extractor.process_eventList process_string = self.extractor.process_string process_node = self.extractor.process_node data = self.selector.process_record(session, rec) vals = [] for location in data: vals2 = [] for match in location: if isinstance(match, types.ListType): # SAX event vals2.append(process_eventList(session, match).keys()[0]) elif ( type(match) in types.StringTypes or type(match) in [int, long, float, bool] ): # Attribute content or function result (e.g. count()) vals2.append(process_string(session, match).keys()[0]) elif isinstance(match, types.TupleType): # RDF graph results (?) vals3 = [] for item in match: if item is not None: vals3.append(process_node(session, item).keys()[0]) else: vals3.append(None) vals2.append(vals3) else: # DOM nodes vals2.append(process_node(session, match).keys()[0]) vals.append(vals2) tmpl = self.template try: return StringDocument(tmpl.format(*vals)) except IndexError as e: try: session.logger.log_error(session, repr(vals)) session.logger.log_error(session, tmpl) except AttributeError: pass raise ConfigFileException('Template contained a place-holder for ' 'which data was not selected by the ' 'selector.') class MarcTransformer(Transformer):
[docs] """Transformer to converts records in marc21xml to marc records.""" def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) self.session = session def _process_tagName(self, tagname): for i, c in enumerate(tagname): if c != '0': return int(tagname[i:]) def process_record(self, session, rec): fields = {} tree = rec.get_dom(session) try: walker = tree.getiterator("controlfield") except AttributeError: # lxml 1.3 or later walker = tree.iter("controlfield") for element in walker: tag = self._process_tagName(element.get('tag')) contents = element.text if tag in fields: fields[tag].append(contents) else: fields[tag] = [contents] try: walker = tree.getiterator("datafield") except AttributeError: # lxml 1.3 or later walker = tree.iter("datafield") for element in walker: tag = self._process_tagName(element.get('tag')) try: children = element.getiterator('subfield') except AttributeError: # lxml 1.3 or later walker = element.iter('subfield') subelements = [(c.get('code'), c.text) for c in children] contents = (element.get('ind1'), element.get('ind2'), subelements) if tag in fields: fields[tag].append(contents) else: fields[tag] = [contents] leader = tree.xpath('//leader')[0] l = leader.text fields[0] = [''.join([l[5:10], l[17:20]])] marcObject = MARC() marcObject.fields = fields return StringDocument(marcObject.get_MARC())