Source code for cheshire3.transformer
import os.path
import types
import time
import re
import bz2
from lxml import etree
from cheshire3.configParser import C3Object
from cheshire3.baseObjects import Transformer, Record, Database, Server
from cheshire3.document import StringDocument
from cheshire3.utils import nonTextToken
from cheshire3.marc_utils import MARC
from cheshire3.exceptions import ConfigFileException
class FilepathTransformer(Transformer):
"""Returns record.id as an identifier, in raw SAX events.
For use as the inTransformer of a recordStore.
"""
def process_record(self, session, rec):
sax = ['1 identifier {}', '3 ' + str(rec.id), '2 identifier']
data = nonTextToken.join(sax)
return StringDocument(data)
# Simplest transformation ...
class XmlTransformer(Transformer):
[docs] """ Return a Document containing the raw XML string of the record """
def process_record(self, session, rec):
return StringDocument(rec.get_xml(session))
class Bzip2XmlTransformer(Transformer):
[docs] """Return a Document containing bzip2 compressed XML.
Return a Document containing the raw XML string of the record, compressed
using the bzip2 algorithm.
"""
def process_record(self, session, rec):
data = rec.get_xml(session)
bzdata = bz2.compress(data)
return StringDocument(bzdata, self.id)
class SaxTransformer(Transformer):
[docs] def process_record(self, session, rec):
sax = [x.encode('utf8') for x in rec.get_sax(session)]
sax.append("9 " + pickle.dumps(rec.elementHash))
data = nonTextToken.join(sax)
return StringDocument(data)
class WorkflowTransformer(Transformer):
[docs] """Transformer to execute a workflow."""
def __init__(self, session, config, parent):
Transformer.__init__(self, session, config, parent)
self.workflow = self.get_path(session, 'workflow')
def process_record(self, session, record):
u"""Apply Workflow to the Record, return the resulting Document."""
output = self.workflow.process(session, record)
if isinstance(output, basestring):
output = StringDocument(output)
elif isinstance(output, Record):
output = StringDocument(output.get_xml(session))
return output
class ComponentParentFetchingTransformer(Transformer):
"""Given a Cheshire3 component, fetch and return the parent Document.
Given a Cheshire3 component Record, fetch and return the data for its
parent in a new Document.
"""
def process_record(self, session, record):
# Get RecordStore and identifier of parent record
try:
parentId = record.process_xpath(session, '/c3component/@parent')[0]
except IndexError:
parentId = record.process_xpath(
session,
'/c3:component/@c3:parent',
maps={'c3': "http://www.cheshire3.org/schemas/component/"}
)[0]
recStoreId, parentId = parentId.split('/', 1)
# Get RecordStore object
if isinstance(self.parent, Database):
db = self.parent
elif isinstance(self.parent, Server) and session.database:
db = self.parent.get_object(session, session.database)
elif (
session.server and
isinstance(session.server, Server) and
session.database
):
db = session.server.get_object(session, session.database)
elif not session.server:
raise ValueError("No session.server")
else:
raise ValueError("No session.database")
recStore = db.get_object(session, recStoreId)
# Fetch parent record
parentRec = recStore.fetch_record(session, parentId)
# Return a new Document with parent data and identifier
data = parentRec.get_xml(session)
doc = StringDocument(
data,
self.id,
byteCount=len(data),
byteOffset=0
)
doc.id = parentId
return doc
# --- XSLT Transformers ---
def myTimeFn(dummy):
# call as <xsl:value-of select="c3fn:now()"/>
# with c3fn defined as http://www.cheshire3.org/ns/function/xsl/
return time.strftime("%Y-%m-%dT%H:%M:%SZ")
class LxmlXsltTransformer(Transformer):
[docs] """XSLT transformer using Lxml implementation. Requires LxmlRecord.
Use Record's resultSetItem's proximity information to highlight query term
matches.
"""
_possiblePaths = {
'xsltPath': {
'docs': "Path to the XSLT file to use."
}
}
_possibleSettings = {
'parameter': {
'docs': "Parameters to be passed to the transformer."
}
}
def __init__(self, session, config, parent):
Transformer.__init__(self, session, config, parent)
xfrPath = self.get_path(session, "xsltPath")
if xfrPath is None:
raise ConfigFileException("Missing path 'xsltPath' for "
"{0}.".format(self.id))
if os.path.isabs(xfrPath):
path = xfrPath
else:
dfp = self.get_path(session, "defaultPath")
path = os.path.join(dfp, xfrPath)
ns = etree.FunctionNamespace(
'http://www.cheshire3.org/ns/function/xsl/'
)
ns['now'] = myTimeFn
self.functionNamespace = ns
self.parsedXslt = etree.parse(path)
self.txr = etree.XSLT(self.parsedXslt)
self.params = None
parameter = self.get_setting(session, 'parameter', None)
if (parameter):
self.params = {}
kv = parameter.split(' ')
for pair in kv:
(k, v) = pair.split(':')
self.params[k] = '"%s"' % v
def process_record(self, session, rec):
# return StringDocument
dom = rec.get_dom(session)
if (session.environment == 'apache'):
self.txr = etree.XSLT(self.parsedXslt)
if self.params:
result = self.txr(dom, **self.params)
else:
result = self.txr(dom)
return StringDocument(str(result))
class LxmlQueryTermHighlightingTransformer(Transformer):
"Abstract Class for query term highlighting Transformers for LxmlRecords."
HIGHLIGHT_NS = "http://www.cheshire3.org/schemas/highlight/"
_possibleSettings = {
'highlightTag': {
'docs': ("Tag to indicate highlighted section (will be inserted "
"into output document as: "
"<highlightTag>blah blah</highlightTag>)")
},
'tagAttrList': {
'docs': ('Space separated list of attribute name="value" pairs '
'(will be inserted into output document as: '
'<highlightTag name="value">blah blah</highlightTag>)')
},
'breakElementsList': {
'docs': ('Space separated list of element names to break at when '
'tagging Query Terms. This can be useful when a speedy '
'response is more important than complete tagging.')
}
}
def __init__(self, session, config, parent):
Transformer.__init__(self, session, config, parent)
htag = self.get_setting(session, 'highlightTag', None)
if htag is None:
self.highlightTag = 'c3:highlight'
self.attrs = {'xmlns:c3': self.HIGHLIGHT_NS}
else:
self.highlightTag = htag
self.attrs = {}
tagAttrs = self.get_setting(session, 'tagAttrList', None)
if tagAttrs is not None:
for attr in tagAttrs.split(' '):
bits = attr.split('=', 1)
k = bits[0]
v = bits[1][1:-1] # strip off "s
self.attrs[k] = v
self.breakElements = self.get_setting(session,
'breakElementsList',
'').split(' ')
def _insertHighlightElement(self, element, located, start, end):
text = getattr(element, located)
setattr(element, located, text[:start])
hel = etree.Element(self.highlightTag)
hel.attrib.update(self.attrs)
hel.text = text[start:end]
hel.tail = text[end:]
return hel
LxmlHighlighTxr = LxmlQueryTermHighlightingTransformer
class LxmlPositionQueryTermHighlightingTransformer(LxmlHighlighTxr):
"""Return Document with search hits higlighted based on word position.
Use word position from Record's resultSetItem's proximity information to
highlight query term matches.
Note Well: this can be unreliable when used in conjunction with stoplists.
"""
def __init__(self, session, config, parent):
raise NotImplementedError
class LxmlOffsetQueryTermHighlightingTransformer(LxmlHighlighTxr):
[docs] """Return Document with search hits higlighted based on character offsets.
Use character offsets from Record's resultSetItem's proximity information
to highlight query term matches.
"""
def __init__(self, session, config, parent):
LxmlHighlighTxr.__init__(self, session, config, parent)
try:
# Try to get database's own version of RegexpFindOffsetTokenizer in
# case config is non-default
db = session.server.get_object(session, session.database)
except:
self.wordRe = re.compile(u"""
(?xu) #verbose, unicode
(?:
[a-zA-Z0-9!#$%*/?|^{}`~&'+-=_]+@[0-9a-zA-Z.-]+ #email
|(?:[\w+-]+)?[+-]/[+-] #alleles
#hypenated word (maybe 'xx on the end)
|\w+(?:-\w+)+(?:'(?:t|ll've|ll|ve|s|d've|d|re))?
#date/num/money/time
|[$\xa3\xa5\u20AC]?[0-9]+(?:[.,:-][0-9]+)+[%]?
|[$\xa3\xa5\u20AC][0-9]+ #single money
#split: 8am 1Million
|[0-9]+(?=[a-zA-Z]+)
#single percentage
|[0-9]+%
|(?:[A-Z]\.)+[A-Z\.] #acronym
#o'clock, O'brien, d'Artagnan
|[oOd]'[a-zA-Z]+
|[a-zA-Z]+://[^\s]+ #URI
|\w+'(?:d've|d|t|ll've|ll|ve|s|re) #don't, we've
|(?:[hH]allowe'en|[mM]a'am|[Ii]'m|[fF]o'c's'le|[eE]'en|[sS]'pose)
#basic words, including +
|[\w+]+
)""")
else:
self.wordRe = db.get_object(session,
'RegexpFindOffsetTokenizer').regexp
def process_record(self, session, rec):
recDom = rec.get_dom(session)
if (
(rec.resultSetItem is not None) and
(rec.resultSetItem.proxInfo is not None) and
(len(rec.resultSetItem.proxInfo) > 0)
):
# munge proxInfo into more useable form
proxInfo = rec.resultSetItem.proxInfo
proxInfo2 = set()
# for each group of proxInfo (i.e. from each query clause)
for pig in proxInfo:
# for each item of proxInfo:
# [nodeIdx, wordIdx, offset, termId(?)]
for pi in pig:
# values must be strings for sets to work
proxInfo2.add('%d %d' % (pi[0], pi[2]))
proxInfo = [map(int, pis.split(' ')) for pis in proxInfo2]
nodeIdxs = []
wordOffsets = []
# sort proxInfo so that nodeIdxs are sorted descending (so that
# offsets don't get upset when modifying text)
for x in sorted(proxInfo, reverse=True):
nodeIdxs.append(x[0])
wordOffsets.append(x[1])
xps = {}
tree = recDom.getroottree()
walker = recDom.getiterator()
for x, n in enumerate(walker):
if n.tag in self.breakElements:
break
if x in nodeIdxs:
xps[x] = tree.getpath(n)
xpathfn = recDom.xpath
for ni, offset in zip(nodeIdxs, wordOffsets):
try:
xp = xps[ni]
except KeyError:
# No XPath
continue
el = xpathfn(xp)[0]
located = None
for ci, c in enumerate(el.iter()):
# Ignore comments processing instructions etc.
if c.text:
text = c.text
if len(c.text) > offset:
start = offset
try:
end = self.wordRe.search(text, start).end()
except:
# Well I still...
# haven't found...
# what I'm looking for!
pass
else:
located = 'text'
if not (c.tag == self.highlightTag):
hel = self._insertHighlightElement(c,
located,
start,
end)
try:
c.insert(0, hel)
except TypeError:
# Immutable element (?)
break
break
else:
# Adjust offset accordingly
offset -= len(text)
if c != el and c.tail and located is None:
text = c.tail
if len(c.tail) > offset:
start = offset
try:
end = self.wordRe.search(text, start).end()
except:
# Well I still...
# haven't found...
# what I'm looking for!
pass
else:
if end == -1:
end = len(text)
located = 'tail'
if not (c.tag == self.highlightTag):
hel = self._insertHighlightElement(c,
located,
start,
end)
p = c.getparent()
try:
p.insert(p.index(c) + 1, hel)
except TypeError:
# Immutable element (?)
break
break
else:
# Adjust offset accordingly
offset -= len(text)
return StringDocument(etree.tostring(recDom))
class TemplatedTransformer(Transformer):
[docs] """Trasnform a Record using a Selector and a Python string.Template.
Transformer to insert the output of a Selector into a template string
containing place-holders.
Template can be specified directly in the configuration using the
template setting (whitespace is respected), or in a file using the
templatePath path. If the template is specified in the configuration,
XML reserved characters (<, >, & etc.) must be escaped.
This can be useful for Record types that are not easily transformed using
more standard mechanism (e.g. XSLT), a prime example being GraphRecords
Example
config:
<subConfig type="transformer" id="myTemplatedTransformer">
<objectType>cheshire3.transformer.TemplatedTransformer</objectType>
<paths>
<object type="selector" ref="mySelector"/>
<object type="extractor" ref="SimpleExtractor"/>
</paths>
<options>
<setting type="template">
This is my document. The title is {0}. The author is {1}
</setting>
</options>
</subConfig>
selector config:
<subConfig type="selector" id="mySelector">
<objectType>cheshire3.selector.XpathSelector</objectType>
<source>
<location type="xpath">//title</location>
<location type="xpath">//author</location>
</source>
</subConfig>
"""
_possiblePaths = {
'selector': {
'docs': "Selector to use to get data from the record."
},
'extractor': {
'docs': ("An Extractor to use on each data item returned by the "
"Selector. The Extractor used must be able to handle the "
"output from the Selector (e.g. A SPARQL Selector would "
"require an RDF Extractor). Default is SimpleExtractor")
},
'templatePath': {
'docs': ("Path to the file containing the template for the output "
"Document with place-holders for the selected data items."
)
}
}
_possibleSettings = {
'template': {
'docs': ("A string representing the template for the output "
"Document with place-holders for selected data items.")
}
}
def __init__(self, session, config, parent):
Transformer.__init__(self, session, config, parent)
self.selector = self.get_path(session, 'selector')
self.extractor = self.get_path(session, 'extractor')
tmplPath = self.get_path(session, "templatePath")
if tmplPath is not None:
dfp = self.get_path(session, "defaultPath")
path = os.path.join(dfp, tmplPath)
with open(path, 'r') as fh:
self.template = unicode(fh.read())
else:
tmpl = self.get_setting(session, 'template', '')
if not tmpl:
raise ConfigFileException("{0} requires either a "
"'templatePath' path or a "
"'template' setting."
"".format(self.id))
self.template = unicode(tmpl)
def process_record(self, session, rec):
process_eventList = self.extractor.process_eventList
process_string = self.extractor.process_string
process_node = self.extractor.process_node
data = self.selector.process_record(session, rec)
vals = []
for location in data:
vals2 = []
for match in location:
if isinstance(match, types.ListType):
# SAX event
vals2.append(process_eventList(session, match).keys()[0])
elif (
type(match) in types.StringTypes or
type(match) in [int, long, float, bool]
):
# Attribute content or function result (e.g. count())
vals2.append(process_string(session, match).keys()[0])
elif isinstance(match, types.TupleType):
# RDF graph results (?)
vals3 = []
for item in match:
if item is not None:
vals3.append(process_node(session, item).keys()[0])
else:
vals3.append(None)
vals2.append(vals3)
else:
# DOM nodes
vals2.append(process_node(session, match).keys()[0])
vals.append(vals2)
tmpl = self.template
try:
return StringDocument(tmpl.format(*vals))
except IndexError as e:
try:
session.logger.log_error(session, repr(vals))
session.logger.log_error(session, tmpl)
except AttributeError:
pass
raise ConfigFileException('Template contained a place-holder for '
'which data was not selected by the '
'selector.')
class MarcTransformer(Transformer):
[docs] """Transformer to converts records in marc21xml to marc records."""
def __init__(self, session, config, parent):
Transformer.__init__(self, session, config, parent)
self.session = session
def _process_tagName(self, tagname):
for i, c in enumerate(tagname):
if c != '0':
return int(tagname[i:])
def process_record(self, session, rec):
fields = {}
tree = rec.get_dom(session)
try:
walker = tree.getiterator("controlfield")
except AttributeError:
# lxml 1.3 or later
walker = tree.iter("controlfield")
for element in walker:
tag = self._process_tagName(element.get('tag'))
contents = element.text
if tag in fields:
fields[tag].append(contents)
else:
fields[tag] = [contents]
try:
walker = tree.getiterator("datafield")
except AttributeError:
# lxml 1.3 or later
walker = tree.iter("datafield")
for element in walker:
tag = self._process_tagName(element.get('tag'))
try:
children = element.getiterator('subfield')
except AttributeError:
# lxml 1.3 or later
walker = element.iter('subfield')
subelements = [(c.get('code'), c.text) for c in children]
contents = (element.get('ind1'), element.get('ind2'), subelements)
if tag in fields:
fields[tag].append(contents)
else:
fields[tag] = [contents]
leader = tree.xpath('//leader')[0]
l = leader.text
fields[0] = [''.join([l[5:10], l[17:20]])]
marcObject = MARC()
marcObject.fields = fields
return StringDocument(marcObject.get_MARC())