import os
try:
import cPickle as pickle
except ImportError:
import pickle
from cheshire3.baseObjects import TokenMerger
from cheshire3.exceptions import ConfigFileException, FileDoesNotExistException
class SimpleTokenMerger(TokenMerger):
[docs]
def process_string(self, session, data):
return data
def process_hash(self, session, data):
new = {}
for d, val in data.iteritems():
if d:
for t in val['text']:
if t in new:
new[t]['occurences'] += val['occurences']
else:
# This will discard any second or further locations
# -very- minor edge case when this will break things
# If important, use ProximityTokenMerger.
try:
new[t] = {
'text': t,
'occurences': val['occurences'],
'proxLoc': val['proxLoc']
}
except KeyError:
# May already have been tokenized and merged
new[t] = {
'text': t,
'occurences': val['occurences'],
'positions': val['positions']
}
return new
class ProximityTokenMerger(SimpleTokenMerger):
[docs]
def process_hash(self, session, data):
new = {}
for d, val in data.iteritems():
if d:
x = 0
for t in val['text']:
if t in new:
new[t]['occurences'] += val['occurences']
try:
pls = [(pl, x) for pl in val['proxLoc']]
for p in pls:
new[t]['positions'].extend(p)
except KeyError:
new[t]['positions'].extend(val['positions'])
else:
try:
pls = [(pl, x) for pl in val['proxLoc']]
new[t] = {
'text': t,
'occurences': len(pls),
'positions': []
}
for p in pls:
new[t]['positions'].extend(p)
except KeyError:
new[t] = {
'text': t,
'occurences': val['occurences'],
'positions': val['positions'][:]
}
x += 1
return new
class OffsetProximityTokenMerger(ProximityTokenMerger):
[docs]
def process_hash(self, session, data):
new = {}
for d, val in data.iteritems():
if d:
x = 0
posns = val.get('charOffsets', [])
for t in val['text']:
try:
wordOffs = val['wordOffs']
except:
wordOffs = []
if t in new:
new[t]['occurences'] += val['occurences']
else:
new[t] = {
'text': t,
'occurences': val['occurences'],
'positions': []
}
try:
if len(wordOffs):
pls = [(pl, wordOffs[x], posns[x])
for pl
in val['proxLoc']
]
else:
pls = [(pl, x, posns[x]) for pl in val['proxLoc']]
for p in pls:
new[t]['positions'].extend(p)
except KeyError:
new[t]['positions'].extend(val['positions'])
x += 1
return new
class RangeTokenMerger(SimpleTokenMerger):
[docs]
_possibleSettings = {
'char': {
'docs': ('Character to use as the interval designator. Defaults '
'to forward slash (/) after ISO 8601.'),
'type': str
}
}
def __init__(self, session, config, parent):
SimpleTokenMerger.__init__(self, session, config, parent)
self.char = self.get_setting(session, 'char', '/')
class SequenceRangeTokenMerger(RangeTokenMerger):
[docs] """Merges tokens into a range for use in RangeIndexes.
Assumes that we've tokenized a single value into pairs,
which need to be concatenated into ranges.
"""
def process_hash(self, session, data):
new = {}
for d, val in data.iteritems():
l = val['text']
for x in range(0, len(l), 2):
try:
newkey = "{0}{1}{2}".format(l[x], self.char, l[x + 1])
except IndexError:
# Uneven number of points :/
newkey = "{0}{1}{0}".format(l[x], self.char)
if newkey in new:
new[newkey]['occurences'] += 1
else:
nval = val.copy()
nval['text'] = newkey
new[newkey] = nval
return new
class MinMaxRangeTokenMerger(RangeTokenMerger):
[docs] """Merges tokens into a range for use in RangeIndexes.
Uses a forward slash (/) as the interval designator after ISO 8601.
"""
def process_hash(self, session, data):
keys = data.keys()
if (not len(keys)):
return {}
startK = str(min(keys))
endK = str(max(keys))
newK = '{0}{1}{2}'.format(startK, self.char, endK)
val = data[startK]
val['text'] = newK
return {newK: val}
class NGramTokenMerger(SimpleTokenMerger):
[docs]
_possibleSettings = {
'nValue': {
'docs': '',
'type': int
}
}
def __init__(self, session, config, parent):
SimpleTokenMerger.__init__(self, session, config, parent)
self.n = self.get_setting(session, 'nValue', 2)
def process_hash(self, session, data):
kw = {}
n = self.n
for k, val in data.iteritems():
split = val['text']
for i in range(len(split) - (n - 1)):
nGram = split[i:(i + n)]
nGramStr = ' '.join(nGram)
if nGramStr in kw:
kw[nGramStr]['occurences'] += val['occurences']
else:
kw[nGramStr] = {
'text': nGramStr,
'occurences': val['occurences']
}
return kw
class ReconstructTokenMerger(SimpleTokenMerger):
[docs]
def process_hash(self, session, data):
kw = {}
for (k, val) in data.iteritems():
pl = 'charOffsets' in val
# FIXME: XXX for faked offsets
pl = 0
currLen = 0
new = []
for (w, word) in enumerate(val['text']):
if pl:
space = ' ' * (val['charOffsets'][w] - currLen)
new.append('%s%s' % (space, word))
currLen = val['charOffsets'][w] + len(word)
else:
new.append('%s' % (word))
if w < len(val['text']) - 1:
new.append(' ')
txt = ''.join(new)
kval = val.copy()
kval['text'] = txt
kw[k] = kval
return kw
class PhraseTokenMerger(ProximityTokenMerger):
[docs]
_possiblePaths = {
'mergeHashPickle': {
'docs': 'Pickled hash of words to merge'
}
}
def __init__(self, session, config, parent):
ProximityTokenMerger.__init__(self, session, config, parent)
mp = self.get_path(session, 'mergeHashPickle', '')
if not mp:
msg = "%s needs path: mergeHashPickle" % self.id
raise ConfigFileException(msg)
elif not os.path.exists(mp):
msg = " mergeHashPickle path on %s does not exist" % self.id
raise FileDoesNotExistException(msg)
inh = file(mp)
data = inh.read()
inh.close()
self.mergeHash = pickle.loads(data)
def process_hash(self, session, data):
new = {}
for d, val in data.iteritems():
if d:
x = 0
merging = []
for t in val['text']:
# Check if t in self.mergeHash
if t in self.mergeHash and len(val['text']) > x + 1:
nexts = self.mergeHash[t]
next = val['text'][x + 1]
if next in nexts:
merging.append(t)
continue
elif merging:
merging.append(t)
t = "_".join(merging)
merging = []
if t in new:
new[t]['occurences'] += val['occurences']
try:
pls = [(pl, x) for pl in val['proxLoc']]
for p in pls:
new[t]['positions'].extend(p)
except KeyError:
new[t]['positions'].extend(val['positions'])
else:
try:
pls = [(pl, x) for pl in val['proxLoc']]
new[t] = {
'text': t,
'occurences': len(pls),
'positions': []
}
for p in pls:
new[t]['positions'].extend(p)
except KeyError:
new[t] = {
'text': t,
'occurences': val['occurences'],
'positions': val['positions'][:]
}
x += 1
return new