import sys
import types
import math
import operator
import time
import cStringIO as StringIO
try:
import cPickle as pickle
except ImportError:
import pickle
from itertools import combinations
from xml.sax.saxutils import escape, unescape
from lxml import etree
from cheshire3.baseObjects import ResultSet, ResultSetItem, Index, Workflow
from cheshire3.utils import SimpleBitfield
from cheshire3 import cqlParser
def ucescape(data):
return unicode(escape(data), 'latin-1')
srlz_typehash = {int: 'int',
long: 'long',
str: 'str',
unicode: 'unicode',
bool: 'bool',
type(None): 'None',
float: 'float'
}
dsrlz_typehash = {}
for k, v in srlz_typehash.iteritems():
dsrlz_typehash[v] = k
class RankedResultSet(ResultSet):
def _sumWeights(self, items, n):
"""Sum the values."""
item = items[0]
item.weight = sum([x.weight for x in items])
return item
#item.weight = sum([x.weight for x in items if (x.weight != 0.5)])
def _meanWeights(self, items, n):
"""Mean average the values."""
item = items[0]
item.weight = sum([x.weight for x in items])
item.weight = item.weight / n
return item
#trueWeightedItems = [x.weight for x in items if (x.weight != 0.5)]
#item.weight = sum(trueWeightedItems)
#item.weight = item.weight / len(trueWeightedItems)
def _normWeights(self, items, n):
"""Normalize the values and average them."""
for i in items:
i.weight = (i.weight *
(i.resultSet.minWeight / i.resultSet.maxWeight)
)
return self._meanWeights(items, n)
def _cmbzWeights(self, a, b):
"""Normalise and rescale values."""
a.weight = a.weight * (self.minWeight / self.maxWeight)
if b:
b.weight = b.weight * (self.minWeight / self.maxWeight)
a.weight = (a.weight + b.weight) * 2.0
else:
a.weight = a.weight / 2.0
def _nprvWeights(self, a, b):
"""Normalise values and privilege high ranked documents."""
a.weight = a.weight * (self.minWeight / self.maxWeight)
if b:
b.weight = b.weight * (self.minWeight / self.maxWeight)
a.weight = (a.weight + b.weight) * 2.0
else:
# Leave high ranking ones high
rlen = len(a.resultSet._list)
# FIXME: item undefined
if (
(rlen > 150 and item.resultSetPosition > 100) or
(rlen < 150 and item.resultSetPosition > rlen / 2)):
a.weight = a.weight / 2.0
def _pivotWeights(self, a, b):
"""Pivot weight of components if the document also occurs in the set.
Determine which item is component set, and which item is from document
set. If the component's parent document's id is the same as the one in
the full document list, then adjust.
Normalize min/max as above
Pivot default is 0.7, but allow override
(Pivot * documentScore) + ((1-pivot) * componentScore)
If not in the list then just ((1-pivot) * componentScore)
"""
raise NotImplementedError
[docs]class SimpleResultSet(RankedResultSet):
_list = []
id = ""
termid = -1
totalOccs = 0
totalRecs = 0
expires = 0
index = None
queryTerm = ""
queryFreq = 0
queryPositions = []
queryTime = 0
query = None
relevancy = 0
maxWeight = 0
minWeight = 0
termWeight = 0.0
recordStore = ""
rsiConstructor = None
attributesToSerialize = []
recordStoreSizes = 0
termIdHash = {}
fromStore = 0
def __init__(self, session, data=None, id="", recordStore=""):
self.rsiConstructor = SimpleResultSetItem
self.attributesToSerialize = [('id', ''),
('termid', -1),
('totalOccs', 0),
('totalRecs', 0),
('expires', 0),
('queryTerm', ''),
('queryFreq', 0),
('queryPositions', []),
('relevancy', 0),
('maxWeight', 0),
('minWeight', 0),
('termWeight', 0.0),
('recordStore', ''),
('recordStoreSizes', 0),
('index', None),
('queryTime', 0.0),
('query', '')
]
if data is not None:
self._list = list(data)
else:
self._list = []
self.id = id
self.recordStore = recordStore
self.relevanceContextSets = {
"info:srw/cql-context-set/2/relevance-1.0": 1.0,
"info:srw/cql-context-set/2/relevance-1.1": 1.1,
"info:srw/cql-context-set/2/relevance-1.2": 1.2
}
self.termid = -1
self.totalOccs = 0
self.totalRecs = 0
self.expires = 0
self.index = None
self.queryTerm = ""
self.queryFreq = 0
self.queryPositions = []
self.queryTime = 0.0
self.query = None
self.relevancy = 0
self.maxWeight = 0
self.minWeight = 0
self.termWeight = 0.0
self.recordStoreSizes = 0
self.termIdHash = {}
self.fromStore = 0
def __getitem__(self, k):
return self._list[k]
def __len__(self):
return len(self._list)
def fromList(self, data):
self._list = data
def serialise(self, session, pickleOk=1):
"""Serialize and this ResultSet as XML, return a string (utf-8).
DEPRECATED by ``resultSet.serialize(session, pickleOk)``
"""
return self.serialize(session, pickleOk)
def serialize(self, session, pickleOk=1):
"""Serialize and this ResultSet as XML, return a string (utf-8)."""
# This is pretty fast, and generates better XML than previous
xml = [u'<resultSet>']
rsetattrs = self.attributesToSerialize
for (a, deft) in rsetattrs:
val = getattr(self, a)
if val != deft:
if type(val) in [dict, list, tuple]:
# Use later version of pickle protocol to deal with
# new-style classes, unicode etc.
valstr = pickle.dumps(val) # , pickle.HIGHEST_PROTOCOL)
xml.append(u'<d n="%s" t="pickle">%s</d>' %
(a, ucescape(valstr)))
elif isinstance(val, Index):
xml.append(u'<d n="%s" t="object">%s</d>' %
(a, escape(val.id)))
elif a == 'query' and val:
xml.append(u'<d n="%s" t="cql">%s</d>' %
(a, escape(val.toCQL())))
else:
xml.append(u'<d n="%s" t="%s">' %
(a, srlz_typehash.get(type(val), '')))
if type(val) in [int, long, float, bool, type(None)]:
xml.append(escape(unicode(val)))
else:
xml.append(escape(val))
xml.append(u'</d>')
for item in self:
xml.append(item.serialize(session, pickleOk))
xml.append(u'</resultSet>')
all = u''.join(xml)
return all.encode('utf-8')
def deserialise(self, session, data):
"""Deserialize XML in ``data`` to return the populated ResultSet.
DEPRECATED by ``resultSet.deserialize(session, data)``
"""
return self.deserialize(session, data)
def deserialize(self, session, data):
"""Deserialize XML in ``data`` to return the populated ResultSet."""
# This is blindingly fast compared to old version!
def value_of(elem):
# typehash = {'int': int,
# 'long': long,
# 'bool': bool,
# 'float': float
# }
t = elem.attrib['t']
if not elem.text:
return elem.text
txt = unescape(elem.text)
if t == 'pickle':
val = pickle.loads(txt.encode('utf-8'))
elif t == 'None':
val = None
elif t == 'object':
# dereference id
db = session.server.get_object(session, session.database)
val = db.get_object(session, txt)
elif t == 'cql':
try:
val = cqlParser.parse(txt)
except:
raise
elif t in dsrlz_typehash:
if type(txt) == unicode and t != 'unicode':
val = dsrlz_typehash[t](txt.encode('utf-8'))
else:
val = dsrlz_typehash[t](txt)
else:
val = txt
return val
root = etree.fromstring(data)
rsiConstructor = self.rsiConstructor
rsi = None
pi = []
hit = []
for e in root.iter(tag=etree.Element):
e2 = e
if e.tag == 'd':
name = e.attrib['n']
val = value_of(e)
if rsi:
setattr(rsi, name, val)
else:
setattr(self, name, val)
elif e2.tag == 'item':
if rsi:
if hit:
pi.append(hit)
rsi.proxInfo = pi
self.append(rsi)
rsi = rsiConstructor(session)
pi = []
hit = []
elif e2.tag == 'hit':
if hit:
pi.append(hit)
hit = []
elif e2.tag == 'w':
hit.append([int(x) for x in e2.attrib.values()])
if rsi:
if hit:
pi.append(hit)
rsi.proxInfo = pi
self.append(rsi)
return self
def append(self, item):
item.resultSet = self
item.resultSetPosition = len(self._list)
self._list.append(item)
def extend(self, itemList):
for i in itemList:
self.append(i)
def _lrAssign(self, session, others, clause, cql, db):
"""Assign Logistic Regression weights and combine items in others.
Assign Logistic Regression weights and merge items in resultSets in
others into self in a single method.
"""
if (db):
totalDocs = db.totalItems
if totalDocs == 0:
raise ValueError("0 documents in database")
else:
# Uhoh
raise NameError("Database not supplied to relevancy algorithm")
# William S Cooper proposes:
constants = [-3.7, 1.269, -0.31, 0.679, -0.0674, 0.223, 2.01]
# Ray R Larson proposes:
constants = [-3.7, 1.269, -0.31, 0.679, -0.021, 0.223, 4.01]
# Index Configuration proposes:
pm = db.get_path(session, 'protocolMap')
if not pm:
db._cacheProtocolMaps(session)
pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/')
db.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, clause)
if (idx):
for x in range(7):
temp = idx.get_setting(session, 'lr_constant%d' % x)
if (temp):
constants[x] = float(temp)
# Query proposes:
for m in cql.modifiers:
# Already been pinged for resolve()
if (m.type.prefixURI in self.relevanceContextSets):
if m.type.value.startswith("const"):
try:
constants[int(m.type.value[5])] = float(m.value)
except ValueError:
# Invalid literal for float()
pass
except IndexError:
# list index out of range
pass
sumLogQueryFreq = 0.0
sumQueryFreq = 0
sumIDF = 0.0
# Sort rss by length
# Each rs represents one unique word in query
for rs in others:
sumLogQueryFreq += math.log(rs.queryFreq)
sumQueryFreq += rs.queryFreq
n = len(rs)
if n:
rs.idf = math.log(totalDocs / float(n))
x2 = math.sqrt(sumQueryFreq)
# ResultSets will be sorted by item already
# Step through all concurrently
tmplist = []
recStores = {}
nors = len(others)
lens = [len(o) for o in others]
oidxs = range(1, nors)
positions = [0] * nors
all = cql.value in ['all', 'and', '=', 'prox', 'adj']
maxWeight = -1
minWeight = 9999999999
cont = 1
while cont:
items = [others[0][positions[0]]]
rspos = [0]
for o in oidxs:
try:
nitem = others[o][positions[o]]
except IndexError:
# There are no more items in this rs
continue
if nitem == items[0]:
items.append(nitem)
rspos.append(o)
elif nitem < items[0]:
if all:
# skip until equal or greater
positions[o] += 1
while (positions[o] < lens[o] and
others[o][positions[o]] < items[0]):
positions[o] += 1
else:
items = [nitem]
rspos = [o]
for r in rspos:
positions[r] += 1
while others and positions[0] == len(others[0]) - 1:
others.pop(0)
positions.pop(0)
if not others:
cont = 0
if all and len(items) < nors:
continue
# sumLogDAF = sum(map(math.log, [x.occurences for x in items]))
sumLogDAF = sum([math.log(x)
for x
in [y.occurences
for y
in items
]
])
sumIdx = sum([x.resultSet.idf for x in items])
x1 = sumLogQueryFreq / float(n)
x3 = sumLogDAF / float(n)
x5 = sumIDF / float(n)
x6 = math.log(float(n))
# FIXME: item undefined
try:
recStore = recStores[item.recordStore]
except KeyError:
db = session.server.get_object(session, session.database)
recStore = db.get_object(session, item.recordStore)
recStores[item.recordStore] = recStore
doclen = recStore.fetch_recordMetadata(session,
item.id,
'wordCount')
x4 = math.sqrt(doclen)
logodds = (constants[0] +
(constants[1] * x1) +
(constants[2] * x2) +
(constants[3] * x3) +
(constants[4] * x4) +
(constants[5] * x5) +
(constants[6] * x6)
)
item.weight = 0.75 * (math.exp(logodds) / (1 + math.exp(logodds)))
tmplist.append(item)
if item.weight > maxWeight:
maxWeight = item.weight
elif item.weight < minWeight:
minWeight = item.weight
self._list = tmplist
self.minWeight = minWeight
self.maxWeight = maxWeight
self.relevancy = 1
return 1
def _coriAssign(self, session, others, clause, cql, db):
"""Assign CORI weighting to each item in each resultSet in others."""
if (db):
totalDocs = float(db.totalItems)
avgSize = float(db.meanWordCount)
if not totalDocs or not avgSize:
raise ValueError("0 documents in database")
else:
raise NameError("Database not supplied to relevancy algorithm")
rsizes = clause.relation['recstoresizes']
if not rsizes:
rsizes = self.recordStoreSizes
recStoreSizes = {}
recStores = {}
for rs in others:
matches = float(len(rs))
if not matches:
rs.minWeight = 1.0
rs.maxWeight = -1.0
continue
I = (math.log((totalDocs + 0.5) / matches) /
math.log(totalDocs + 1.0)
)
rs.minWeight = 1000000.0
rs.maxWeight = -1.0
for item in rs:
df = float(item.occurences)
recStore = recStores.get(item.recordStore, None)
if not recStore:
recStore = db.get_object(session, item.recordStore)
recStores[item.recordStore] = recStore
size = recStore.fetch_recordMetadata(session,
item.id,
'wordCount')
if rsizes:
avgSize = recStore.meanWordCount
if size is None:
# Record deleted? Assume average size
size = avgSize
T = df / (df + 50.0 + ((150.0 * size) / avgSize))
item.weight = 0.4 + (0.6 * T * I)
if item.weight > rs.maxWeight:
rs.maxWeight = item.weight
if item.weight < rs.minWeight:
rs.minWeight = item.weight
return 0
def _tfidfAssign(self, session, others, clause, cql, db):
"""Assign TF-IDF weighting to each item in each resultSet in others."""
# each rs in others represents records matching a single term
# w(i,j) = tf(i,j) * (log ( N / df(i)))
if (db):
totalDocs = float(db.totalItems)
if not totalDocs:
raise ValueError("0 documents in database")
else:
raise NameError("Database not supplied to relevancy algorithm")
for rs in others:
matches = float(len(rs))
rs.minWeight = 10000000.0
rs.maxWeight = -1.0
for item in rs:
weight = item.occurences * math.log(totalDocs / matches)
item.weight = weight
if rs.maxWeight < weight:
rs.maxWeight = weight
if rs.minWeight > weight:
rs.minWeight = weight
return 0
def _okapiAssign(self, session, others, clause, cql, db):
"""Assign Okapi BM-25 weighting to items in resultSets in others."""
if (db):
totalDocs = float(db.totalItems)
avgSize = float(db.meanWordCount)
if not totalDocs or not avgSize:
raise ValueError("0 documents in database")
else:
raise NameError("Database not supplied to relevancy algorithm")
# Tuning parameters [b, k1, k3]
# default
constants = [0.75, 1.5, 1.5]
# Index Configuration proposes:
pm = db.get_path(session, 'protocolMap')
if not pm:
db._cacheProtocolMaps(session)
pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/')
db.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, clause)
if (idx):
for i, const in enumerate(['b', 'k1', 'k3']):
temp = idx.get_setting(session, 'okapi_constant_' + const)
if (temp):
constants[i] = float(temp)
# Query proposes:
for m in cql.modifiers:
# Already been pinged for resolve()
if (m.type.prefixURI in self.relevanceContextSets):
if m.type.value.startswith("const"):
try:
constants[int(m.type.value[5])] = float(m.value)
except ValueError:
# Invalid literal for float()
pass
except IndexError:
# list index out of range
pass
rsizes = clause.relation['recstoresizes']
if not rsizes:
rsizes = self.recordStoreSizes
recStoreSizes = {}
recStores = {}
b, k1, k3 = constants
for rs in others:
matches = float(len(rs))
if not matches:
rs.minWeight = 1.0
rs.maxWeight = -1.0
continue
idf = math.log(totalDocs / matches)
# idf = max(0.0,
# math.log(totalDocs - matches + 0.5 / matches + 0.5)
# ) # give it a floor of 0
qtw = ((k3 + 1) * rs.queryFreq) / (k3 + rs.queryFreq)
rs.minWeight = 1000000.0
rs.maxWeight = -1.0
for item in rs:
docFreq = float(item.occurences)
recStore = recStores.get(item.recordStore, None)
if recStore is None:
recStore = db.get_object(session, item.recordStore)
recStores[item.recordStore] = recStore
size = recStore.fetch_recordMetadata(session,
item.id,
'wordCount')
if rsizes:
avgSize = recStore.meanWordCount
if size is None:
# Record deleted? Assume average size
size = avgSize
T = (((k1 + 1) * docFreq) /
((k1 * ((1 - b) + b * (size / avgSize))) + docFreq)
)
item.weight = idf * T * qtw
if item.weight > rs.maxWeight:
rs.maxWeight = item.weight
if item.weight < rs.minWeight:
rs.minWeight = item.weight
return 0
def combine(self, session, others, clause, db=None):
"""Combine resultSets in others into self and return."""
try:
cql = clause.boolean
except AttributeError:
cql = clause.relation
self.query = clause
all = cql.value in ['all', 'and', '=', 'prox', 'adj', 'window']
# XXX: To Configuration. How?
relSets = self.relevanceContextSets
cqlSets = ["info:srw/cql-context-set/1/cql-v1.1",
"info:srw/cql-context-set/1/cql-v1.2"]
relevancy = 0
pi = 0
algorithm = "cori"
combine = "mean"
modType = ""
for m in cql.modifiers:
m.type.parent = clause
m.type.resolvePrefix()
if (m.type.prefixURI in relSets):
# Relevancy info
relevancy = 1
if m.type.value == "algorithm":
algorithm = m.value.lower()
elif m.type.value == "combine":
combine = m.value.lower()
elif (m.type.prefixURI in cqlSets and m.type.value == "relevant"):
# Generic 'relevancy please' request
relevancy = 1
elif m.type.value == 'proxinfo':
pi = 1
# Check if any others are relevance ranked already and preserve
if (not relevancy):
for x in others:
if (x.relevancy):
relevancy = 1
break
# Sort result sets by length
if not cql.value in ['not', 'prox']:
others.sort(key=lambda x: len(x), reverse=not all)
if (relevancy):
self.relevancy = 1
if (isinstance(cql, cqlParser.Relation)):
fname = "_%sAssign" % algorithm
if (hasattr(self, fname)):
fn = getattr(self, fname)
else:
# We /could/ self inspect to sat what relevance algorithms
# are supported...
raise NotImplementedError("Relevance algorithm '{0}' not "
"implemented".format(algorithm))
finish = fn(session, others, clause, cql, db)
if finish:
return self
if len(others) == 1 and len(others[0].queryPositions) < 2:
if relevancy:
# Just adding relevance to items?
others[0].relevancy = 1
if pi:
o = others[0]
for i in o:
for pii in i.proxInfo:
[x.append(o.termid) for x in pii]
return others[0]
if relevancy:
maxWeight = -1
minWeight = 9999999999
fname = "_%sWeights" % combine
if (hasattr(self, fname)):
fn = getattr(self, fname)
else:
raise NotImplementedError
tmplist = []
oidxs = range(1, len(others))
lens = [len(x) for x in others]
nors = len(others)
# Fast escapes
if all and 0 in lens:
return self
elif sum(lens) == 0:
return self
elif nors == 2 and cql.value in ['or', 'any'] and 0 in lens:
# A or (empty) == A
return others[int(lens[0] == 0)]
positions = [0] * nors
cmpHash = {'<': [-1],
'<=': [-1, 0],
'=': [0],
'>=': [0, 1],
'>': [1]
}
distance = 1
unit = "word"
comparison = "="
ordered = 0
if (cql.value in ['prox', 'window'] and cql.modifiers):
if (cql['unit']):
unit = cql['unit'].value
if (cql['distance']):
distance = int(cql['distance'].value)
comparison = cql['distance'].comparison
if cql['ordered']:
ordered = 1
else:
# for adj/=
ordered = 1
for o in others:
self.termIdHash[o.termid] = o.queryTerm
if o.fromStore:
# Re-sort before combining as likely out of order
if o[0].numericId is not None:
o.order(session, 'numericId')
else:
o.order(session, 'id')
chitem = cmpHash[comparison]
if unit == "word":
proxtype = 1
elif unit == "element" and distance == 0 and comparison == "=":
proxtype = 2
elif unit == "character":
# Can do this with offsets :)
proxtype = 3
else:
raise NotImplementedError()
hasGetItemList = [hasattr(o, 'get_item') for o in others]
cont = 1
while cont:
items = [others[0][positions[0]]]
rspos = [0]
for o in oidxs:
if o != -1:
if hasGetItemList[o]:
nitem = others[o].get_item(items[0])
if not nitem:
continue
else:
try:
nitem = others[o][positions[o]]
except IndexError:
oidxs[o - 1] = -1
continue
if nitem < items[0]:
if all or cql.value == 'not':
# skip until equal or greater
while True:
positions[o] += 1
if (
positions[o] >= lens[o] or
others[o][positions[o]] >= items[0]
):
break
if positions[o] != lens[o]:
nitem = others[o][positions[o]]
else:
items = [nitem]
rspos = [o]
continue
if nitem == items[0]:
items.append(nitem)
rspos.append(o)
for r in rspos:
positions[r] += 1
while others and positions[0] > len(others[0]) - 1:
others.pop(0)
positions.pop(0)
lens.pop(0)
if (
not others or
((cql.value == 'not' or all) and len(others) != nors)
):
cont = 0
if (all and len(items) < nors):
continue
elif cql.value == 'not' and len(items) != 1:
continue
elif cql.value in ["prox", 'adj', '=', 'window']:
# proxInfo is hash of (docid, recStore) to list of locations in
# record
# Sort items by query position. Repeat set at each posn
if cql.value != "prox":
newItemHash = {}
rsiConstructor = self.rsiConstructor
for i in items:
i.queryTerm = i.resultSet.queryTerm
i.queryPositions = i.resultSet.queryPositions
newItemHash[i.queryPositions[0]] = i
if len(i.queryPositions) > 1:
for qpi in i.queryPositions[1:]:
# construct new rsi
newi = rsiConstructor(session,
id=i.id,
recStore=i.recordStore,
occs=i.occurences,
database=i.database,
weight=i.weight,
resultSet=i.resultSet
)
newi.queryPositions = [qpi]
newi.queryTerm = i.queryTerm
newi.proxInfo = i.proxInfo
newItemHash[qpi] = newi
ni = newItemHash.items()
ni.sort()
newitems = [x[1] for x in ni]
items = newitems[:]
else:
# Create a copy of items
newitems = items[:]
litem = items.pop(0)
ltermid = litem.resultSet.termid
nomatch = 0
while len(items):
ritem = items.pop(0)
rtermid = ritem.resultSet.termid
matchlocs = []
for rpiFull in ritem.proxInfo:
rpi = list(rpiFull[-1])
(relem, rwpos) = rpi[:2]
for lpiFull in litem.proxInfo:
lpi = list(lpiFull[-1])
(lelem, lwpos) = lpi[:2]
if lelem == relem:
if proxtype == 2:
d = lpiFull[:]
for r in rpiFull:
if d[-1] != r:
r.append(rtermid)
d.append(r)
matchlocs.append(d)
else:
if proxtype == 3:
# character distance
try:
loff = lpi[2]
roff = rpi[2]
except IndexError:
# no offset in index
msg = ("Cannot do character "
"proximity without offset "
"information")
raise ConfigFileException(msg)
piDistance = roff - loff
else:
# word proximity
piDistance = rwpos - lwpos
if ordered and piDistance < 0:
# B is before A
pass
else:
piDistance = abs(piDistance)
c = cmp(piDistance, distance)
if (c in chitem):
# copy as we're in two deep
anyOkay = 0
d = lpiFull[:]
# Check we're not the same word
for r in rpiFull:
if (
cql.value == 'window' and
len(d) > 1
):
wokay = 1
# Check that ALL in
# distance
for wd in d:
if proxtype == 3:
wpiDistance = (
roff - wd[2]
)
else:
wpiDistance = (
rwpos - wd[1]
)
if (
ordered and
wpiDistance < 0
):
wokay = 0
break
else:
wpiDistance = abs(
wpiDistance
)
c = cmp(
wpiDistance,
distance
)
if not c in chitem:
wokay = 0
break
anyOkay = 1
if wokay and d[-1] != r:
r.append(rtermid)
d.append(r)
else:
anyOkay = 1
r.append(rtermid)
if d[-1] != r:
d.append(r)
if anyOkay:
matchlocs.append(d)
if matchlocs:
ritem.proxInfo = matchlocs
litem = ritem
else:
# no match, break to next set of items
nomatch = 1
break
if nomatch:
continue
for m in matchlocs:
m[0].append(ltermid)
litem.proxInfo = matchlocs
items = [litem]
# Do stuff on items to reduce to single representative
if relevancy:
item = fn(items, nors)
if item.weight > maxWeight:
maxWeight = item.weight
if item.weight < minWeight:
minWeight = item.weight
else:
item = items[0]
if pi and cql.value != "window":
# copy proxInfo around
if items[0].resultSet.termid != -1:
for pii in items[0].proxInfo:
for x in pii:
x.append(items[0].resultSet.termid)
for o in items[1:]:
if o.resultSet.termid != -1:
for pii in o.proxInfo:
for x in pii:
x.append(o.resultSet.termid)
item.proxInfo.extend(o.proxInfo)
item.resultSet = self
tmplist.append(item)
self._list = tmplist
if relevancy:
self.relevancy = 1
self.minWeight = minWeight
self.maxWeight = maxWeight
return self
def order(self, session, spec,
ascending=None, missing=None, case=None, accents=None):
"""Re-order based on the given specification and arguments.
:param spec: specification on which to order the ResultSet
:type spec: Index, xpath, Workflow, attribute of ResultSetItem
:param ascending: sort in ascending order
:type ascending: True, False or None (best guess)
:param missing: behaviour when sort value is missing
:type missing: integer (-1: low, 0: omit, 1: high) or string (default)
:param case: case sensitive? (assuming spec permits it)
:type case: True or False
:param accents: exclude accented characters
:type accents: True or False
:rtype: None
Not handling yet:
* locale=VALUE
* unicodeCollate[=VALUE]
Clause is a CQL clause with sort attributes on the relation
"""
l = self._list
if not l:
# don't try to sort empty set
return
if (
isinstance(spec, Index) and
spec.get_setting(session, 'sortStore')
):
# Check pre-processed db
tmplist = [(spec.fetch_sortValue(session, x, ascending), x)
for x
in l
]
elif isinstance(spec, Index) and spec.get_setting(session, 'vectors'):
# This assumes termid is ordered properly
# if it isn't write a normalizer, see pyuca normalizer
miss = lambda x: x[2][0][0] if x[2] else None
tmplist = [(miss(spec.fetch_vector(session, x)), x)
for x
in l
]
elif isinstance(spec, Index):
# Extract data as per indexing, MUCH slower
recs = []
storeHash = {}
for r in l:
store = r.recordStore
o = storeHash.get(store, spec.get_object(session, store))
storeHash[store] = o
recs.append(o.fetch_record(session, r.id))
tmplist = [(spec.extract_data(session, recs[x]), l[x])
for x
in range(len(l))
]
elif isinstance(spec, Workflow):
# Process a workflow on records
tmplist = []
for r in l:
rec = r.fetch_record(session)
tmplist.append((spec.process(session, rec), r))
elif (isinstance(spec, basestring) and hasattr(self[0], spec)):
# Sort by attribute of item
tmplist = [(getattr(x, spec), x) for x in l]
if ascending is None:
# Check if default sort order should be ascending
# Allow for str vs unicode
if str(spec) in ['id', 'numericId']:
ascending = True
else:
ascending = False
elif isinstance(spec, basestring):
# XPath
tmplist = []
for r in l:
rec = r.fetch_record(session)
tmplist.append((rec.process_xpath(session, spec), r))
else:
# Don't know what?
raise NotImplementedError
if missing is not None:
if missing == -1:
# Sort low
val = '\x00'
elif missing == 0:
# Omit
tmplist = [x for x in tmplist if x[0]]
elif missing == 1:
# Sort high
val = '\xff'
else:
val = missing
fill = lambda x: x if x else val
tmplist = [(fill(x[0]), x[1]) for x in tmplist]
if not case and case is not None:
tmplist = [(x[0].lower(), x[1]) for x in tmplist]
if not accents and accents is not None:
db = session.server.get_object(session, session.database)
n = db.get_object(session, 'DiacriticNormalizer')
unaccent = n.process_string
tmplist = [(unaccent(session, x[0]), x[1]) for x in tmplist]
if ascending is None:
# If ascending not set, assume ascending unless over-ridden
# due to spec later...
ascending = True
tmplist.sort(reverse=not(ascending))
self._list = [x for (key, x) in tmplist]
def reverse(self, session):
self._list.reverse()
def scale_weights(self):
minw = self.minWeight
if self.maxWeight != minw:
r = 1 / (self.maxWeight - minw)
else:
r = 1
# faster than equivalent list comprehension!
for rsi in self._list:
rsi.scaledWeight = (rsi.weight - minw) * r
[docs]class SimpleResultSetItem(ResultSetItem):
id = 0
numericId = None
recordStore = ""
database = ""
occurences = 0
weight = 0.5
scaledWeight = 0.5
diagnostic = None
proxInfo = []
attributesToSerialize = []
def __init__(self, session, id=0, recStore="", occs=0, database="",
diagnostic=None, weight=0.5, resultSet=None, numeric=None):
self.attributesToSerialize = [('id', 0),
('numericId', None),
('recordStore', ''),
('database', ''),
('occurences', 0),
('weight', 0.5),
('scaledWeight', 0.5)
]
self.id = id
self.recordStore = recStore
self.occurences = occs
self.weight = weight
self.scaledWeight = 0.5
self.database = database
self.resultSet = resultSet
self.proxInfo = []
self.numericId = numeric
def serialize(self, session, pickleOk=1):
xml = [u'<item>']
itemattrs = self.attributesToSerialize
for (a, deft) in itemattrs:
val = getattr(self, a)
if val != deft:
if type(val) in [dict, list, tuple]:
if pickleOk:
# Use latest version of pickle protocol to deal with
# new-style classes, unicode etc.
# valstr = pickle.dumps(val, pickle.HIGHEST_PROTOCOL)
valstr = pickle.dumps(val)
escaped_valstr = ucescape(valstr)
xml.append(u'<d n="{0}" t="pickle">{1}</d>'
u''.format(a, escaped_valstr))
else:
try:
valstr = unicode(val, 'utf-8')
except TypeError:
valstr = unicode(val)
escaped_valstr = escape(valstr)
xml.append(u'<d n="{0}" t="{1}">{2}</d>'
u''.format(a,
srlz_typehash.get(type(val), ''),
escaped_valstr)
)
val = getattr(self, 'proxInfo')
if val:
# Serialize to XML
xml.append(u'<proxInfo>')
for hit in val:
xml.append(u'<hit>')
for w in hit:
if len(w) == 4:
xml.append(u'<w e="%s" w="%s" o="%s" t="%s"/>' %
tuple(w))
elif len(w) == 3:
xml.append(u'<w e="%s" w="%s" o="%s"/>' %
tuple(w))
else:
try:
xml.append(u'<w e="%s" w="%s"/>' %
tuple(w))
except:
# Should really error!
xml.append(u'<w e="%s" w="%s" o="%s" t="%s"/>' %
tuple(w[:4]))
xml.append(u'</hit>')
xml.append(u'</proxInfo>')
xml.append(u'</item>')
return u''.join(xml)
def fetch_record(self, session):
# Return record from store
if (session.server):
# db = session.server.get_object(session, self.database)
db = session.server.get_object(session, session.database)
recStore = db.get_object(session, self.recordStore)
rec = recStore.fetch_record(session, self.id)
rec.resultSetItem = self
return rec
def __eq__(self, other):
try:
return (self.id == other.id and
self.recordStore == other.recordStore)
except:
# Not comparing two RSIs
return False
def __str__(self):
return "%s/%s" % (self.recordStore, self.id)
def __repr__(self):
return "Ptr:%s/%s" % (self.recordStore, self.id)
def __cmp__(self, other):
# Default sort by docid
if self.numericId is not None:
if other.numericId is not None:
oid = other.numericId
else:
oid = other.id
c = cmp(self.numericId, oid)
else:
c = cmp(self.id, other.id)
if not c:
return cmp(self.recordStore, other.recordStore)
else:
return c
def __hash__(self):
# Hash of recordstore + id
return hash(str(self))
class BitmapResultSet(ResultSet):
bitfield = None
currItems = None
recordStore = None
fromStore = 0
relevancy = 0
termid = -1
totalOccs = 0
totalRecs = 0
id = ""
index = None
queryTerm = ""
queryFreq = 0
queryPositions = []
relevancy = 0
maxWeight = 0
minWeight = 0
def __init__(self, session, data=0, recordStore=None):
if isinstance(data, SimpleBitfield):
self.bitfield = data
else:
self.bitfield = SimpleBitfield(data)
self.currItems = None
self.recordStore = recordStore
self.relevancy = 0
def __getitem__(self, k):
if self.currItems is None:
self.currItems = self.bitfield.trueItems()
return SimpleResultSetItem(None,
self.currItems[k],
self.recordStore,
1)
def __len__(self):
return self.bitfield.lenTrueItems()
def serialise(self, session):
return self.serialize(session)
def serialize(self, session):
return str(self.bitfield)
def deserialise(self, data):
return self.deserialize(data)
def deserialize(self, data):
self.bitfield = SimpleBitfield(data)
def get_item(self, item):
try:
if self.bitfield[item.id]:
return item
except IndexError:
pass
return None
def combine(self, session, others, clause, db=None):
if (isinstance(clause, cqlParser.Triple)):
cql = clause.boolean
else:
cql = clause.relation
v = cql.value
# Check if all are bitmaps
if v in ['=', 'exact', 'prox']:
if len(others) == 1:
return others[0]
else:
raise NotImplementedError()
allbits = 1
for o in others:
if not hasattr(o, 'bitfield'):
allbits = 0
break
if allbits:
if (v in ['all', 'and']):
s = others[0].bitfield
for o in others[1:]:
s.intersection(o.bitfield)
elif (v in ['any', 'or', '>', '>=', '<', '<=']):
s = others[0].bitfield
for o in others[1:]:
s.union(o.bitfield)
elif (v == 'not'):
s = others[0].bitfield
for o in others[1:]:
s.difference(o.bitfield)
else:
raise NotImplementedError()
self.bitfield = s
else:
# XXX Merging Bitmap with non bitmap
pass
return self
def order(self, spec):
# Reorder a bitmap?!
raise NotImplementedError()
def retrieve(self, numReq, start, cache=0):
end = min(start + numReq + 1, len(self))
recs = []
# XXX This should cache server, db and resultSet
for r in range(start, end):
recs.append(self[r].fetch_record(session))
return recs