Source code for cheshire3.resultSet

import sys
import types
import math
import operator
import time
import cStringIO as StringIO
try:
    import cPickle as pickle
except ImportError:
    import pickle

from itertools import combinations
from xml.sax.saxutils import escape, unescape
from lxml import etree

from cheshire3.baseObjects import ResultSet, ResultSetItem, Index, Workflow
from cheshire3.utils import SimpleBitfield
from cheshire3 import cqlParser


def ucescape(data):
    return unicode(escape(data), 'latin-1')


srlz_typehash = {int: 'int',
                 long: 'long',
                 str: 'str',
                 unicode: 'unicode',
                 bool: 'bool',
                 type(None): 'None',
                 float: 'float'
                 }

dsrlz_typehash = {}
for k, v in srlz_typehash.iteritems():
    dsrlz_typehash[v] = k


class RankedResultSet(ResultSet):

    def _sumWeights(self, items, n):
        """Sum the values."""
        item = items[0]
        item.weight = sum([x.weight for x in items])
        return item
        #item.weight = sum([x.weight for x in items if (x.weight != 0.5)])

    def _meanWeights(self, items, n):
        """Mean average the values."""
        item = items[0]
        item.weight = sum([x.weight for x in items])
        item.weight = item.weight / n
        return item
        #trueWeightedItems = [x.weight for x in items if (x.weight != 0.5)]
        #item.weight = sum(trueWeightedItems)
        #item.weight = item.weight / len(trueWeightedItems)

    def _normWeights(self, items, n):
        """Normalize the values and average them."""
        for i in items:
            i.weight = (i.weight *
                        (i.resultSet.minWeight / i.resultSet.maxWeight)
                        )
        return self._meanWeights(items, n)

    def _cmbzWeights(self, a, b):
        """Normalise and rescale values."""
        a.weight = a.weight * (self.minWeight / self.maxWeight)
        if b:
            b.weight = b.weight * (self.minWeight / self.maxWeight)
            a.weight = (a.weight + b.weight) * 2.0
        else:
            a.weight = a.weight / 2.0

    def _nprvWeights(self, a, b):
        """Normalise values and privilege high ranked documents."""
        a.weight = a.weight * (self.minWeight / self.maxWeight)
        if b:
            b.weight = b.weight * (self.minWeight / self.maxWeight)
            a.weight = (a.weight + b.weight) * 2.0
        else:
            # Leave high ranking ones high
            rlen = len(a.resultSet._list)
            # FIXME: item undefined
            if (
                    (rlen > 150 and item.resultSetPosition > 100) or
                    (rlen < 150 and item.resultSetPosition > rlen / 2)):
                a.weight = a.weight / 2.0

    def _pivotWeights(self, a, b):
        """Pivot weight of components if the document also occurs in the set.

        Determine which item is component set, and which item is from document
        set. If the component's parent document's id is the same as the one in
        the full document list, then adjust.

        Normalize min/max as above
        Pivot default is 0.7, but allow override
        (Pivot * documentScore) + ((1-pivot) * componentScore)

        If not in the list then just ((1-pivot) * componentScore)
        """
        raise NotImplementedError


[docs]class SimpleResultSet(RankedResultSet):
    _list = []

    id = ""
    termid = -1
    totalOccs = 0
    totalRecs = 0
    expires = 0
    index = None
    queryTerm = ""
    queryFreq = 0
    queryPositions = []
    queryTime = 0
    query = None
    relevancy = 0
    maxWeight = 0
    minWeight = 0
    termWeight = 0.0
    recordStore = ""
    rsiConstructor = None
    attributesToSerialize = []
    recordStoreSizes = 0
    termIdHash = {}
    fromStore = 0

    def __init__(self, session, data=None, id="", recordStore=""):
        self.rsiConstructor = SimpleResultSetItem
        self.attributesToSerialize = [('id', ''),
                                      ('termid', -1),
                                      ('totalOccs', 0),
                                      ('totalRecs', 0),
                                      ('expires', 0),
                                      ('queryTerm', ''),
                                      ('queryFreq', 0),
                                      ('queryPositions', []),
                                      ('relevancy', 0),
                                      ('maxWeight', 0),
                                      ('minWeight', 0),
                                      ('termWeight', 0.0),
                                      ('recordStore', ''),
                                      ('recordStoreSizes', 0),
                                      ('index', None),
                                      ('queryTime', 0.0),
                                      ('query', '')
                                      ]
        if data is not None:
            self._list = list(data)
        else:
            self._list = []
        self.id = id
        self.recordStore = recordStore

        self.relevanceContextSets = {
            "info:srw/cql-context-set/2/relevance-1.0": 1.0,
            "info:srw/cql-context-set/2/relevance-1.1": 1.1,
            "info:srw/cql-context-set/2/relevance-1.2": 1.2
        }

        self.termid = -1
        self.totalOccs = 0
        self.totalRecs = 0
        self.expires = 0
        self.index = None
        self.queryTerm = ""
        self.queryFreq = 0
        self.queryPositions = []
        self.queryTime = 0.0
        self.query = None
        self.relevancy = 0
        self.maxWeight = 0
        self.minWeight = 0
        self.termWeight = 0.0
        self.recordStoreSizes = 0
        self.termIdHash = {}
        self.fromStore = 0

    def __getitem__(self, k):
        return self._list[k]

    def __len__(self):
        return len(self._list)

    def fromList(self, data):
        self._list = data

    def serialise(self, session, pickleOk=1):
        """Serialize and this ResultSet as XML, return a string (utf-8).

        DEPRECATED by ``resultSet.serialize(session, pickleOk)``
        """
        return self.serialize(session, pickleOk)

    def serialize(self, session, pickleOk=1):
        """Serialize and this ResultSet as XML, return a string (utf-8)."""
        # This is pretty fast, and generates better XML than previous
        xml = [u'<resultSet>']

        rsetattrs = self.attributesToSerialize
        for (a, deft) in rsetattrs:
            val = getattr(self, a)
            if val != deft:
                if type(val) in [dict, list, tuple]:
                    # Use later version of pickle protocol to deal with
                    # new-style classes, unicode etc.
                    valstr = pickle.dumps(val)  # , pickle.HIGHEST_PROTOCOL)
                    xml.append(u'<d n="%s" t="pickle">%s</d>' %
                               (a, ucescape(valstr)))
                elif isinstance(val, Index):
                    xml.append(u'<d n="%s" t="object">%s</d>' %
                               (a, escape(val.id)))
                elif a == 'query' and val:
                    xml.append(u'<d n="%s" t="cql">%s</d>' %
                               (a, escape(val.toCQL())))
                else:
                    xml.append(u'<d n="%s" t="%s">' %
                               (a, srlz_typehash.get(type(val), '')))
                    if type(val) in [int, long, float, bool, type(None)]:
                        xml.append(escape(unicode(val)))
                    else:
                        xml.append(escape(val))
                    xml.append(u'</d>')

        for item in self:
            xml.append(item.serialize(session, pickleOk))
        xml.append(u'</resultSet>')
        all = u''.join(xml)
        return all.encode('utf-8')

    def deserialise(self, session, data):
        """Deserialize XML in ``data`` to return the populated ResultSet.

        DEPRECATED by ``resultSet.deserialize(session, data)``
        """
        return self.deserialize(session, data)

    def deserialize(self, session, data):
        """Deserialize XML in ``data`` to return the populated ResultSet."""
        # This is blindingly fast compared to old version!

        def value_of(elem):
            # typehash = {'int': int,
            #             'long': long,
            #             'bool': bool,
            #             'float': float
            #             }
            t = elem.attrib['t']
            if not elem.text:
                return elem.text
            txt = unescape(elem.text)
            if t == 'pickle':
                val = pickle.loads(txt.encode('utf-8'))
            elif t == 'None':
                val = None
            elif t == 'object':
                # dereference id
                db = session.server.get_object(session, session.database)
                val = db.get_object(session, txt)
            elif t == 'cql':
                try:
                    val = cqlParser.parse(txt)
                except:
                    raise
            elif t in dsrlz_typehash:
                if type(txt) == unicode and t != 'unicode':
                    val = dsrlz_typehash[t](txt.encode('utf-8'))
                else:
                    val = dsrlz_typehash[t](txt)
            else:
                val = txt
            return val

        root = etree.fromstring(data)
        rsiConstructor = self.rsiConstructor

        rsi = None
        pi = []
        hit = []
        for e in root.iter(tag=etree.Element):
            e2 = e
            if e.tag == 'd':
                name = e.attrib['n']
                val = value_of(e)
                if rsi:
                    setattr(rsi, name, val)
                else:
                    setattr(self, name, val)
            elif e2.tag == 'item':
                if rsi:
                    if hit:
                        pi.append(hit)
                    rsi.proxInfo = pi
                    self.append(rsi)
                rsi = rsiConstructor(session)
                pi = []
                hit = []
            elif e2.tag == 'hit':
                if hit:
                    pi.append(hit)
                hit = []
            elif e2.tag == 'w':
                hit.append([int(x) for x in e2.attrib.values()])
        if rsi:
            if hit:
                pi.append(hit)
            rsi.proxInfo = pi
            self.append(rsi)
        return self

    def append(self, item):
        item.resultSet = self
        item.resultSetPosition = len(self._list)
        self._list.append(item)

    def extend(self, itemList):
        for i in itemList:
            self.append(i)

    def _lrAssign(self, session, others, clause, cql, db):
        """Assign Logistic Regression weights and combine items in others.

        Assign Logistic Regression weights and merge items in resultSets in
        others into self in a single method.
        """
        if (db):
            totalDocs = db.totalItems
            if totalDocs == 0:
                raise ValueError("0 documents in database")
        else:
            # Uhoh
            raise NameError("Database not supplied to relevancy algorithm")

        # William S Cooper proposes:
        constants = [-3.7, 1.269, -0.31, 0.679, -0.0674, 0.223, 2.01]

        # Ray R Larson proposes:
        constants = [-3.7, 1.269, -0.31, 0.679, -0.021, 0.223, 4.01]

        # Index Configuration proposes:
        pm = db.get_path(session, 'protocolMap')
        if not pm:
            db._cacheProtocolMaps(session)
            pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/')
            db.paths['protocolMap'] = pm

        idx = pm.resolveIndex(session, clause)

        if (idx):
            for x in range(7):
                temp = idx.get_setting(session, 'lr_constant%d' % x)
                if (temp):
                    constants[x] = float(temp)

        # Query proposes:
        for m in cql.modifiers:
            # Already been pinged for resolve()
            if (m.type.prefixURI in self.relevanceContextSets):
                if m.type.value.startswith("const"):
                    try:
                        constants[int(m.type.value[5])] = float(m.value)
                    except ValueError:
                        # Invalid literal for float()
                        pass
                    except IndexError:
                        # list index out of range
                        pass

        sumLogQueryFreq = 0.0
        sumQueryFreq = 0
        sumIDF = 0.0

        # Sort rss by length

        # Each rs represents one unique word in query
        for rs in others:
            sumLogQueryFreq += math.log(rs.queryFreq)
            sumQueryFreq += rs.queryFreq
            n = len(rs)
            if n:
                rs.idf = math.log(totalDocs / float(n))
        x2 = math.sqrt(sumQueryFreq)

        # ResultSets will be sorted by item already
        # Step through all concurrently

        tmplist = []
        recStores = {}
        nors = len(others)
        lens = [len(o) for o in others]
        oidxs = range(1, nors)
        positions = [0] * nors
        all = cql.value in ['all', 'and', '=', 'prox', 'adj']
        maxWeight = -1
        minWeight = 9999999999

        cont = 1
        while cont:
            items = [others[0][positions[0]]]
            rspos = [0]
            for o in oidxs:
                try:
                    nitem = others[o][positions[o]]
                except IndexError:
                    # There are no more items in this rs
                    continue

                if nitem == items[0]:
                    items.append(nitem)
                    rspos.append(o)
                elif nitem < items[0]:
                    if all:
                        # skip until equal or greater
                        positions[o] += 1
                        while (positions[o] < lens[o] and
                               others[o][positions[o]] < items[0]):
                            positions[o] += 1

                    else:
                        items = [nitem]
                        rspos = [o]
            for r in rspos:
                positions[r] += 1

            while others and positions[0] == len(others[0]) - 1:
                others.pop(0)
                positions.pop(0)
            if not others:
                cont = 0
            if all and len(items) < nors:
                continue

            # sumLogDAF = sum(map(math.log, [x.occurences for x in items]))
            sumLogDAF = sum([math.log(x)
                             for x
                             in [y.occurences
                                 for y
                                 in items
                                 ]
                             ])
            sumIdx = sum([x.resultSet.idf for x in items])

            x1 = sumLogQueryFreq / float(n)
            x3 = sumLogDAF / float(n)
            x5 = sumIDF / float(n)
            x6 = math.log(float(n))
            # FIXME: item undefined
            try:
                recStore = recStores[item.recordStore]
            except KeyError:
                db = session.server.get_object(session, session.database)
                recStore = db.get_object(session, item.recordStore)
                recStores[item.recordStore] = recStore
            doclen = recStore.fetch_recordMetadata(session,
                                                   item.id,
                                                   'wordCount')
            x4 = math.sqrt(doclen)
            logodds = (constants[0] +
                       (constants[1] * x1) +
                       (constants[2] * x2) +
                       (constants[3] * x3) +
                       (constants[4] * x4) +
                       (constants[5] * x5) +
                       (constants[6] * x6)
                       )
            item.weight = 0.75 * (math.exp(logodds) / (1 + math.exp(logodds)))
            tmplist.append(item)
            if item.weight > maxWeight:
                maxWeight = item.weight
            elif item.weight < minWeight:
                minWeight = item.weight

        self._list = tmplist
        self.minWeight = minWeight
        self.maxWeight = maxWeight
        self.relevancy = 1
        return 1

    def _coriAssign(self, session, others, clause, cql, db):
        """Assign CORI weighting to each item in each resultSet in others."""
        if (db):
            totalDocs = float(db.totalItems)
            avgSize = float(db.meanWordCount)
            if not totalDocs or not avgSize:
                raise ValueError("0 documents in database")
        else:
                raise NameError("Database not supplied to relevancy algorithm")

        rsizes = clause.relation['recstoresizes']
        if not rsizes:
            rsizes = self.recordStoreSizes

        recStoreSizes = {}

        recStores = {}
        for rs in others:
            matches = float(len(rs))
            if not matches:
                rs.minWeight = 1.0
                rs.maxWeight = -1.0
                continue
            I = (math.log((totalDocs + 0.5) / matches) /
                 math.log(totalDocs + 1.0)
                 )
            rs.minWeight = 1000000.0
            rs.maxWeight = -1.0
            for item in rs:
                df = float(item.occurences)
                recStore = recStores.get(item.recordStore, None)
                if not recStore:
                    recStore = db.get_object(session, item.recordStore)
                    recStores[item.recordStore] = recStore
                size = recStore.fetch_recordMetadata(session,
                                                     item.id,
                                                     'wordCount')
                if rsizes:
                    avgSize = recStore.meanWordCount

                if size is None:
                    # Record deleted? Assume average size
                    size = avgSize

                T = df / (df + 50.0 + ((150.0 * size) / avgSize))
                item.weight = 0.4 + (0.6 * T * I)
                if item.weight > rs.maxWeight:
                    rs.maxWeight = item.weight
                if item.weight < rs.minWeight:
                    rs.minWeight = item.weight
        return 0

    def _tfidfAssign(self, session, others, clause, cql, db):
        """Assign TF-IDF weighting to each item in each resultSet in others."""
        # each rs in others represents records matching a single term
        # w(i,j) = tf(i,j) * (log ( N / df(i)))
        if (db):
            totalDocs = float(db.totalItems)
            if not totalDocs:
                raise ValueError("0 documents in database")
        else:
            raise NameError("Database not supplied to relevancy algorithm")

        for rs in others:
            matches = float(len(rs))
            rs.minWeight = 10000000.0
            rs.maxWeight = -1.0
            for item in rs:
                weight = item.occurences * math.log(totalDocs / matches)
                item.weight = weight
                if rs.maxWeight < weight:
                    rs.maxWeight = weight
                if rs.minWeight > weight:
                    rs.minWeight = weight
        return 0

    def _okapiAssign(self, session, others, clause, cql, db):
        """Assign Okapi BM-25 weighting to items in resultSets in others."""
        if (db):
            totalDocs = float(db.totalItems)
            avgSize = float(db.meanWordCount)
            if not totalDocs or not avgSize:
                raise ValueError("0 documents in database")
        else:
            raise NameError("Database not supplied to relevancy algorithm")

        # Tuning parameters [b, k1, k3]
        # default
        constants = [0.75, 1.5, 1.5]

        # Index Configuration proposes:
        pm = db.get_path(session, 'protocolMap')
        if not pm:
            db._cacheProtocolMaps(session)
            pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/')
            db.paths['protocolMap'] = pm

        idx = pm.resolveIndex(session, clause)

        if (idx):
            for i, const in enumerate(['b', 'k1', 'k3']):
                temp = idx.get_setting(session, 'okapi_constant_' + const)
                if (temp):
                    constants[i] = float(temp)

        # Query proposes:
        for m in cql.modifiers:
            # Already been pinged for resolve()
            if (m.type.prefixURI in self.relevanceContextSets):
                if m.type.value.startswith("const"):
                    try:
                        constants[int(m.type.value[5])] = float(m.value)
                    except ValueError:
                        # Invalid literal for float()
                        pass
                    except IndexError:
                        # list index out of range
                        pass

        rsizes = clause.relation['recstoresizes']
        if not rsizes:
            rsizes = self.recordStoreSizes

        recStoreSizes = {}
        recStores = {}
        b, k1, k3 = constants
        for rs in others:
            matches = float(len(rs))
            if not matches:
                rs.minWeight = 1.0
                rs.maxWeight = -1.0
                continue

            idf = math.log(totalDocs / matches)
            # idf = max(0.0,
            #           math.log(totalDocs - matches + 0.5 / matches + 0.5)
            #           )  # give it a floor of 0

            qtw = ((k3 + 1) * rs.queryFreq) / (k3 + rs.queryFreq)

            rs.minWeight = 1000000.0
            rs.maxWeight = -1.0
            for item in rs:
                docFreq = float(item.occurences)
                recStore = recStores.get(item.recordStore, None)
                if recStore is None:
                    recStore = db.get_object(session, item.recordStore)
                    recStores[item.recordStore] = recStore
                size = recStore.fetch_recordMetadata(session,
                                                     item.id,
                                                     'wordCount')

                if rsizes:
                    avgSize = recStore.meanWordCount

                if size is None:
                    # Record deleted? Assume average size
                    size = avgSize

                T = (((k1 + 1) * docFreq) /
                     ((k1 * ((1 - b) + b * (size / avgSize))) + docFreq)
                     )

                item.weight = idf * T * qtw

                if item.weight > rs.maxWeight:
                    rs.maxWeight = item.weight
                if item.weight < rs.minWeight:
                    rs.minWeight = item.weight

        return 0

    def combine(self, session, others, clause, db=None):
        """Combine resultSets in others into self and return."""
        try:
            cql = clause.boolean
        except AttributeError:
            cql = clause.relation

        self.query = clause

        all = cql.value in ['all', 'and', '=', 'prox', 'adj', 'window']

        # XXX: To Configuration. How?
        relSets = self.relevanceContextSets
        cqlSets = ["info:srw/cql-context-set/1/cql-v1.1",
                   "info:srw/cql-context-set/1/cql-v1.2"]

        relevancy = 0
        pi = 0
        algorithm = "cori"
        combine = "mean"
        modType = ""
        for m in cql.modifiers:
            m.type.parent = clause
            m.type.resolvePrefix()
            if (m.type.prefixURI in relSets):
                # Relevancy info
                relevancy = 1
                if m.type.value == "algorithm":
                    algorithm = m.value.lower()
                elif m.type.value == "combine":
                    combine = m.value.lower()
            elif (m.type.prefixURI in cqlSets and m.type.value == "relevant"):
                # Generic 'relevancy please' request
                relevancy = 1
            elif m.type.value == 'proxinfo':
                pi = 1

        # Check if any others are relevance ranked already and preserve
        if (not relevancy):
            for x in others:
                if (x.relevancy):
                    relevancy = 1
                    break

        # Sort result sets by length
        if not cql.value in ['not', 'prox']:
            others.sort(key=lambda x: len(x), reverse=not all)

        if (relevancy):
            self.relevancy = 1
            if (isinstance(cql, cqlParser.Relation)):
                fname = "_%sAssign" % algorithm
                if (hasattr(self, fname)):
                    fn = getattr(self, fname)
                else:
                    # We /could/ self inspect to sat what relevance algorithms
                    # are supported...
                    raise NotImplementedError("Relevance algorithm '{0}' not "
                                              "implemented".format(algorithm))
                finish = fn(session, others, clause, cql, db)
                if finish:
                    return self

        if len(others) == 1 and len(others[0].queryPositions) < 2:
            if relevancy:
                # Just adding relevance to items?
                others[0].relevancy = 1
            if pi:
                o = others[0]
                for i in o:
                    for pii in i.proxInfo:
                        [x.append(o.termid) for x in pii]
            return others[0]

        if relevancy:
            maxWeight = -1
            minWeight = 9999999999
            fname = "_%sWeights" % combine
            if (hasattr(self, fname)):
                fn = getattr(self, fname)
            else:
                raise NotImplementedError

        tmplist = []
        oidxs = range(1, len(others))
        lens = [len(x) for x in others]
        nors = len(others)
        # Fast escapes
        if all and 0 in lens:
            return self
        elif sum(lens) == 0:
            return self
        elif nors == 2 and cql.value in ['or', 'any'] and 0 in lens:
            # A or (empty) == A
            return others[int(lens[0] == 0)]

        positions = [0] * nors
        cmpHash = {'<': [-1],
                   '<=': [-1, 0],
                   '=': [0],
                   '>=': [0, 1],
                   '>': [1]
                   }
        distance = 1
        unit = "word"
        comparison = "="
        ordered = 0
        if (cql.value in ['prox', 'window'] and cql.modifiers):
            if (cql['unit']):
                unit = cql['unit'].value
            if (cql['distance']):
                distance = int(cql['distance'].value)
                comparison = cql['distance'].comparison
            if cql['ordered']:
                ordered = 1
        else:
            # for adj/=
            ordered = 1

        for o in others:
            self.termIdHash[o.termid] = o.queryTerm
            if o.fromStore:
                # Re-sort before combining as likely out of order
                if o[0].numericId is not None:
                    o.order(session, 'numericId')
                else:
                    o.order(session, 'id')

        chitem = cmpHash[comparison]
        if unit == "word":
            proxtype = 1
        elif unit == "element" and distance == 0 and comparison == "=":
            proxtype = 2
        elif unit == "character":
            # Can do this with offsets :)
            proxtype = 3
        else:
            raise NotImplementedError()
        hasGetItemList = [hasattr(o, 'get_item') for o in others]
        cont = 1

        while cont:
            items = [others[0][positions[0]]]
            rspos = [0]
            for o in oidxs:
                if o != -1:
                    if hasGetItemList[o]:
                        nitem = others[o].get_item(items[0])
                        if not nitem:
                            continue
                    else:
                        try:
                            nitem = others[o][positions[o]]
                        except IndexError:
                            oidxs[o - 1] = -1
                            continue
                        if nitem < items[0]:
                            if all or cql.value == 'not':
                                # skip until equal or greater
                                while True:
                                    positions[o] += 1
                                    if (
                                        positions[o] >= lens[o] or
                                        others[o][positions[o]] >= items[0]
                                    ):
                                        break
                                if positions[o] != lens[o]:
                                    nitem = others[o][positions[o]]
                            else:
                                items = [nitem]
                                rspos = [o]
                                continue
                    if nitem == items[0]:
                        items.append(nitem)
                        rspos.append(o)
            for r in rspos:
                positions[r] += 1

            while others and positions[0] > len(others[0]) - 1:
                others.pop(0)
                positions.pop(0)
                lens.pop(0)
            if (
                not others or
                ((cql.value == 'not' or all) and len(others) != nors)
            ):
                cont = 0
            if (all and len(items) < nors):
                continue
            elif cql.value == 'not' and len(items) != 1:
                continue
            elif cql.value in ["prox", 'adj', '=', 'window']:
                # proxInfo is hash of (docid, recStore) to list of locations in
                # record
                # Sort items by query position. Repeat set at each posn
                if cql.value != "prox":
                    newItemHash = {}
                    rsiConstructor = self.rsiConstructor
                    for i in items:
                        i.queryTerm = i.resultSet.queryTerm
                        i.queryPositions = i.resultSet.queryPositions
                        newItemHash[i.queryPositions[0]] = i
                        if len(i.queryPositions) > 1:
                            for qpi in i.queryPositions[1:]:
                                # construct new rsi
                                newi = rsiConstructor(session,
                                                      id=i.id,
                                                      recStore=i.recordStore,
                                                      occs=i.occurences,
                                                      database=i.database,
                                                      weight=i.weight,
                                                      resultSet=i.resultSet
                                                      )
                                newi.queryPositions = [qpi]
                                newi.queryTerm = i.queryTerm
                                newi.proxInfo = i.proxInfo
                                newItemHash[qpi] = newi

                    ni = newItemHash.items()
                    ni.sort()
                    newitems = [x[1] for x in ni]
                    items = newitems[:]
                else:
                    # Create a copy of items
                    newitems = items[:]

                litem = items.pop(0)
                ltermid = litem.resultSet.termid
                nomatch = 0

                while len(items):
                    ritem = items.pop(0)
                    rtermid = ritem.resultSet.termid
                    matchlocs = []
                    for rpiFull in ritem.proxInfo:
                        rpi = list(rpiFull[-1])
                        (relem, rwpos) = rpi[:2]
                        for lpiFull in litem.proxInfo:
                            lpi = list(lpiFull[-1])
                            (lelem, lwpos) = lpi[:2]
                            if lelem == relem:
                                if proxtype == 2:
                                    d = lpiFull[:]
                                    for r in rpiFull:
                                        if d[-1] != r:
                                            r.append(rtermid)
                                            d.append(r)
                                    matchlocs.append(d)
                                else:
                                    if proxtype == 3:
                                        # character distance
                                        try:
                                            loff = lpi[2]
                                            roff = rpi[2]
                                        except IndexError:
                                            # no offset in index
                                            msg = ("Cannot do character "
                                                   "proximity without offset "
                                                   "information")
                                            raise ConfigFileException(msg)
                                        piDistance = roff - loff
                                    else:
                                        # word proximity
                                        piDistance = rwpos - lwpos
                                    if ordered and piDistance < 0:
                                        # B is before A
                                        pass
                                    else:
                                        piDistance = abs(piDistance)
                                        c = cmp(piDistance, distance)
                                        if (c in chitem):
                                            # copy as we're in two deep
                                            anyOkay = 0
                                            d = lpiFull[:]
                                            # Check we're not the same word
                                            for r in rpiFull:
                                                if (
                                                    cql.value == 'window' and
                                                    len(d) > 1
                                                ):
                                                    wokay = 1
                                                    # Check that ALL in
                                                    # distance
                                                    for wd in d:
                                                        if proxtype == 3:
                                                            wpiDistance = (
                                                                roff - wd[2]
                                                            )
                                                        else:
                                                            wpiDistance = (
                                                                rwpos - wd[1]
                                                            )
                                                        if (
                                                            ordered and
                                                            wpiDistance < 0
                                                        ):
                                                            wokay = 0
                                                            break
                                                        else:
                                                            wpiDistance = abs(
                                                                wpiDistance
                                                            )
                                                            c = cmp(
                                                                wpiDistance,
                                                                distance
                                                            )
                                                            if not c in chitem:
                                                                wokay = 0
                                                                break
                                                        anyOkay = 1
                                                    if wokay and d[-1] != r:
                                                        r.append(rtermid)
                                                        d.append(r)
                                                else:
                                                    anyOkay = 1
                                                    r.append(rtermid)
                                                    if d[-1] != r:
                                                        d.append(r)
                                            if anyOkay:
                                                matchlocs.append(d)

                    if matchlocs:
                        ritem.proxInfo = matchlocs
                        litem = ritem
                    else:
                        # no match, break to next set of items
                        nomatch = 1
                        break
                if nomatch:
                    continue

                for m in matchlocs:
                    m[0].append(ltermid)
                litem.proxInfo = matchlocs
                items = [litem]

            # Do stuff on items to reduce to single representative
            if relevancy:
                item = fn(items, nors)
                if item.weight > maxWeight:
                    maxWeight = item.weight
                if item.weight < minWeight:
                    minWeight = item.weight
            else:
                item = items[0]

            if pi and cql.value != "window":
                # copy proxInfo around
                if items[0].resultSet.termid != -1:
                    for pii in items[0].proxInfo:
                        for x in pii:
                            x.append(items[0].resultSet.termid)
                for o in items[1:]:
                    if o.resultSet.termid != -1:
                        for pii in o.proxInfo:
                            for x in pii:
                                x.append(o.resultSet.termid)
                    item.proxInfo.extend(o.proxInfo)
            item.resultSet = self
            tmplist.append(item)

        self._list = tmplist
        if relevancy:
            self.relevancy = 1
            self.minWeight = minWeight
            self.maxWeight = maxWeight
        return self

    def order(self, session, spec,
              ascending=None, missing=None, case=None, accents=None):
        """Re-order based on the given specification and arguments.

        :param spec: specification on which to order the ResultSet
        :type spec: Index, xpath, Workflow, attribute of ResultSetItem
        :param ascending: sort in ascending order
        :type ascending: True, False or None (best guess)
        :param missing: behaviour when sort value is missing
        :type missing: integer (-1: low, 0: omit, 1: high) or string (default)
        :param case: case sensitive? (assuming spec permits it)
        :type case: True or False
        :param accents: exclude accented characters
        :type accents: True or False
        :rtype: None

        Not handling yet:

        * locale=VALUE
        * unicodeCollate[=VALUE]

        Clause is a CQL clause with sort attributes on the relation
        """

        l = self._list
        if not l:
            # don't try to sort empty set
            return
        if (
            isinstance(spec, Index) and
            spec.get_setting(session, 'sortStore')
        ):
            # Check pre-processed db
            tmplist = [(spec.fetch_sortValue(session, x, ascending), x)
                       for x
                       in l
                       ]
        elif isinstance(spec, Index) and spec.get_setting(session, 'vectors'):
            # This assumes termid is ordered properly
            # if it isn't write a normalizer, see pyuca normalizer
            miss = lambda x: x[2][0][0] if x[2] else None
            tmplist = [(miss(spec.fetch_vector(session, x)), x)
                       for x
                       in l
                       ]
        elif isinstance(spec, Index):
            # Extract data as per indexing, MUCH slower
            recs = []
            storeHash = {}
            for r in l:
                store = r.recordStore
                o = storeHash.get(store, spec.get_object(session, store))
                storeHash[store] = o
                recs.append(o.fetch_record(session, r.id))
            tmplist = [(spec.extract_data(session, recs[x]), l[x])
                       for x
                       in range(len(l))
                       ]
        elif isinstance(spec, Workflow):
            # Process a workflow on records
            tmplist = []
            for r in l:
                rec = r.fetch_record(session)
                tmplist.append((spec.process(session, rec), r))
        elif (isinstance(spec, basestring) and hasattr(self[0], spec)):
            # Sort by attribute of item
            tmplist = [(getattr(x, spec), x) for x in l]
            if ascending is None:
                # Check if default sort order should be ascending
                # Allow for str vs unicode
                if str(spec) in ['id', 'numericId']:
                    ascending = True
                else:
                    ascending = False
        elif isinstance(spec, basestring):
            # XPath
            tmplist = []
            for r in l:
                rec = r.fetch_record(session)
                tmplist.append((rec.process_xpath(session, spec), r))
        else:
            # Don't know what?
            raise NotImplementedError

        if missing is not None:
            if missing == -1:
                # Sort low
                val = '\x00'
            elif missing == 0:
                # Omit
                tmplist = [x for x in tmplist if x[0]]
            elif missing == 1:
                # Sort high
                val = '\xff'
            else:
                val = missing
            fill = lambda x: x if x else val
            tmplist = [(fill(x[0]), x[1]) for x in tmplist]

        if not case and case is not None:
            tmplist = [(x[0].lower(), x[1]) for x in tmplist]

        if not accents and accents is not None:
            db = session.server.get_object(session, session.database)
            n = db.get_object(session, 'DiacriticNormalizer')
            unaccent = n.process_string
            tmplist = [(unaccent(session, x[0]), x[1]) for x in tmplist]

        if ascending is None:
            # If ascending not set, assume ascending unless over-ridden
            # due to spec later...
            ascending = True

        tmplist.sort(reverse=not(ascending))
        self._list = [x for (key, x) in tmplist]

    def reverse(self, session):
        self._list.reverse()

    def scale_weights(self):
        minw = self.minWeight
        if self.maxWeight != minw:
            r = 1 / (self.maxWeight - minw)
        else:
            r = 1
        # faster than equivalent list comprehension!
        for rsi in self._list:
            rsi.scaledWeight = (rsi.weight - minw) * r


[docs]class SimpleResultSetItem(ResultSetItem):

    id = 0
    numericId = None
    recordStore = ""
    database = ""
    occurences = 0
    weight = 0.5
    scaledWeight = 0.5
    diagnostic = None
    proxInfo = []
    attributesToSerialize = []

    def __init__(self, session, id=0, recStore="", occs=0, database="",
                 diagnostic=None, weight=0.5, resultSet=None, numeric=None):
        self.attributesToSerialize = [('id', 0),
                                      ('numericId', None),
                                      ('recordStore', ''),
                                      ('database', ''),
                                      ('occurences', 0),
                                      ('weight', 0.5),
                                      ('scaledWeight', 0.5)
                                      ]
        self.id = id
        self.recordStore = recStore
        self.occurences = occs
        self.weight = weight
        self.scaledWeight = 0.5
        self.database = database
        self.resultSet = resultSet
        self.proxInfo = []
        self.numericId = numeric

    def serialize(self, session, pickleOk=1):
        xml = [u'<item>']
        itemattrs = self.attributesToSerialize
        for (a, deft) in itemattrs:
            val = getattr(self, a)
            if val != deft:
                if type(val) in [dict, list, tuple]:
                    if pickleOk:
                        # Use latest version of pickle protocol to deal with
                        # new-style classes, unicode etc.
                        # valstr = pickle.dumps(val, pickle.HIGHEST_PROTOCOL)
                        valstr = pickle.dumps(val)
                        escaped_valstr = ucescape(valstr)
                        xml.append(u'<d n="{0}" t="pickle">{1}</d>'
                                   u''.format(a, escaped_valstr))
                else:
                    try:
                        valstr = unicode(val, 'utf-8')
                    except TypeError:
                        valstr = unicode(val)
                    escaped_valstr = escape(valstr)
                    xml.append(u'<d n="{0}" t="{1}">{2}</d>'
                               u''.format(a,
                                          srlz_typehash.get(type(val), ''),
                                          escaped_valstr)
                               )
        val = getattr(self, 'proxInfo')
        if val:
            # Serialize to XML
            xml.append(u'<proxInfo>')
            for hit in val:
                xml.append(u'<hit>')
                for w in hit:
                    if len(w) == 4:
                        xml.append(u'<w e="%s" w="%s" o="%s" t="%s"/>' %
                                   tuple(w))
                    elif len(w) == 3:
                        xml.append(u'<w e="%s" w="%s" o="%s"/>' %
                                   tuple(w))
                    else:
                        try:
                            xml.append(u'<w e="%s" w="%s"/>' %
                                       tuple(w))
                        except:
                            # Should really error!
                            xml.append(u'<w e="%s" w="%s" o="%s" t="%s"/>' %
                                       tuple(w[:4]))

                xml.append(u'</hit>')
            xml.append(u'</proxInfo>')
        xml.append(u'</item>')
        return u''.join(xml)

    def fetch_record(self, session):
        # Return record from store
        if (session.server):
            # db = session.server.get_object(session, self.database)
            db = session.server.get_object(session, session.database)
            recStore = db.get_object(session, self.recordStore)
            rec = recStore.fetch_record(session, self.id)
            rec.resultSetItem = self
            return rec

    def __eq__(self, other):
        try:
            return (self.id == other.id and
                    self.recordStore == other.recordStore)
        except:
            # Not comparing two RSIs
            return False

    def __str__(self):
        return "%s/%s" % (self.recordStore, self.id)

    def __repr__(self):
        return "Ptr:%s/%s" % (self.recordStore, self.id)

    def __cmp__(self, other):
        # Default sort by docid
        if self.numericId is not None:
            if other.numericId is not None:
                oid = other.numericId
            else:
                oid = other.id
            c = cmp(self.numericId, oid)
        else:
            c = cmp(self.id, other.id)
        if not c:
            return cmp(self.recordStore, other.recordStore)
        else:
            return c

    def __hash__(self):
        # Hash of recordstore + id
        return hash(str(self))


class BitmapResultSet(ResultSet):
    bitfield = None
    currItems = None
    recordStore = None
    fromStore = 0
    relevancy = 0
    termid = -1
    totalOccs = 0
    totalRecs = 0
    id = ""
    index = None
    queryTerm = ""
    queryFreq = 0
    queryPositions = []
    relevancy = 0
    maxWeight = 0
    minWeight = 0

    def __init__(self, session, data=0, recordStore=None):
        if isinstance(data, SimpleBitfield):
            self.bitfield = data
        else:
            self.bitfield = SimpleBitfield(data)
        self.currItems = None
        self.recordStore = recordStore
        self.relevancy = 0

    def __getitem__(self, k):
        if self.currItems is None:
            self.currItems = self.bitfield.trueItems()
        return SimpleResultSetItem(None,
                                   self.currItems[k],
                                   self.recordStore,
                                   1)

    def __len__(self):
        return self.bitfield.lenTrueItems()

    def serialise(self, session):
        return self.serialize(session)

    def serialize(self, session):
        return str(self.bitfield)

    def deserialise(self, data):
        return self.deserialize(data)

    def deserialize(self, data):
        self.bitfield = SimpleBitfield(data)

    def get_item(self, item):
        try:
            if self.bitfield[item.id]:
                return item
        except IndexError:
            pass
        return None

    def combine(self, session, others, clause, db=None):
        if (isinstance(clause, cqlParser.Triple)):
            cql = clause.boolean
        else:
            cql = clause.relation
        v = cql.value

        # Check if all are bitmaps
        if v in ['=', 'exact', 'prox']:
            if len(others) == 1:
                return others[0]
            else:
                raise NotImplementedError()

        allbits = 1
        for o in others:
            if not hasattr(o, 'bitfield'):
                allbits = 0
                break

        if allbits:
            if (v in ['all', 'and']):
                s = others[0].bitfield
                for o in others[1:]:
                    s.intersection(o.bitfield)
            elif (v in ['any', 'or', '>', '>=', '<', '<=']):
                s = others[0].bitfield
                for o in others[1:]:
                    s.union(o.bitfield)
            elif (v == 'not'):
                s = others[0].bitfield
                for o in others[1:]:
                    s.difference(o.bitfield)
            else:
                raise NotImplementedError()
            self.bitfield = s
        else:
            # XXX Merging Bitmap with non bitmap
            pass
        return self

    def order(self, spec):
        # Reorder a bitmap?!
        raise NotImplementedError()

    def retrieve(self, numReq, start, cache=0):
        end = min(start + numReq + 1, len(self))
        recs = []
        # XXX This should cache server, db and resultSet
        for r in range(start, end):
            recs.append(self[r].fetch_record(session))
        return recs