Source code for cheshire3.index

"""Cheshire3 Index Implementations."""

import os
import re
import math
import struct
import codecs
import gzip

try:
    import cStringIO as StringIO
except ImportError:
    import StringIO

from lxml import etree

import cheshire3.cqlParser as cql

from cheshire3.baseObjects import Index, Session
from cheshire3.utils import (
    elementType,
    flattenTexts,
    vectorSimilarity,
    SimpleBitfield
)
from cheshire3.exceptions import (
    ConfigFileException,
    QueryException,
    C3ObjectTypeError,
    PermissionException
)
from cheshire3.internal import CONFIG_NS
from cheshire3.resultSet import (
    SimpleResultSet,
    SimpleResultSetItem,
    BitmapResultSet
)
from cheshire3.workflow import CachingWorkflow
from cheshire3.xpathProcessor import SimpleXPathProcessor


class IndexIter(object):
    """Object to facilitate iterating over an Index."""

    index = None
    session = None

    def __init__(self, index):
        self.index = index
        self.indexStore = index.indexStore
        self.session = Session()
        self.summary = 0
        # populate with first term
        try:
            self.nextData = self.indexStore.fetch_termList(self.session,
                                                           self.index,
                                                           "",
                                                           1)[0]
        except IndexError:
            self.nextData = None

    def __iter__(self):
        return self

    def next(self):
        """Return the next item from the iterator."""
        try:
            d = self.nextData
            if not d:
                raise StopIteration()
            if d[-1] == 'last':
                self.nextData = ""
            else:
                try:
                    nd = self.indexStore.fetch_termList(self.session,
                                                        self.index,
                                                        d[0], 2)[1]
                except IndexError:
                    self.nextData = ""
                else:
                    self.nextData = nd

            return self.index.construct_resultSet(self.session, d[1],
                                                  queryHash={'text': d[0],
                                                             'occurences': 1,
                                                             'positions': []})
        except:
            # Fail safely
            raise StopIteration()

    def jump(self, position):
        """Jump to a position in the sequence."""
        self.nextData = self.indexStore.fetch_termList(self.session,
                                                       self.index,
                                                       position, 1)[0]
        return self.index.construct_resultSet(self.session,
                                              self.nextData[1],
                                              queryHash={
                                                  'text': self.nextData[0],
                                                  'occurences': 1,
                                                  'positions': []
                                              })


[docs]class SimpleIndex(Index):
    sources = {}
    xPathAllAbsolute = 1
    xPathAttributesRequired = []
    xPathsNormalized = {}
    currentFullPath = []
    currentPath = []
    storeOrig = 0
    canExtractSection = 1

    indexingTerm = ""
    indexingData = []

    _possiblePaths = {
        'indexStore': {
            "docs": "IndexStore identifier for where this index is stored"
        },
        'termIdIndex': {
            "docs": ("Alternative index object to use for termId for terms "
                     "in this index.")
        },
        'tempPath': {
            "docs": ("Path to a directory where temporary files will be "
                     "stored during batch mode indexing")
        }
    }

    _possibleSettings = {
        'lr_constant0': {
            "docs": ("Value for 1st constant in logistic regression relevance"
                     "assignments. default: -3.7"),
            'type': float
        },
        'lr_constant1': {
            "docs": ("Value for 2nd constant in logistic regression relevance"
                     "assignments. default: 1.269"),
            'type': float
        },
        'lr_constant2': {
            "docs": ("Value for 3rd constant in logistic regression relevance"
                     "assignments. default: -0.31"),
            'type': float
        },
        'lr_constant3': {
            "docs": ("Value for 4th constant in logistic regression relevance"
                     "assignments. default: 0.679"),
            'type': float
        },
        'lr_constant4': {
            "docs": ("Value for 5th constant in logistic regression relevance"
                     "assignments. default: -0.021"),
            'type': float
        },
        'lr_constant5': {
            "docs": ("Value for 6th constant in logistic regression relevance"
                     "assignments. default: 0.223"),
            'type': float
        },
        'lr_constant6': {
            "docs": ("Value for 7th constant in logistic regression relevance"
                     "assignments. default: 4.01"),
            'type': float
        },
        'okapi_constant_b': {
            "docs": ("Constant to use for tuning parameter 'b' in the "
                     "OKAPI BM-25 algorithm. 0 <= b <= 1 determines effect "
                     "of document length on term weight scaling. "
                     "0 -> no effect, 1 -> full scaling."),
            'type': float
        },
        'okapi_constant_k1': {
            "docs": "",
            'type': float
        },
        'okapi_constant_k3': {
            "docs": "",
            'type': float
        },
        'noIndexDefault': {
            "docs": ("If true, the index should not be called from "
                     "db.index_record()"),
            "type": int,
            "options": "0|1"
        },
        'noUnindexDefault': {
            "docs": ("If true, the index should not be called from "
                     "db.unindex_record()"),
            "type": int,
            "options": "0|1"
        },
        'sortStore': {
            "docs": "Should the index build a store to support sorting",
            'type': int,
            'options': '0|1'
        },
        'termIds': {
            "docs": "Should the index store termId -> term",
            'type': int,
            'options': '0|1'
        },
        'vectors': {
            "docs": "Should the index store vectors (doc -> list of termIds.",
            'type': int,
            'options': '0|1'
        },
        'proxVectors': {
            "docs": ("Should the index store vectors that also maintain "
                     "proximity for their terms"),
            'type': int,
            'options': '0|1'
        },
        'minimumSupport': {
            "docs": ("Minimum number of records in which the term must appear "
                     "for it to be indexed at all"),
            'type': int
        },
        'vectorMinGlobalFreq': {
            "docs": "Minimum global records for term to appear in a vector",
            'type': int
        },
        'vectorMaxGlobalFreq': {
            "docs": "Maximum global records for term to appear in a vector",
            'type': int
        },
        'vectorMinGlobalOccs': {
            "docs": "Minimum global occurences",
            'type': int
        },
        'vectorMaxGlobalOccs': {
            "docs": "Maximum global occurences",
            'type': int
        },
        'vectorMinLocalFreq': {
            "docs": "Minimum records in selected",
            'type': int
        },
        'vectorMaxLocalFreq': {
            "docs": "Maximum records in selected",
            'type': int
        },
        'freqList': {
            'docs': ("Store a frequency sorted list to step through "
                     "of records, occurrences or both"),
            'options': 'rec|occ|rec occ|occ rec'
        },
        'longSize': {
            "docs": ("Size of a long integer in this index's underlying data "
                     "structure (eg to migrate between 32 and 64 bit "
                     "platforms)"),
            'type': int
        },
        'recordStoreSizes': {
            "docs": ("Should we use recordStore sizes instead of database "
                     "sizes"),
            'type': int
        },
        'maxVectorCacheSize': {
            'docs': "Number of terms to cache when building vectors",
            'type': int
        },
        'bucketType': {
            'docs': ("Type of 'bucket' to use when splitting an index over "
                     "multiple files."),
            'options': 'term1|term2|hash'
        },
        'maxBuckets': {
            'docs': "Maximum number of 'buckets' to split an index into",
            'type': int
        },
        'maxItemsPerBucket': {
            'docs': "Maximum number of items to put into each 'bucket'",
            'type': int
        },
    }

    def _handleConfigNode(self, session, node):
        # Source
        if (node.localName == "source"):
            modes = node.getAttributeNS(None, 'mode')
            if not modes:
                modes = [u'data']
            else:
                modes = modes.split('|')
            process = None
            preprocess = None
            xp = None
            for child in node.childNodes:
                if child.nodeType == elementType:
                    if child.localName in ["xpath", "selector"]:
                        if xp is None:
                            ref = child.getAttributeNS(None, 'ref')
                            if ref:
                                xp = self.get_object(session, ref)
                            else:
                                xp = SimpleXPathProcessor(session, node, self)
                                xp._handleConfigNode(session, node)
                    elif child.localName == "preprocess":
                        # turn preprocess chain to workflow
                        ref = child.getAttributeNS(None, 'ref')
                        if ref:
                            preprocess = self.get_object(session, ref)
                        else:
                            child.localName = 'workflow'
                            preprocess = CachingWorkflow(session, child, self)
                            preprocess._handleConfigNode(session, child)
                    elif child.localName == "process":
                        # turn xpath chain to workflow
                        ref = child.getAttributeNS(None, 'ref')
                        if ref:
                            process = self.get_object(session, ref)
                        else:
                            try:
                                child.localName = 'workflow'
                            except:
                                # 4suite dom sets read only
                                # Shortcut to method for shorter lines!
                                cefn = child.ownerDocument.createElementNS
                                newTop = cefn(None, 'workflow')
                                for kid in child.childNodes:
                                    newTop.appendChild(kid)
                                child = newTop
                            process = CachingWorkflow(session, child, self)
                            process._handleConfigNode(session, child)

            for m in modes:
                mysrc = self.sources.setdefault(m, [])
                mysrc.append((xp, process, preprocess))

    def _handleLxmlConfigNode(self, session, node):
        # Source
        if node.tag in ["source", '{%s}source' % CONFIG_NS]:
            modes = node.attrib.get('{%s}mode' % CONFIG_NS,
                                    node.attrib.get('mode', 'data'))
            modes = modes.split('|')
            process = None
            preprocess = None
            xp = None
            for child in node.iterchildren(tag=etree.Element):
                if child.tag in ['xpath', '{%s}xpath' % CONFIG_NS,
                                 'selector', '{%s}selector' % CONFIG_NS]:
                    if xp is None:
                        ref = child.attrib.get('{%s}ref' % CONFIG_NS,
                                               child.attrib.get('ref', ''))
                        if ref:
                            xp = self.get_object(session, ref)
                        else:
                            node.set('id', self.id + '-xpath')
                            xp = SimpleXPathProcessor(session, node, self)
                            xp._handleLxmlConfigNode(session, node)
                elif child.tag in ['preprocess', '{%s}preprocess' % CONFIG_NS]:
                    # turn preprocess chain to workflow
                    ref = child.attrib.get('{%s}ref' % CONFIG_NS,
                                           child.attrib.get('ref', ''))
                    if ref:
                        preprocess = self.get_object(session, ref)
                    else:
                        # create new element
                        e = etree.XML(etree.tostring(child))
                        e.tag = 'workflow'
                        e.set('id', self.id + "-preworkflow")
                        preprocess = CachingWorkflow(session, e, self)
                        preprocess._handleLxmlConfigNode(session, e)
                elif child.tag in ['process', '{%s}process' % CONFIG_NS]:
                    # turn xpath chain to workflow
                    ref = child.attrib.get('{%s}ref' % CONFIG_NS,
                                           child.attrib.get('ref', ''))
                    if ref:
                        process = self.get_object(session, ref)
                    else:
                        # create new element
                        e = etree.XML(etree.tostring(child))
                        e.tag = 'workflow'
                        e.set('id', self.id + "-workflow")
                        process = CachingWorkflow(session, e, self)
                        process._handleLxmlConfigNode(session, e)
            for m in modes:
                mysrc = self.sources.setdefault(m, [])
                mysrc.append((xp, process, preprocess))

    def __init__(self, session, config, parent):
        self.sources = {}
        self.xPathAttributesRequired = []
        self.xPathsNormalized = {}
        self.xPathAllAbsolute = 1
        self.indexingTerm = ""
        self.indexingData = []

        self.maskList = ['*', '?', '^']
        self.caretRe = re.compile(r'(?<!\\)\^')
        self.qmarkRe = re.compile(r'(?<!\\)\?')
        self.astxRe = re.compile(r'(?<!\\)\*')

        Index.__init__(self, session, config, parent)

        lss = self.get_setting(session, 'longSize')
        if lss:
            self.longStructSize = int(lss)
        else:
            #self.longStructSize = len(struct.pack('L', 1))
            self.longStructSize = struct.calcsize('<l')

        self.recordStoreSizes = self.get_setting(session,
                                                 'recordStoreSizes',
                                                 0)
        # We need a Store object
        iStore = self.get_path(session, 'indexStore', None)
        self.indexStore = iStore

        if (iStore is None):
            raise ConfigFileException("Index (%s) does not have an "
                                      "indexStore." % (self.id))
        else:
            iStore.create_index(session, self)

        self.resultSetClass = SimpleResultSet
        self.recordStore = ""

    def __iter__(self):
        return IndexIter(self)

    def _locate_firstMask(self, term, start=0):
        try:
            return min([term.index(x, start) for x in self.maskList])
        except ValueError:
            # one or more are not found (i.e. == -1)
            firstMaskList = [term.find(x, start) for x in self.maskList]
            firstMaskList.sort()
            firstMask = firstMaskList.pop(0)
            while len(firstMaskList) and firstMask < 0:
                firstMask = firstMaskList.pop(0)
            return firstMask

    def _regexify_wildcards(self, term):
        # Escape existing special regex chars
        term = term.replace('.', r'\.')
        term = term[0] + self.caretRe.sub(r'\^', term[1:-1]) + term[-1]
        term = self.qmarkRe.sub('.', term)
        term = self.astxRe.sub('.*', term)
        if (term[-1] == '^') and (term[-2] != '\\'):
            term = term[:-1]
        return term + '$'

    def _processRecord(self, session, record, source):
        (xpath, process, preprocess) = source
        if preprocess:
            record = preprocess.process(session, record)
        if xpath:
            try:
                rawlist = xpath.process_record(session, record)
            except C3ObjectTypeError:
                rawlist = [[]]
            processed = process.process(session, rawlist)
        else:
            processed = process.process(session, record)
        return processed

    def extract_data(self, session, rec):
        processed = self._processRecord(session, rec, self.sources[u'data'][0])
        if processed:
            keys = processed.keys()
            keys.sort()
            return keys[0]
        else:
            return None

    def index_record(self, session, rec):
        p = self.permissionHandlers.get('info:srw/operation/2/index', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to add "
                                          "to index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to add to "
                                          "index %s" % self.id)
        if 'sort' in self.sources:
            sortHash = self._processRecord(session, rec,
                                           self.sources[u'sort'][0])
            if sortHash:
                sortVal = sortHash.keys()[0]
            else:
                sortVal = ''
        else:
            sortVal = ''

        for src in self.sources[u'data']:
            processed = self._processRecord(session, rec, src)
            if sortVal:
                # Don't blank sortVal, or will be overwritten
                # in subsequent iterations
                k = processed.keys()[0]
                processed[k]['sortValue'] = sortVal
            self.indexStore.store_terms(session, self, processed, rec)
        return rec

    def delete_record(self, session, rec):
        # Extract terms, and remove from store
        p = self.permissionHandlers.get('info:srw/operation/2/unindex', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to "
                                          "remove from index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to remove from "
                                          "index %s" % self.id)
        istore = self.get_path(session, 'indexStore')

        if self.get_setting(session, 'vectors', 0):
            # use vectors to unindex instead of reprocessing
            # faster, only way for 'now' metadata.
            vec = self.fetch_vector(session, rec)
            # [totalUniqueTerms, totalFreq, [(tid, freq)+]]
            processed = {}
            for (t, f) in vec[2]:
                term = self.fetch_termById(session, t)
                processed[term] = {'occurences': f}
            if istore is not None:
                istore.delete_terms(session, self, processed, rec)
        else:
            for src in self.sources[u'data']:
                processed = self._processRecord(session, rec, src)
                if (istore is not None):
                    istore.delete_terms(session, self, processed, rec)

    def begin_indexing(self, session):
        # Find all indexStores
        p = self.permissionHandlers.get('info:srw/operation/2/index', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to add "
                                          "to index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to add to "
                                          "index %s" % self.id)
        stores = []
        istore = self.get_path(session, 'indexStore')
        if (istore is not None and not istore in stores):
            stores.append(istore)
        for s in stores:
            s.begin_indexing(session, self)

    def commit_indexing(self, session):
        p = self.permissionHandlers.get('info:srw/operation/2/index', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to add "
                                          "to index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to add to "
                                          "index %s" % self.id)
        stores = []
        istore = self.get_path(session, 'indexStore')
        if (istore is not None and not istore in stores):
            stores.append(istore)
        for s in stores:
            s.commit_indexing(session, self)

    def commit_parallelIndexing(self, session):
        istore = self.get_path(session, 'indexStore')
        istore.commit_parallelIndexing(session, self)

    def search(self, session, clause, db):
        # Final destination. Process Term.
        p = self.permissionHandlers.get('info:srw/operation/2/search', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to "
                                          "search index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to search "
                                          "index %s" % self.id)

        res = {}
        # src = (xp, processwf, preprocesswf)
        # Try to get process for relation/modifier, failing that relation,
        # fall back to that used for data
        for src in self.sources.get(clause.relation.toCQL(),
                                    self.sources.get(clause.relation.value,
                                                     self.sources[u'data'])):
            res.update(src[1].process(session, [[clause.term.value]]))

        store = self.get_path(session, 'indexStore')
        matches = []
        rel = clause.relation
        if (
            rel.prefix == 'cql' or
            rel.prefixURI == 'info:srw/cql-context-set/1/cql-v1.1'
        ):
            if (rel.value == 'scr'):
                pm = db.get_path(session, 'protocolMap')
                try:
                    rel.value = pm.defaultRelation
                except AttributeError:
                    pass

        # While we're looking at the query, check if we should do blind
        # relevance feedback on this clause
        feedback = 0
        for m in rel.modifiers:
            m.type.parent = clause
            m.type.resolvePrefix()
            if (
                m.type.prefixURI.startswith(
                    'info:srw/cql-context-set/2/relevance'
                )
            ):
                if m.type.value == "feedback":
                    try:
                        feedback = int(m.value)
                    except ValueError:
                        feedback = 1

        construct_resultSet = self.construct_resultSet
        if (
            rel.value in ['any', 'all', '=', 'exact', 'window'] and
            (
                rel.prefix == 'cql' or
                rel.prefixURI == 'info:srw/cql-context-set/1/cql-v1.1'
            )
        ):
            for k, qHash in res.iteritems():
                if k[0] == '^':
                    k = k[1:]
                firstMask = self._locate_firstMask(k)
                while (firstMask > 0) and (k[firstMask - 1] == '\\'):
                    firstMask = self._locate_firstMask(k, firstMask + 1)
                # TODO: slow regex e.g. if first char is *
                if (firstMask > -1):
                    startK = k[:firstMask]
                    try:
                        nextK = startK[:-1] + unichr(ord(startK[-1]) + 1)
                    except IndexError:
                        # Left truncation, all terms from the index
                        # TODO: we should check if there's a inversion of
                        # index keys
                        termList = store.fetch_termList(session, self,
                                                        startK, 0, '>=')
                    else:
                        termList = store.fetch_termList(session, self,
                                                        startK, 0, '>=',
                                                        end=nextK)
                    if len(k) > 1:
                        # Filter terms by regex
                        # FIXME: need to do something cleverer than this if
                        # first character is masked. This implementation will
                        # be incredibly slow for these cases...
                        if (
                            (firstMask < len(k) - 1) or
                            (k[firstMask] in ['?', '^'])
                        ):
                            # not simply right hand truncation
                            kRe = re.compile(self._regexify_wildcards(k))
                            mymatch = kRe.match
                            termList = filter(lambda t: mymatch(t[0]),
                                              termList)

                    maskBase = self.resultSetClass(
                        session,
                        [],
                        recordStore=self.recordStore
                    )
                    maskClause = cql.parse(clause.toCQL())
                    maskClause.relation.value = u'any'
                    if (clause.relation.value == u'='):
                        # tell combine to keep proxInfo
                        pass

                    try:
                        maskResultSets = [
                            construct_resultSet(session, t[1], qHash)
                            for t in termList
                        ]
                        maskBase = maskBase.combine(session, maskResultSets,
                                                    maskClause, db)
                        maskBase.queryTerm = qHash['text']
                        try:
                            maskBase.queryPositions = qHash['positions']
                        except KeyError:
                            pass
                    except:
                        pass
                    else:
                        matches.append(maskBase)
                else:
                    term = store.fetch_term(session, self, k)
                    s = construct_resultSet(session, term, qHash)
                    matches.append(s)
        elif (clause.relation.value in ['>=', '>', '<', '<=']):
            if (len(res) != 1):
                raise QueryException("%s %s" % (clause.relation.toCQL(),
                                                clause.term.value),
                                     24)
            else:
                termList = store.fetch_termList(session, self, res.keys()[0],
                                                0, clause.relation.value)
                for t in termList:
                    matches.append(construct_resultSet(session, t[1]))
        elif (clause.relation.value == "within"):
            if (len(res) != 2):
                raise QueryException('%s "%s"' % (clause.relation.toCQL(),
                                                  clause.term.value),
                                     24)
            else:
                termList = store.fetch_termList(session, self, res.keys()[0],
                                                end=res.keys()[1])
                matches.extend([construct_resultSet(session, t[1])
                                for t in termList])
        else:
            raise QueryException('%s "%s"' % (clause.relation.toCQL(),
                                              clause.term.value),
                                 24)

        base = self.resultSetClass(session, [], recordStore=self.recordStore)
        base.recordStoreSizes = self.recordStoreSizes
        base.index = self
        if not matches:
            return base
        else:
            if (
                clause.relation.value == "=" and not
                isinstance(self, ProximityIndex)
            ):
                # Can't do proximity, treat as a search for 'all'
                clause.relation.value = "all"
            rs = base.combine(session, matches, clause, db)
            if len(rs) and feedback:
                rs = self._blindFeedback(session, rs, clause, db)
            return rs

    def scan(self, session, clause, nTerms, direction=">=", summary=1):
        # Process term.
        p = self.permissionHandlers.get('info:srw/operation/2/scan', None)
        if p:
            if not session.user:
                raise PermissionException("Authenticated user required to "
                                          "scan index %s" % self.id)
            okay = p.hasPermission(session, session.user)
            if not okay:
                raise PermissionException("Permission required to scan index "
                                          "%s" % self.id)

        res = {}
        for src in self.sources.get(clause.relation.toCQL(),
                                    self.sources.get(clause.relation.value,
                                                     self.sources[u'data'])):
            res.update(src[1].process(session, [[clause.term.value]]))

        if len(res) == 0:
            # No term, so start at the beginning
            res = {'': ''}
        elif (len(res) != 1):
            raise QueryException("%s" % (clause.term.value), 24)
        store = self.get_path(session, 'indexStore')
        if direction == "=":
            k = res.keys()[0]
            if not k:
                k2 = "!"
            else:
                k2 = k[:-1] + unichr(ord(k[-1]) + 1)
            tList = store.fetch_termList(session, self, k,
                                         nTerms=nTerms, end=k2,
                                         summary=summary, relation='>=')
        else:
            tList = store.fetch_termList(session, self, res.keys()[0],
                                         nTerms=nTerms, relation=direction,
                                         summary=summary)
        # List of (term, occs)
        return tList

    def facets(self, session, resultSet, nTerms=0):
        """Return a list of term facets for the resultSet.

        Return a list of (term, termdata) tuples from this index which occur
        within the records in resultSet. Terms are returned in descending
        frequency (number of records) order.
        """
        termFreqs = {}
        recordFreqs = {}
        for r in resultSet:
            # Use vectors to identify terms
            vec = self.fetch_vector(session, r)
            if vec[2]:
                # store / increment freq
                for t in vec[2]:
                    try:
                        termFreqs[t[0]] += t[1]
                        recordFreqs[t[0]] += 1
                    except:
                        termFreqs[t[0]] = t[1]
                        recordFreqs[t[0]] = 1

        # Sort list by descending frequency (decorate-sort-undecorate)
        # Use 1 / freq - keeps terms with same freq in alpha order
        sortList = [(1.0 / v, k)
                    for k, v
                    in recordFreqs.iteritems()]
        sortList.sort()
        tids = [x[1] for x in sortList]
        if nTerms:
            tids = tids[:min(len(tids), nTerms)]
        terms = []
        for termId in tids:
            term = self.fetch_termById(session, termId)
            # (term, (termId, nRecs, freq))
            terms.append((term.decode('utf-8'),
                          (termId, recordFreqs[termId], termFreqs[termId])))
        return terms

    def searchByExamples(self, session, examples, clause, db, nTerms=20):
        """Return a ResultSet of items 'similar' to the given examples.

        Identify most common terms in examples, create a new resultSet of
        results that contains any of these terms.

        examples := iterable of example objects (e.g. ResultSet, list of
                    Records) to be used to search the index (i.e. by
                    identifying common terms)
        clause   := CQL clause (if examples is a resultSet this would the query
                    used to generate it)
        nTerms   := No. of most common (no. of records) terms to use to create
                    new resultSet
        """
        base = self.resultSetClass(session, [], recordStore=self.recordStore)
        base.recordStoreSizes = self.recordStoreSizes
        base.index = self
        try:
            terms = [t[0] for t in self.facets(session, examples, nTerms)]
        except:
            raise ConfigFileException("Index {0.id} does not support "
                                      "searchByExample; requires vector "
                                      "setting.".format(self))
        # Construct a CQL clause for combining the resultSets from the
        # discovered terms
        # Use modifiers from main clause so that we get scores from the same
        # relevance algorithm combined in the same way
        exClause = cql.SearchClause(clause.index,
                                    cql.Relation("any",
                                                 clause.relation.modifiers),
                                    cql.Term(" ".join(terms)))
        construct_rs = self.construct_resultSet
        store = self.indexStore
        matches = [construct_rs(session,
                                store.fetch_term(session, self, t),
                                {'text': t, 'proxLoc': [-1], 'occurences': 1})
                   for t in terms]
        self.log_debug(session, "searchByExamples ResultSets constructed")
        rs = base.combine(session, matches, exClause, db)
        self.log_debug(session, "searchByExamples ResultSets combined")
        return rs

    def _blindFeedback(self, session, rs, clause, db, nRecs=0, nTerms=20):
        """Expand ResultSet using blind/pseudo relevance feedback.

        Carry out blind/pseudo relevance feedback on the resultSet, merge and
        return.

        rs        := resultSet
        clause    := CQL clause that generated rs
        nRecs     := No. of top results to extract terms from
        nTerms    := No. of top terms to use to expand query
        """
        if not rs.relevancy:
            raise TypeError("Unable to carry out blind relevance feedback on "
                            "a resultSet with no relevance information")
        if not nRecs:
            nRecs = max(int(math.sqrt(len(rs))), 5)
        self.log_debug(session,
                       "Feedback requested; top {0} Terms from top {1} of "
                       "{2} Records".format(nTerms, nRecs, len(rs)))
        # Need to sort before slicing to get the 'good' results
        # Use the built-in sorted() to create a sorted iterable slice of top
        # results without changing original (otherwise combining with new
        # results will end up with duplicates)
        try:
            fbrs = self.searchByExamples(
                session,
                sorted(rs, key=lambda x: x.weight)[:nRecs],
                clause,
                db,
                nTerms
            )
        except ConfigFileException as e:
            self.log_warning(session,
                             "Unable to complete blind relevance feedback "
                             "loop: " + e.reason)
            return rs
        # Both resultSets now have scores assigned, so need to combine them
        # using a boolean
        fooQ = cql.parse(">rel=info:srw/cql-context-set/2/relevance-1.1 {0}"
                         " or/rel.combine=sum {1}".format(clause.toCQL(),
                                                          fbrs.query.toCQL()))
        base = self.resultSetClass(session, [], recordStore=self.recordStore)
        base.recordStoreSizes = self.recordStoreSizes
        base.index = self
        return base.combine(session, [rs, fbrs], fooQ, db)

    def similarity(self, session, record1, record2):
        """Calculate and return cosine similarity of records.

        Calculate and return cosine similarity of vector representations of the
        two record arguments.

        >>> self.similarity(session, rec, rec)
        1.0
        """
        if self.get_setting(session, 'vectors', 0):
            # We can fetch stored vectors
            vector1 = self.fetch_vector(session, record1)
            vector2 = self.fetch_vector(session, record2)
        else:
            # We could regenerate on the fly...
            raise NotImplementedError  # ...but not yet

        return vectorSimilarity(dict(vector1[2]), dict(vector2[2]))

    # Internal API for stores

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        """Return a string serialization representing the term.

        Return a string serialization representing the term for storage
        purposes. Used as a callback from IndexStore to serialize a list of
        terms and document references to be stored.

        termId  := numeric ID of term being serialized
        data    := list of longs
        nRecs   := number of Records containing the term, if known
        nOccs   := total occurrences of the term, if known
        """
        # in: list of longs
        if not nRecs:
            nRecs = len(data) / 3
        if not nOccs:
            nOccs = sum(data[2::3])
        fmt = '<' + 'lll' * (nRecs + 1)
        params = [fmt, termId, nRecs, nOccs] + data
        try:
            return struct.pack(*params)
        except:
            self.log_critical(session, 'Error while serializing index term.\n'
                              'HINT: are you trying to put proximity '
                              'information into a SimpleIndex?\n')
            raise

    def deserialize_term(self, session, data, nRecs=-1, prox=1):
        """Deserialize and return the internal representation of a term.

        Return the internal representation of a term as recreated from a
        string serialization from storage. Used as a callback from IndexStore
        to take serialized data and produce list of terms and document
        references.

        data  := string (usually retrieved from indexStore)
        nRecs := number of Records to deserialize (all by default)
        prox  := boolean flag to include proximity information
        """
        if nRecs == -1:
            fmt = '<' + 'lll' * (len(data) / (3 * self.longStructSize))
            return struct.unpack(fmt, data)
        else:
            fmt = '<' + "lll" * (nRecs + 1)
            endpoint = (nRecs + 1) * 3 * self.longStructSize
            return struct.unpack(fmt, data[:endpoint])

    def calc_sectionOffsets(self, session, start, nRecs, dataLen=0):
        #tid, recs, occs, (store, rec, freq)+
        a = (self.longStructSize * 3) + (self.longStructSize * start * 3)
        b = (self.longStructSize * 3 * nRecs)
        return [(a, b)]

    def merge_term(self, session, currentData, newData,
                   op="replace", nRecs=0, nOccs=0):
        """Merge newData into currentData and return the result.

        Merging takes the currentData and can add, replace or delete the data
        found in newData, and then returns the result. Used as a callback from
        IndexStore to take two sets of terms and merge them together.

        currentData := output of deserialize_terms
        newData     := flat list
        op          := replace | add | delete
        nRecs       := total records in newData
        nOccs       := total occurrences in newdata
        """
        (termid, oldTotalRecs, oldTotalOccs) = currentData[0:3]
        currentData = list(currentData[3:])

        if op == 'add':
            currentData.extend(newData)
            if nRecs:
                trecs = oldTotalRecs + nRecs
                toccs = oldTotalOccs + nOccs
            else:
                trecs = oldTotalRecs + len(newData) / 3
                toccs = oldTotalOccs + sum(newData[2::3])
        elif op == 'replace':
            for n in range(0, len(newData), 3):
                docid = newData[n]
                storeid = newData[n + 1]
                replaced = 0
                for x in range(3, len(currentData), 3):
                    if (
                        currentData[x] == docid and
                        currentData[x + 1] == storeid
                    ):
                        currentData[x + 2] = newData[n + 2]
                        replaced = 1
                        break
                if not replaced:
                    currentData.extend([docid, storeid, newData[n + 2]])
            trecs = len(currentData) / 3
            toccs = sum(currentData[2::3])
        elif op == 'delete':
            for n in range(0, len(newData), 3):
                docid = newData[n]
                storeid = newData[n + 1]
                for x in range(0, len(currentData), 3):
                    if (
                        currentData[x] == docid and
                        currentData[x + 1] == storeid
                    ):
                        del currentData[x:(x + 3)]
                        break
            trecs = len(currentData) / 3
            toccs = sum(currentData[2::3])
        merged = [termid, trecs, toccs] + currentData
        return merged

    def construct_resultSet(self, session, terms, queryHash={}):
        """Create and return a ResultSet.

        Take a list of the internal representation of terms, as stored in this
        Index, create and return an appropriate ResultSet object.
        """
        # in: unpacked
        # out: resultSet
        l = len(terms)
        ci = self.indexStore.construct_resultSetItem
        s = self.resultSetClass(session, [])
#        rsilist = []
#        for t in range(3,len(terms),3):
#            item = ci(session, terms[t], terms[t+1], terms[t+2])
#            item.resultSet = s
#            rsilist.append(item)
#
#        s.fromList(rsilist)
        # Filter out duplicates
        rsis = {}
        for t in range(3, len(terms), 3):
            if terms[t] not in rsis:
                item = ci(session, terms[t], terms[t + 1], terms[t + 2])
                item.resultSet = s
                rsis[item.id] = (t, item)
        # Keep them in order
        s.fromList([r[1] for r in sorted(rsis.values())])
        s.index = self
        if queryHash:
            s.queryTerm = queryHash['text']
            s.queryFreq = queryHash['occurences']
        if (terms):
            s.termid = terms[0]
            s.totalRecs = terms[1]
            s.totalOccs = terms[2]
        else:
            s.totalRecs = 0
            s.totalOccs = 0
        return s

    # pass-throughs to indexStore

    def construct_resultSetItem(self, session, term,
                                rsiType="SimpleResultSetItem"):
        return self.indexStore.construct_resultSetItem(session, term[0],
                                                       term[1], term[2],
                                                       rsiType)

    def clear(self, session):
        self.indexStore.clear_index(session, self)

    def store_terms(self, session, data, rec):
        self.indexStore.store_terms(session, self, data, rec)

    def fetch_term(self, session, term, summary=False, prox=True):
        return self.indexStore.fetch_term(session, self, term, summary, prox)

    def fetch_termList(self, session, term, nTerms=0,
                       relation="", end="", summary=0):
        return self.indexStore.fetch_termList(session, self, term, nTerms,
                                              relation, end, summary)

    def fetch_termById(self, session, termId):
        return self.indexStore.fetch_termById(session, self, termId)

    def fetch_vector(self, session, rec, summary=False):
        return self.indexStore.fetch_vector(session, self, rec, summary)

    def fetch_proxVector(self, session, rec, elemId=-1):
        return self.indexStore.fetch_proxVector(session, self, rec, elemId)

    def fetch_summary(self, session):
        return self.indexStore.fetch_summary(session, self)

    def fetch_termFrequencies(self, session, mType='occ',
                              start=0, nTerms=100, direction=">"):
        return self.indexStore.fetch_termFrequencies(session, self, mType,
                                                     start, nTerms, direction)

    def fetch_metadata(self, session):
        return self.indexStore.fetch_indexMetadata(session, self)

    def fetch_sortValue(self, session, rec, ascending=True):
        return self.indexStore.fetch_sortValue(session, self, rec, ascending)

    def merge_tempFiles(self, session):
        return self.indexStore.merge_tempFiles(session, self)

    def commit_centralIndexing(self, session, filename=""):
        return self.indexStore.commit_centralIndexing(session, self, filename)


class SingleRecordStoreIndex(SimpleIndex):
    """Index implementation that assumes there is only 1 RecordStore.

    For single RecordStore cases this makes the index smaller and hence faster.
    Also enables compatibility with basic (non-proximity) Cheshire 2 index
    files.
    """

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        """Return a string serialization representing the term.

        Return a string serialization representing the term for storage
        purposes. Used as a callback from IndexStore to serialize a list of
        terms and document references to be stored.

        termId  := numeric ID of term being serialized
        data    := list of longs
        nRecs   := number of Records containing the term, if known
        nOccs   := total occurrences of the term, if known
        """
        # in: list of longs
        if not nRecs:
            nRecs = len(data) / 3
        if not nOccs:
            nOccs = sum(data[2::3])
        # strip out RecordStore pointer
        del data[1::3]
        fmt = '<' + 'lll' + ('ll' * nRecs)
        params = [fmt, termId, nRecs, nOccs] + data
        try:
            return struct.pack(*params)
        except:
            self.log_critical(session,
                              'Error while serializing index term.\n'
                              'HINT: are you trying to put proximity '
                              'information into a SimpleIndex?\n')
            raise

    def deserialize_term(self, session, data, nRecs=-1, prox=1):
        """Deserialize and return the internal representation of a term.

        Return the internal representation of a term as recreated from a
        string serialization from storage. Used as a callback from IndexStore
        to take serialized data and produce list of terms and document
        references.

        data  := string (usually retrieved from indexStore)
        nRecs := number of Records to deserialize (all by default)
        prox  := boolean flag to include proximity information
        """
        lss = self.longStructSize
        if nRecs == -1:
            fmt = '<' + 'l' * (len(data) / lss)
            out = list(struct.unpack(fmt, data))
        else:
            fmt = '<' + "lll" + "ll" * nRecs
            endpoint = (3 * lss) + (nRecs * 2 * lss)
            out = list(struct.unpack(fmt, data[:endpoint]))
        # Insert assumed RecordStore pointers
        for x in range(4, 3 + (3 * (len(out[3:]) / 2)), 3):
            out.insert(x, 0)
        return out

    def calc_sectionOffsets(self, session, start, nRecs, dataLen=0):
        # tid, recs, occs, (rec, freq)+
        a = (self.longStructSize * 3) + (self.longStructSize * start * 2)
        b = (self.longStructSize * 2 * nRecs)
        return [(a, b)]


[docs]class ProximityIndex(SimpleIndex):
    """Index that can store term locations to enable proximity search.

    An Index that can store element, word and character offset location
    information for term entries, enabling phrase, adjacency searches etc.

    Need to use an Extractor with prox setting and a ProximityTokenMerger
    """

    canExtractSection = 0
    _possibleSettings = {
        'nProxInts': {
            'docs': ("Number of integers per occurence in this index for "
                     "proximity information, typically 2 "
                     "(elementId, wordPosition) or "
                     "3 (elementId, wordPosition, byteOffset)"),
            'type': int
        }
    }

    def __init__(self, session, config, parent):
        SimpleIndex.__init__(self, session, config, parent)
        self.nProxInts = self.get_setting(session, 'nProxInts', 2)

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        # in: list of longs
        fmt = '<' + 'l' * (len(data) + 3)
        params = [fmt, termId, nRecs, nOccs] + data
        try:
            val = struct.pack(*params)
        except:
            self.log_critical(session,
                              "%s failed to pack: %r" % (self.id, params))
            raise
        return val

    def deserialize_term(self, session, data, nRecs=-1, prox=1):
        fmt = '<' + 'l' * (len(data) / self.longStructSize)
        flat = struct.unpack(fmt, data)
        (termid, totalRecs, totalOccs) = flat[:3]
        idx = 3
        docs = [termid, totalRecs, totalOccs]
        while idx < len(flat):
            doc = list(flat[idx:idx + 3])
            nidx = idx + 3 + (doc[2] * self.nProxInts)
            doc.extend(flat[idx + 3:nidx])
            idx = nidx
            docs.append(doc)
        return docs

    def merge_term(self, session, currentData, newData,
                   op="replace", nRecs=0, nOccs=0):
        # in: struct: deserialised, new: flag
        # out: flat

        (termid, oldTotalRecs, oldTotalOccs) = currentData[0:3]
        currentData = list(currentData[3:])

        if op == 'add':
            # flatten
            terms = []
            for t in currentData:
                terms.extend(t)
            terms.extend(newData)
            currentData = terms
            if nRecs != 0:
                trecs = oldTotalRecs + nRecs
                toccs = oldTotalOccs + nOccs
            else:
                # ...
                trecs = oldTotalRecs + len(newData)
                toccs = oldTotalOccs
                for t in newData:
                    toccs = toccs + t[2]
                raise ValueError("FIXME:  mergeTerms needs recs/occs params")

        elif op == 'replace':
            recs = [(x[0], x[1]) for x in currentData]
            newOccs = 0
            idx = 0
            while idx < len(newData):
                end = idx + 3 + (newData[idx + 2] * self.nProxInts)
                new = list(newData[idx:end])
                idx = end
                docid = new[0]
                storeid = new[1]
                if (docid, storeid) in recs:
                    loc = recs.index((docid, storeid))
                    # subtract old occs
                    occs = currentData[loc][2]
                    newOccs -= occs
                    currentData[loc] = new
                else:
                    currentData.append(new)
                newOccs += new[2]
            trecs = len(currentData)
            toccs = oldTotalOccs + newOccs
            # now flatten currentData
            n = []
            for s in currentData:
                n.extend(s)
            currentData = n

        elif op == 'delete':
            delOccs = 0
            idx = 0
            while idx < len(newData):
                doc = list(newData[idx:idx + 3])
                idx = idx + 3 + (doc[2] * self.nProxInts)
                for x in range(len(currentData)):
                    old = currentData[x]
                    if old[0] == doc[0] and old[1] == doc[1]:
                        delOccs = delOccs + old[2]
                        del currentData[x]
                        break
            trecs = len(currentData) - 3
            toccs = oldTotalOccs - delOccs
            # now flatten
            terms = []
            for t in currentData:
                terms.extend(t)
            currentData = terms

        merged = [termid, trecs, toccs]
        merged.extend(currentData)
        return merged

    def construct_resultSetItem(self, session, term, rsiType=""):
        # in: single triple
        # out: resultSetItem
        # Need to map recordStore and docid at indexStore
        item = self.indexStore.construct_resultSetItem(session, term[0],
                                                       term[1], term[2])
        item.proxInfo = term[3:]
        return item

    def construct_resultSet(self, session, terms, queryHash={}):
        # in: unpacked
        # out: resultSet
        rsilist = []
        ci = self.indexStore.construct_resultSetItem
        s = self.resultSetClass(session, [])
        for t in terms[3:]:
            item = ci(session, t[0], t[1], t[2])
            pi = t[3:]
            item.proxInfo = [
                [pi[x:(x + self.nProxInts)]]
                for x
                in range(0, len(pi), self.nProxInts)
            ]
            item.resultSet = s
            rsilist.append(item)
        s.fromList(rsilist)
        s.index = self
        if queryHash:
            s.queryTerm = queryHash['text']
            s.queryFreq = queryHash['occurences']
            s.queryPositions = []
            # not sure about this nProxInts??
            try:
                for x in queryHash['positions'][1::self.nProxInts]:
                    s.queryPositions.append(x)
            except:
                # no queryPos?
                pass
        if (terms):
            s.termid = terms[0]
            s.totalRecs = terms[1]
            s.totalOccs = terms[2]
        else:
            s.totalRecs = 0
            s.totalOccs = 0
        return s


[docs]class XmlIndex(SimpleIndex):
    """Index to store terms as XML structure.

    e.g.::

        <rs tid="" recs="" occs="">
            <r i="DOCID" s="STORE" o="OCCS"/>
        </rs>

    """

    def __init__(self, session, config, parent):
        SimpleIndex.__init__(self, session, config, parent)
        # ping etree to initialize
        nothing = etree.fromstring("<xml/>")

    def _maybeCompress(self, xmlstr):
        compress = "0"
        if len(xmlstr) > 1000000:
            # compress
            compress = "1"
            outDoc = StringIO.StringIO()
            zfile = gzip.GzipFile(mode='wb', fileobj=outDoc, compresslevel=1)
            zfile.write(xmlstr)
            zfile.close()
            l = outDoc.tell()
            outDoc.seek(0)
            xmlstr = outDoc.read(l)
            outDoc.close()
        return compress + xmlstr

    def _maybeUncompress(self, data):
        compress = int(data[0])
        xmlstr = data[1:]
        if compress:
            # uncompress
            buff = StringIO.StringIO(xmlstr)
            zfile = gzip.GzipFile(mode='rb', fileobj=buff)
            xmlstr = zfile.read()
            zfile.close()
            buff.close()
        return xmlstr

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        # in: list of longs
        val = struct.pack('<lll', termId, nRecs, nOccs)
        xml = ['<rs tid="%s" recs="%s" occs="%s">' % (termId, nRecs, nOccs)]
        idx = 0
        for i in range(0, len(data), 3):
            xml.append('<r i="%s" s="%s" o="%s"/>' % data[i:i + 3])
        xml.append("</rs>")
        xmlstr = ''.join(xml)
        data = self._maybeCompress(xmlstr)
        final = val + data
        return final

    def deserialize_term(self, session, data, nRecs=-1, prox=1):
        lss3 = 3 * self.longStructSize
        fmt = '<lll'
        (termid, totalRecs, totalOccs) = struct.unpack(fmt, data[:lss3])
        xmlstr = self._maybeUncompress(data[lss3:])
        return [termid, totalRecs, totalOccs, xmlstr]

    def construct_resultSet(self, session, terms, queryHash={}):
        # in: [termid, recs, occs, XML]
        # out: resultSet
        rs = SimpleResultSet(session, [])
        if len(terms) < 3:
            # no data
            return rs
        rsilist = []
        #ci = self.indexStore.construct_resultSetItem
        # Parse xml
        doc = etree.fromstring(terms[3])
        rsi = None

        for elem in doc.iter():
            if elem.tag == 'rs':
                # extract any further rs info here
                pass
            elif elem.tag == 'r':
                # process a hit: i, s, o
                if rsi:
                    rsi.proxInfo = pi
                    rsilist.append(rsi)
                vals = [int(x) for x in elem.attrib.values()]
                rsi = SimpleResultSetItem(session, *vals)
                # rsi = ci(session, *vals)
                rsi.resultSet = rs
                pi = []
            elif elem.tag == 'p':
                # process prox info
                pi.append([[int(x) for x in elem.attrib.values()]])
        if rsi:
            rsi.proxInfo = pi
            rsilist.append(rsi)
        rs.fromList(rsilist)
        rs.index = self
        if queryHash:
            rs.queryTerm = queryHash['text']
            rs.queryFreq = queryHash['occurences']
            rs.queryPositions = []
            # Not sure about this nProxInts??
            try:
                for x in queryHash['positions'][1::self.nProxInts]:
                    rs.queryPositions.append(x)
            except:
                # No queryPos?
                pass
        if (terms):
            rs.termid = terms[0]
            rs.totalRecs = terms[1]
            rs.totalOccs = terms[2]
        else:
            rs.totalRecs = 0
            rs.totalOccs = 0
        return rs


[docs]class XmlProximityIndex(XmlIndex):
    """ProximityIndex to store terms as XML structure.

    e.g.::

        <rs tid="" recs="" occs="">
          <r i="DOCID" s="STORE" o="OCCS">
            <p e="ELEM" w="WORDNUM" c="CHAROFFSET"/>
          </r>
        </rs>

    """

    _possibleSettings = {
        'nProxInts': {
            'docs': ("Number of integers per occurence in this index for "
                     "proximity information, typically 2 "
                     "(elementId, wordPosition) or "
                     "3 (elementId, wordPosition, byteOffset)"),
            'type': int
        }
    }

    def __init__(self, session, config, parent):
        XmlIndex.__init__(self, session, config, parent)
        self.nProxInts = self.get_setting(session, 'nProxInts', 2)

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        # in: list of longs
        npi = self.get_setting(session, 'nProxInts', 2)
        val = struct.pack('<lll', termId, nRecs, nOccs)
        xml = ['<rs tid="%s" recs="%s" occs="%s">' % (termId, nRecs, nOccs)]
        idx = 0
        while idx < len(data):
            xml.append('<r i="%s" s="%s" o="%s">' % tuple(data[idx:idx + 3]))
            if npi == 3:
                for x in range(data[idx + 2]):
                    xml.append('<p e="%s" w="%s" c="%s"/>' %
                               tuple(data[idx + 3 + (x * 3):idx + 6 + (x * 3)])
                               )
                idx = idx + idx + 6 + (x * 3)
            else:
                for x in range(data[idx + 2]):
                    p = tuple(data[idx + 3 + (x * 2):idx + 5 + (x * 2)])
                    xml.append('<p e="%s" w="%s"/>' % p)
                idx = idx + 5 + (x * 2)
            xml.append('</r>')
        xml.append("</rs>")
        xmlstr = ''.join(xml)
        data = self._maybeCompress(xmlstr)
        final = val + data
        return final


[docs]class RangeIndex(SimpleIndex):
    """Index to enable searching over one-dimensional range (e.g. time).

    Need to use a RangeTokenMerger
    """
    # 1 3 should match 1, 2, 3
    # a c should match a* b* c
    # unsure about this - RangeIndex only necessary for 'encloses' queries
    # Also appropriate for 'within', so implememnted - John

    def search(self, session, clause, db):
        # Check if we can just use SimpleIndex.search
        if (clause.relation.value not in ['encloses', 'overlaps', 'within',
                                          '>', '>=', '<', '<=', '>=<']):
            return SimpleIndex.search(self, session, clause, db)
        else:
            p = self.permissionHandlers.get('info:srw/operation/2/search',
                                            None)
            if p:
                if not session.user:
                    raise PermissionException("Authenticated user required to "
                                              "search index %s" % self.id)
                okay = p.hasPermission(session, session.user)
                if not okay:
                    raise PermissionException("Permission required to search "
                                              "index %s" % self.id)
            # Final destination. Process Term.
            res = {}
            # Try to get process for relation/modifier, failing that relation,
            # fall back to that used for data
            for src in self.sources.get(
                clause.relation.toCQL(),
                self.sources.get(clause.relation.value, self.sources[u'data'])
            ):
                res.update(src[1].process(session, [[clause.term.value]]))
            store = self.get_path(session, 'indexStore')
            matches = []
            rel = clause.relation
            if (len(res) != 1):
                msg = "%s %s" % (clause.relation.toCQL(), clause.term.value)
                raise QueryException(msg, 24)
            keys = res.keys()[0].split('/', 1)
            startK = keys[0]
            endK = keys[1]
            rel = clause.relation.value
            if rel in ['encloses', '<', '<=']:
                # RangeExtractor should already return the range in ascending
                # order
                termList = store.fetch_termList(session, self,
                                                startK, relation='<')
                if rel == 'encloses':
                    # list comprehension is easier to understand
                    #termList = filter(lambda t: endK < t[0].split('/', 1)[1],
                    #                  termList)
                    termList = [t for t in termList
                                if (t[0].split('/', 1)[1] > endK)]
                elif rel == '<':
                    termList = [t for t in termList
                                if (t[0].split('/', 1)[1] < endK)]
                elif rel == '<=':
                    termList = [t for t in termList
                                if (t[0].split('/', 1)[1] <= endK)]
            elif rel == 'within':
                termList = store.fetch_termList(session, self,
                                                startK, end=endK)
                # List comprehension is easier to understand
                #termList = filter(lambda t: endK > t[0].split('/', 1)[1],
                #                  termList)
                termList = [t for t in termList
                            if (endK > t[0].split('/', 1)[1])]
            elif rel in ['overlaps', '>=<']:
                # Fetch all which start before the end point
                termList = store.fetch_termList(session, self,
                                                endK, relation='<=')
                # Filter for only those that end after start point
                termList = [t for t in termList
                            if (startK <= t[0].split('/', 1)[1])]
            elif rel.startswith('>'):
                termList = store.fetch_termList(session, self,
                                                endK, relation=rel)
            else:
                # This just SHOULD NOT have happened!...
                msg = '%s "%s"' % (clause.relation.toCQL(), clause.term.value)
                raise QueryException(msg, 24)
            matches.extend([self.construct_resultSet(session, t[1])
                            for t in termList])
            base = self.resultSetClass(session, [],
                                       recordStore=self.recordStore)
            base.recordStoreSizes = self.recordStoreSizes
            base.index = self
            if not len(matches):
                return base
            else:
                rs = base.combine(session, matches, clause, db)
                return rs


[docs]class BitmapIndex(SimpleIndex):
    # store as hex -- fast to generate, 1 byte per 4 bits.
    # eval to go from hex to long for bit manipulation

    _possiblePaths = {
        'recordStore': {
            "docs": ("The recordStore in which the records are kept "
                     "(as this info not maintained in the index)")
        }
    }

    def __init__(self, session, config, parent):
        SimpleIndex.__init__(self, session, config, parent)
        self.indexingData = SimpleBitfield()
        self.indexingTerm = ""
        self.recordStore = self.get_setting(session, 'recordStore', None)
        if not self.recordStore:
            rs = self.get_path(session, 'recordStore', None)
            if rs:
                self.recordStore = rs.id
        self.resultSetClass = BitmapResultSet

    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        # in: list of longs
        if len(data) == 1 and isinstance(data[0], SimpleBitfield):
            # HACK.  Accept bitfield from mergeTerms
            bf = data[0]
        else:
            bf = SimpleBitfield()
            for item in data[::3]:
                bf[item] = 1
        pack = struct.pack('<lll', termId, nRecs, nOccs)
        val = pack + str(bf)
        return val

    def calc_sectionOffsets(self, session, start, nRecs, dataLen):
        # order is (of course) backwards
        # so we need length of data etc etc.
        start = (dataLen - (start / 4) + 1) - (nRecs / 4)
        packing = dataLen - (start + (nRecs / 4) + 1)
        return [(start, (nRecs / 4) + 1, '0x', '0' * packing)]

    def deserialize_term(self, session, data, nRecs=-1, prox=0):
        lsize = 3 * self.longStructSize
        longs = data[:lsize]
        terms = list(struct.unpack('<lll', longs))
        if len(data) > lsize:
            bf = SimpleBitfield(data[lsize:])
            terms.append(bf)
        return terms

    def merge_term(self, session, currentData, newData,
                   op="replace", nRecs=0, nOccs=0):
        (termid, oldTotalRecs, oldTotalOccs, oldBf) = currentData
        if op in['add', 'replace']:
            for t in newData[1::3]:
                oldBf[t] = 1
        elif op == 'delete':
            for t in newData[1::3]:
                oldBf[t] = 0
        trecs = oldBf.lenTrueItems()
        toccs = trecs
        merged = [termid, trecs, toccs, oldBf]
        return merged

    def construct_resultSetItem(self, session, term, rsiType=""):
        # in: single triple
        # out: resultSetItem
        # Need to map recordStore and docid at indexStore
        return self.indexStore.construct_resultSetItem(session, term[0],
                                                       term[1], term[2])

    def construct_resultSet(self, session, terms, queryHash={}):
        # in: unpacked
        # out: resultSet
        if len(terms) > 3:
            data = terms[3]
            s = BitmapResultSet(session, data, recordStore=self.recordStore)
        else:
            bmp = SimpleBitfield(0)
            s = BitmapResultSet(session, bmp, recordStore=self.recordStore)
        s.index = self
        if queryHash:
            s.queryTerm = queryHash['text']
            s.queryFreq = queryHash['occurences']
        if (terms):
            s.termid = terms[0]
            s.totalRecs = terms[1]
            s.totalOccs = terms[2]
        else:
            s.totalRecs = 0
            s.totalOccs = 0
        return s


[docs]class RecordIdentifierIndex(Index):

    _possibleSettings = {
        'recordStore': {
            "docs": ("The recordStore in which the records are kept "
                     "(as this info not maintained in the index)")
        }
    }

    def begin_indexing(self, session):
        pass

    def commit_indexing(self, session):
        pass

    def index_record(self, session, rec):
        return rec

    def delete_record(self, session, rec):
        pass

    def clear(self, session):
        pass

    def scan(self, session, clause, nTerms, direction):
        raise NotImplementedError()

    def search(self, session, clause, db):
        # Copy data from clause to resultSetItem after checking exists
        recordStore = self.get_path(session, 'recordStore')
        base = SimpleResultSet(session)
        if clause.relation.value in ['=', 'exact']:
            t = clause.term.value
            if t.isdigit():
                t = long(t)
            if recordStore.fetch_metadata(session, t, 'wordCount') > -1:
                item = SimpleResultSetItem(session)
                item.id = t
                item.recordStore = recordStore.id
                item.database = db.id
                items = [item]
            else:
                items = []
        elif clause.relation.value == 'any':
            # split on whitespace
            terms = clause.term.value.split()
            items = []
            for t in terms:
                if t.isdigit():
                    t = long(t)
                if recordStore.fetch_metadata(session, t, 'wordCount') > -1:
                    item = SimpleResultSetItem(session)
                    item.id = t
                    item.database = db.id
                    item.recordStore = recordStore.id
                    items.append(item)
        elif (clause.relation.value in ['<', '<='] and
              clause.term.value.isdigit()):
            t = long(clause.term.value)
            if clause.relation.value == '<':
                terms = range(t)
            else:
                terms = range(t + 1)

            items = []
            for t in terms:
                if recordStore.fetch_metadata(session, t, 'wordCount') > -1:
                    item = SimpleResultSetItem(session)
                    item.id = t
                    item.database = db.id
                    item.recordStore = recordStore.id
                    items.append(item)
        else:
            msg = '%s "%s"' % (clause.relation.toCQL(), clause.term.value)
            raise QueryException(msg, 24)

        base.fromList(items)
        base.index = self
        return base


# rec.checksumValue
class ReverseMetadataIndex(Index):

    _possiblePaths = {
        'recordStore': {
            "docs": ("The recordStore in which the records are kept "
                     "(as this info not maintained in the index)")
        }
    }

    _possibleSettings = {
        'metadataType': {
            "docs": ("The type of metadata to provide an 'index' for. "
                     "Defaults to digestReverse.")
        }
    }

    def begin_indexing(self, session):
        pass

    def commit_indexing(self, session):
        pass

    def index_record(self, session, rec):
        return record

    def delete_record(self, session, rec):
        pass

    def clear(self, session):
        pass

    def scan(self, session, clause, nTerms, direction):
        raise NotImplementedError()

    def search(self, session, clause, db):
        mtype = self.get_setting(session, 'metadataType', 'digestReverse')
        recordStore = self.get_path(session, 'recordStore')
        base = SimpleResultSet(session)
        if clause.relation.value in ['=', 'exact']:
            t = clause.term.value
            rid = recordStore.fetch_metadata(session, t, mtype)
            if rid:
                item = SimpleResultSetItem(session)
                item.id = rid
                item.recordStore = recordStore.id
                item.database = db.id
                items = [item]
            else:
                items = []
        elif clause.relation.value == 'any':
            # Split on whitespace
            terms = clause.term.value.split()
            items = []
            for t in terms:
                rid = recordStore.fetch_metadata(session, t, mtype)
                if rid:
                    item = SimpleResultSetItem(session)
                    item.id = rid
                    item.database = db.id
                    item.recordStore = recordStore.id
                    items.append(item)
        base.fromList(items)
        base.index = self
        return base


[docs]class PassThroughIndex(SimpleIndex):
    """Special Index pull in search terms from another Database."""

    def _handleLxmlConfigNode(self, session, node):
        # Source
        if node.tag in ['xpath', '{%s}xpath' % CONFIG_NS,
                        'selector', '{%s}selector' % CONFIG_NS
                        ]:
            ref = node.attrib.get('{%s}ref' % CONFIG_NS,
                                  node.attrib.get('ref', ''))
            if ref:
                xp = self.get_object(session, ref)
            else:
                xp = SimpleXPathProcessor(session, node, self)
                xp.sources = [[xp._handleLxmlLocationNode(session, node)]]
            self.xpath = xp

    def _handleConfigNode(self, session, node):
        # Source
        if (node.localname in ["xpath", "selector"]):
            ref = node.getAttributeNS('ref')
            if ref:
                xp = self.get_object(session, ref)
            else:
                xp = SimpleXPathProcessor(session, node, self)
                xp.sources = [[xp._handleLocationNode(session, node)]]
            self.xpath = xp

    def __init__(self, session, config, parent):
        self.xpath = None
        SimpleIndex.__init__(self, session, config, parent)
        dbStr = self.get_path(session, 'database', '')
        if not dbStr:
            raise ConfigFileException("No remote database given in "
                                      "%s" % self.id)
        db = session.server.get_object(session, dbStr)
        if not db:
            raise ConfigFileException("Unknown remote database given in "
                                      "%s" % self.id)
        self.database = db

        idxStr = self.get_path(session, 'remoteIndex', "")
        if not idxStr:
            raise ConfigFileException("No remote index given in %s" % self.id)
        idx = db.get_object(session, idxStr)
        if not idx:
            msg = ("Unknown index %s in remote database %s for %s" %
                   (idxStr, db.id, self.id))
            raise ConfigFileException(msg)
        self.remoteIndex = idx

        idxStr = self.get_path(session, 'remoteKeyIndex', "")
        if idxStr:
            idx = db.get_object(session, idxStr)
            if not idx:
                msg = ("Unknown index %s in remote database %s for %s" %
                       (idxStr, db.id, self.id))
                raise ConfigFileException(msg)
            self.remoteKeyIndex = idx
        else:
            self.remoteKeyIndex = None

        idx = self.get_path(session, 'localIndex', None)
        if not idx:
            raise ConfigFileException("No local index given in %s" % self.id)
        self.localIndex = idx

    def search(self, session, clause, db):
        # First do search on remote index
        currDb = session.database
        session.database = self.database.id
        rs = self.remoteIndex.search(session, clause, self.database)
        # Fetch all matched records
        values = {}
        for rsi in rs:
            rec = rsi.fetch_record(session)
            # Process xpath
            try:
                value = self.xpath.process_record(session, rec)[0][0]
            except:
                # No data where we expect it
                continue
            if value:
                values[value] = 1

        # Construct search from keys and return local search
        localq = cql.parse('c3.%s any "%s"' %
                           (self.localIndex.id, ' '.join(values.keys())))
        session.database = currDb
        return self.localIndex.search(session, localq, db)

    def scan(self, session, clause, nTerms, direction=">="):
        """Scan remote index.

        Note Well:  If term in remote doesn't appear, it's still included
        with trecs and toccs of 0.  termid is always -1 as it's meaningless
        in the local context -- could be multiple or 0
        """
        currDb = session.database
        session.database = self.database.id
        scans = self.remoteIndex.scan(session, clause, nTerms,
                                      direction, summary=0)
        if not scans:
            return []
        newscans = []
        storeHash = {}
        end = 0
        endMarker = ''
        while True:
            for info in scans:
                if len(info) == 3:
                    end = 1
                    endMarker = info[2]
                term = info[0]
                termInfo = info[1]
                trecs = 0
                toccs = 0
                termid = -1
                # Construct result set is more portable but slower
                rs = self.remoteIndex.construct_resultSet(session, termInfo)
                for rsi in rs:
                    rec = rsi.fetch_record(session)
                    # process xpath
                    try:
                        value = self.xpath.process_record(session, rec)[0][0]
                    except:
                        # no data where we expect it
                        continue
                    info = self.localIndex.fetch_term(session, value,
                                                      summary=1, prox=0)
                    if info:
                        trecs += info[1]
                        toccs += info[2]
                if trecs:
                    newscans.append([term, [termid, trecs, toccs]])

                    if endMarker != '':
                        newscans[-1].append(endMarker)
                        endMarker = ''

            if (not end) and len(newscans) < nTerms + 1:
                # fetch new scans
                clause.term.value = scans[-1][0]
                if direction == "<=":
                    direction = "<"
                elif direction == ">=":
                    direction = ">"
                scans = self.remoteIndex.scan(session, clause, 10,
                                              direction, summary=0)
            else:
                break
        if endMarker != '' and len(newscans):
            newscans[-1].append(endMarker)
        return newscans[:nTerms]

    def fetch_sortValue(self, session, rec, ascending=True):
        if not self.remoteKeyIndex:
            return ''
        key = self.localIndex.fetch_sortValue(session, rec, ascending)
        if not key:
            return ''
        currDb = session.database
        session.database = self.database.id
        q = cql.parse('c3.%s exact "%s"' % (self.remoteKeyIndex.id, key))
        rs = self.remoteKeyIndex.search(session, q, self.database)
        if rs:
            sv = self.remoteIndex.fetch_sortValue(session, rs[0], ascending)
        else:
            sv = ''
        session.database = currDb
        return sv

    # No need to do anything during indexing
    def begin_indexing(self, session):
        pass

    def commit_indexing(self, session):
        pass

    def index_record(self, session, rec):
        return rec

    def delete_record(self, session, rec):
        pass