Source code for cheshire3.database

import sys
import os
import re
import time

from lxml import etree
try:
    # Name when installed by hand
    import bsddb3 as bdb
except:
    # Name that comes in Python 2.3
    # though Python 2.3 no longer supported
    import bsddb as bdb

from cheshire3.configParser import C3Object, CaselessDictionary
from cheshire3.baseObjects import Database, Index, ProtocolMap, Record
from cheshire3.baseStore import SummaryObject
from cheshire3.exceptions import (
    ConfigFileException,
    ObjectDoesNotExistException,
    QueryException
)
from cheshire3.internal import CONFIG_NS
from cheshire3.bootstrap import BSParser, BootstrapDocument
from cheshire3.resultSet import SimpleResultSet, BitmapResultSet
import cheshire3.cqlParser as cql


class SimpleDatabase(SummaryObject, Database):
[docs]    """ Default database implementation"""

    _possibleSettings = {
        'srw': {
            'docs': 'Should the database be available via the SRW protocol',
            'type': int,
            'options': "0|1"
        },
        'sru': {
            'docs': 'Should the database be available via the SRU protocol',
            'type': int,
            'options': "0|1"
        },
        'z3950': {
            'docs': 'Should the database be available via the Z39.50 protocol',
            'type': int,
            'options': "0|1"
        },
        'remoteWorkflow': {
            'docs': ('Should the database be available via the remote '
                     'workflow protocol for Cheshire3. This MUST be secured, '
                     'so it is not recommended without fully understanding '
                     'the implications'),
            'type': int,
            'options': "0|1"
        },
        'oai-pmh': {
            'docs': 'Should the database be available via the OAI protocol',
            'type': int,
            'options': "0|1"
        },
        'www': {
            'docs': ("Should the database be available via Cheshire3's "
                     "introspective web search interface."),
            'type': int,
            'options': "0|1"
        }
    }

    _possiblePaths = {
        'indexStoreList': {
            'docs': ("Space separated list of indexStore identifiers for this "
                     "database.")
        },
        'indexStore': {
            'docs': "Single indexStore identifier for this database"
        },
        'recordStore': {
            'docs': "Single (default) recordStore identifier"
        },
        'protocolMap': {
            'docs': "Single (default) protocolMap identifier"
        }
    }

    indexes = {}
    protocolMaps = {}
    indexConfigs = {}
    protocolMapConfigs = {}
    records = {}

    def __init__(self, session, config, parent):
        self.indexes = CaselessDictionary()
        self.protocolMaps = CaselessDictionary()
        self.indexConfigs = CaselessDictionary()
        self.protocolMapConfigs = CaselessDictionary()
        self.records = {}
        Database.__init__(self, session, config, parent)
        SummaryObject.__init__(self, session, config, parent)
        if not session.database:
            session.database = self.id

    def _cacheIndexes(self, session):
        storeList = self.get_path(session, 'indexStoreList')
        if not storeList:
            indexStore = self.get_path(session, 'indexStore')
            if not indexStore:
                msg = ("No indexStore/indexStoreList associated with "
                       "database: %s" % self.id)
                raise ConfigFileException(msg)
            storeList = [indexStore.id]
        else:
            storeList = storeList.split(' ')
        for (id, dom) in self.indexConfigs.iteritems():
            # see if index should be built
            if hasattr(dom, 'childNodes'):
                for c in dom.childNodes:
                    if c.nodeType == 1 and c.localName == 'paths':
                        for c2 in c.childNodes:
                            if c2.nodeType == 1 and c2.localName == 'object':
                                istore = c2.getAttributeNS(None, 'ref')
                                if istore in storeList:
                                    o = self.get_object(session, id)
                                    self.indexes[id] = o
            else:
                for c in dom.iterchildren(tag=etree.Element):
                    if c.tag in ['paths', '{%s}paths' % CONFIG_NS]:
                        for c2 in c.iterchildren(tag=etree.Element):
                            if c2.tag in ['object', '{%s}object' % CONFIG_NS]:
                                istore = c2.attrib.get(
                                    'ref',
                                    c2.attrib.get(
                                        '{%s}ref' % CONFIG_NS,
                                        ''
                                    )
                                )
                                if istore in storeList:
                                    o = self.get_object(session, id)
                                    self.indexes[id] = o

    def _cacheProtocolMaps(self, session):
        for id in self.protocolMapConfigs.iterkeys():
            pm = self.get_object(session, id)
            self.protocolMaps[pm.protocol] = pm

    def get_indexes(self, session):
        self._cacheIndexes(session)
        return self.indexes.values()

    def add_record(self, session, rec):
        (storeid, id) = (rec.recordStore, rec.id)
        try:
            full = self.records.get(storeid, [[]])
            k = full[-1]
            if (len(k) > 1 and k[1] == id - 1):
                k[1] = id
            elif ((len(k) == 1 and k[0] == id - 1) or not k):
                k.append(id)
            else:
                full.append([id])
            self.records[storeid] = full
        except:
            pass
        self.accumulate_metadata(session, rec)
        return rec

    def index_record(self, session, rec):
        if not self.indexes:
            self._cacheIndexes(session)
        for idx in self.indexes.itervalues():
            if not idx.get_setting(session, 'noIndexDefault', 0):
                idx.index_record(session, rec)
        return rec

    def remove_record(self, session, rec):
        self.totalItems -= 1
        (storeid, id) = (rec.recordStore, rec.id)
        # XXX remove from self.records
        # XXX this should be SummaryObject.unaccumulate_metadata() ?
        if (rec.wordCount):
            self.totalWordCount -= rec.wordCount
        if (rec.byteCount):
            self.totalByteCount -= rec.byteCount

    def unindex_record(self, session, rec):
        if not self.indexes:
            self._cacheIndexes(session)
        for idx in self.indexes.itervalues():
            if not idx.get_setting(session, 'noUnindexDefault', 0):
                idx.delete_record(session, rec)
        return None

    def begin_indexing(self, session):
        if not self.indexes:
            self._cacheIndexes(session)
        for idx in self.indexes.itervalues():
            idx.begin_indexing(session)
        return None

    def commit_indexing(self, session):
        for idx in self.indexes.itervalues():
            idx.commit_indexing(session)
        return None

    def clear_indexes(self, session):
        if not len(self.indexes):
            self._cacheIndexes(session)
        for idx in self.indexes.itervalues():
            idx.clear(session)
        return None

    def _search(self, session, query):
        if not hasattr(query, 'leftOperand'):
            # Check resultset
            rsid = query.getResultSetId()
            if (rsid):
                # Get existing result set
                if rsid.find('/') > -1:
                    (rssid, rsid) = rsid.split('/', 1)
                    rss = self.get_object(session, rssid)
                else:
                    rss = self.get_object(session, "defaultResultSetStore")
                rset = rss.fetch_resultSet(session, rsid)
                rset.fromStore = 1
                return rset
            else:
                pm = self.get_path(session, 'protocolMap')
                if not pm:
                    self._cacheProtocolMaps(session)
                    pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
                    self.paths['protocolMap'] = pm
                idx = pm.resolveIndex(session, query)
                if (idx is not None):
                    query.config = pm
                    rs = idx.search(session, query, self)
                    query.config = None
                    rs.query = query
                    return rs
                else:
                    # unsupported index
                    raise ObjectDoesNotExistException(query.index.toCQL())

        else:
            # get the indexStore
            left = self._search(session, query.leftOperand)
            right = self._search(session, query.rightOperand)
            if left.__class__ == right.__class__:
                new = left.__class__(session, [], recordStore=left.recordStore)
            elif left.__class__ == BitmapResultSet:
                # Want to switch the left/right,
                # but rset assumes list[0] is same type
                new = right.__class__(session, [],
                                      recordStore=right.recordStore)
                if query.boolean.value == 'prox':
                    # bitmaps can't do prox, so just raise
                    msg = "Cannot use Prox with %s" % left.index.toCQL()
                    raise QueryException(msg, 18)
                elif query.boolean.value == 'not':
                    # can't reorder without changing query
                    return new.combine(session, [left, right], query, self)
                else:
                    return new.combine(session, [right, left], query, self)
            elif right.__class__ == BitmapResultSet:
                new = left.__class__(session, [], recordStore=left.recordStore)
            else:
                new = SimpleResultSet(session, [])
            rs = new.combine(session, [left, right], query, self)
            trip = cql.Triple()
            trip.leftOperand = left.query
            trip.rightOperand = right.query
            trip.boolean = query.boolean
            rs.query = trip
            return rs

    def search(self, session, query):
        # Check for optimized indexStore based search (eg SQL translation)
        storeList = self.get_path(session, 'indexStoreList')
        if not storeList:
            indexStore = self.get_path(session, 'indexStore')
            if not indexStore:
                msg = ("No indexStore/indexStoreList associated with "
                       "database: %s" % self.id)
                raise ConfigFileException(msg)
            storeList = [indexStore.id]
        else:
            storeList = storeList.split(' ')

        # FIXME: Should respect multiple index stores somehow?
        idxStore = self.get_object(session, storeList[0])
        # Check if there's an indexStore specific search function
        start = time.time()
        if hasattr(idxStore, 'search'):
            rs = idxStore.search(session, query, self)
        else:
            rs = self._search(session, query)
        # Now do top level stuff, like sort
        if rs.relevancy:
            rs.scale_weights()
            rs.order(session, "weight")
        else:
            # CQL 1.2 sort definition
            # URI: info:srw/cql-context-set/1/sort-v1.0
            try:
                sk = query.sortKeys
            except AttributeError:
                # pre CQL 1.2
                query.resultSet = rs
                rs.queryTime = time.time() - start
                return rs

            sk.reverse()  # stable sort = keys in reverse order
            pm = self.get_path(session, 'protocolMap')
            if not pm:
                self._cacheProtocolMaps(session)
                pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
                self.paths['protocolMap'] = pm
            exact = cql.Relation('exact')
            term = cql.Term('')
            for idx in sk:
                # resolve index
                sc = cql.SearchClause(idx, exact, term)

                index = pm.resolveIndex(session, sc)
                # and find params from modifiers
                if idx['ascending']:
                    ascending = True
                elif idx['descending']:
                    ascending = False
                elif hasattr(pm, 'defaultSortDirection'):
                    ascending = pm.defaultSortDirection[:3].lower() == 'asc'
                else:
                    ascending = True

                if idx['missingomit']:
                    miss = 0
                elif idx['missinghigh']:
                    miss = 1
                elif idx['missinglow']:
                    miss = -1
                elif idx['missingfail']:
                    miss = cql.Diagnostic()
                elif idx['missingvalue']:
                    miss = idx['missingvalue'].value
                elif hasattr(pm, 'defaultSortMissing'):
                    m = pm.defaultSortMissing
                    vals = ['low', 'omit', 'high']
                    if m in vals:
                        miss = int(vals.index(m)) - 1
                    elif m == 'fail':
                        miss = cql.Diagnostic()
                    else:
                        miss = m
                else:
                    miss = [-1, 1][int(ascending)]

                if idx['respectcase']:
                    case = 1
                elif idx['ignorecase']:
                    case = 0
                elif hasattr(pm, 'defaultSortCase'):
                    if pm.defaultSortCase.lower() in ['1', 'true']:
                        case = 1
                    else:
                        case = 0
                else:
                    case = None

                if idx['respectaccents']:
                    accents = 1
                elif idx['ignoreaccents']:
                    accents = 0
                elif hasattr(pm, 'defaultSortAccents'):
                    if pm.defaultSortAccents.lower() in ['1', 'true']:
                        accents = 1
                    else:
                        accents = 0
                else:
                    accents = None
                # Now, finally, order resultSet
                rs.order(session, index, ascending=ascending,
                         missing=miss, case=case, accents=accents)
        query.resultSet = rs
        rs.queryTime = time.time() - start
        return rs

    def scan(self, session, clause, nTerms=25, direction=">="):
        if (hasattr(clause, 'leftOperand')):
            raise QueryException("Cannot use boolean in scan", 38)
        pm = self.get_path(session, 'protocolMap')
        if not pm:
            self._cacheProtocolMaps(session)
            pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
            self.paths['protocolMap'] = pm
        idx = pm.resolveIndex(session, clause)
        if (idx is not None):
            return idx.scan(session, clause, nTerms, direction)
        else:
            raise ObjectDoesNotExistException(clause.index.toCQL())


class OptimisingDatabase(SimpleDatabase):
[docs]    """ Experimental query optimising database """

    def __init__(self, session, config, parent):
        SimpleDatabase.__init__(self, session, config, parent)
        self.maskRe = re.compile(r'(?<!\\)[*?]')

    def _rewriteQuery(self, session, query):
        if not hasattr(query, 'leftOperand'):
            if query.relation.value == "all":
                # Rewrite to AND triples
                nbool = " and "
            elif query.relation.value == "any":
                nbool = " or "
            elif (query.relation.value == "=" and not
                  query.term.value.isnumeric() and
                  query.term.value.index(' ') > -1):
                nbool = " prox "
            else:
                # Can't rewrite
                return None
            # Now split on spaces
            terms = query.term.value.split(' ')
            if len(terms) == 1:
                return None
            nq = []
            for t in terms:
                nq.append(' '.join([query.index.toCQL(),
                                    query.relation.toCQL(),
                                    '"' + t + '"']))
            newstr = nbool.join(nq)
            newQuery = cql.parse(newstr)
            return newQuery
        else:
            n = self._rewriteQuery(session, query.leftOperand)
            if n:
                query.leftOperand = n
            n = self._rewriteQuery(session, query.rightOperand)
            if n:
                query.rightOperand = n
            return None

    def _attachResultCount(self, session, query):
        if not (hasattr(query, 'leftOperand')):
            # If have masking chrs, assign positive number
            if self.maskRe.search(query.term.value):
                query.resultCount = 100
            else:
                pm = self.get_path(session, 'protocolMap')
                if not pm:
                    self._cacheProtocolMaps(session)
                    pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
                    self.paths['protocolMap'] = pm
                idx = pm.resolveIndex(session, query)
                # terms should be atomic now.
                scandata = idx.scan(session, query, 1)
                if scandata[0][0] != query.term.value:
                    # No matches
                    query.resultCount = 0
                else:
                    query.resultCount = scandata[0][1][1]
        else:
            leftResultCount = query.leftOperand.resultCount
            rightResultCount = query.rightOperand.resultCount
            self._attachResultCount(session, query.leftOperand)
            if (
                query.boolean.value in ['and', 'prox'] and
                leftResultCount == 0
            ):
                query.resultCount = 0
                return

            self._attachResultCount(session, query.rightOperand)
            if query.boolean.value in ['and', 'prox']:
                query.resultCount = min(leftResultCount, rightResultCount)
                if (
                    query.boolean.value == "and" and
                    rightResultCount < leftResultCount
                ):
                    # Can't reorder prox
                    temp = query.leftOperand
                    query.leftOperand = query.rightOperand
                    query.rightOperand = temp
                    del temp
            elif query.boolean.value == 'or':
                query.resultCount = leftResultCount + rightResultCount
                if rightResultCount > leftResultCount:
                    temp = query.leftOperand
                    query.leftOperand = query.rightOperand
                    query.rightOperand = temp
                    del temp
            else:
                # Can't really predict not and can't reorder. just take LHS
                query.resultCount = leftResultCount
        return None

    def _search(self, session, query):
        if query.resultCount == 0:
            # No matches in this full subtree
            return SimpleResultSet([])
        else:
            return SimpleDatabase._search(self, session, query)

    def search(self, session, query):
        # Check for optimized indexStore based search (eg SQL translation)
        storeList = self.get_path(session, 'indexStoreList')
        if not storeList:
            indexStore = self.get_path(session, 'indexStore')
            if not indexStore:
                msg = ("No indexStore/indexStoreList associated with "
                       "database: %s" % self.id)
                raise ConfigFileException(msg)
            storeList = [indexStore.id]
        else:
            storeList = storeList.split(' ')
        # FIXME: Should respect multiple index stores somehow?
        idxStore = self.get_object(session, storeList[0])
        # Check if there's an indexStore specific search function
        if hasattr(idxStore, 'search'):
            return idxStore.search(session, query, self)
        else:
            if (
                (not hasattr(query, 'leftOperand')) and
                query.relation.value == "any"
            ):
                # Don't try to rewrite, futile.
                pass
            else:
                n = self._rewriteQuery(session, query)
                if n:
                    query = n
            if not hasattr(query, 'leftOperand'):
                # Single term or any in single clause
                query.resultCount = 1
                rs = self._search(session, query)
            else:
                # Triples... walk and look for ANDs that have a 0 length rs
                # Attach resultsets with counts
                self._attachResultCount(session, query)
                if query.resultCount == 0:
                    # no matches
                    return SimpleResultSet([])
                else:
                    rs = self._search(session, query)

        # now do top level stuff, like sort

        if rs.relevancy:
            rs.scale_weights()
            rs.order(session, "weight")
        elif query.sortKeys:
            # CQL 1.2 sort definition
            # URI: info:srw/cql-context-set/1/sort-v1.0

            sk = query.sortKeys
            sk.reverse()  # stable sort = keys in reverse order
            pm = self.get_path(session, 'protocolMap')
            if not pm:
                self._cacheProtocolMaps(session)
                pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
                self.paths['protocolMap'] = pm
            for idx in sk:
                # resolve index
                index = pm.resolveIndex(session, query)
                # and find params from modifiers
                if idx['ascending']:
                    ascending = True
                elif idx['descending']:
                    ascending = False
                elif hasattr(pm, 'defaultSortDirection'):
                    ascending = pm.defaultSortDirection[:3].lower() == 'asc'
                else:
                    ascending = True

                if idx['missingomit']:
                    miss = 0
                elif idx['missinghigh']:
                    miss = 1
                elif idx['missinglow']:
                    miss = -1
                elif idx['missingfail']:
                    miss = cql.Diagnostic()
                elif idx['missingvalue']:
                    miss = idx['missingvalue'].value
                elif hasattr(pm, 'defaultSortMissing'):
                    m = pm.defaultSortMissing
                    vals = ['low', 'omit', 'high']
                    if m in vals:
                        miss = int(vals.index(m)) - 1
                    elif m == 'fail':
                        miss = cql.Diagnostic()
                    else:
                        miss = m
                else:
                    miss = [-1, 1][int(ascending)]

                if idx['respectcase']:
                    case = 1
                elif idx['ignorecase']:
                    case = 0
                elif hasattr(pm, 'defaultSortCase'):
                    if pm.defaultSortCase.lower() in ['1', 'true']:
                        case = 1
                    else:
                        case = 0
                else:
                    case = None

                if idx['respectaccents']:
                    accents = 1
                elif idx['ignoreaccents']:
                    accents = 0
                elif hasattr(pm, 'defaultSortAccents'):
                    if pm.defaultSortAccents.lower() in ['1', 'true']:
                        accents = 1
                    else:
                        accents = 0
                else:
                    accents = None

                # now, finally, order resultSet
                rs.order(session, idx, ascending=asc,
                         missing=miss, case=case, accents=accents)

        query.resultSet = rs
        return rs