Source code for cheshire3.database

import sys
import os
import re
import time

from lxml import etree
try:
    # Name when installed by hand
    import bsddb3 as bdb
except:
    # Name that comes in Python 2.3
    # though Python 2.3 no longer supported
    import bsddb as bdb

from cheshire3.configParser import C3Object, CaselessDictionary
from cheshire3.baseObjects import Database, Index, ProtocolMap, Record
from cheshire3.baseStore import SummaryObject
from cheshire3.exceptions import (
    ConfigFileException,
    ObjectDoesNotExistException,
    QueryException
)
from cheshire3.internal import CONFIG_NS
from cheshire3.bootstrap import BSParser, BootstrapDocument
from cheshire3.resultSet import SimpleResultSet, BitmapResultSet
import cheshire3.cqlParser as cql


class SimpleDatabase(SummaryObject, Database):
[docs] """ Default database implementation""" _possibleSettings = { 'srw': { 'docs': 'Should the database be available via the SRW protocol', 'type': int, 'options': "0|1" }, 'sru': { 'docs': 'Should the database be available via the SRU protocol', 'type': int, 'options': "0|1" }, 'z3950': { 'docs': 'Should the database be available via the Z39.50 protocol', 'type': int, 'options': "0|1" }, 'remoteWorkflow': { 'docs': ('Should the database be available via the remote ' 'workflow protocol for Cheshire3. This MUST be secured, ' 'so it is not recommended without fully understanding ' 'the implications'), 'type': int, 'options': "0|1" }, 'oai-pmh': { 'docs': 'Should the database be available via the OAI protocol', 'type': int, 'options': "0|1" }, 'www': { 'docs': ("Should the database be available via Cheshire3's " "introspective web search interface."), 'type': int, 'options': "0|1" } } _possiblePaths = { 'indexStoreList': { 'docs': ("Space separated list of indexStore identifiers for this " "database.") }, 'indexStore': { 'docs': "Single indexStore identifier for this database" }, 'recordStore': { 'docs': "Single (default) recordStore identifier" }, 'protocolMap': { 'docs': "Single (default) protocolMap identifier" } } indexes = {} protocolMaps = {} indexConfigs = {} protocolMapConfigs = {} records = {} def __init__(self, session, config, parent): self.indexes = CaselessDictionary() self.protocolMaps = CaselessDictionary() self.indexConfigs = CaselessDictionary() self.protocolMapConfigs = CaselessDictionary() self.records = {} Database.__init__(self, session, config, parent) SummaryObject.__init__(self, session, config, parent) if not session.database: session.database = self.id def _cacheIndexes(self, session): storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') for (id, dom) in self.indexConfigs.iteritems(): # see if index should be built if hasattr(dom, 'childNodes'): for c in dom.childNodes: if c.nodeType == 1 and c.localName == 'paths': for c2 in c.childNodes: if c2.nodeType == 1 and c2.localName == 'object': istore = c2.getAttributeNS(None, 'ref') if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o else: for c in dom.iterchildren(tag=etree.Element): if c.tag in ['paths', '{%s}paths' % CONFIG_NS]: for c2 in c.iterchildren(tag=etree.Element): if c2.tag in ['object', '{%s}object' % CONFIG_NS]: istore = c2.attrib.get( 'ref', c2.attrib.get( '{%s}ref' % CONFIG_NS, '' ) ) if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o def _cacheProtocolMaps(self, session): for id in self.protocolMapConfigs.iterkeys(): pm = self.get_object(session, id) self.protocolMaps[pm.protocol] = pm def get_indexes(self, session): self._cacheIndexes(session) return self.indexes.values() def add_record(self, session, rec): (storeid, id) = (rec.recordStore, rec.id) try: full = self.records.get(storeid, [[]]) k = full[-1] if (len(k) > 1 and k[1] == id - 1): k[1] = id elif ((len(k) == 1 and k[0] == id - 1) or not k): k.append(id) else: full.append([id]) self.records[storeid] = full except: pass self.accumulate_metadata(session, rec) return rec def index_record(self, session, rec): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): if not idx.get_setting(session, 'noIndexDefault', 0): idx.index_record(session, rec) return rec def remove_record(self, session, rec): self.totalItems -= 1 (storeid, id) = (rec.recordStore, rec.id) # XXX remove from self.records # XXX this should be SummaryObject.unaccumulate_metadata() ? if (rec.wordCount): self.totalWordCount -= rec.wordCount if (rec.byteCount): self.totalByteCount -= rec.byteCount def unindex_record(self, session, rec): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): if not idx.get_setting(session, 'noUnindexDefault', 0): idx.delete_record(session, rec) return None def begin_indexing(self, session): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): idx.begin_indexing(session) return None def commit_indexing(self, session): for idx in self.indexes.itervalues(): idx.commit_indexing(session) return None def clear_indexes(self, session): if not len(self.indexes): self._cacheIndexes(session) for idx in self.indexes.itervalues(): idx.clear(session) return None def _search(self, session, query): if not hasattr(query, 'leftOperand'): # Check resultset rsid = query.getResultSetId() if (rsid): # Get existing result set if rsid.find('/') > -1: (rssid, rsid) = rsid.split('/', 1) rss = self.get_object(session, rssid) else: rss = self.get_object(session, "defaultResultSetStore") rset = rss.fetch_resultSet(session, rsid) rset.fromStore = 1 return rset else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) if (idx is not None): query.config = pm rs = idx.search(session, query, self) query.config = None rs.query = query return rs else: # unsupported index raise ObjectDoesNotExistException(query.index.toCQL()) else: # get the indexStore left = self._search(session, query.leftOperand) right = self._search(session, query.rightOperand) if left.__class__ == right.__class__: new = left.__class__(session, [], recordStore=left.recordStore) elif left.__class__ == BitmapResultSet: # Want to switch the left/right, # but rset assumes list[0] is same type new = right.__class__(session, [], recordStore=right.recordStore) if query.boolean.value == 'prox': # bitmaps can't do prox, so just raise msg = "Cannot use Prox with %s" % left.index.toCQL() raise QueryException(msg, 18) elif query.boolean.value == 'not': # can't reorder without changing query return new.combine(session, [left, right], query, self) else: return new.combine(session, [right, left], query, self) elif right.__class__ == BitmapResultSet: new = left.__class__(session, [], recordStore=left.recordStore) else: new = SimpleResultSet(session, []) rs = new.combine(session, [left, right], query, self) trip = cql.Triple() trip.leftOperand = left.query trip.rightOperand = right.query trip.boolean = query.boolean rs.query = trip return rs def search(self, session, query): # Check for optimized indexStore based search (eg SQL translation) storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') # FIXME: Should respect multiple index stores somehow? idxStore = self.get_object(session, storeList[0]) # Check if there's an indexStore specific search function start = time.time() if hasattr(idxStore, 'search'): rs = idxStore.search(session, query, self) else: rs = self._search(session, query) # Now do top level stuff, like sort if rs.relevancy: rs.scale_weights() rs.order(session, "weight") else: # CQL 1.2 sort definition # URI: info:srw/cql-context-set/1/sort-v1.0 try: sk = query.sortKeys except AttributeError: # pre CQL 1.2 query.resultSet = rs rs.queryTime = time.time() - start return rs sk.reverse() # stable sort = keys in reverse order pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm exact = cql.Relation('exact') term = cql.Term('') for idx in sk: # resolve index sc = cql.SearchClause(idx, exact, term) index = pm.resolveIndex(session, sc) # and find params from modifiers if idx['ascending']: ascending = True elif idx['descending']: ascending = False elif hasattr(pm, 'defaultSortDirection'): ascending = pm.defaultSortDirection[:3].lower() == 'asc' else: ascending = True if idx['missingomit']: miss = 0 elif idx['missinghigh']: miss = 1 elif idx['missinglow']: miss = -1 elif idx['missingfail']: miss = cql.Diagnostic() elif idx['missingvalue']: miss = idx['missingvalue'].value elif hasattr(pm, 'defaultSortMissing'): m = pm.defaultSortMissing vals = ['low', 'omit', 'high'] if m in vals: miss = int(vals.index(m)) - 1 elif m == 'fail': miss = cql.Diagnostic() else: miss = m else: miss = [-1, 1][int(ascending)] if idx['respectcase']: case = 1 elif idx['ignorecase']: case = 0 elif hasattr(pm, 'defaultSortCase'): if pm.defaultSortCase.lower() in ['1', 'true']: case = 1 else: case = 0 else: case = None if idx['respectaccents']: accents = 1 elif idx['ignoreaccents']: accents = 0 elif hasattr(pm, 'defaultSortAccents'): if pm.defaultSortAccents.lower() in ['1', 'true']: accents = 1 else: accents = 0 else: accents = None # Now, finally, order resultSet rs.order(session, index, ascending=ascending, missing=miss, case=case, accents=accents) query.resultSet = rs rs.queryTime = time.time() - start return rs def scan(self, session, clause, nTerms=25, direction=">="): if (hasattr(clause, 'leftOperand')): raise QueryException("Cannot use boolean in scan", 38) pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, clause) if (idx is not None): return idx.scan(session, clause, nTerms, direction) else: raise ObjectDoesNotExistException(clause.index.toCQL()) class OptimisingDatabase(SimpleDatabase):
[docs] """ Experimental query optimising database """ def __init__(self, session, config, parent): SimpleDatabase.__init__(self, session, config, parent) self.maskRe = re.compile(r'(?<!\\)[*?]') def _rewriteQuery(self, session, query): if not hasattr(query, 'leftOperand'): if query.relation.value == "all": # Rewrite to AND triples nbool = " and " elif query.relation.value == "any": nbool = " or " elif (query.relation.value == "=" and not query.term.value.isnumeric() and query.term.value.index(' ') > -1): nbool = " prox " else: # Can't rewrite return None # Now split on spaces terms = query.term.value.split(' ') if len(terms) == 1: return None nq = [] for t in terms: nq.append(' '.join([query.index.toCQL(), query.relation.toCQL(), '"' + t + '"'])) newstr = nbool.join(nq) newQuery = cql.parse(newstr) return newQuery else: n = self._rewriteQuery(session, query.leftOperand) if n: query.leftOperand = n n = self._rewriteQuery(session, query.rightOperand) if n: query.rightOperand = n return None def _attachResultCount(self, session, query): if not (hasattr(query, 'leftOperand')): # If have masking chrs, assign positive number if self.maskRe.search(query.term.value): query.resultCount = 100 else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) # terms should be atomic now. scandata = idx.scan(session, query, 1) if scandata[0][0] != query.term.value: # No matches query.resultCount = 0 else: query.resultCount = scandata[0][1][1] else: leftResultCount = query.leftOperand.resultCount rightResultCount = query.rightOperand.resultCount self._attachResultCount(session, query.leftOperand) if ( query.boolean.value in ['and', 'prox'] and leftResultCount == 0 ): query.resultCount = 0 return self._attachResultCount(session, query.rightOperand) if query.boolean.value in ['and', 'prox']: query.resultCount = min(leftResultCount, rightResultCount) if ( query.boolean.value == "and" and rightResultCount < leftResultCount ): # Can't reorder prox temp = query.leftOperand query.leftOperand = query.rightOperand query.rightOperand = temp del temp elif query.boolean.value == 'or': query.resultCount = leftResultCount + rightResultCount if rightResultCount > leftResultCount: temp = query.leftOperand query.leftOperand = query.rightOperand query.rightOperand = temp del temp else: # Can't really predict not and can't reorder. just take LHS query.resultCount = leftResultCount return None def _search(self, session, query): if query.resultCount == 0: # No matches in this full subtree return SimpleResultSet([]) else: return SimpleDatabase._search(self, session, query) def search(self, session, query): # Check for optimized indexStore based search (eg SQL translation) storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') # FIXME: Should respect multiple index stores somehow? idxStore = self.get_object(session, storeList[0]) # Check if there's an indexStore specific search function if hasattr(idxStore, 'search'): return idxStore.search(session, query, self) else: if ( (not hasattr(query, 'leftOperand')) and query.relation.value == "any" ): # Don't try to rewrite, futile. pass else: n = self._rewriteQuery(session, query) if n: query = n if not hasattr(query, 'leftOperand'): # Single term or any in single clause query.resultCount = 1 rs = self._search(session, query) else: # Triples... walk and look for ANDs that have a 0 length rs # Attach resultsets with counts self._attachResultCount(session, query) if query.resultCount == 0: # no matches return SimpleResultSet([]) else: rs = self._search(session, query) # now do top level stuff, like sort if rs.relevancy: rs.scale_weights() rs.order(session, "weight") elif query.sortKeys: # CQL 1.2 sort definition # URI: info:srw/cql-context-set/1/sort-v1.0 sk = query.sortKeys sk.reverse() # stable sort = keys in reverse order pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm for idx in sk: # resolve index index = pm.resolveIndex(session, query) # and find params from modifiers if idx['ascending']: ascending = True elif idx['descending']: ascending = False elif hasattr(pm, 'defaultSortDirection'): ascending = pm.defaultSortDirection[:3].lower() == 'asc' else: ascending = True if idx['missingomit']: miss = 0 elif idx['missinghigh']: miss = 1 elif idx['missinglow']: miss = -1 elif idx['missingfail']: miss = cql.Diagnostic() elif idx['missingvalue']: miss = idx['missingvalue'].value elif hasattr(pm, 'defaultSortMissing'): m = pm.defaultSortMissing vals = ['low', 'omit', 'high'] if m in vals: miss = int(vals.index(m)) - 1 elif m == 'fail': miss = cql.Diagnostic() else: miss = m else: miss = [-1, 1][int(ascending)] if idx['respectcase']: case = 1 elif idx['ignorecase']: case = 0 elif hasattr(pm, 'defaultSortCase'): if pm.defaultSortCase.lower() in ['1', 'true']: case = 1 else: case = 0 else: case = None if idx['respectaccents']: accents = 1 elif idx['ignoreaccents']: accents = 0 elif hasattr(pm, 'defaultSortAccents'): if pm.defaultSortAccents.lower() in ['1', 'true']: accents = 1 else: accents = 0 else: accents = None # now, finally, order resultSet rs.order(session, idx, ascending=asc, missing=miss, case=case, accents=accents) query.resultSet = rs return rs