Source code for cheshire3.database

import sys
import os
import re
import time

from lxml import etree
try:
    # Name when installed by hand
    import bsddb3 as bdb
except:
    # Name that comes in Python 2.3
    # though Python 2.3 no longer supported
    import bsddb as bdb

from cheshire3.configParser import C3Object, CaselessDictionary
from cheshire3.baseObjects import Database, Index, ProtocolMap, Record
from cheshire3.baseStore import SummaryObject
from cheshire3.exceptions import ConfigFileException,\
                                 ObjectDoesNotExistException, QueryException
from cheshire3.internal import CONFIG_NS
from cheshire3.bootstrap import BSParser, BootstrapDocument
from cheshire3.resultSet import SimpleResultSet, BitmapResultSet
import cheshire3.cqlParser as cql


class SimpleDatabase(SummaryObject, Database):
[docs] """ Default database implementation""" _possibleSettings = { 'srw': { 'docs': 'Should the database be available via the SRW protocol', 'type': int, 'options': "0|1" }, 'sru': { 'docs': 'Should the database be available via the SRU protocol', 'type': int, 'options': "0|1" }, 'z3950': { 'docs': 'Should the database be available via the Z39.50 protocol', 'type': int, 'options': "0|1" }, 'remoteWorkflow': { 'docs': ('Should the database be available via the remote ' 'workflow protocol for Cheshire3. This MUST be secured, ' 'so it is not recommended without fully understanding ' 'the implications'), 'type': int, 'options': "0|1" }, 'oai-pmh': { 'docs': 'Should the database be available via the OAI protocol', 'type': int, 'options': "0|1" }, 'www': { 'docs': ("Should the database be available via Cheshire3's " "introspective web search interface."), 'type': int, 'options': "0|1" } } _possiblePaths = { 'indexStoreList': { 'docs': ("Space separated list of indexStore identifiers for this " "database.") }, 'indexStore': { 'docs': "Single indexStore identifier for this database" }, 'recordStore': { 'docs': "Single (default) recordStore identifier" }, 'protocolMap': { 'docs': "Single (default) protocolMap identifier" } } indexes = {} protocolMaps = {} indexConfigs = {} protocolMapConfigs = {} records = {} def __init__(self, session, config, parent): self.indexes = CaselessDictionary() self.protocolMaps = CaselessDictionary() self.indexConfigs = CaselessDictionary() self.protocolMapConfigs = CaselessDictionary() self.records = {} Database.__init__(self, session, config, parent) SummaryObject.__init__(self, session, config, parent) if not session.database: session.database = self.id def _cacheIndexes(self, session): storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') for (id, dom) in self.indexConfigs.iteritems(): # see if index should be built if hasattr(dom, 'childNodes'): for c in dom.childNodes: if c.nodeType == 1 and c.localName == 'paths': for c2 in c.childNodes: if c2.nodeType == 1 and c2.localName == 'object': istore = c2.getAttributeNS(None, 'ref') if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o else: for c in dom.iterchildren(tag=etree.Element): if c.tag in ['paths', '{%s}paths' % CONFIG_NS]: for c2 in c.iterchildren(tag=etree.Element): if c2.tag in ['object', '{%s}object' % CONFIG_NS]: istore = c2.attrib.get('ref', c2.attrib.get( '{%s}ref' % CONFIG_NS, '' ) ) if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o def _cacheProtocolMaps(self, session): for id in self.protocolMapConfigs.iterkeys(): pm = self.get_object(session, id) self.protocolMaps[pm.protocol] = pm def get_indexes(self, session): self._cacheIndexes(session) return self.indexes.values() def add_record(self, session, rec): (storeid, id) = (rec.recordStore, rec.id) try: full = self.records.get(storeid, [[]]) k = full[-1] if (len(k) > 1 and k[1] == id - 1): k[1] = id elif ((len(k) == 1 and k[0] == id - 1) or not k): k.append(id) else: full.append([id]) self.records[storeid] = full except: pass self.accumulate_metadata(session, rec) return rec def index_record(self, session, rec): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): if not idx.get_setting(session, 'noIndexDefault', 0): idx.index_record(session, rec) return rec def remove_record(self, session, rec): self.totalItems -= 1 (storeid, id) = (rec.recordStore, rec.id) # XXX remove from self.records # XXX this should be SummaryObject.unaccumulate_metadata() ? if (rec.wordCount): self.totalWordCount -= rec.wordCount if (rec.byteCount): self.totalByteCount -= rec.byteCount def unindex_record(self, session, rec): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): if not idx.get_setting(session, 'noUnindexDefault', 0): idx.delete_record(session, rec) return None def begin_indexing(self, session): if not self.indexes: self._cacheIndexes(session) for idx in self.indexes.itervalues(): idx.begin_indexing(session) return None def commit_indexing(self, session): for idx in self.indexes.itervalues(): idx.commit_indexing(session) return None def clear_indexes(self, session): if not len(self.indexes): self._cacheIndexes(session) for idx in self.indexes.itervalues(): idx.clear(session) return None def _search(self, session, query): if not hasattr(query, 'leftOperand'): # Check resultset rsid = query.getResultSetId() if (rsid): # Get existing result set if rsid.find('/') > -1: (rssid, rsid) = rsid.split('/', 1) rss = self.get_object(session, rssid) else: rss = self.get_object(session, "defaultResultSetStore") rset = rss.fetch_resultSet(session, rsid) rset.fromStore = 1 return rset else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) if (idx is not None): query.config = pm rs = idx.search(session, query, self) query.config = None rs.query = query return rs else: # unsupported index raise ObjectDoesNotExistException(query.index.toCQL()) else: # get the indexStore left = self._search(session, query.leftOperand) right = self._search(session, query.rightOperand) if left.__class__ == right.__class__: new = left.__class__(session, [], recordStore=left.recordStore) elif left.__class__ == BitmapResultSet: # Want to switch the left/right, # but rset assumes list[0] is same type new = right.__class__(session, [], recordStore=right.recordStore) if query.boolean.value == 'prox': # bitmaps can't do prox, so just raise msg = "Cannot use Prox with %s" % left.index.toCQL() raise QueryException(msg, 18) elif query.boolean.value == 'not': # can't reorder without changing query return new.combine(session, [left, right], query, self) else: return new.combine(session, [right, left], query, self) elif right.__class__ == BitmapResultSet: new = left.__class__(session, [], recordStore=left.recordStore) else: new = SimpleResultSet(session, []) rs = new.combine(session, [left, right], query, self) trip = cql.Triple() trip.leftOperand = left.query trip.rightOperand = right.query trip.boolean = query.boolean rs.query = trip return rs def search(self, session, query): # Check for optimized indexStore based search (eg SQL translation) storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') # FIXME: Should respect multiple index stores somehow? idxStore = self.get_object(session, storeList[0]) # Check if there's an indexStore specific search function start = time.time() if hasattr(idxStore, 'search'): rs = idxStore.search(session, query, self) else: rs = self._search(session, query) # Now do top level stuff, like sort if rs.relevancy: rs.scale_weights() rs.order(session, "weight") else: # CQL 1.2 sort definition # URI: info:srw/cql-context-set/1/sort-v1.0 try: sk = query.sortKeys except AttributeError: # pre CQL 1.2 query.resultSet = rs rs.queryTime = time.time() - start return rs sk.reverse() # stable sort = keys in reverse order pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm exact = cql.Relation('exact') term = cql.Term('') for idx in sk: # resolve index sc = cql.SearchClause(idx, exact, term) index = pm.resolveIndex(session, sc) # and find params from modifiers if idx['ascending']: ascending = True elif idx['descending']: ascending = False elif hasattr(pm, 'defaultSortDirection'): ascending = pm.defaultSortDirection[:3].lower() == 'asc' else: ascending = True if idx['missingomit']: miss = 0 elif idx['missinghigh']: miss = 1 elif idx['missinglow']: miss = -1 elif idx['missingfail']: miss = cql.Diagnostic() elif idx['missingvalue']: miss = idx['missingvalue'].value elif hasattr(pm, 'defaultSortMissing'): m = pm.defaultSortMissing vals = ['low', 'omit', 'high'] if m in vals: miss = int(vals.index(m)) - 1 elif m == 'fail': miss = cql.Diagnostic() else: miss = m else: miss = [-1, 1][int(ascending)] if idx['respectcase']: case = 1 elif idx['ignorecase']: case = 0 elif hasattr(pm, 'defaultSortCase'): if pm.defaultSortCase.lower() in ['1', 'true']: case = 1 else: case = 0 else: case = None if idx['respectaccents']: accents = 1 elif idx['ignoreaccents']: accents = 0 elif hasattr(pm, 'defaultSortAccents'): if pm.defaultSortAccents.lower() in ['1', 'true']: accents = 1 else: accents = 0 else: accents = None # Now, finally, order resultSet rs.order(session, index, ascending=ascending, missing=miss, case=case, accents=accents) query.resultSet = rs rs.queryTime = time.time() - start return rs def scan(self, session, clause, nTerms=25, direction=">="): if (hasattr(clause, 'leftOperand')): raise QueryException("Cannot use boolean in scan", 38) pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, clause) if (idx is not None): return idx.scan(session, clause, nTerms, direction) else: raise ObjectDoesNotExistException(clause.index.toCQL()) class OptimisingDatabase(SimpleDatabase):
[docs] """ Experimental query optimising database """ def __init__(self, session, config, parent): SimpleDatabase.__init__(self, session, config, parent) self.maskRe = re.compile(r'(?<!\\)[*?]') def _rewriteQuery(self, session, query): if not hasattr(query, 'leftOperand'): if query.relation.value == "all": # Rewrite to AND triples nbool = " and " elif query.relation.value == "any": nbool = " or " elif (query.relation.value == "=" and not query.term.value.isnumeric() and query.term.value.index(' ') > -1): nbool = " prox " else: # Can't rewrite return None # Now split on spaces terms = query.term.value.split(' ') if len(terms) == 1: return None nq = [] for t in terms: nq.append(' '.join([query.index.toCQL(), query.relation.toCQL(), '"' + t + '"'])) newstr = nbool.join(nq) newQuery = cql.parse(newstr) return newQuery else: n = self._rewriteQuery(session, query.leftOperand) if n: query.leftOperand = n n = self._rewriteQuery(session, query.rightOperand) if n: query.rightOperand = n return None def _attachResultCount(self, session, query): if not (hasattr(query, 'leftOperand')): # If have masking chrs, assign positive number if self.maskRe.search(query.term.value): query.resultCount = 100 else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) # terms should be atomic now. scandata = idx.scan(session, query, 1) if scandata[0][0] != query.term.value: # No matches query.resultCount = 0 else: query.resultCount = scandata[0][1][1] else: leftResultCount = query.leftOperand.resultCount rightResultCount = query.rightOperand.resultCount self._attachResultCount(session, query.leftOperand) if (query.boolean.value in ['and', 'prox'] and leftResultCount == 0): query.resultCount = 0 return self._attachResultCount(session, query.rightOperand) if query.boolean.value in ['and', 'prox']: query.resultCount = min(leftResultCount, rightResultCount) if (query.boolean.value == "and" and rightResultCount < leftResultCount): # Can't reorder prox temp = query.leftOperand query.leftOperand = query.rightOperand query.rightOperand = temp del temp elif query.boolean.value == 'or': query.resultCount = leftResultCount + rightResultCount if rightResultCount > leftResultCount: temp = query.leftOperand query.leftOperand = query.rightOperand query.rightOperand = temp del temp else: # Can't really predict not and can't reorder. just take LHS query.resultCount = leftResultCount return None def _search(self, session, query): if query.resultCount == 0: # No matches in this full subtree return SimpleResultSet([]) else: return SimpleDatabase._search(self, session, query) def search(self, session, query): # Check for optimized indexStore based search (eg SQL translation) storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') # FIXME: Should respect multiple index stores somehow? idxStore = self.get_object(session, storeList[0]) # Check if there's an indexStore specific search function if hasattr(idxStore, 'search'): return idxStore.search(session, query, self) else: if ((not hasattr(query, 'leftOperand')) and query.relation.value == "any"): # Don't try to rewrite, futile. pass else: n = self._rewriteQuery(session, query) if n: query = n if not hasattr(query, 'leftOperand'): # Single term or any in single clause query.resultCount = 1 rs = self._search(session, query) else: # Triples... walk and look for ANDs that have a 0 length rs # Attach resultsets with counts self._attachResultCount(session, query) if query.resultCount == 0: # no matches return SimpleResultSet([]) else: rs = self._search(session, query) # now do top level stuff, like sort if rs.relevancy: rs.scale_weights() rs.order(session, "weight") elif query.sortKeys: # CQL 1.2 sort definition # URI: info:srw/cql-context-set/1/sort-v1.0 sk = query.sortKeys sk.reverse() # stable sort = keys in reverse order pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm for idx in sk: # resolve index index = pm.resolveIndex(session, query) # and find params from modifiers if idx['ascending']: ascending = True elif idx['descending']: ascending = False elif hasattr(pm, 'defaultSortDirection'): ascending = pm.defaultSortDirection[:3].lower() == 'asc' else: ascending = True if idx['missingomit']: miss = 0 elif idx['missinghigh']: miss = 1 elif idx['missinglow']: miss = -1 elif idx['missingfail']: miss = cql.Diagnostic() elif idx['missingvalue']: miss = idx['missingvalue'].value elif hasattr(pm, 'defaultSortMissing'): m = pm.defaultSortMissing vals = ['low', 'omit', 'high'] if m in vals: miss = int(vals.index(m)) - 1 elif m == 'fail': miss = cql.Diagnostic() else: miss = m else: miss = [-1, 1][int(ascending)] if idx['respectcase']: case = 1 elif idx['ignorecase']: case = 0 elif hasattr(pm, 'defaultSortCase'): if pm.defaultSortCase.lower() in ['1', 'true']: case = 1 else: case = 0 else: case = None if idx['respectaccents']: accents = 1 elif idx['ignoreaccents']: accents = 0 elif hasattr(pm, 'defaultSortAccents'): if pm.defaultSortAccents.lower() in ['1', 'true']: accents = 1 else: accents = 0 else: accents = None # now, finally, order resultSet rs.order(session, idx, ascending=asc, missing=miss, case=case, accents=accents) query.resultSet = rs return rs