import sys
import os
import re
import time
from lxml import etree
try:
# Name when installed by hand
import bsddb3 as bdb
except:
# Name that comes in Python 2.3
# though Python 2.3 no longer supported
import bsddb as bdb
from cheshire3.configParser import C3Object, CaselessDictionary
from cheshire3.baseObjects import Database, Index, ProtocolMap, Record
from cheshire3.baseStore import SummaryObject
from cheshire3.exceptions import ConfigFileException,\
ObjectDoesNotExistException, QueryException
from cheshire3.internal import CONFIG_NS
from cheshire3.bootstrap import BSParser, BootstrapDocument
from cheshire3.resultSet import SimpleResultSet, BitmapResultSet
import cheshire3.cqlParser as cql
class SimpleDatabase(SummaryObject, Database):
[docs] """ Default database implementation"""
_possibleSettings = {
'srw': {
'docs': 'Should the database be available via the SRW protocol',
'type': int,
'options': "0|1"
},
'sru': {
'docs': 'Should the database be available via the SRU protocol',
'type': int,
'options': "0|1"
},
'z3950': {
'docs': 'Should the database be available via the Z39.50 protocol',
'type': int,
'options': "0|1"
},
'remoteWorkflow': {
'docs': ('Should the database be available via the remote '
'workflow protocol for Cheshire3. This MUST be secured, '
'so it is not recommended without fully understanding '
'the implications'),
'type': int,
'options': "0|1"
},
'oai-pmh': {
'docs': 'Should the database be available via the OAI protocol',
'type': int,
'options': "0|1"
},
'www': {
'docs': ("Should the database be available via Cheshire3's "
"introspective web search interface."),
'type': int,
'options': "0|1"
}
}
_possiblePaths = {
'indexStoreList': {
'docs': ("Space separated list of indexStore identifiers for this "
"database.")
},
'indexStore': {
'docs': "Single indexStore identifier for this database"
},
'recordStore': {
'docs': "Single (default) recordStore identifier"
},
'protocolMap': {
'docs': "Single (default) protocolMap identifier"
}
}
indexes = {}
protocolMaps = {}
indexConfigs = {}
protocolMapConfigs = {}
records = {}
def __init__(self, session, config, parent):
self.indexes = CaselessDictionary()
self.protocolMaps = CaselessDictionary()
self.indexConfigs = CaselessDictionary()
self.protocolMapConfigs = CaselessDictionary()
self.records = {}
Database.__init__(self, session, config, parent)
SummaryObject.__init__(self, session, config, parent)
if not session.database:
session.database = self.id
def _cacheIndexes(self, session):
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
for (id, dom) in self.indexConfigs.iteritems():
# see if index should be built
if hasattr(dom, 'childNodes'):
for c in dom.childNodes:
if c.nodeType == 1 and c.localName == 'paths':
for c2 in c.childNodes:
if c2.nodeType == 1 and c2.localName == 'object':
istore = c2.getAttributeNS(None, 'ref')
if istore in storeList:
o = self.get_object(session, id)
self.indexes[id] = o
else:
for c in dom.iterchildren(tag=etree.Element):
if c.tag in ['paths', '{%s}paths' % CONFIG_NS]:
for c2 in c.iterchildren(tag=etree.Element):
if c2.tag in ['object', '{%s}object' % CONFIG_NS]:
istore = c2.attrib.get('ref',
c2.attrib.get(
'{%s}ref' % CONFIG_NS,
''
)
)
if istore in storeList:
o = self.get_object(session, id)
self.indexes[id] = o
def _cacheProtocolMaps(self, session):
for id in self.protocolMapConfigs.iterkeys():
pm = self.get_object(session, id)
self.protocolMaps[pm.protocol] = pm
def get_indexes(self, session):
self._cacheIndexes(session)
return self.indexes.values()
def add_record(self, session, rec):
(storeid, id) = (rec.recordStore, rec.id)
try:
full = self.records.get(storeid, [[]])
k = full[-1]
if (len(k) > 1 and k[1] == id - 1):
k[1] = id
elif ((len(k) == 1 and k[0] == id - 1) or not k):
k.append(id)
else:
full.append([id])
self.records[storeid] = full
except:
pass
self.accumulate_metadata(session, rec)
return rec
def index_record(self, session, rec):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
if not idx.get_setting(session, 'noIndexDefault', 0):
idx.index_record(session, rec)
return rec
def remove_record(self, session, rec):
self.totalItems -= 1
(storeid, id) = (rec.recordStore, rec.id)
# XXX remove from self.records
# XXX this should be SummaryObject.unaccumulate_metadata() ?
if (rec.wordCount):
self.totalWordCount -= rec.wordCount
if (rec.byteCount):
self.totalByteCount -= rec.byteCount
def unindex_record(self, session, rec):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
if not idx.get_setting(session, 'noUnindexDefault', 0):
idx.delete_record(session, rec)
return None
def begin_indexing(self, session):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
idx.begin_indexing(session)
return None
def commit_indexing(self, session):
for idx in self.indexes.itervalues():
idx.commit_indexing(session)
return None
def clear_indexes(self, session):
if not len(self.indexes):
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
idx.clear(session)
return None
def _search(self, session, query):
if not hasattr(query, 'leftOperand'):
# Check resultset
rsid = query.getResultSetId()
if (rsid):
# Get existing result set
if rsid.find('/') > -1:
(rssid, rsid) = rsid.split('/', 1)
rss = self.get_object(session, rssid)
else:
rss = self.get_object(session, "defaultResultSetStore")
rset = rss.fetch_resultSet(session, rsid)
rset.fromStore = 1
return rset
else:
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, query)
if (idx is not None):
query.config = pm
rs = idx.search(session, query, self)
query.config = None
rs.query = query
return rs
else:
# unsupported index
raise ObjectDoesNotExistException(query.index.toCQL())
else:
# get the indexStore
left = self._search(session, query.leftOperand)
right = self._search(session, query.rightOperand)
if left.__class__ == right.__class__:
new = left.__class__(session, [], recordStore=left.recordStore)
elif left.__class__ == BitmapResultSet:
# Want to switch the left/right,
# but rset assumes list[0] is same type
new = right.__class__(session, [],
recordStore=right.recordStore)
if query.boolean.value == 'prox':
# bitmaps can't do prox, so just raise
msg = "Cannot use Prox with %s" % left.index.toCQL()
raise QueryException(msg, 18)
elif query.boolean.value == 'not':
# can't reorder without changing query
return new.combine(session, [left, right], query, self)
else:
return new.combine(session, [right, left], query, self)
elif right.__class__ == BitmapResultSet:
new = left.__class__(session, [], recordStore=left.recordStore)
else:
new = SimpleResultSet(session, [])
rs = new.combine(session, [left, right], query, self)
trip = cql.Triple()
trip.leftOperand = left.query
trip.rightOperand = right.query
trip.boolean = query.boolean
rs.query = trip
return rs
def search(self, session, query):
# Check for optimized indexStore based search (eg SQL translation)
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
# FIXME: Should respect multiple index stores somehow?
idxStore = self.get_object(session, storeList[0])
# Check if there's an indexStore specific search function
start = time.time()
if hasattr(idxStore, 'search'):
rs = idxStore.search(session, query, self)
else:
rs = self._search(session, query)
# Now do top level stuff, like sort
if rs.relevancy:
rs.scale_weights()
rs.order(session, "weight")
else:
# CQL 1.2 sort definition
# URI: info:srw/cql-context-set/1/sort-v1.0
try:
sk = query.sortKeys
except AttributeError:
# pre CQL 1.2
query.resultSet = rs
rs.queryTime = time.time() - start
return rs
sk.reverse() # stable sort = keys in reverse order
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
exact = cql.Relation('exact')
term = cql.Term('')
for idx in sk:
# resolve index
sc = cql.SearchClause(idx, exact, term)
index = pm.resolveIndex(session, sc)
# and find params from modifiers
if idx['ascending']:
ascending = True
elif idx['descending']:
ascending = False
elif hasattr(pm, 'defaultSortDirection'):
ascending = pm.defaultSortDirection[:3].lower() == 'asc'
else:
ascending = True
if idx['missingomit']:
miss = 0
elif idx['missinghigh']:
miss = 1
elif idx['missinglow']:
miss = -1
elif idx['missingfail']:
miss = cql.Diagnostic()
elif idx['missingvalue']:
miss = idx['missingvalue'].value
elif hasattr(pm, 'defaultSortMissing'):
m = pm.defaultSortMissing
vals = ['low', 'omit', 'high']
if m in vals:
miss = int(vals.index(m)) - 1
elif m == 'fail':
miss = cql.Diagnostic()
else:
miss = m
else:
miss = [-1, 1][int(ascending)]
if idx['respectcase']:
case = 1
elif idx['ignorecase']:
case = 0
elif hasattr(pm, 'defaultSortCase'):
if pm.defaultSortCase.lower() in ['1', 'true']:
case = 1
else:
case = 0
else:
case = None
if idx['respectaccents']:
accents = 1
elif idx['ignoreaccents']:
accents = 0
elif hasattr(pm, 'defaultSortAccents'):
if pm.defaultSortAccents.lower() in ['1', 'true']:
accents = 1
else:
accents = 0
else:
accents = None
# Now, finally, order resultSet
rs.order(session, index, ascending=ascending,
missing=miss, case=case, accents=accents)
query.resultSet = rs
rs.queryTime = time.time() - start
return rs
def scan(self, session, clause, nTerms=25, direction=">="):
if (hasattr(clause, 'leftOperand')):
raise QueryException("Cannot use boolean in scan", 38)
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, clause)
if (idx is not None):
return idx.scan(session, clause, nTerms, direction)
else:
raise ObjectDoesNotExistException(clause.index.toCQL())
class OptimisingDatabase(SimpleDatabase):
[docs] """ Experimental query optimising database """
def __init__(self, session, config, parent):
SimpleDatabase.__init__(self, session, config, parent)
self.maskRe = re.compile(r'(?<!\\)[*?]')
def _rewriteQuery(self, session, query):
if not hasattr(query, 'leftOperand'):
if query.relation.value == "all":
# Rewrite to AND triples
nbool = " and "
elif query.relation.value == "any":
nbool = " or "
elif (query.relation.value == "=" and not
query.term.value.isnumeric() and
query.term.value.index(' ') > -1):
nbool = " prox "
else:
# Can't rewrite
return None
# Now split on spaces
terms = query.term.value.split(' ')
if len(terms) == 1:
return None
nq = []
for t in terms:
nq.append(' '.join([query.index.toCQL(),
query.relation.toCQL(),
'"' + t + '"']))
newstr = nbool.join(nq)
newQuery = cql.parse(newstr)
return newQuery
else:
n = self._rewriteQuery(session, query.leftOperand)
if n:
query.leftOperand = n
n = self._rewriteQuery(session, query.rightOperand)
if n:
query.rightOperand = n
return None
def _attachResultCount(self, session, query):
if not (hasattr(query, 'leftOperand')):
# If have masking chrs, assign positive number
if self.maskRe.search(query.term.value):
query.resultCount = 100
else:
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, query)
# terms should be atomic now.
scandata = idx.scan(session, query, 1)
if scandata[0][0] != query.term.value:
# No matches
query.resultCount = 0
else:
query.resultCount = scandata[0][1][1]
else:
leftResultCount = query.leftOperand.resultCount
rightResultCount = query.rightOperand.resultCount
self._attachResultCount(session, query.leftOperand)
if (query.boolean.value in ['and', 'prox'] and
leftResultCount == 0):
query.resultCount = 0
return
self._attachResultCount(session, query.rightOperand)
if query.boolean.value in ['and', 'prox']:
query.resultCount = min(leftResultCount, rightResultCount)
if (query.boolean.value == "and" and
rightResultCount < leftResultCount):
# Can't reorder prox
temp = query.leftOperand
query.leftOperand = query.rightOperand
query.rightOperand = temp
del temp
elif query.boolean.value == 'or':
query.resultCount = leftResultCount + rightResultCount
if rightResultCount > leftResultCount:
temp = query.leftOperand
query.leftOperand = query.rightOperand
query.rightOperand = temp
del temp
else:
# Can't really predict not and can't reorder. just take LHS
query.resultCount = leftResultCount
return None
def _search(self, session, query):
if query.resultCount == 0:
# No matches in this full subtree
return SimpleResultSet([])
else:
return SimpleDatabase._search(self, session, query)
def search(self, session, query):
# Check for optimized indexStore based search (eg SQL translation)
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
# FIXME: Should respect multiple index stores somehow?
idxStore = self.get_object(session, storeList[0])
# Check if there's an indexStore specific search function
if hasattr(idxStore, 'search'):
return idxStore.search(session, query, self)
else:
if ((not hasattr(query, 'leftOperand')) and
query.relation.value == "any"):
# Don't try to rewrite, futile.
pass
else:
n = self._rewriteQuery(session, query)
if n:
query = n
if not hasattr(query, 'leftOperand'):
# Single term or any in single clause
query.resultCount = 1
rs = self._search(session, query)
else:
# Triples... walk and look for ANDs that have a 0 length rs
# Attach resultsets with counts
self._attachResultCount(session, query)
if query.resultCount == 0:
# no matches
return SimpleResultSet([])
else:
rs = self._search(session, query)
# now do top level stuff, like sort
if rs.relevancy:
rs.scale_weights()
rs.order(session, "weight")
elif query.sortKeys:
# CQL 1.2 sort definition
# URI: info:srw/cql-context-set/1/sort-v1.0
sk = query.sortKeys
sk.reverse() # stable sort = keys in reverse order
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
for idx in sk:
# resolve index
index = pm.resolveIndex(session, query)
# and find params from modifiers
if idx['ascending']:
ascending = True
elif idx['descending']:
ascending = False
elif hasattr(pm, 'defaultSortDirection'):
ascending = pm.defaultSortDirection[:3].lower() == 'asc'
else:
ascending = True
if idx['missingomit']:
miss = 0
elif idx['missinghigh']:
miss = 1
elif idx['missinglow']:
miss = -1
elif idx['missingfail']:
miss = cql.Diagnostic()
elif idx['missingvalue']:
miss = idx['missingvalue'].value
elif hasattr(pm, 'defaultSortMissing'):
m = pm.defaultSortMissing
vals = ['low', 'omit', 'high']
if m in vals:
miss = int(vals.index(m)) - 1
elif m == 'fail':
miss = cql.Diagnostic()
else:
miss = m
else:
miss = [-1, 1][int(ascending)]
if idx['respectcase']:
case = 1
elif idx['ignorecase']:
case = 0
elif hasattr(pm, 'defaultSortCase'):
if pm.defaultSortCase.lower() in ['1', 'true']:
case = 1
else:
case = 0
else:
case = None
if idx['respectaccents']:
accents = 1
elif idx['ignoreaccents']:
accents = 0
elif hasattr(pm, 'defaultSortAccents'):
if pm.defaultSortAccents.lower() in ['1', 'true']:
accents = 1
else:
accents = 0
else:
accents = None
# now, finally, order resultSet
rs.order(session, idx, ascending=asc,
missing=miss, case=case, accents=accents)
query.resultSet = rs
return rs