import sys
import os
import re
import time
from lxml import etree
try:
# Name when installed by hand
import bsddb3 as bdb
except:
# Name that comes in Python 2.3
# though Python 2.3 no longer supported
import bsddb as bdb
from cheshire3.configParser import C3Object, CaselessDictionary
from cheshire3.baseObjects import Database, Index, ProtocolMap, Record
from cheshire3.baseStore import SummaryObject
from cheshire3.exceptions import (
ConfigFileException,
ObjectDoesNotExistException,
QueryException
)
from cheshire3.internal import CONFIG_NS
from cheshire3.bootstrap import BSParser, BootstrapDocument
from cheshire3.resultSet import SimpleResultSet, BitmapResultSet
import cheshire3.cqlParser as cql
class SimpleDatabase(SummaryObject, Database):
[docs] """ Default database implementation"""
_possibleSettings = {
'srw': {
'docs': 'Should the database be available via the SRW protocol',
'type': int,
'options': "0|1"
},
'sru': {
'docs': 'Should the database be available via the SRU protocol',
'type': int,
'options': "0|1"
},
'z3950': {
'docs': 'Should the database be available via the Z39.50 protocol',
'type': int,
'options': "0|1"
},
'remoteWorkflow': {
'docs': ('Should the database be available via the remote '
'workflow protocol for Cheshire3. This MUST be secured, '
'so it is not recommended without fully understanding '
'the implications'),
'type': int,
'options': "0|1"
},
'oai-pmh': {
'docs': 'Should the database be available via the OAI protocol',
'type': int,
'options': "0|1"
},
'www': {
'docs': ("Should the database be available via Cheshire3's "
"introspective web search interface."),
'type': int,
'options': "0|1"
}
}
_possiblePaths = {
'indexStoreList': {
'docs': ("Space separated list of indexStore identifiers for this "
"database.")
},
'indexStore': {
'docs': "Single indexStore identifier for this database"
},
'recordStore': {
'docs': "Single (default) recordStore identifier"
},
'protocolMap': {
'docs': "Single (default) protocolMap identifier"
}
}
indexes = {}
protocolMaps = {}
indexConfigs = {}
protocolMapConfigs = {}
records = {}
def __init__(self, session, config, parent):
self.indexes = CaselessDictionary()
self.protocolMaps = CaselessDictionary()
self.indexConfigs = CaselessDictionary()
self.protocolMapConfigs = CaselessDictionary()
self.records = {}
Database.__init__(self, session, config, parent)
SummaryObject.__init__(self, session, config, parent)
if not session.database:
session.database = self.id
def _cacheIndexes(self, session):
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
for (id, dom) in self.indexConfigs.iteritems():
# see if index should be built
if hasattr(dom, 'childNodes'):
for c in dom.childNodes:
if c.nodeType == 1 and c.localName == 'paths':
for c2 in c.childNodes:
if c2.nodeType == 1 and c2.localName == 'object':
istore = c2.getAttributeNS(None, 'ref')
if istore in storeList:
o = self.get_object(session, id)
self.indexes[id] = o
else:
for c in dom.iterchildren(tag=etree.Element):
if c.tag in ['paths', '{%s}paths' % CONFIG_NS]:
for c2 in c.iterchildren(tag=etree.Element):
if c2.tag in ['object', '{%s}object' % CONFIG_NS]:
istore = c2.attrib.get(
'ref',
c2.attrib.get(
'{%s}ref' % CONFIG_NS,
''
)
)
if istore in storeList:
o = self.get_object(session, id)
self.indexes[id] = o
def _cacheProtocolMaps(self, session):
for id in self.protocolMapConfigs.iterkeys():
pm = self.get_object(session, id)
self.protocolMaps[pm.protocol] = pm
def get_indexes(self, session):
self._cacheIndexes(session)
return self.indexes.values()
def add_record(self, session, rec):
(storeid, id) = (rec.recordStore, rec.id)
try:
full = self.records.get(storeid, [[]])
k = full[-1]
if (len(k) > 1 and k[1] == id - 1):
k[1] = id
elif ((len(k) == 1 and k[0] == id - 1) or not k):
k.append(id)
else:
full.append([id])
self.records[storeid] = full
except:
pass
self.accumulate_metadata(session, rec)
return rec
def index_record(self, session, rec):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
if not idx.get_setting(session, 'noIndexDefault', 0):
idx.index_record(session, rec)
return rec
def remove_record(self, session, rec):
self.totalItems -= 1
(storeid, id) = (rec.recordStore, rec.id)
# XXX remove from self.records
# XXX this should be SummaryObject.unaccumulate_metadata() ?
if (rec.wordCount):
self.totalWordCount -= rec.wordCount
if (rec.byteCount):
self.totalByteCount -= rec.byteCount
def unindex_record(self, session, rec):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
if not idx.get_setting(session, 'noUnindexDefault', 0):
idx.delete_record(session, rec)
return None
def begin_indexing(self, session):
if not self.indexes:
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
idx.begin_indexing(session)
return None
def commit_indexing(self, session):
for idx in self.indexes.itervalues():
idx.commit_indexing(session)
return None
def clear_indexes(self, session):
if not len(self.indexes):
self._cacheIndexes(session)
for idx in self.indexes.itervalues():
idx.clear(session)
return None
def _search(self, session, query):
if not hasattr(query, 'leftOperand'):
# Check resultset
rsid = query.getResultSetId()
if (rsid):
# Get existing result set
if rsid.find('/') > -1:
(rssid, rsid) = rsid.split('/', 1)
rss = self.get_object(session, rssid)
else:
rss = self.get_object(session, "defaultResultSetStore")
rset = rss.fetch_resultSet(session, rsid)
rset.fromStore = 1
return rset
else:
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, query)
if (idx is not None):
query.config = pm
rs = idx.search(session, query, self)
query.config = None
rs.query = query
return rs
else:
# unsupported index
raise ObjectDoesNotExistException(query.index.toCQL())
else:
# get the indexStore
left = self._search(session, query.leftOperand)
right = self._search(session, query.rightOperand)
if left.__class__ == right.__class__:
new = left.__class__(session, [], recordStore=left.recordStore)
elif left.__class__ == BitmapResultSet:
# Want to switch the left/right,
# but rset assumes list[0] is same type
new = right.__class__(session, [],
recordStore=right.recordStore)
if query.boolean.value == 'prox':
# bitmaps can't do prox, so just raise
msg = "Cannot use Prox with %s" % left.index.toCQL()
raise QueryException(msg, 18)
elif query.boolean.value == 'not':
# can't reorder without changing query
return new.combine(session, [left, right], query, self)
else:
return new.combine(session, [right, left], query, self)
elif right.__class__ == BitmapResultSet:
new = left.__class__(session, [], recordStore=left.recordStore)
else:
new = SimpleResultSet(session, [])
rs = new.combine(session, [left, right], query, self)
trip = cql.Triple()
trip.leftOperand = left.query
trip.rightOperand = right.query
trip.boolean = query.boolean
rs.query = trip
return rs
def search(self, session, query):
# Check for optimized indexStore based search (eg SQL translation)
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
# FIXME: Should respect multiple index stores somehow?
idxStore = self.get_object(session, storeList[0])
# Check if there's an indexStore specific search function
start = time.time()
if hasattr(idxStore, 'search'):
rs = idxStore.search(session, query, self)
else:
rs = self._search(session, query)
# Now do top level stuff, like sort
if rs.relevancy:
rs.scale_weights()
rs.order(session, "weight")
else:
# CQL 1.2 sort definition
# URI: info:srw/cql-context-set/1/sort-v1.0
try:
sk = query.sortKeys
except AttributeError:
# pre CQL 1.2
query.resultSet = rs
rs.queryTime = time.time() - start
return rs
sk.reverse() # stable sort = keys in reverse order
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
exact = cql.Relation('exact')
term = cql.Term('')
for idx in sk:
# resolve index
sc = cql.SearchClause(idx, exact, term)
index = pm.resolveIndex(session, sc)
# and find params from modifiers
if idx['ascending']:
ascending = True
elif idx['descending']:
ascending = False
elif hasattr(pm, 'defaultSortDirection'):
ascending = pm.defaultSortDirection[:3].lower() == 'asc'
else:
ascending = True
if idx['missingomit']:
miss = 0
elif idx['missinghigh']:
miss = 1
elif idx['missinglow']:
miss = -1
elif idx['missingfail']:
miss = cql.Diagnostic()
elif idx['missingvalue']:
miss = idx['missingvalue'].value
elif hasattr(pm, 'defaultSortMissing'):
m = pm.defaultSortMissing
vals = ['low', 'omit', 'high']
if m in vals:
miss = int(vals.index(m)) - 1
elif m == 'fail':
miss = cql.Diagnostic()
else:
miss = m
else:
miss = [-1, 1][int(ascending)]
if idx['respectcase']:
case = 1
elif idx['ignorecase']:
case = 0
elif hasattr(pm, 'defaultSortCase'):
if pm.defaultSortCase.lower() in ['1', 'true']:
case = 1
else:
case = 0
else:
case = None
if idx['respectaccents']:
accents = 1
elif idx['ignoreaccents']:
accents = 0
elif hasattr(pm, 'defaultSortAccents'):
if pm.defaultSortAccents.lower() in ['1', 'true']:
accents = 1
else:
accents = 0
else:
accents = None
# Now, finally, order resultSet
rs.order(session, index, ascending=ascending,
missing=miss, case=case, accents=accents)
query.resultSet = rs
rs.queryTime = time.time() - start
return rs
def scan(self, session, clause, nTerms=25, direction=">="):
if (hasattr(clause, 'leftOperand')):
raise QueryException("Cannot use boolean in scan", 38)
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, clause)
if (idx is not None):
return idx.scan(session, clause, nTerms, direction)
else:
raise ObjectDoesNotExistException(clause.index.toCQL())
class OptimisingDatabase(SimpleDatabase):
[docs] """ Experimental query optimising database """
def __init__(self, session, config, parent):
SimpleDatabase.__init__(self, session, config, parent)
self.maskRe = re.compile(r'(?<!\\)[*?]')
def _rewriteQuery(self, session, query):
if not hasattr(query, 'leftOperand'):
if query.relation.value == "all":
# Rewrite to AND triples
nbool = " and "
elif query.relation.value == "any":
nbool = " or "
elif (query.relation.value == "=" and not
query.term.value.isnumeric() and
query.term.value.index(' ') > -1):
nbool = " prox "
else:
# Can't rewrite
return None
# Now split on spaces
terms = query.term.value.split(' ')
if len(terms) == 1:
return None
nq = []
for t in terms:
nq.append(' '.join([query.index.toCQL(),
query.relation.toCQL(),
'"' + t + '"']))
newstr = nbool.join(nq)
newQuery = cql.parse(newstr)
return newQuery
else:
n = self._rewriteQuery(session, query.leftOperand)
if n:
query.leftOperand = n
n = self._rewriteQuery(session, query.rightOperand)
if n:
query.rightOperand = n
return None
def _attachResultCount(self, session, query):
if not (hasattr(query, 'leftOperand')):
# If have masking chrs, assign positive number
if self.maskRe.search(query.term.value):
query.resultCount = 100
else:
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
idx = pm.resolveIndex(session, query)
# terms should be atomic now.
scandata = idx.scan(session, query, 1)
if scandata[0][0] != query.term.value:
# No matches
query.resultCount = 0
else:
query.resultCount = scandata[0][1][1]
else:
leftResultCount = query.leftOperand.resultCount
rightResultCount = query.rightOperand.resultCount
self._attachResultCount(session, query.leftOperand)
if (
query.boolean.value in ['and', 'prox'] and
leftResultCount == 0
):
query.resultCount = 0
return
self._attachResultCount(session, query.rightOperand)
if query.boolean.value in ['and', 'prox']:
query.resultCount = min(leftResultCount, rightResultCount)
if (
query.boolean.value == "and" and
rightResultCount < leftResultCount
):
# Can't reorder prox
temp = query.leftOperand
query.leftOperand = query.rightOperand
query.rightOperand = temp
del temp
elif query.boolean.value == 'or':
query.resultCount = leftResultCount + rightResultCount
if rightResultCount > leftResultCount:
temp = query.leftOperand
query.leftOperand = query.rightOperand
query.rightOperand = temp
del temp
else:
# Can't really predict not and can't reorder. just take LHS
query.resultCount = leftResultCount
return None
def _search(self, session, query):
if query.resultCount == 0:
# No matches in this full subtree
return SimpleResultSet([])
else:
return SimpleDatabase._search(self, session, query)
def search(self, session, query):
# Check for optimized indexStore based search (eg SQL translation)
storeList = self.get_path(session, 'indexStoreList')
if not storeList:
indexStore = self.get_path(session, 'indexStore')
if not indexStore:
msg = ("No indexStore/indexStoreList associated with "
"database: %s" % self.id)
raise ConfigFileException(msg)
storeList = [indexStore.id]
else:
storeList = storeList.split(' ')
# FIXME: Should respect multiple index stores somehow?
idxStore = self.get_object(session, storeList[0])
# Check if there's an indexStore specific search function
if hasattr(idxStore, 'search'):
return idxStore.search(session, query, self)
else:
if (
(not hasattr(query, 'leftOperand')) and
query.relation.value == "any"
):
# Don't try to rewrite, futile.
pass
else:
n = self._rewriteQuery(session, query)
if n:
query = n
if not hasattr(query, 'leftOperand'):
# Single term or any in single clause
query.resultCount = 1
rs = self._search(session, query)
else:
# Triples... walk and look for ANDs that have a 0 length rs
# Attach resultsets with counts
self._attachResultCount(session, query)
if query.resultCount == 0:
# no matches
return SimpleResultSet([])
else:
rs = self._search(session, query)
# now do top level stuff, like sort
if rs.relevancy:
rs.scale_weights()
rs.order(session, "weight")
elif query.sortKeys:
# CQL 1.2 sort definition
# URI: info:srw/cql-context-set/1/sort-v1.0
sk = query.sortKeys
sk.reverse() # stable sort = keys in reverse order
pm = self.get_path(session, 'protocolMap')
if not pm:
self._cacheProtocolMaps(session)
pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
self.paths['protocolMap'] = pm
for idx in sk:
# resolve index
index = pm.resolveIndex(session, query)
# and find params from modifiers
if idx['ascending']:
ascending = True
elif idx['descending']:
ascending = False
elif hasattr(pm, 'defaultSortDirection'):
ascending = pm.defaultSortDirection[:3].lower() == 'asc'
else:
ascending = True
if idx['missingomit']:
miss = 0
elif idx['missinghigh']:
miss = 1
elif idx['missinglow']:
miss = -1
elif idx['missingfail']:
miss = cql.Diagnostic()
elif idx['missingvalue']:
miss = idx['missingvalue'].value
elif hasattr(pm, 'defaultSortMissing'):
m = pm.defaultSortMissing
vals = ['low', 'omit', 'high']
if m in vals:
miss = int(vals.index(m)) - 1
elif m == 'fail':
miss = cql.Diagnostic()
else:
miss = m
else:
miss = [-1, 1][int(ascending)]
if idx['respectcase']:
case = 1
elif idx['ignorecase']:
case = 0
elif hasattr(pm, 'defaultSortCase'):
if pm.defaultSortCase.lower() in ['1', 'true']:
case = 1
else:
case = 0
else:
case = None
if idx['respectaccents']:
accents = 1
elif idx['ignoreaccents']:
accents = 0
elif hasattr(pm, 'defaultSortAccents'):
if pm.defaultSortAccents.lower() in ['1', 'true']:
accents = 1
else:
accents = 0
else:
accents = None
# now, finally, order resultSet
rs.order(session, idx, ascending=asc,
missing=miss, case=case, accents=accents)
query.resultSet = rs
return rs