Source code for cheshire3.documentStore

from cheshire3.baseObjects import DocumentStore, Session
from cheshire3.exceptions import *
from cheshire3.document import StringDocument
from cheshire3.baseStore import BdbStore, BdbIter, FileSystemStore
from cheshire3.baseStore import DeletedObject
from cheshire3.baseStore import DirectoryStore, directoryStoreIter
try:
    # Name when installed by hand
    import bsddb3 as bdb
except:
    # Name that comes in Python 2.3, though Python >= 2.6 required
    import bsddb as bdb


class SimpleDocumentStore(DocumentStore):
    inPreParser = None
    outPreParser = None
    inWorkflow = None
    outWorkflow = None

    _possiblePaths = {
        'inPreParser': {
            'docs': ("Identifier for a preParser through which to pass the "
                     "documents being ingested."
                     )
        },
        'outPreParser': {
            'docs': ("Identifier for a preParser through which to pass the "
                     "documents being requested"
                     )
        }
    }

    def __init__(self, session, config, parent):
        if (not self.paths):
            DocumentStore.__init__(self, session, config, parent)
        self.inPreParser = self.get_path(session, 'inPreParser', None)
        self.outPreParser = self.get_path(session, 'outPreParser', None)
        self.inWorkflow = self.get_path(session, 'inWorkflow', None)
        self.outWorkflow = self.get_path(session, 'outWorkflow', None)

    def create_document(self, session, doc=None):
        p = self.permissionHandlers.get('info:srw/operation/1/create', None)
        if p:
            if not session.user:
                msg = ("Authenticated user required to create an object in "
                       "%s" % self.id)
                raise PermissionException(msg)
            okay = p.hasPermission(session, session.user)
            if not okay:
                msg = "Permission required to create an object in %s" % self.id
                raise PermissionException(msg)
        id = self.generate_id(session)
        if (doc is None):
            # Create a placeholder
            doc = StringDocument("")
        else:
            doc.id = id
        doc.documentStore = self.id
        try:
            self.store_document(session, doc)
        except ObjectAlreadyExistsException:
            # Back out id change
            if type(id) == long:
                self.currentId -= 1
            raise
        except:
            raise
        return doc

    def store_document(self, session, doc):
        doc.documentStore = self.id
        if (self.inPreParser is not None):
            doc = self.inPreParser.process_document(session, doc)
        elif self.inWorkflow is not None:
            doc = self.inWorkflow.process(session, doc)
        data = doc.get_raw(session)
        md = doc.metadata
        if doc.wordCount:
            md['wordCount'] = doc.wordCount
        if doc.byteCount:
            md['byteCount'] = doc.byteCount
        if self.expires or doc.expires:
            md['expires'] = self.generate_expires(session, doc)
        if doc.byteOffset:
            md['byteOffset'] = doc.byteOffset
        if doc.filename:
            md['filename'] = doc.filename
        self.store_data(session, doc.id, data, md)

    def fetch_document(self, session, id):
        p = self.permissionHandlers.get('info:srw/operation/2/retrieve', None)
        if p:
            if not session.user:
                msg = ("Authenticated user required to retrieve an object "
                       "from %s" % self.id)
                raise PermissionException(msg)
            okay = p.hasPermission(session, session.user)
            if not okay:
                msg = ("Permission required to retrieve an object from "
                       "%s" % self.id)
                raise PermissionException(msg)
        data = self.fetch_data(session, id)
        if (data):
            doc = StringDocument(data)
            if (self.outPreParser is not None):
                doc = self.outPreParser.process_document(session, doc)
            elif (self.outWorkflow is not None):
                doc = self.outWorkflow.process(session, doc)
            doc.id = id
            doc.documentStore = self.id
            doc.parent = ('document', self.id, id)
            return doc
        elif (isinstance(data, DeletedObject)):
            raise ObjectDeletedException(data)
        else:
            raise ObjectDoesNotExistException(id)

    def delete_document(self, session, id):
        p = self.permissionHandlers.get('info:srw/operation/1/delete', None)
        if p:
            if not session.user:
                msg = ("Authenticated user required to delete an object from "
                       "%s" % self.id)
                raise PermissionException(msg)
            okay = p.hasPermission(session, session.user)
            if not okay:
                msg = ("Permission required to replace an object from "
                       "%s" % self.id)
                raise PermissionException(msg)

        if isinstance(id, StringDocument):
            id = id.id
        self.delete_data(session, id)

    def _process_data(self, session, id, data, preParser=None):
        # Split from fetch record for Iterators
        if (preParser is not None):
            doc = StringDocument(data)
            doc = preParser.process_document(session, doc)
        elif (self.outPreParser is not None):
            doc = StringDocument(data)
            doc = self.outPreParser.process_document(session, doc)
        elif (self.outWorkflow is not None):
            doc = StringDocument(data)
            doc = self.outWorkflow.process(session, doc)
        else:
            doc = StringDocument(data)
        # Ensure basic required info
        doc.id = id
        doc.documentStore = self.id
        return doc


class BdbDocIter(BdbIter):
    def next(self):
        d = BdbIter.next(self)
        doc = StringDocument(d[1])
        doc.id = d[0]
        return doc


class BdbDocumentStore(BdbStore, SimpleDocumentStore):
[docs] # Instantiate some type of simple doc store def __init__(self, session, config, parent): BdbStore.__init__(self, session, config, parent) SimpleDocumentStore.__init__(self, session, config, parent) def __iter__(self): return BdbDocIter(self.session, self) class FileSystemDocumentStore(FileSystemStore, SimpleDocumentStore):
[docs] def __init__(self, session, config, parent): FileSystemStore.__init__(self, session, config, parent) SimpleDocumentStore.__init__(self, session, config, parent) def get_storageTypes(self, session): types = ['filename', 'byteCount', 'byteOffset'] if self.get_setting(session, 'digest'): types.append('digest') if self.get_setting(session, 'expires'): types.append('expires') return types class DirectoryDocumentStore(DirectoryStore, SimpleDocumentStore):
# Instantiate some type of simple doc store def __init__(self, session, config, parent): DirectoryStore.__init__(self, session, config, parent) SimpleDocumentStore.__init__(self, session, config, parent) def __iter__(self): return directoryDocumentStoreIter(self) def fetch_document(self, session, id_): # Fetch the document doc = SimpleDocumentStore.fetch_document(self, session, id_) # Assign the filename attribute internalId = self._normalizeIdentifier(session, id_) doc.filename = self._getFilePath(session, internalId) return doc def directoryDocumentStoreIter(store): session = Session() for id_, data in directoryStoreIter(store): doc = StringDocument(data) doc.id = id_ internalId = store._normalizeIdentifier(session, id_) doc.filename = store._getFilePath(session, internalId) doc.documentStore = store.id yield doc