Source code for cheshire3.baseObjects

"""Abstract Base Classes for Cheshire3 Objects.

Defines the base classes of object in the Cheshire3 Object model, their
API method and documentation.

Functional implementations are contained in the module for each class e.g.
Server in cheshire3.server etc.
"""

from cheshire3.session import Session
from cheshire3.configParser import C3Object


[docs]class Server(C3Object):
    """A Server object is a collection point for other objects.

    A Server is a collection point for other objects and an initial entry
    into the system for requests from a ProtocolHandler. A server might know
    about several Databases, RecordStores and so forth, but its main function
    is to check whether the request should be accepted or not and create an
    environment in which the request can be processed.

    It will likely have access to a UserStore database which maintains
    authentication and authorization information. The exact nature of this
    information is not defined, allowing many possible backend
    implementations.

    Servers are the top level of configuration for the system and hence their
    constructor requires the path to a local XML configuration file, however
    from then on configuration information may be retrieved from other
    locations such as a remote datastore to enable distributed environments
    to maintain synchronicity.
    """

    databases = {}
    authStore = None
    resultSetStore = None
    queryStore = None

    def __init__(self, session, configFile="serverConfig.xml"):
        """The constructer takes a Session object and a file path to
        the configuration file to be parsed and processed.
        """
        raise(NotImplementedError)


[docs]class Database(C3Object):
    """A Database is a collection of Records and Indexes.

    It is responsible for maintaining and allowing access to its components,
    as well as metadata associated with the collections. It must be able to
    interpret a request, splitting it amongst its known resources and then
    recombine the values into a single response.
    """
    indexes = {}
    protocolMaps = {}
    recordStore = None

[docs]    def add_record(self, session, rec):
        """Ensure that a Record is registered with the database.

        This method does not ensure persistence of the Record, nor index it,
        just perform registration, and accumulate its metadata.
        """
        raise NotImplementedError

[docs]    def remove_record(self, session, rec):
        """Unregister the Record.

        This method does not delete the Record, nor unindex it, just
        de-registers the Record and subtracts its metadata from the whole.
        """
        raise NotImplementedError

[docs]    def index_record(self, session, rec):
        """Index a Record, return the Record.

        Send the Record to all Indexes registered with the Database to be
        indexed and then return the Record (for the sake of Workflows).
        """
        raise NotImplementedError

[docs]    def unindex_record(self, session, rec):
        """Unindex a Record, return the Record.

        Sends the Record to all Indexes registered with the Database to be
        removed/unindexed.
        """
        raise NotImplementedError

[docs]    def begin_indexing(self, session):
        """Prepare to index Records.

        Perform tasks before Records are to be indexed."""
        raise NotImplementedError

[docs]    def commit_indexing(self, session):
        """Finalize indexing, commit data to persistent storage.

        Perform tasks after Records have been sent to all Indexes. For
        example, commit any temporary data to IndexStores"""
        raise(NotImplementedError)

[docs]    def reindex(self, session):
        """Reindex all Records registered with the database."""
        raise NotImplementedError

[docs]    def scan(self, session, clause, nTerms, direction=">="):
        """Scan (browse) through an Index to return a list of terms.

        Given a single clause CQL query, resolve to the appropriate Index and
        return an ordered term list with document frequencies and total
        occurrences with a maximum of nTerms items. Direction specifies
        whether to move backwards or forwards from the term given in clause.
        """
        raise NotImplementedError

[docs]    def search(self, session, query):
        """Search the database, return a ResultSet.

        Given a CQL query, execute the query and return a ResultSet object.
        """
        raise NotImplementedError

[docs]    def sort(self, session, resultSets, sortKeys):
        """Merge, sort and return one or more ResultSets.

        Take one or more resultSets, merge them and sort based on sortKeys.
        """
        raise NotImplementedError

[docs]    def commit_metadata(self, session):
        """Ensure persistence of database metadata."""
        raise(NotImplementedError)

[docs]    def accumulate_metadata(self, session, obj):
        """Accumulate metadata (e.g. size) from and object.
        """
        raise NotImplementedError


[docs]class Index(C3Object):
    """An Index defines an access point into the Records.

    An Index is an object which defines an access point into Records and is
    responsible for extracting that information from them. It can then store
    the information extracted in an IndexStore.

    The entry point can be defined using one or more Selectors (e.g. an XPath
    expression), and the extraction process can be defined using a Workflow
    chain of standard objects. These chains must start with an Extractor, but
    from there might then include Tokenizers, PreParsers, Parsers,
    Transformers, Normalizers, even other Indexes. A processing chain usually
    finishes with a TokenMerger to merge identical tokens into the appropriate
    data structure (a dictionary/hash/associative array)

    An Index can also be the last object in a regular Workflow, so long as a
    Selector object is used to find the data in the Record immediately before
    an Extractor.
    """
    indexStore = None

[docs]    def begin_indexing(self, session):
        """Prepare to index Records.

        Perform tasks before indexing any Records.
        """
        raise NotImplementedError

[docs]    def commit_indexing(self, session):
        """Finalize indexing.

        Perform tasks after Records have been indexed.
        """
        raise NotImplementedError

[docs]    def index_record(self, session, rec):
        """Index and return a Record.

        Accept a Record to index. If begin indexing has been called, the index
        might not commit any data until commit_indexing is called.  If it is
        not in batch mode, then index_record will also commit the terms to the
        indexStore.
        """
        raise NotImplementedError

[docs]    def delete_record(self, session, rec):
        """Delete a Record from the Index.

        Identify terms from the Record and delete them from IndexStore.
        Depending on the configuration of the Index, it may be necessary to do
        this by repeating the extracting the terms from the Record, finding
        and removing them. Hence the Record must be the same as the one that
        was indexed.
        """
        raise NotImplementedError

[docs]    def store_terms(self, session, data, rec):
        """Store the indexed Terms in the configured IndexStore."""
        raise NotImplementedError

[docs]    def extract_data(self, session, rec):
        """Extract data from the Record.

        Deprecated?
        """
        raise NotImplementedError

[docs]    def fetch_term(self, session, term, summary, prox):
        """Fetch and return the data for the given term."""
        raise NotImplementedError

[docs]    def fetch_termById(self, session, termId):
        """Fetch and return the data for the given term id."""
        raise NotImplementedError

[docs]    def fetch_termList(self, session, term, nTerms, relation, end, summary):
        """Fetch and return a list of terms from the index."""
        raise NotImplementedError

[docs]    def fetch_vector(self, session, rec, summary):
        """Fetch and return a vector for the given Record."""
        raise NotImplementedError

[docs]    def fetch_proxVector(self, session, rec, elemId=-1):
        """Fetch and return a proximity vector for the given Record."""
        raise NotImplementedError

[docs]    def fetch_summary(self, session):
        """Fetch and return summary data for all terms in the Index.

        e.g. for sorting, then iterating.
        USE WITH CAUTION! Everything done here for speed.
        """
        raise NotImplementedError

[docs]    def fetch_termFrequencies(self, session, mType, start, nTerms, direction):
        """Fetch and return a list of term frequency tuples."""
        raise NotImplementedError

[docs]    def clear(self, session):
        """Clear all data from Index."""
        raise NotImplementedError

[docs]    def scan(self, session, clause, nTerms, direction=">="):
        """Scan (browse) through an Index to return a list of terms.

        Given a single clause CQL query, return an ordered term list with
        document frequencies and total occurrences with a maximum of nTerms
        items. Direction specifies whether to move backwards or forwards from
        the term given in clause.
        """
        raise NotImplementedError

[docs]    def search(self, session, clause, db):
        """Search this Index, return a ResultSet.

        Given a CQL query, execute the query and return a ResultSet object.
        """
        raise NotImplementedError

[docs]    def sort(self, session, rset):
        """Sort and return a ResultSet object.

        Sort and return a ResultSet object based on the values extracted
        according to this index.
        """
        raise NotImplementedError

[docs]    def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
        """Return a string serialization representing the term.

        Return a string serialization representing the term for storage
        purposes. Used as a callback from IndexStore to serialize a list of
        terms and document references to be stored.

        termId  := numeric ID of term being serialized
        data    := list of longs
        nRecs   := number of Records containing the term, if known
        nOccs   := total occurrences of the term, if known
        """
        raise NotImplementedError

[docs]    def deserialize_term(self, session, data, nRecs=-1, prox=1):
        """Deserialize and return the internal representation of a term.

        Return the internal representation of a term as recreated from a
        string serialization from storage. Used as a callback from IndexStore
        to take serialized data and produce list of terms and document
        references.

        data  := string (usually retrieved from indexStore)
        nRecs := number of Records to deserialize (all by default)
        prox  := boolean flag to include proximity information
        """
        raise NotImplementedError

[docs]    def merge_term(self, session, currentData, newData,
                   op="replace", nRecs=0, nOccs=0):
        """Merge newData into currentData and return the result.

        Merging takes the currentData and can add, replace or delete the data
        found in newData, and then returns the result. Used as a callback from
        IndexStore to take two sets of terms and merge them together.

        currentData := output of deserialize_terms
        newData     := flat list
        op          := replace | add | delete
        nRecs       := total records in newData
        nOccs       := total occurrences in newdata
        """
        raise NotImplementedError

[docs]    def construct_resultSetItem(self, session, term, rsiType=""):
        """Create and return a ResultSetItem.

        Take the internal representation of a term, as stored in this Index,
        create and return a ResultSetItem from it.
        """
        raise NotImplementedError

[docs]    def construct_resultSet(self, session, terms, queryHash={}):
        """Create and return a ResultSet.

        Take a list of the internal representation of terms, as stored in this
        Index, create and return an appropriate ResultSet object.
        """
        raise NotImplementedError

    def calc_sectionOffsets(self, session, start, nRecs, dataLen=0):
        raise NotImplementedError


[docs]class Selector(C3Object):
    """A Selector is a simple wrapper around a means of selecting data.

    This could be an XPath or some other means of selecting data from the
    parsed structure in a Record.
    """

[docs]    def process_record(self, session, record):
        """Process the given Record and return the results."""
        raise NotImplementedError


[docs]class Extractor(C3Object):
    """An Extractor takes selected data and returns extracted values.

    An Extractor is a processing object called by an Index with the value
    returned by a Selector, and extracts the values into an appropriate data
    structure (a dictionary/hash/associative array).

    Example Extractors might extract all text from within a DOM node / etree
    Element, or select all text that occurs between a pair of selected DOM
    nodes / etree Elements.

    Extractors must also be used on the query terms to apply the same keyword
    processing rules, for example.
    """

[docs]    def process_string(self, session, data):
        """Process and return the value of a raw string.

        e.g. from an attribute value or the query.
        """
        raise NotImplementedError

[docs]    def process_node(self, session, data):
        """Process a DOM node."""
        raise NotImplementedError

[docs]    def process_eventList(self, session, data):
        """Process a list of SAX events serialized in C3 internal format."""
        raise NotImplementedError

[docs]    def process_xpathResult(self, session, data):
        """Process the result of an XPath expression.

        Convenience function to wrap the other process_* functions and do type
        checking.
        """
        raise NotImplementedError


[docs]class Tokenizer(C3Object):
    u"""A Tokenizer takes a string and returns an ordered list of tokens.

    A Tokenizer takes a string of language and processes it to produce an
    ordered list of tokens.

    Example Tokenizers might extract keywords by splitting on whitespace, or
    by identifying common word forms using a regular expression.

    The incoming string is often in a data structure (dictionary / hash /
    associative array), as per output from Extractor.
    """

[docs]    def process_string(self, session, data):
        """Process and return tokens found in a raw string."""
        raise NotImplementedError

[docs]    def process_hash(self, session, data):
        """Process and return tokens found in the keys of a hash."""
        raise NotImplementedError


[docs]class TokenMerger(C3Object):
    u"""A TokenMerger merges identical tokens and returns a hash.

    A TokenMerger takes an ordered list of tokens (i.e. as produced by a
    Tokenizer) and merges them into a hash. This might involve merging
    multiple tokens per key, while maintaining frequency, proximity
    information etc.

    One or more Normalizers may occur in the processing chain between a
    Tokenizer and TokenMerger in order to reduce dimensionality of terms.
    """

[docs]    def process_string(self, session, data):
        """Merge and return tokens found in a raw string."""
        raise NotImplementedError

[docs]    def process_hash(self, session, data):
        """Merge and return tokens found in a hash."""
        raise NotImplementedError


# Takes a string, returns a list of normalised values
class Normalizer(C3Object):
    """A Normalizer modifies terms to allow effective comparison.

    Normalizer objects are chained after Extractors in order to transform the
    data from the Record or query.

    Example Normalizers might standardize the case, perform stemming or
    transform a date into ISO8601 format.

    Normalizers are also needed to transform the terms in a request into the
    same format as the term stored in the Index. For example a date index might
    be searched using a free text date and that would need to be parsed into
    the normalized form in order to compare it with the stored data.
    """

    def process_string(self, session, data):
        """Process a string into an alternative form."""
        raise NotImplementedError

    def process_hash(self, session, data):
        """Process a hash of values into alternative forms."""
        raise NotImplementedError


[docs]class DocumentFactory(C3Object):
    """A DocumentFactory takes raw data, returns one or more Documents.

    A DocumentFacory can be used to return Documents from e.g. a file, a
    directory containing many files, archive files, a URL, or a web-based API.
    """

[docs]    def load(self, session, data,
             cache=None, format=None, tagName=None, codec=""):
        """Load documents into the document factory from data.

        Returns the DocumentFactory itself which acts as an iterator
        DocumentFactory's load function takes session, plus:

        data     := the data to load. Could be a filename, a directory name,
                    the data as a string, a URL to the data etc.

        cache    := setting for how to cache documents in memory when reading
                    them in.

        format   := format of the data parameter. Many options, most common:
                    * xml  -- XML file. May contain multiple records
                    * dir  -- a directory containing files to load
                    * tar  -- a tar file containing files to load
                    * zip  -- a zip file containing files to load
                    * marc -- a file with MARC records (library catalogue data)
                    * http -- a base HTTP URL to retrieve

        tagName  := name of the tag which starts (and ends!) a Record.

        codec    := name of the codec in which the data is encoded.
        """
        raise NotImplementedError

[docs]    def get_document(self, session, n=-1):
        """Return the Document at index n."""
        raise NotImplementedError

    @classmethod
[docs]    def register_stream(self, session, format, cls):
        """Register a new format, handled by given DocumentStream (cls).

        Class method to register an implementation of a DocumentStream (cls)
        against a name for the format parameter (format) in future calls to
        load().
        """
        raise NotImplementedError


class QueryFactory(C3Object):
    """A QueryFactory takes data and returns a CQL Query."""
    pass


[docs]class Parser(C3Object):
    """A Parser takes a Document and parses it to a Record.

    Parsers could be viewed as Record Factories. They take a Document
    containing some data and produce the equivalent Record.

    Often a simple wrapper around an XML parser, however implementations also
    exist for various types of RDF data.
    """

[docs]    def process_document(self, session, doc):
        """Take a Document, parse it and return a Record object."""
        raise NotImplementedError


[docs]class PreParser(C3Object):
    """A PreParser takes a Document and returns a modified Document.

    For example, the input document might consist of SGML data. The
    output would be a Document containing XML data.

    This functionality allows for Workflow chains to be strung together in
    many ways, and perhaps in ways which the original implemention had not
    foreseen.
    """

[docs]    def process_document(self, session, doc):
        """Take a Document, transform it and return a new Document object."""
        raise NotImplementedError


# Takes a Record, returns a Document
[docs]class Transformer(C3Object):
    """A Transformer transforms a Record into a Document.

    A Transformer may be seen as the opposite of a Parser. It takes a Record
    and produces a Document. In many cases this can be handled by an XSLT
    stylesheet, but other instances might include one that returns a binary
    file based on the information in the Record.

    Transformers may be used in the processing chain of an Index, but are more
    likely to be used to render a Record in a format or schema for delivery to
    the end user.
    """

[docs]    def process_record(self, session, rec):
        """Take a Record, transform it and return a new Document object."""
        raise NotImplementedError


[docs]class User(C3Object):
    """A User represents a user of the system.

    An object representing a user of the system to allow for convenient access
    to properties such as username, password, rights and permissions metadata.

    Users may be stores and retrieved from an ObjectStore like any other
    configured or created C3Object.
    """

    username = ''
    password = ''
    rights = []
    email = ''
    realName = ''
    flags = []

[docs]    def has_flag(self, session, flag, object=None):
        """Does the User have the specified flag?

        Check whether or not the User has the specified flag.  This flag may
        be set regarding a particular object, for example write access to a
        particular ObjectStore.
        """
        raise NotImplementedError


[docs]class ProtocolMap(C3Object):
    """A ProtocolMap maps incoming queries to internal capabilities.

    A ProtocolMaps maps from an incoming query type to internal Indexes based
    on some specification.
    """

    protocol = ""

    def resolve_index(self, session, data):
        """Given a query, resolve it and return the index object to be used."""
        raise NotImplementedError

# --- Store APIs ---


[docs]class IndexStore(C3Object):
    """A persistent storage mechanism for terms organized by Indexes.

    Not an ObjectStore, just looks after Indexes and their terms.
    """

[docs]    def begin_indexing(self, session, index):
        """Prepare to index Records.

        Perform tasks as required before indexing begins, for example creating
        batch files.
        """
        raise NotImplementedError

[docs]    def commit_indexing(self, session, index):
        """Finalize indexing for the given Index.

        Perform tasks after all Records have been sent to given Index. For
        example, commit any temporary data to disk."""
        raise NotImplementedError

[docs]    def commit_centralIndexing(self, session, index, filePath):
        """Finalize indexing for given index in single process context.

        Commit data from the indexing process to persistent storage. Called
        automatically unless indexing is being carried out in distributed
        context. In this case, must be called in only one of the processes.
        """
        raise NotImplementedError

[docs]    def contains_index(self, session, index):
        """Does the IndexStore currently store the given Index."""
        raise NotImplementedError

[docs]    def create_index(self, session, index):
        """Create an index in the store."""
        raise NotImplementedError

[docs]    def clean_index(self, session, index):
        """Remove all the terms from an Index, but keep the specification."""
        raise NotImplementedError

[docs]    def delete_index(self, session, index):
        """Completely delete an index from the store."""
        raise NotImplementedError

[docs]    def delete_terms(self, session, index, terms, rec=None):
        """Delete the given terms from Index.

        Optionally only delete terms for a particular Record.
        """
        raise NotImplementedError

[docs]    def store_terms(self, session, index, terms, rec):
        """Store terms in the index for a given Record."""
        raise NotImplementedError

[docs]    def create_term(self, session, index, termId, resultSet):
        """Take resultset and munge to Index format, serialise, store."""
        raise NotImplementedError

[docs]    def fetch_term(self, session, index, term, summary=0, prox=0):
        """Fetch and return data for a single term."""
        raise NotImplementedError

[docs]    def fetch_termById(self, session, index, termId):
        """Fetch and return data for a single term based on term identifier."""
        raise NotImplementedError

[docs]    def fetch_termList(self, session, index, term,
                       nTerms=0, relation="", end="", summary=0, reverse=0):
        """Fetch and return a list of terms for an Index.

        :param numReq: how many terms are wanted.
        :type numReq: integer
        :param relation: which order to scan through the index.
        :param end: a point to end at (e.g. between A and B)
        :param summary: only return frequency info, not the pointers to
        matching records.
        :type summary: boolean (or something that can be evaluated as True or
        False)
        :param reverse: use the reversed index if available (eg 'xedni' not
        'index').
        :rtype: list
        """
        raise NotImplementedError

[docs]    def fetch_sortValue(self, session, index, item):
        """Fetch a stored value for the given Record to use for sorting."""
        raise NotImplementedError

[docs]    def fetch_vector(self, session, index, rec, summary=0):
        """Fetch and return a vector for the given Record."""
        raise NotImplementedError

[docs]    def fetch_proxVector(self, session, index, rec, elemId=-1):
        """Fetch and return a proximity vector for the given Record."""
        raise NotImplementedError

[docs]    def fetch_summary(self, session, index):
        """Fetch and return summary data for all terms in the Index.

        e.g. for sorting, then iterating.
        USE WITH CAUTION! Everything done here for speed.
        """
        raise NotImplementedError

[docs]    def fetch_termFrequencies(self, session, index, mType,
                              start, nTerms, direction):
        """Fetch and return a list of term frequency tuples."""
        raise NotImplementedError

[docs]    def construct_resultSetItem(self, session, recId,
                                recStoreId, nOccs, rsiType=None):
        """Create and return a ResultSetItem.

        Take the internal representation of a term, as stored in this Index,
        create and return a ResultSetItem from it.
        """
        raise NotImplementedError


[docs]class ObjectStore(C3Object):
    """A persistent storage mechanism for configured Cheshire3 objects."""

[docs]    def create_object(self, session, obj=None):
        """Create a slot for and store a serialized Cheshire3 Object.

        Given a Cheshire3 object, create a serialized form of it in the
        database.
        Note: You should use create_record() as per RecordStore to create an
        object from a configuration.
        """
        raise NotImplementedError

[docs]    def delete_object(self, session, id):
        """Delete an object."""
        raise NotImplementedError

[docs]    def fetch_object(self, session, id):
        """Fetch and return an object."""
        raise NotImplementedError

[docs]    def store_object(self, session, obj):
        """Store an object, potentially overwriting an existing copy."""
        raise NotImplementedError


class QueryStore(ObjectStore):
    """An interface to persistent storage for CQL Queries."""

    def create_query(self, session, query=None):
        """Create a new query in the store."""
        raise NotImplementedError

    def delete_query(self, session, id):
        """Delete a query from the store."""
        raise NotImplementedError

    def fetch_query(self, session, id):
        """Fetch a query from the store."""
        raise NotImplementedError

    def store_query(self, session, query):
        """Store a query, potentially overwriting an existing copy."""
        raise NotImplementedError


[docs]class RecordStore(ObjectStore):
    """A persistent storage mechanism for Records.

    A RecordStore allows such operations as create, update, fetch and delete.
    It also allows fast retrieval of important Record metadata, for use in
    computing relevance rankings for example.
    """

[docs]    def create_record(self, session, rec=None):
        """Create an identifier, store and return a Record.

        Generate a new identifier. If a Record is given, assign the identifier
        to the Record and store it using store_record. If Record not given
        create a placeholder Record. Return the Record.
        """
        raise NotImplementedError

[docs]    def replace_record(self, session, rec):
        """Check for permission, replace stored copy of an existing Record.

        Carry out permission checking before calling store_record.
        """
        raise NotImplementedError

[docs]    def store_record(self, session, rec, transformer=None):
        """Store a Record that already has an identifier assigned.

        If a Transformer is given, use it to serialize the Record data.
        """
        raise NotImplementedError

[docs]    def fetch_record(self, session, id, parser=None):
        """Fetch and return the Record with the given identifier."""
        raise NotImplementedError

[docs]    def delete_record(self, session, id):
        """Delete the Record with the given identifier from storage."""
        raise NotImplementedError

[docs]    def fetch_recordMetadata(self, session, id, mType):
        """Return the size of the Record, according to its metadata."""
        raise NotImplementedError


[docs]class DocumentStore(ObjectStore):
    """A persistent storage mechanism for Documents and their metadata."""

[docs]    def create_document(self, session, doc=None):
        """Create an identifier, store and return a Document

        Generate a new identifier. If a Document is given, assign the
        identifier to the Document and store it using store_document. If
        Document not given create a placeholder Document. Return the Document.
        """
        raise NotImplementedError

[docs]    def delete_document(self, session, id):
        """Delete the Document with the given identifier from storage."""
        raise NotImplementedError

[docs]    def fetch_document(self, session, id):
        """Fetch and return Document with the given identifier."""
        raise NotImplementedError

[docs]    def store_document(self, session, doc):
        """Store a Document that already has an identifier assigned."""
        raise NotImplementedError


# And store result sets
[docs]class ResultSetStore(ObjectStore):
    """A persistent storage mechanism for ResultSet objects."""

[docs]    def create_resultSet(self, session, rset=None):
        """Create an identifier, store and return a ResultSet

        Generate a new identifier. If a ResultSet is given, assign the
        identifier and store it using store_resultSet. If ResultSet is not
        given create a placeholder ResultSet. Return the ResultSet.
        """
        raise NotImplementedError

[docs]    def delete_resultSet(self, session, id):
        """Delete a ResultSet with the given identifier from storage."""
        raise NotImplementedError

[docs]    def fetch_resultSet(self, session, id):
        """Fetch and return Resultset with the given identifier."""
        raise NotImplementedError

[docs]    def store_resultSet(self, session, rset):
        """Store a ResultSet that already has an identifier assigned."""
        raise NotImplementedError

# --- Code Instantiated Objects ---


[docs]class Document(object):
    """A Document is a wrapper for raw data and its metadata.

    A Document is the raw data which will become a Record. It may be processed
    into a Record by a Parser, or into another Document type by a PreParser.
    Documents might be stored in a DocumentStore, if necessary, but can
    generally be discarded. Documents may be anything from a JPG file, to an
    unparsed XML file, to a string containing a URL. This allows for future
    compatability with new formats, as they may be incorporated into the
    system by implementing a Document type and a PreParser.
    """

    id = -1
    documentStore = ""
    text = ""
    mimeType = ""
    processHistory = []
    parent = ('', '', -1)

    def __init__(self, data, creator="", history=[], mimeType="",
                 parent=None, filename="", tagName="",
                 byteCount=0, byteOffset=0, wordCount=0):
        """Construct a Document from data, with given metadata attributes.

        The constructer takes the data which should be used to construct the
        document. This is implementation dependant. It also optionally may
        take a creator object, process history information and a mimetype.
        The parent option is for documents which have been extracted from
        another document, for example pages from a book.
        """
        raise NotImplementedError

[docs]    def get_raw(self, session):
        """Return the raw data associated with this document."""
        raise NotImplementedError


[docs]class ResultSet(object):
    """A collection of results, commonly pointers to Records.

    Typically created in response to a search on a Database. ResultSets are
    also the return value when searching an IndexStore or Index and are merged
    internally to combine results when searching multiple Indexes combined
    with boolean operators.
    """

    termid = -1
    totalOccs = 0
    totalRecs = 0
    id = ""
    expires = ""
    index = None
    queryTerm = ""
    queryFreq = 0
    queryPositions = []
    relevancy = 0
    maxWeight = 0
    minWeight = 0

[docs]    def combine(self, session, others, clause):
        """Combine the ResultSets in 'others' into this ResultSet."""
        raise NotImplementedError

[docs]    def retrieve(self, session, nRecs, start=0):
        """Return an iterable of ``nRecs`` Records starting at ``start``."""
        raise NotImplementedError

[docs]    def order(self, session, spec,
              ascending=None, missing=None, case=None, accents=None):
        """Re-order in-place based on the given spec and arguments."""
        raise NotImplementedError

[docs]    def serialize(self, session):
        """Return a string serialization of the ResultSet."""
        raise NotImplementedError

[docs]    def deserialize(self, session, data):
        """Deserialize string in ``data`` to return the populated ResultSet."""
        raise NotImplementedError


class ResultSetItem(object):
    """Object representing a result, typically a pointer to a Record.

    Object representing a pointer to a Record, with ResultSet specific
    metadata.
    """

    id = 0
    recordStore = ""
    occurences = 0

    def fetch_record(self, session):
        """Fetch and return the Record represented by the ResultSetItem."""
        raise NotImplementedError

    def serialize(self, session):
        """Return a string serialization of the ResultSetItem."""
        raise NotImplementedError


[docs]class Record(object):
    """A Record is a wrapper for parsed data and its metadata.

    Records in the system are commonly stored in an XML form. Attached to the
    record is various configurable metadata, such as the time it was inserted
    into the database and by which user. Records are stored in a RecordStore
    and retrieved via a persistent and unique identifier. The record data may
    be retrieved as a list of SAX events, as regularised XML, as a DOM tree or
    ElementTree.
    """

    tagName = ''
    status = ''
    baseUri = ''
    history = []
    rights = []
    recordStore = None    # RecordStore in which the Record is stored
    elementHash = {}
    resultSetItem = None  # ResultSetItem from which the Record was fetched

    wordCount = -1
    byteCount = -1
    metadata = {}        # Arbitrary metadata

    parent = ('', None, 0)
    processHistory = []

    dom = None
    xml = ""
    sax = []

    def __init__(self, data, xml="", docId=None, wordCount=0, byteCount=0):
        raise NotImplementedError

    def __repr__(self):
        if self.recordStore is not None:
            return "%s/%s" % (self.recordStore, self.id)
        else:
            return "%s-%s" % (self.__class__.__name__, self.id)

[docs]    def get_dom(self, session):
        """Return the DOM document node for the record."""
        raise NotImplementedError

[docs]    def get_sax(self, session):
        """Return the list of SAX events for the record

        SAX events are serialized according to the internal Cheshire3 format.
        """
        raise NotImplementedError

[docs]    def get_xml(self, session):
        """Return the XML for the record as a serialized string."""
        raise NotImplementedError

[docs]    def process_xpath(self, session, xpath, maps={}):
        """Process and return the result of the given XPath

         XPath may be either a string or a configured XPath, perhaps with some
         supplied namespace mappings.
         """
        raise NotImplementedError

[docs]    def fetch_vector(self, session, index, summary=False):
        """Fetch and return a vector for the Record from the given Index."""
        raise NotImplementedError


[docs]class Workflow(C3Object):
    """A Workflow defines a series of processing steps.

    A Workflow is similar to the process chain concept of an index, but acts
    at a more global level. It will allow the configuration of a Workflow
    using Cheshire3 objects and simple code to be defined and executed for
    input objects.

    For example, one might define a common Workflow pattern of PreParsers,
    a Parser and then indexing routines in the XML configuration, and then
    run each Document in a DocumentFactory through it. This allows users who
    are not familiar with Python, but who are familiar with XML and available
    Cheshire3 processing objects to implement tasks as required, by changing
    only configuration files. It thus also allows a user to configure personal
    workflows in a Cheshire3 system the code for which they don't have
    permission to modify.
    """

    code = ""

[docs]    def process(self, session, *args, **kw):
        """Executes the code as constructed from the XML configuration.

        Executes the generate code on the given input arguments. The return
        value is the last object to be produced by the execution.
        This function is automatically written and compiled when the object
        is instantiated.
        """
        raise NotImplementedError


class Logger(C3Object):
    """A Logger logs messages for system events."""

    def log(self, session, *args, **kw):
        """Log a message based in the given args."""
        raise(NotImplementedError)