"""Abstract Base Classes for Cheshire3 Objects.
Defines the base classes of object in the Cheshire3 Object model, their
API method and documentation.
Functional implementations are contained in the module for each class e.g.
Server in cheshire3.server etc.
"""
from cheshire3.session import Session
from cheshire3.configParser import C3Object
[docs]class Server(C3Object):
"""A Server object is a collection point for other objects.
A Server is a collection point for other objects and an initial entry
into the system for requests from a ProtocolHandler. A server might know
about several Databases, RecordStores and so forth, but its main function
is to check whether the request should be accepted or not and create an
environment in which the request can be processed.
It will likely have access to a UserStore database which maintains
authentication and authorization information. The exact nature of this
information is not defined, allowing many possible backend
implementations.
Servers are the top level of configuration for the system and hence their
constructor requires the path to a local XML configuration file, however
from then on configuration information may be retrieved from other
locations such as a remote datastore to enable distributed environments
to maintain synchronicity.
"""
databases = {}
authStore = None
resultSetStore = None
queryStore = None
def __init__(self, session, configFile="serverConfig.xml"):
"""The constructer takes a Session object and a file path to
the configuration file to be parsed and processed.
"""
raise(NotImplementedError)
[docs]class Database(C3Object):
"""A Database is a collection of Records and Indexes.
It is responsible for maintaining and allowing access to its components,
as well as metadata associated with the collections. It must be able to
interpret a request, splitting it amongst its known resources and then
recombine the values into a single response.
"""
indexes = {}
protocolMaps = {}
recordStore = None
[docs] def add_record(self, session, rec):
"""Ensure that a Record is registered with the database.
This method does not ensure persistence of the Record, nor index it,
just perform registration, and accumulate its metadata.
"""
raise NotImplementedError
[docs] def remove_record(self, session, rec):
"""Unregister the Record.
This method does not delete the Record, nor unindex it, just
de-registers the Record and subtracts its metadata from the whole.
"""
raise NotImplementedError
[docs] def index_record(self, session, rec):
"""Index a Record, return the Record.
Send the Record to all Indexes registered with the Database to be
indexed and then return the Record (for the sake of Workflows).
"""
raise NotImplementedError
[docs] def unindex_record(self, session, rec):
"""Unindex a Record, return the Record.
Sends the Record to all Indexes registered with the Database to be
removed/unindexed.
"""
raise NotImplementedError
[docs] def begin_indexing(self, session):
"""Prepare to index Records.
Perform tasks before Records are to be indexed."""
raise NotImplementedError
[docs] def commit_indexing(self, session):
"""Finalize indexing, commit data to persistent storage.
Perform tasks after Records have been sent to all Indexes. For
example, commit any temporary data to IndexStores"""
raise(NotImplementedError)
[docs] def reindex(self, session):
"""Reindex all Records registered with the database."""
raise NotImplementedError
[docs] def scan(self, session, clause, nTerms, direction=">="):
"""Scan (browse) through an Index to return a list of terms.
Given a single clause CQL query, resolve to the appropriate Index and
return an ordered term list with document frequencies and total
occurrences with a maximum of nTerms items. Direction specifies
whether to move backwards or forwards from the term given in clause.
"""
raise NotImplementedError
[docs] def search(self, session, query):
"""Search the database, return a ResultSet.
Given a CQL query, execute the query and return a ResultSet object.
"""
raise NotImplementedError
[docs] def sort(self, session, resultSets, sortKeys):
"""Merge, sort and return one or more ResultSets.
Take one or more resultSets, merge them and sort based on sortKeys.
"""
raise NotImplementedError
[docs]class Index(C3Object):
"""An Index defines an access point into the Records.
An Index is an object which defines an access point into Records and is
responsible for extracting that information from them. It can then store
the information extracted in an IndexStore.
The entry point can be defined using one or more Selectors (e.g. an XPath
expression), and the extraction process can be defined using a Workflow
chain of standard objects. These chains must start with an Extractor, but
from there might then include Tokenizers, PreParsers, Parsers,
Transformers, Normalizers, even other Indexes. A processing chain usually
finishes with a TokenMerger to merge identical tokens into the appropriate
data structure (a dictionary/hash/associative array)
An Index can also be the last object in a regular Workflow, so long as a
Selector object is used to find the data in the Record immediately before
an Extractor.
"""
indexStore = None
[docs] def begin_indexing(self, session):
"""Prepare to index Records.
Perform tasks before indexing any Records.
"""
raise NotImplementedError
[docs] def commit_indexing(self, session):
"""Finalize indexing.
Perform tasks after Records have been indexed.
"""
raise NotImplementedError
[docs] def index_record(self, session, rec):
"""Index and return a Record.
Accept a Record to index. If begin indexing has been called, the index
might not commit any data until commit_indexing is called. If it is
not in batch mode, then index_record will also commit the terms to the
indexStore.
"""
raise NotImplementedError
[docs] def delete_record(self, session, rec):
"""Delete a Record from the Index.
Identify terms from the Record and delete them from IndexStore.
Depending on the configuration of the Index, it may be necessary to do
this by repeating the extracting the terms from the Record, finding
and removing them. Hence the Record must be the same as the one that
was indexed.
"""
raise NotImplementedError
[docs] def store_terms(self, session, data, rec):
"""Store the indexed Terms in the configured IndexStore."""
raise NotImplementedError
[docs] def fetch_term(self, session, term, summary, prox):
"""Fetch and return the data for the given term."""
raise NotImplementedError
[docs] def fetch_termById(self, session, termId):
"""Fetch and return the data for the given term id."""
raise NotImplementedError
[docs] def fetch_termList(self, session, term, nTerms, relation, end, summary):
"""Fetch and return a list of terms from the index."""
raise NotImplementedError
[docs] def fetch_vector(self, session, rec, summary):
"""Fetch and return a vector for the given Record."""
raise NotImplementedError
[docs] def fetch_proxVector(self, session, rec, elemId=-1):
"""Fetch and return a proximity vector for the given Record."""
raise NotImplementedError
[docs] def fetch_summary(self, session):
"""Fetch and return summary data for all terms in the Index.
e.g. for sorting, then iterating.
USE WITH CAUTION! Everything done here for speed.
"""
raise NotImplementedError
[docs] def fetch_termFrequencies(self, session, mType, start, nTerms, direction):
"""Fetch and return a list of term frequency tuples."""
raise NotImplementedError
[docs] def clear(self, session):
"""Clear all data from Index."""
raise NotImplementedError
[docs] def scan(self, session, clause, nTerms, direction=">="):
"""Scan (browse) through an Index to return a list of terms.
Given a single clause CQL query, return an ordered term list with
document frequencies and total occurrences with a maximum of nTerms
items. Direction specifies whether to move backwards or forwards from
the term given in clause.
"""
raise NotImplementedError
[docs] def search(self, session, clause, db):
"""Search this Index, return a ResultSet.
Given a CQL query, execute the query and return a ResultSet object.
"""
raise NotImplementedError
[docs] def sort(self, session, rset):
"""Sort and return a ResultSet object.
Sort and return a ResultSet object based on the values extracted
according to this index.
"""
raise NotImplementedError
[docs] def serialize_term(self, session, termId, data, nRecs=0, nOccs=0):
"""Return a string serialization representing the term.
Return a string serialization representing the term for storage
purposes. Used as a callback from IndexStore to serialize a list of
terms and document references to be stored.
termId := numeric ID of term being serialized
data := list of longs
nRecs := number of Records containing the term, if known
nOccs := total occurrences of the term, if known
"""
raise NotImplementedError
[docs] def deserialize_term(self, session, data, nRecs=-1, prox=1):
"""Deserialize and return the internal representation of a term.
Return the internal representation of a term as recreated from a
string serialization from storage. Used as a callback from IndexStore
to take serialized data and produce list of terms and document
references.
data := string (usually retrieved from indexStore)
nRecs := number of Records to deserialize (all by default)
prox := boolean flag to include proximity information
"""
raise NotImplementedError
[docs] def merge_term(self, session, currentData, newData,
op="replace", nRecs=0, nOccs=0):
"""Merge newData into currentData and return the result.
Merging takes the currentData and can add, replace or delete the data
found in newData, and then returns the result. Used as a callback from
IndexStore to take two sets of terms and merge them together.
currentData := output of deserialize_terms
newData := flat list
op := replace | add | delete
nRecs := total records in newData
nOccs := total occurrences in newdata
"""
raise NotImplementedError
[docs] def construct_resultSetItem(self, session, term, rsiType=""):
"""Create and return a ResultSetItem.
Take the internal representation of a term, as stored in this Index,
create and return a ResultSetItem from it.
"""
raise NotImplementedError
[docs] def construct_resultSet(self, session, terms, queryHash={}):
"""Create and return a ResultSet.
Take a list of the internal representation of terms, as stored in this
Index, create and return an appropriate ResultSet object.
"""
raise NotImplementedError
def calc_sectionOffsets(self, session, start, nRecs, dataLen=0):
raise NotImplementedError
[docs]class Selector(C3Object):
"""A Selector is a simple wrapper around a means of selecting data.
This could be an XPath or some other means of selecting data from the
parsed structure in a Record.
"""
[docs] def process_record(self, session, record):
"""Process the given Record and return the results."""
raise NotImplementedError
[docs]class Tokenizer(C3Object):
u"""A Tokenizer takes a string and returns an ordered list of tokens.
A Tokenizer takes a string of language and processes it to produce an
ordered list of tokens.
Example Tokenizers might extract keywords by splitting on whitespace, or
by identifying common word forms using a regular expression.
The incoming string is often in a data structure (dictionary / hash /
associative array), as per output from Extractor.
"""
[docs] def process_string(self, session, data):
"""Process and return tokens found in a raw string."""
raise NotImplementedError
[docs] def process_hash(self, session, data):
"""Process and return tokens found in the keys of a hash."""
raise NotImplementedError
[docs]class TokenMerger(C3Object):
u"""A TokenMerger merges identical tokens and returns a hash.
A TokenMerger takes an ordered list of tokens (i.e. as produced by a
Tokenizer) and merges them into a hash. This might involve merging
multiple tokens per key, while maintaining frequency, proximity
information etc.
One or more Normalizers may occur in the processing chain between a
Tokenizer and TokenMerger in order to reduce dimensionality of terms.
"""
[docs] def process_string(self, session, data):
"""Merge and return tokens found in a raw string."""
raise NotImplementedError
[docs] def process_hash(self, session, data):
"""Merge and return tokens found in a hash."""
raise NotImplementedError
# Takes a string, returns a list of normalised values
class Normalizer(C3Object):
"""A Normalizer modifies terms to allow effective comparison.
Normalizer objects are chained after Extractors in order to transform the
data from the Record or query.
Example Normalizers might standardize the case, perform stemming or
transform a date into ISO8601 format.
Normalizers are also needed to transform the terms in a request into the
same format as the term stored in the Index. For example a date index might
be searched using a free text date and that would need to be parsed into
the normalized form in order to compare it with the stored data.
"""
def process_string(self, session, data):
"""Process a string into an alternative form."""
raise NotImplementedError
def process_hash(self, session, data):
"""Process a hash of values into alternative forms."""
raise NotImplementedError
[docs]class DocumentFactory(C3Object):
"""A DocumentFactory takes raw data, returns one or more Documents.
A DocumentFacory can be used to return Documents from e.g. a file, a
directory containing many files, archive files, a URL, or a web-based API.
"""
[docs] def load(self, session, data,
cache=None, format=None, tagName=None, codec=""):
"""Load documents into the document factory from data.
Returns the DocumentFactory itself which acts as an iterator
DocumentFactory's load function takes session, plus:
data := the data to load. Could be a filename, a directory name,
the data as a string, a URL to the data etc.
cache := setting for how to cache documents in memory when reading
them in.
format := format of the data parameter. Many options, most common:
* xml -- XML file. May contain multiple records
* dir -- a directory containing files to load
* tar -- a tar file containing files to load
* zip -- a zip file containing files to load
* marc -- a file with MARC records (library catalogue data)
* http -- a base HTTP URL to retrieve
tagName := name of the tag which starts (and ends!) a Record.
codec := name of the codec in which the data is encoded.
"""
raise NotImplementedError
[docs] def get_document(self, session, n=-1):
"""Return the Document at index n."""
raise NotImplementedError
@classmethod
[docs] def register_stream(self, session, format, cls):
"""Register a new format, handled by given DocumentStream (cls).
Class method to register an implementation of a DocumentStream (cls)
against a name for the format parameter (format) in future calls to
load().
"""
raise NotImplementedError
class QueryFactory(C3Object):
"""A QueryFactory takes data and returns a CQL Query."""
pass
[docs]class Parser(C3Object):
"""A Parser takes a Document and parses it to a Record.
Parsers could be viewed as Record Factories. They take a Document
containing some data and produce the equivalent Record.
Often a simple wrapper around an XML parser, however implementations also
exist for various types of RDF data.
"""
[docs] def process_document(self, session, doc):
"""Take a Document, parse it and return a Record object."""
raise NotImplementedError
[docs]class PreParser(C3Object):
"""A PreParser takes a Document and returns a modified Document.
For example, the input document might consist of SGML data. The
output would be a Document containing XML data.
This functionality allows for Workflow chains to be strung together in
many ways, and perhaps in ways which the original implemention had not
foreseen.
"""
[docs] def process_document(self, session, doc):
"""Take a Document, transform it and return a new Document object."""
raise NotImplementedError
# Takes a Record, returns a Document
[docs]class User(C3Object):
"""A User represents a user of the system.
An object representing a user of the system to allow for convenient access
to properties such as username, password, rights and permissions metadata.
Users may be stores and retrieved from an ObjectStore like any other
configured or created C3Object.
"""
username = ''
password = ''
rights = []
email = ''
realName = ''
flags = []
[docs] def has_flag(self, session, flag, object=None):
"""Does the User have the specified flag?
Check whether or not the User has the specified flag. This flag may
be set regarding a particular object, for example write access to a
particular ObjectStore.
"""
raise NotImplementedError
[docs]class ProtocolMap(C3Object):
"""A ProtocolMap maps incoming queries to internal capabilities.
A ProtocolMaps maps from an incoming query type to internal Indexes based
on some specification.
"""
protocol = ""
def resolve_index(self, session, data):
"""Given a query, resolve it and return the index object to be used."""
raise NotImplementedError
# --- Store APIs ---
[docs]class IndexStore(C3Object):
"""A persistent storage mechanism for terms organized by Indexes.
Not an ObjectStore, just looks after Indexes and their terms.
"""
[docs] def begin_indexing(self, session, index):
"""Prepare to index Records.
Perform tasks as required before indexing begins, for example creating
batch files.
"""
raise NotImplementedError
[docs] def commit_indexing(self, session, index):
"""Finalize indexing for the given Index.
Perform tasks after all Records have been sent to given Index. For
example, commit any temporary data to disk."""
raise NotImplementedError
[docs] def commit_centralIndexing(self, session, index, filePath):
"""Finalize indexing for given index in single process context.
Commit data from the indexing process to persistent storage. Called
automatically unless indexing is being carried out in distributed
context. In this case, must be called in only one of the processes.
"""
raise NotImplementedError
[docs] def contains_index(self, session, index):
"""Does the IndexStore currently store the given Index."""
raise NotImplementedError
[docs] def create_index(self, session, index):
"""Create an index in the store."""
raise NotImplementedError
[docs] def clean_index(self, session, index):
"""Remove all the terms from an Index, but keep the specification."""
raise NotImplementedError
[docs] def delete_index(self, session, index):
"""Completely delete an index from the store."""
raise NotImplementedError
[docs] def delete_terms(self, session, index, terms, rec=None):
"""Delete the given terms from Index.
Optionally only delete terms for a particular Record.
"""
raise NotImplementedError
[docs] def store_terms(self, session, index, terms, rec):
"""Store terms in the index for a given Record."""
raise NotImplementedError
[docs] def create_term(self, session, index, termId, resultSet):
"""Take resultset and munge to Index format, serialise, store."""
raise NotImplementedError
[docs] def fetch_term(self, session, index, term, summary=0, prox=0):
"""Fetch and return data for a single term."""
raise NotImplementedError
[docs] def fetch_termById(self, session, index, termId):
"""Fetch and return data for a single term based on term identifier."""
raise NotImplementedError
[docs] def fetch_termList(self, session, index, term,
nTerms=0, relation="", end="", summary=0, reverse=0):
"""Fetch and return a list of terms for an Index.
:param numReq: how many terms are wanted.
:type numReq: integer
:param relation: which order to scan through the index.
:param end: a point to end at (e.g. between A and B)
:param summary: only return frequency info, not the pointers to
matching records.
:type summary: boolean (or something that can be evaluated as True or
False)
:param reverse: use the reversed index if available (eg 'xedni' not
'index').
:rtype: list
"""
raise NotImplementedError
[docs] def fetch_sortValue(self, session, index, item):
"""Fetch a stored value for the given Record to use for sorting."""
raise NotImplementedError
[docs] def fetch_vector(self, session, index, rec, summary=0):
"""Fetch and return a vector for the given Record."""
raise NotImplementedError
[docs] def fetch_proxVector(self, session, index, rec, elemId=-1):
"""Fetch and return a proximity vector for the given Record."""
raise NotImplementedError
[docs] def fetch_summary(self, session, index):
"""Fetch and return summary data for all terms in the Index.
e.g. for sorting, then iterating.
USE WITH CAUTION! Everything done here for speed.
"""
raise NotImplementedError
[docs] def fetch_termFrequencies(self, session, index, mType,
start, nTerms, direction):
"""Fetch and return a list of term frequency tuples."""
raise NotImplementedError
[docs] def construct_resultSetItem(self, session, recId,
recStoreId, nOccs, rsiType=None):
"""Create and return a ResultSetItem.
Take the internal representation of a term, as stored in this Index,
create and return a ResultSetItem from it.
"""
raise NotImplementedError
[docs]class ObjectStore(C3Object):
"""A persistent storage mechanism for configured Cheshire3 objects."""
[docs] def create_object(self, session, obj=None):
"""Create a slot for and store a serialized Cheshire3 Object.
Given a Cheshire3 object, create a serialized form of it in the
database.
Note: You should use create_record() as per RecordStore to create an
object from a configuration.
"""
raise NotImplementedError
[docs] def delete_object(self, session, id):
"""Delete an object."""
raise NotImplementedError
[docs] def fetch_object(self, session, id):
"""Fetch and return an object."""
raise NotImplementedError
[docs] def store_object(self, session, obj):
"""Store an object, potentially overwriting an existing copy."""
raise NotImplementedError
class QueryStore(ObjectStore):
"""An interface to persistent storage for CQL Queries."""
def create_query(self, session, query=None):
"""Create a new query in the store."""
raise NotImplementedError
def delete_query(self, session, id):
"""Delete a query from the store."""
raise NotImplementedError
def fetch_query(self, session, id):
"""Fetch a query from the store."""
raise NotImplementedError
def store_query(self, session, query):
"""Store a query, potentially overwriting an existing copy."""
raise NotImplementedError
[docs]class RecordStore(ObjectStore):
"""A persistent storage mechanism for Records.
A RecordStore allows such operations as create, update, fetch and delete.
It also allows fast retrieval of important Record metadata, for use in
computing relevance rankings for example.
"""
[docs] def create_record(self, session, rec=None):
"""Create an identifier, store and return a Record.
Generate a new identifier. If a Record is given, assign the identifier
to the Record and store it using store_record. If Record not given
create a placeholder Record. Return the Record.
"""
raise NotImplementedError
[docs] def replace_record(self, session, rec):
"""Check for permission, replace stored copy of an existing Record.
Carry out permission checking before calling store_record.
"""
raise NotImplementedError
[docs] def store_record(self, session, rec, transformer=None):
"""Store a Record that already has an identifier assigned.
If a Transformer is given, use it to serialize the Record data.
"""
raise NotImplementedError
[docs] def fetch_record(self, session, id, parser=None):
"""Fetch and return the Record with the given identifier."""
raise NotImplementedError
[docs] def delete_record(self, session, id):
"""Delete the Record with the given identifier from storage."""
raise NotImplementedError
[docs]class DocumentStore(ObjectStore):
"""A persistent storage mechanism for Documents and their metadata."""
[docs] def create_document(self, session, doc=None):
"""Create an identifier, store and return a Document
Generate a new identifier. If a Document is given, assign the
identifier to the Document and store it using store_document. If
Document not given create a placeholder Document. Return the Document.
"""
raise NotImplementedError
[docs] def delete_document(self, session, id):
"""Delete the Document with the given identifier from storage."""
raise NotImplementedError
[docs] def fetch_document(self, session, id):
"""Fetch and return Document with the given identifier."""
raise NotImplementedError
[docs] def store_document(self, session, doc):
"""Store a Document that already has an identifier assigned."""
raise NotImplementedError
# And store result sets
[docs]class ResultSetStore(ObjectStore):
"""A persistent storage mechanism for ResultSet objects."""
[docs] def create_resultSet(self, session, rset=None):
"""Create an identifier, store and return a ResultSet
Generate a new identifier. If a ResultSet is given, assign the
identifier and store it using store_resultSet. If ResultSet is not
given create a placeholder ResultSet. Return the ResultSet.
"""
raise NotImplementedError
[docs] def delete_resultSet(self, session, id):
"""Delete a ResultSet with the given identifier from storage."""
raise NotImplementedError
[docs] def fetch_resultSet(self, session, id):
"""Fetch and return Resultset with the given identifier."""
raise NotImplementedError
[docs] def store_resultSet(self, session, rset):
"""Store a ResultSet that already has an identifier assigned."""
raise NotImplementedError
# --- Code Instantiated Objects ---
[docs]class Document(object):
"""A Document is a wrapper for raw data and its metadata.
A Document is the raw data which will become a Record. It may be processed
into a Record by a Parser, or into another Document type by a PreParser.
Documents might be stored in a DocumentStore, if necessary, but can
generally be discarded. Documents may be anything from a JPG file, to an
unparsed XML file, to a string containing a URL. This allows for future
compatability with new formats, as they may be incorporated into the
system by implementing a Document type and a PreParser.
"""
id = -1
documentStore = ""
text = ""
mimeType = ""
processHistory = []
parent = ('', '', -1)
def __init__(self, data, creator="", history=[], mimeType="",
parent=None, filename="", tagName="",
byteCount=0, byteOffset=0, wordCount=0):
"""Construct a Document from data, with given metadata attributes.
The constructer takes the data which should be used to construct the
document. This is implementation dependant. It also optionally may
take a creator object, process history information and a mimetype.
The parent option is for documents which have been extracted from
another document, for example pages from a book.
"""
raise NotImplementedError
[docs] def get_raw(self, session):
"""Return the raw data associated with this document."""
raise NotImplementedError
[docs]class ResultSet(object):
"""A collection of results, commonly pointers to Records.
Typically created in response to a search on a Database. ResultSets are
also the return value when searching an IndexStore or Index and are merged
internally to combine results when searching multiple Indexes combined
with boolean operators.
"""
termid = -1
totalOccs = 0
totalRecs = 0
id = ""
expires = ""
index = None
queryTerm = ""
queryFreq = 0
queryPositions = []
relevancy = 0
maxWeight = 0
minWeight = 0
[docs] def combine(self, session, others, clause):
"""Combine the ResultSets in 'others' into this ResultSet."""
raise NotImplementedError
[docs] def retrieve(self, session, nRecs, start=0):
"""Return an iterable of ``nRecs`` Records starting at ``start``."""
raise NotImplementedError
[docs] def order(self, session, spec,
ascending=None, missing=None, case=None, accents=None):
"""Re-order in-place based on the given spec and arguments."""
raise NotImplementedError
[docs] def serialize(self, session):
"""Return a string serialization of the ResultSet."""
raise NotImplementedError
[docs] def deserialize(self, session, data):
"""Deserialize string in ``data`` to return the populated ResultSet."""
raise NotImplementedError
class ResultSetItem(object):
"""Object representing a result, typically a pointer to a Record.
Object representing a pointer to a Record, with ResultSet specific
metadata.
"""
id = 0
recordStore = ""
occurences = 0
def fetch_record(self, session):
"""Fetch and return the Record represented by the ResultSetItem."""
raise NotImplementedError
def serialize(self, session):
"""Return a string serialization of the ResultSetItem."""
raise NotImplementedError
[docs]class Record(object):
"""A Record is a wrapper for parsed data and its metadata.
Records in the system are commonly stored in an XML form. Attached to the
record is various configurable metadata, such as the time it was inserted
into the database and by which user. Records are stored in a RecordStore
and retrieved via a persistent and unique identifier. The record data may
be retrieved as a list of SAX events, as regularised XML, as a DOM tree or
ElementTree.
"""
tagName = ''
status = ''
baseUri = ''
history = []
rights = []
recordStore = None # RecordStore in which the Record is stored
elementHash = {}
resultSetItem = None # ResultSetItem from which the Record was fetched
wordCount = -1
byteCount = -1
metadata = {} # Arbitrary metadata
parent = ('', None, 0)
processHistory = []
dom = None
xml = ""
sax = []
def __init__(self, data, xml="", docId=None, wordCount=0, byteCount=0):
raise NotImplementedError
def __repr__(self):
if self.recordStore is not None:
return "%s/%s" % (self.recordStore, self.id)
else:
return "%s-%s" % (self.__class__.__name__, self.id)
[docs] def get_dom(self, session):
"""Return the DOM document node for the record."""
raise NotImplementedError
[docs] def get_sax(self, session):
"""Return the list of SAX events for the record
SAX events are serialized according to the internal Cheshire3 format.
"""
raise NotImplementedError
[docs] def get_xml(self, session):
"""Return the XML for the record as a serialized string."""
raise NotImplementedError
[docs] def process_xpath(self, session, xpath, maps={}):
"""Process and return the result of the given XPath
XPath may be either a string or a configured XPath, perhaps with some
supplied namespace mappings.
"""
raise NotImplementedError
[docs] def fetch_vector(self, session, index, summary=False):
"""Fetch and return a vector for the Record from the given Index."""
raise NotImplementedError
[docs]class Workflow(C3Object):
"""A Workflow defines a series of processing steps.
A Workflow is similar to the process chain concept of an index, but acts
at a more global level. It will allow the configuration of a Workflow
using Cheshire3 objects and simple code to be defined and executed for
input objects.
For example, one might define a common Workflow pattern of PreParsers,
a Parser and then indexing routines in the XML configuration, and then
run each Document in a DocumentFactory through it. This allows users who
are not familiar with Python, but who are familiar with XML and available
Cheshire3 processing objects to implement tasks as required, by changing
only configuration files. It thus also allows a user to configure personal
workflows in a Cheshire3 system the code for which they don't have
permission to modify.
"""
code = ""
[docs] def process(self, session, *args, **kw):
"""Executes the code as constructed from the XML configuration.
Executes the generate code on the given input arguments. The return
value is the last object to be produced by the execution.
This function is automatically written and compiled when the object
is instantiated.
"""
raise NotImplementedError
class Logger(C3Object):
"""A Logger logs messages for system events."""
def log(self, session, *args, **kw):
"""Log a message based in the given args."""
raise(NotImplementedError)