"""Cheshire3 Tokenizer Implementations.
A Tokenizer converts a string to a list of tokens. Lists aren't hashable so we
maintain string key. Also we're very unlikely to duplicate at this point, and
even if we do it's not important.
A Tokenizer MUST be followed by a TokenMerger merge, however, as Normalizers
won't know what to do with a list as data.
"""
import re
import string
# Python source code tokenizer from base libs
import tokenize
import keyword
try:
import cStringIO as StringIO
except ImportError:
import StringIO
from dateutil import parser as dateparser
from datetime import timedelta
from cheshire3.baseObjects import Tokenizer
[docs]class SimpleTokenizer(Tokenizer):
_possibleSettings = {
'char': {
'docs': ('character to split with, or empty for default of '
'whitespace'
)
}
}
def __init__(self, session, config, parent):
Tokenizer.__init__(self, session, config, parent)
self.char = self.get_setting(session, 'char', None)
def process_string(self, session, data):
if self.char:
return data.split(self.char)
else:
return data.split()
def process_hash(self, session, data):
kw = {}
for (key, val) in data.iteritems():
nval = val.copy()
nval['text'] = self.process_string(session, val['text'])
kw[key] = nval
return kw
class OffsetTokenizer(Tokenizer):
def process_hash(self, session, data):
kw = {}
for (key, val) in data.iteritems():
nval = val.copy()
(tokens, positions) = self.process_string(session, val['text'])
nval['text'] = tokens
nval['charOffsets'] = positions
kw[key] = nval
return kw
[docs]class RegexpSubTokenizer(SimpleTokenizer):
u"""Substitute regex matches with a character, then split on whitespace.
A Tokenizer that replaces regular expression matches in the data with a
configurable character (defaults to whitespace), then splits the result at
whitespace.
"""
_possibleSettings = {
'regexp': {
'docs': ("Regular expression to match and replace with instances "
"of 'char' before spltting on whitespace")
},
'char': {
'docs': ''
}
}
def __init__(self, session, config, parent):
SimpleTokenizer.__init__(self, session, config, parent)
pre = self.get_setting(session,
'regexp',
u"""(?x)([-.,'\")}\]]+((?=\s)|$)|(^|(?<=\s))
[-.,']+|[`~!@+=\#\&\^*()\[\]{}\\\|\":;<>?
/\u2026\u2013\u2014\u2018\u2019\u201c
\u201d]|\.\.\.)"""
)
# all strings should be treated as unicode internally
# this is default for lxml - primary Record implementation
self.regexp = re.compile(pre, re.UNICODE)
self.char = self.get_setting(session, 'char', ' ')
def process_string(self, session, data):
txt = self.regexp.sub(self.char, data) # kill unwanted characters
return txt.split() # split at whitespace
[docs]class RegexpSplitTokenizer(SimpleTokenizer):
"""A Tokenizer that simply splits at the regex matches."""
_possibleSettings = {
'regexp': {
'docs': 'Regular expression used to split string'
}
}
def __init__(self, session, config, parent):
SimpleTokenizer.__init__(self, session, config, parent)
pre = self.get_setting(session,
'regexp',
u"""(?x)([-.,'\")}\]]+((?=\s)|$)|(^|(?<=\s))
[-.,']+|[`~!@+=\#\&\^*()\[\]{}\\\|\":;<>?
/\u2026\u2013\u2014\u2018\u2019\u201c
\u201d]|\.\.\.)"""
)
self.regexp = re.compile(pre, re.UNICODE)
def process_string(self, session, data):
return self.regexp.split(data)
[docs]class RegexpFindTokenizer(SimpleTokenizer):
"""A tokenizer that returns all words that match the regex."""
# Some ideas thanks to NLTK's RegexpTokenizer
# Some more ' words:
# cat-o'-nine-tails, ne'er-do-well, will-o'-the-wisp
# --- ignoring
# 'tis, 'twas, 'til, 'phone
# --- IMO should be indexed with leading '
# --- eg 'phone == phone
#
# XXX: Should come up with better solution
# l'il ? y'all ?
#
# XXX: Decide what to do with 8am 8:00am 1.2M $1.2 $1.2M
# As related to 8 am, 8:00 am, 1.2 Million, $ 1.2, $1.2 Million
# vs $1200000 vs $ 1200000 vs four million dollars
# Require acronyms to have at least TWO letters Eg U.S not just J.
_possibleSettings = {
'regexp': {
'docs': 'Regular expression to match when finding tokens.'
},
'gaps': {
'docs': ('Does the regular expression specify the gaps between '
'desired tokens. Defaults to 0 i.e. No, it specifies '
'tokens to keep'),
'type': int,
'options': "0|1"
}
}
def __init__(self, session, config, parent):
SimpleTokenizer.__init__(self, session, config, parent)
pre = self.get_setting(session, 'regexp', u"""
(?xu) # verbose, unicode
(?:
[a-zA-Z0-9!#$%*/?|^{}`~&'+-=_]+@[0-9a-zA-Z.-]+ # email
|(?:[\w+-]+)?[+-]/[+-] # genetic alleles
|\w+(?:-\w+)+ # hypenated word
(?:'(?:t|ll've|ll|ve|s|d've|d|re))? # with/without 'suffix
|[$\xa3\xa5\u20AC]?[0-9]+(?:[.,:-][0-9]+)+[%]? # date/num/money/time
|[$\xa3\xa5\u20AC][0-9]+ # single money
|[0-9]+(?=[a-zA-Z]+) # split: 8am 1Million
|[0-9]+% # single percentage
|(?:[A-Z]\.)+[A-Z\.] # abbreviation
|[oOd]'[a-zA-Z]+ # o'clock, O'Brien, d'Artagnan
|[a-zA-Z]+://[^\s]+ # URI
|\w+'(?:d've|d|t|ll've|ll|ve|s|re) # don't, we've
|(?:[hH]allowe'en|[mM]a'am|[Ii]'m|[fF]o'c's'le|[eE]'en|[sS]'pose)
|[\w+]+ # basic words, including +
)""")
self.regexp = re.compile(pre, re.UNICODE)
self.gaps = self.get_setting(session, 'gaps', 0)
def process_string(self, session, data):
if self.gaps:
return [tok for tok in self.regexp.split(data) if tok]
else:
return self.regexp.findall(data)
[docs]class RegexpFindOffsetTokenizer(OffsetTokenizer, RegexpFindTokenizer):
"""Find tokens that match regex with character offsets.
A Tokenizer that returns all words that match the regex, and also the
character offset at which each word occurs.
"""
def __init__(self, session, config, parent):
# Only init once!
RegexpFindTokenizer.__init__(self, session, config, parent)
def process_string(self, session, data):
tokens = []
positions = []
for m in self.regexp.finditer(data):
tokens.append(m.group())
positions.append(m.start())
return (tokens, positions)
[docs]class RegexpFindPunctuationOffsetTokenizer(RegexpFindOffsetTokenizer):
def process_string(self, session, data):
tokens = []
positions = []
for m in self.regexp.finditer(data):
tokens.append(m.group())
i = m.start()
while i > 0 and data[i - 1] in string.punctuation:
i = i - 1
positions.append(i)
return (tokens, positions)
# Was a text mining util, now should reformulate workflows
[docs]class SentenceTokenizer(SimpleTokenizer):
def __init__(self, session, config, parent):
self.paraRe = re.compile('\n\n+', re.UNICODE)
self.sentenceRe = re.compile(
'.+?(?<!\.\.)[\.!?:]["\'\)]?(?=\s+|$)(?![a-z])',
re.UNICODE | re.DOTALL
)
self.abbrMashRe = re.compile(
'''
(?xu) # verbose, unicode
(^|\s) # leading spaces
([^\s]+?\.[a-zA-Z]+|
Prof|Dr|Sr|Mr|Mrs|Ms|Jr|Capt|Gen|Col|Sgt| # common abbrevs
[ivxjCcl]+|[A-Z]
)\. # Acronyms?
(\s|$) # trailing space
''',
re.UNICODE
)
def process_string(self, session, data):
ps = self.paraRe.split(data)
sents = []
for p in ps:
s = self.abbrMashRe.sub('\\1\\2.\\3', p)
sl = self.sentenceRe.findall(s)
if not sl:
s += '.'
sl = self.sentenceRe.findall(s)
sents.extend(sl)
ns = []
for s in sents:
ns.append(s.replace(".", '.'))
return ns
[docs]class LineTokenizer(SimpleTokenizer):
"Trivial but potentially useful Tokenizer to split data on whitespace."
def process_string(self, session, data):
return data.split('\n')
[docs]class DateTokenizer(SimpleTokenizer):
"""Tokenizer to identify date tokens, and return only these.
Capable of extracting multiple dates, but slowly and less reliably than
single ones.
"""
_possibleDefaults = {
'datetime': {
"docs": ("Default datetime to use for values not supplied in the "
"data")
}
}
_possibleSettings = {
'fuzzy': {
"docs": "Should the parser use fuzzy matching.",
'type': int,
'options': '0|1'
},
'dayfirst': {
"docs": ("Is the day before the month (when ambiguous). "
"1 = Yes, 0 = No (default)"),
'type': int,
'options': '0|1'
}
}
def __init__(self, session, config, parent):
SimpleTokenizer.__init__(self, session, config, parent)
default = self.get_default(session, 'datetime')
self.fuzzy = self.get_setting(session, 'fuzzy')
self.dayfirst = self.get_setting(session, 'dayfirst')
self.normalisedDateRe = re.compile('(?<=\d)xx+', re.UNICODE)
self.isoDateRe = re.compile('''
([0-2]\d\d\d) # match any year up to 2999
(0[1-9]|1[0-2]|xx)? # match any month 01-12 or xx
(0[1-9]|[1-2][0-9]|3[0-1]|xx)? # match any date up to 01-31 or xx
''', re.VERBOSE | re.IGNORECASE | re.UNICODE)
if default:
self.default = dateparser.parse(default.encode('utf-8'),
dayfirst=self.dayfirst,
fuzzy=self.fuzzy)
else:
self.default = dateparser.parse('2000-01-01', fuzzy=True)
def _convertIsoDates(self, mo):
dateparts = [mo.group(1)]
for x in range(2, 4):
if mo.group(x):
dateparts.append(mo.group(x))
return '-'.join(dateparts)
def _tokenize(self, data, default=None):
if default is None:
default = self.default
# Deconstruct data word by word and feed to parser until success.
# Must be a better way to do this..., but for now...
tks = []
wds = data.split()
while (len(wds)):
for x in range(len(wds), 0, -1):
txt = ' '.join(wds[:x]).encode('utf-8')
try:
t = dateparser.parse(txt,
default=default,
dayfirst=self.dayfirst,
fuzzy=self.fuzzy
).isoformat()
except:
continue
else:
tks.append(t)
break
wds = wds[x:]
return tks
def process_string(self, session, data):
# Convert ISO 8601 date elements to extended format (YYYY-MM-DD) for
# better recognition by date parser
data = self.isoDateRe.sub(self._convertIsoDates, data)
if len(data):
# a range?
bits = []
if data.count('/') == 1:
bits = data.split('/')
# ISO allows YYYY-MM and YYYY-Www
elif data.count('-') == 1 and (data.find('-') < len(data) - 4):
bits = data.split('-')
if len(bits):
# Use a new default, just under a year on for the end of the
# range
td = timedelta(days=365, hours=23, minutes=59, seconds=59,
microseconds=999999)
tks = self._tokenize(bits[0]) + \
self._tokenize(bits[1], self.default + td)
else:
tks = []
if len(tks):
return tks
return self._tokenize(data)
[docs]class DateRangeTokenizer(DateTokenizer):
"""Tokenizer to identify ranges of date tokens, and return only these.
e.g.
>>> self.process_string(session, '2003/2004')
['2003-01-01T00:00:00', '2004-12-31T23:59:59.999999']
>>> self.process_string(session, '2003-2004')
['2003-01-01T00:00:00', '2004-12-31T23:59:59.999999']
>>> self.process_string(session, '2003 2004')
['2003-01-01T00:00:00', '2004-12-31T23:59:59.999999']
>>> self.process_string(session, '2003 to 2004')
['2003-01-01T00:00:00', '2004-12-31T23:59:59.999999']
For single dates, attempts to expand this into the largest possible range
that the data could specify. e.g. 1902-04 means the whole of April 1902.
>>> self.process_string(session, "1902-04")
['1902-04-01T00:00:00', '1902-04-30T23:59:59.999999']
"""
def process_string(self, session, data):
# Convert ISO 8601 date elements to extended format (YYYY-MM-DD)
# for better recognition by date parser
data = self.isoDateRe.sub(self._convertIsoDates, data)
if not data:
return []
midpoint = len(data) / 2
if data[midpoint] in ['/', '-', ' ']:
startK = data[:midpoint]
endK = data[midpoint + 1:]
elif len(data.split(' to ')) == 2:
startK, endK = data.split(' to ')
elif data.count('/') == 1:
startK, endK = data.split('/')
# ISO allows YYYY-MM and YYYY-Www
elif data.count('-') == 1 and (data.find('-') < len(data) - 4):
startK, endK = data.split('-')
else:
startK = endK = data
starts = self._tokenize(startK)
ends = []
days = 365
# For end point use a new default, just under a year on for the end
# of the range. Also account for varying month lengths.
while not ends and days > 361:
td = timedelta(days=days,
hours=23,
minutes=59,
seconds=59,
microseconds=999999
)
ends = self._tokenize(endK, self.default + td)
days -= 1
return starts + ends
[docs]class PythonTokenizer(OffsetTokenizer):
""" Tokenize python source code into token/TYPE with offsets """
def __init__(self, session, config, parent):
OffsetTokenizer.__init__(self, session, config, parent)
self.ignoreTypes = [tokenize.INDENT, tokenize.DEDENT, tokenize.NEWLINE,
tokenize.NL, tokenize.ENDMARKER]
def process_string(self, session, data):
io = StringIO.StringIO(data)
toks = []
posns = []
totalChrs = 0
currLine = 0
prevLineLen = 0
for tok in tokenize.generate_tokens(io.readline):
(ttype, txt, start, end, lineTxt) = tok
if start[0] != currLine:
totalChrs += prevLineLen
prevLineLen = len(lineTxt)
currLine = start[0]
# maybe store token
if not ttype in self.ignoreTypes:
tname = tokenize.tok_name[ttype]
if tname == "NAME" and keyword.iskeyword(txt):
toks.append("%s/KEYWORD" % (txt))
else:
toks.append("%s/%s" % (txt, tokenize.tok_name[ttype]))
posns.append(totalChrs + start[1])
return (toks, posns)