“Natural Language Processing with Python” (read my review) has lots of motivating examples for natural language processing. I quickly found it valuable to build indices ahead of time – I have a corpus of legal texts, and build a set of n-gram indices from it.
The first index is a list of just tokenized text, with all text contents combined. You may wish to add a special token that indicates where files end.
import nltk;
import os;
import re;
all_tokens = []
idx = 0
for root, directories, filenames in os.walk('.'):
for file in filenames:
if file.endswith('.txt'):
idx = idx + 1
contents = open(file)
raw = contents.read()
lc_raw = raw.lower()
new_tokens = nltk.word_tokenize(lc_raw)
new_tokens_filtered = [w for w in new_tokens
if len(w) < 20 and
(re.search('^[a-zA-Z]+$', w) or len(w) == 1)]
all_tokens = all_tokens + new_tokens_filtered
for token in all_tokens:
print token
This filters the tokens - depending on the situation you may wish to modify this (e.g. whether you want punctuation or not).
The following turns the token index into an n-gram index. This runs pretty quickly while there is memory - this could easily be extended to work on parts of a document set, and combine with the results with a map-reduce operation by addition, especially if the output was sorted. One surprising result is that the 5-gram index can be quite a bit larger than the original data.
import nltk;
import os;
contents = open('all_tokens5', 'rU')
n1_gram_word = ' '
n2_gram_word = ' '
n3_gram_word = ' '
n4_gram_word = ' '
n5_gram_word = ' '
n1_counts = {}
n2_counts = {}
n3_counts = {}
n4_counts = {}
n5_counts = {}
index = 0
def incr(c, w):
try:
c[w] = c[w] + 1
except:
c[w] = 1
for word in contents:
index = index + 1
if (index % 10000 == 0):
print "Processed %-d words" % (index)
# defects: loses last character, loses EOF, adds bigrams/trigrams at starts
(n5_gram_word, n4_gram_word, n3_gram_word, n2_gram_word, n1_gram_word) = \
(n4_gram_word, n3_gram_word, n2_gram_word, n1_gram_word, word[:-1])
n1_gram = n1_gram_word
n2_gram = n2_gram_word + ' ' + n1_gram
n3_gram = n3_gram_word + ' ' + n2_gram
n4_gram = n4_gram_word + ' ' + n3_gram
n5_gram = n5_gram_word + ' ' + n4_gram
incr(n1_counts, n1_gram)
incr(n2_counts, n2_gram)
incr(n3_counts, n3_gram)
incr(n4_counts, n4_gram)
incr(n5_counts, n5_gram)
contents.close()
def save_ngram(c, f):
output = open(f, 'w')
ordered = sorted(c.items(), lambda a, b: cmp(c[a[0]], c[b[0]]))
for a, b in ordered:
output.write('%-s %-d\n' % (a, b))
output.close()
save_ngram(n1_counts, '1gram')
save_ngram(n2_counts, '2gram')
save_ngram(n3_counts, '3gram')
save_ngram(n4_counts, '4gram')
save_ngram(n5_counts, '5gram')