12.6. From text to data in Python#

First, we’ll further explore the vocabulary terms from the previous section by implementing them in Python. Then, we’ll conduct our cutting-edge and all-important analysis on Shakespeare vs. Seinfeld.

import matplotlib.pyplot as plt                               # ah, friendship. #grateful
import seaborn as sns
import pandas as pd

import nltk                                                   # some new friends here!
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
from nltk.tokenize import word_tokenize                       
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer   # new resources from an old friend!
from sklearn.metrics.pairwise import cosine_similarity

1. Tokenization & frequency distributions#

# simple tokenization

text = "Those aren't for New Year's. Those are my everyday balloons." 
print(word_tokenize(text))
['Those', 'are', "n't", 'for', 'New', 'Year', "'s", '.', 'Those', 'are', 'my', 'everyday', 'balloons', '.']
# tokenization, then inspect how many types, tokens

text2 = "I don't really want to go to congress again. I'm kind of too young to be in congress so much, you know?"
tokens = word_tokenize(text2)
fdist = FreqDist(tokens)
print(fdist)
<FreqDist with 23 samples and 27 outcomes>
fdist.most_common(2)
[('to', 3), ('I', 2)]
fdist.plot()     # plot tokens by frequency
plt.show()
../../_images/06_tad_python_6_0.png

2. Pre-processing example#

# Have a look at one set of stop words

print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
parrot_text = """This parrot is no more! It has ceased to be! 
                It’s expired and gone to meet its maker! 
                This is a late parrot! It’s a stiff! Bereft of life, it rests in peace! 
                If you hadn’t nailed it to the perch, it would be pushing up the daisies! 
                It’s run down the curtain and joined the choir invisible! 
                This is an ex-parrot!"""
stop_words = set(stopwords.words('english'))
 
parrot_tokens = word_tokenize(parrot_text)

parrot_tokens= [w for w in parrot_tokens if w.isalnum()]     # uncomment this to remove punctuation

parrot_tokens_nostopwords = [w for w in parrot_tokens if not w.lower() in stop_words]
 
parrot_tokens_nostopwords = []
 
for w in parrot_tokens:
    if w not in stop_words:
        parrot_tokens_nostopwords.append(w)

print("Tokenized parrot text:", parrot_tokens), print()

print("Tokenized parrot text with stopwords removed:", parrot_tokens_nostopwords)
Tokenized parrot text: ['This', 'parrot', 'is', 'no', 'more', 'It', 'has', 'ceased', 'to', 'be', 'It', 's', 'expired', 'and', 'gone', 'to', 'meet', 'its', 'maker', 'This', 'is', 'a', 'late', 'parrot', 'It', 's', 'a', 'stiff', 'Bereft', 'of', 'life', 'it', 'rests', 'in', 'peace', 'If', 'you', 'hadn', 't', 'nailed', 'it', 'to', 'the', 'perch', 'it', 'would', 'be', 'pushing', 'up', 'the', 'daisies', 'It', 's', 'run', 'down', 'the', 'curtain', 'and', 'joined', 'the', 'choir', 'invisible', 'This', 'is', 'an']

Tokenized parrot text with stopwords removed: ['This', 'parrot', 'It', 'ceased', 'It', 'expired', 'gone', 'meet', 'maker', 'This', 'late', 'parrot', 'It', 'stiff', 'Bereft', 'life', 'rests', 'peace', 'If', 'nailed', 'perch', 'would', 'pushing', 'daisies', 'It', 'run', 'curtain', 'joined', 'choir', 'invisible', 'This']
fdist = FreqDist(parrot_tokens)
print(fdist)
<FreqDist with 45 samples and 65 outcomes>
fdist.plot()
plt.show()
../../_images/06_tad_python_12_0.png

3. Stemming vs. lemmatization#

ps = PorterStemmer()
text = "waits waited hurries hurrying tries try"
tokenization = word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,ps.stem(w))) 
Stemming for waits is wait
Stemming for waited is wait
Stemming for hurries is hurri
Stemming for hurrying is hurri
Stemming for tries is tri
Stemming for try is tri
wnl = WordNetLemmatizer()
text = "waits waited hurries hurrying tries try"
tokenization = word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wnl.lemmatize(w)))  
Lemma for waits is wait
Lemma for waited is waited
Lemma for hurries is hurry
Lemma for hurrying is hurrying
Lemma for tries is try
Lemma for try is try

4. Document-Term Matrix (DTM)#

docs = ["I looked into the eyes of the giant fish. Mammal. Whatever.", 
        "It's not the size of the opponents, Elaine, it's the ferocity", 
        'You kept making all the stops?! They kept ringing the bell!']
vec = CountVectorizer()
X = vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
print(df)
   all  bell  elaine  eyes  ferocity  fish  giant  into  it  kept  ...  not  \
0    0     0       0     1         0     1      1     1   0     0  ...    0   
1    0     0       1     0         1     0      0     0   2     0  ...    1   
2    1     1       0     0         0     0      0     0   0     2  ...    0   

   of  opponents  ringing  size  stops  the  they  whatever  you  
0   1          0        0     0      0    2     0         1    0  
1   1          1        0     1      0    3     0         0    0  
2   0          0        1     0      1    2     1         0    1  

[3 rows x 23 columns]
pd.set_option('display.max_columns', None)
df
all bell elaine eyes ferocity fish giant into it kept looked making mammal not of opponents ringing size stops the they whatever you
0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 0 0 0 0 2 0 1 0
1 0 0 1 0 1 0 0 0 2 0 0 0 0 1 1 1 0 1 0 3 0 0 0
2 1 1 0 0 0 0 0 0 0 2 0 1 0 0 0 0 1 0 1 2 1 0 1

5. Cosine similarity#

print(cosine_similarity(df, df))
[[1.         0.46358632 0.2981424 ]
 [0.46358632 1.         0.35540933]
 [0.2981424  0.35540933 1.        ]]

6. A real(ish) example!#

doc1 = """To be, or not to be, that is the question:
        Whether 'tis nobler in the mind to suffer
        The slings and arrows of outrageous fortune,
        Or to take arms against a sea of troubles
        And by opposing end them. To die—to sleep,
        No more; and by a sleep to say we end
        The heart-ache and the thousand natural shocks
        That flesh is heir to: 'tis a consummation
        Devoutly to be wish'd. To die, to sleep;
        To sleep, perchance to dream—ay, there's the rub:
        For in that sleep of death what dreams may come,
        When we have shuffled off this mortal coil,
        Must give us pause—there's the respect
        That makes calamity of so long life."""
    
doc2 =  """To-morrow, and to-morrow, and to-morrow,
        Creeps in this petty pace from day to day
        To the last syllable of recorded time,
        And all our yesterdays have lighted fools
        The way to dusty death. Out, out, brief candle!
        Life’s but a walking shadow, a poor player
        That struts and frets his hour upon the stage
        And then is heard no more: it is a tale
        Told by an idiot, full of sound and fury,
        Signifying nothing."""

doc3 = """Betting on the horses: you can’t possibly win. 
        I don’t understand what we’re betting on.
        Do the horses know that it’s a race? Are they aware? 
        What is going on here? After the race are the horses 
        walking back to the stable: “I was third, I was first, 
        I was ninth.” I think they’re thinking: “Oat bag, 
        I get my oat bag now!” “Oat bag time for me.” I gotta 
        bet on this idiot? I mean, I’m sure the horses have 
        some idea that the jockey is in a hurry. He’s on him, 
        he’s hitting him with this thing. He’s going: 
        “Come on, come on”. Obviously the jockey’s in a hurry. 
        But the horse must get to the end and go: 
        “We were just here! 
        What was the point of that?"""

corpus = [doc1, doc2, doc3]
X = vec.fit_transform(corpus)
real_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
real_df
ache after against all an and are arms arrows aware ay back bag be bet betting brief but by calamity can candle coil come consummation creeps day death devoutly die do don dream dreams dusty end first flesh fools for fortune frets from full fury get give go going gotta have he heard heart heir here him his hitting horse horses hour hurry idea idiot in is it jockey just know last life lighted long makes may me mean mind more morrow mortal must my natural ninth no nobler not nothing now oat obviously of off on opposing or our out outrageous pace pause perchance petty player point poor possibly question race re recorded respect rub say sea shadow shocks shuffled signifying sleep slings so some sound stable stage struts suffer sure syllable take tale that the them then there they thing think thinking third this thousand time tis to told troubles understand upon us walking was way we were what when whether win wish with yesterdays you
0 1 0 1 0 0 4 0 1 1 0 1 0 0 3 0 0 0 0 2 1 0 0 1 1 1 0 0 1 1 2 0 0 1 1 0 2 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 1 0 0 0 0 4 1 0 1 2 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 5 1 1 0 0 0 0 0 1 0 0 1 0 4 7 1 0 2 0 0 0 0 0 1 1 0 2 13 0 1 0 0 1 0 0 0 2 0 1 1 1 0 1 0 0 0
1 0 0 0 1 1 6 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 2 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 2 1 0 0 0 1 1 1 0 0 0 0 0 0 1 3 0 0 0 0 0 1 0 0 1 0 0 0 2 0 0 0 0 1 2 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 3 0 1 0 0 0 0 0 0 1 0 1 0 6 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0
2 0 1 0 0 0 1 2 0 0 1 0 1 3 0 1 2 0 1 0 0 1 0 0 2 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 2 0 1 2 1 1 3 0 0 0 2 2 0 1 1 4 0 2 1 1 2 2 1 2 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 3 1 1 0 7 0 0 0 0 0 0 0 0 0 0 1 0 1 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 3 11 0 0 0 2 1 1 1 1 2 0 1 0 2 0 0 1 0 0 1 4 0 2 1 3 0 0 1 0 1 0 1
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(real_df, real_df))
[[1.         0.60370413 0.39027206]
 [0.60370413 1.         0.29592291]
 [0.39027206 0.29592291 1.        ]]
# for fun, let's plot the freqdist for doc1

tokens3 = word_tokenize(doc1)
fdist = FreqDist(tokens3)
print(fdist)
<FreqDist with 84 samples and 136 outcomes>
# and make it look slightly nicer!

sns.set_style('darkgrid')

plt.figure(figsize=(15, 6))
fdist.plot()
plt.show()                              # Zipf's law!!
../../_images/06_tad_python_26_0.png