Contents

From text to data in Python

Contents

12.6. From text to data in Python#

First, we’ll further explore the vocabulary terms from the previous section by implementing them in Python. Then, we’ll conduct our cutting-edge and all-important analysis on Shakespeare vs. Seinfeld.

import matplotlib.pyplot as plt                               # ah, friendship. #grateful
import seaborn as sns
import pandas as pd

import nltk                                                   # some new friends here!
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('punkt')
from nltk.tokenize import word_tokenize                       
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer   # new resources from an old friend!
from sklearn.metrics.pairwise import cosine_similarity

1. Tokenization & frequency distributions#

# simple tokenization

text = "Those aren't for New Year's. Those are my everyday balloons." 
print(word_tokenize(text))

['Those', 'are', "n't", 'for', 'New', 'Year', "'s", '.', 'Those', 'are', 'my', 'everyday', 'balloons', '.']

# tokenization, then inspect how many types, tokens

text2 = "I don't really want to go to congress again. I'm kind of too young to be in congress so much, you know?"
tokens = word_tokenize(text2)
fdist = FreqDist(tokens)
print(fdist)

<FreqDist with 23 samples and 27 outcomes>

fdist.most_common(2)

[('to', 3), ('I', 2)]

fdist.plot()     # plot tokens by frequency
plt.show()

../../_images/06_tad_python_6_0.png

2. Pre-processing example#

# Have a look at one set of stop words

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

parrot_text = """This parrot is no more! It has ceased to be! 
                It’s expired and gone to meet its maker! 
                This is a late parrot! It’s a stiff! Bereft of life, it rests in peace! 
                If you hadn’t nailed it to the perch, it would be pushing up the daisies! 
                It’s run down the curtain and joined the choir invisible! 
                This is an ex-parrot!"""

stop_words = set(stopwords.words('english'))
 
parrot_tokens = word_tokenize(parrot_text)

parrot_tokens= [w for w in parrot_tokens if w.isalnum()]     # uncomment this to remove punctuation

parrot_tokens_nostopwords = [w for w in parrot_tokens if not w.lower() in stop_words]
 
parrot_tokens_nostopwords = []
 
for w in parrot_tokens:
    if w not in stop_words:
        parrot_tokens_nostopwords.append(w)

print("Tokenized parrot text:", parrot_tokens), print()

print("Tokenized parrot text with stopwords removed:", parrot_tokens_nostopwords)

Tokenized parrot text: ['This', 'parrot', 'is', 'no', 'more', 'It', 'has', 'ceased', 'to', 'be', 'It', 's', 'expired', 'and', 'gone', 'to', 'meet', 'its', 'maker', 'This', 'is', 'a', 'late', 'parrot', 'It', 's', 'a', 'stiff', 'Bereft', 'of', 'life', 'it', 'rests', 'in', 'peace', 'If', 'you', 'hadn', 't', 'nailed', 'it', 'to', 'the', 'perch', 'it', 'would', 'be', 'pushing', 'up', 'the', 'daisies', 'It', 's', 'run', 'down', 'the', 'curtain', 'and', 'joined', 'the', 'choir', 'invisible', 'This', 'is', 'an']

Tokenized parrot text with stopwords removed: ['This', 'parrot', 'It', 'ceased', 'It', 'expired', 'gone', 'meet', 'maker', 'This', 'late', 'parrot', 'It', 'stiff', 'Bereft', 'life', 'rests', 'peace', 'If', 'nailed', 'perch', 'would', 'pushing', 'daisies', 'It', 'run', 'curtain', 'joined', 'choir', 'invisible', 'This']

fdist = FreqDist(parrot_tokens)
print(fdist)

<FreqDist with 45 samples and 65 outcomes>

fdist.plot()
plt.show()

../../_images/06_tad_python_12_0.png

3. Stemming vs. lemmatization#

ps = PorterStemmer()
text = "waits waited hurries hurrying tries try"
tokenization = word_tokenize(text)
for w in tokenization:
    print("Stemming for {} is {}".format(w,ps.stem(w))) 

Stemming for waits is wait
Stemming for waited is wait
Stemming for hurries is hurri
Stemming for hurrying is hurri
Stemming for tries is tri
Stemming for try is tri

wnl = WordNetLemmatizer()
text = "waits waited hurries hurrying tries try"
tokenization = word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wnl.lemmatize(w)))  

Lemma for waits is wait
Lemma for waited is waited
Lemma for hurries is hurry
Lemma for hurrying is hurrying
Lemma for tries is try
Lemma for try is try

4. Document-Term Matrix (DTM)#

docs = ["I looked into the eyes of the giant fish. Mammal. Whatever.", 
        "It's not the size of the opponents, Elaine, it's the ferocity", 
        'You kept making all the stops?! They kept ringing the bell!']
vec = CountVectorizer()
X = vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
print(df)

   all  bell  elaine  eyes  ferocity  fish  giant  into  it  kept  ...  not  \
0    0     0       0     1         0     1      1     1   0     0  ...    0   
1    0     0       1     0         1     0      0     0   2     0  ...    1   
2    1     1       0     0         0     0      0     0   0     2  ...    0   

   of  opponents  ringing  size  stops  the  they  whatever  you  
0   1          0        0     0      0    2     0         1    0  
1   1          1        0     1      0    3     0         0    0  
2   0          0        1     0      1    2     1         0    1  

[3 rows x 23 columns]

pd.set_option('display.max_columns', None)
df

	all	bell	elaine	eyes	ferocity	fish	giant	into	it	kept	looked	making	mammal	not	of	opponents	ringing	size	stops	the	they	whatever	you
0	0	0	0	1	0	1	1	1	0	0	1	0	1	0	1	0	0	0	0	2	0	1	0
1	0	0	1	0	1	0	0	0	2	0	0	0	0	1	1	1	0	1	0	3	0	0	0
2	1	1	0	0	0	0	0	0	0	2	0	1	0	0	0	0	1	0	1	2	1	0	1

5. Cosine similarity#

print(cosine_similarity(df, df))

[[1.         0.46358632 0.2981424 ]
 [0.46358632 1.         0.35540933]
 [0.2981424  0.35540933 1.        ]]

6. A real(ish) example!#

doc1 = """To be, or not to be, that is the question:
        Whether 'tis nobler in the mind to suffer
        The slings and arrows of outrageous fortune,
        Or to take arms against a sea of troubles
        And by opposing end them. To die—to sleep,
        No more; and by a sleep to say we end
        The heart-ache and the thousand natural shocks
        That flesh is heir to: 'tis a consummation
        Devoutly to be wish'd. To die, to sleep;
        To sleep, perchance to dream—ay, there's the rub:
        For in that sleep of death what dreams may come,
        When we have shuffled off this mortal coil,
        Must give us pause—there's the respect
        That makes calamity of so long life."""
    
doc2 =  """To-morrow, and to-morrow, and to-morrow,
        Creeps in this petty pace from day to day
        To the last syllable of recorded time,
        And all our yesterdays have lighted fools
        The way to dusty death. Out, out, brief candle!
        Life’s but a walking shadow, a poor player
        That struts and frets his hour upon the stage
        And then is heard no more: it is a tale
        Told by an idiot, full of sound and fury,
        Signifying nothing."""

doc3 = """Betting on the horses: you can’t possibly win. 
        I don’t understand what we’re betting on.
        Do the horses know that it’s a race? Are they aware? 
        What is going on here? After the race are the horses 
        walking back to the stable: “I was third, I was first, 
        I was ninth.” I think they’re thinking: “Oat bag, 
        I get my oat bag now!” “Oat bag time for me.” I gotta 
        bet on this idiot? I mean, I’m sure the horses have 
        some idea that the jockey is in a hurry. He’s on him, 
        he’s hitting him with this thing. He’s going: 
        “Come on, come on”. Obviously the jockey’s in a hurry. 
        But the horse must get to the end and go: 
        “We were just here! 
        What was the point of that?"""

corpus = [doc1, doc2, doc3]

X = vec.fit_transform(corpus)
real_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
real_df

	ache	after	against	all	an	and	are	arms	arrows	aware	ay	back	bag	be	bet	betting	brief	but	by	calamity	can	candle	coil	come	consummation	creeps	day	death	devoutly	die	do	don	dream	dreams	dusty	end	first	flesh	fools	for	fortune	frets	from	full	fury	get	give	go	going	gotta	have	he	heard	heart	heir	here	him	his	hitting	horse	horses	hour	hurry	idea	idiot	in	is	it	jockey	just	know	last	life	lighted	long	makes	may	me	mean	mind	more	morrow	mortal	must	my	natural	ninth	no	nobler	not	nothing	now	oat	obviously	of	off	on	opposing	or	our	out	outrageous	pace	pause	perchance	petty	player	point	poor	possibly	question	race	re	recorded	respect	rub	say	sea	shadow	shocks	shuffled	signifying	sleep	slings	so	some	sound	stable	stage	struts	suffer	sure	syllable	take	tale	that	the	them	then	there	they	thing	think	thinking	third	this	thousand	time	tis	to	told	troubles	understand	upon	us	walking	was	way	we	were	what	when	whether	win	wish	with	yesterdays	you
0	1	0	1	0	0	4	0	1	1	0	1	0	0	3	0	0	0	0	2	1	0	0	1	1	1	0	0	1	1	2	0	0	1	1	0	2	0	1	0	1	1	0	0	0	0	0	1	0	0	0	1	0	0	1	1	0	0	0	0	0	0	0	0	0	0	2	2	0	0	0	0	0	1	0	1	1	1	0	0	1	1	0	1	1	0	1	0	1	1	1	0	0	0	0	4	1	0	1	2	0	0	1	0	1	1	0	0	0	0	0	1	0	0	0	1	1	1	1	0	1	1	0	5	1	1	0	0	0	0	0	1	0	0	1	0	4	7	1	0	2	0	0	0	0	0	1	1	0	2	13	0	1	0	0	1	0	0	0	2	0	1	1	1	0	1	0	0	0
1	0	0	0	1	1	6	0	0	0	0	0	0	0	0	0	0	1	1	1	0	0	1	0	0	0	1	2	1	0	0	0	0	0	0	1	0	0	0	1	0	0	1	1	1	1	0	0	0	0	0	1	0	1	0	0	0	0	1	0	0	0	1	0	0	1	1	2	1	0	0	0	1	1	1	0	0	0	0	0	0	1	3	0	0	0	0	0	1	0	0	1	0	0	0	2	0	0	0	0	1	2	0	1	0	0	1	1	0	1	0	0	0	0	1	0	0	0	0	1	0	0	1	0	0	0	0	1	0	1	1	0	0	1	0	1	1	3	0	1	0	0	0	0	0	0	1	0	1	0	6	1	0	0	1	0	1	0	1	0	0	0	0	0	0	0	0	1	0
2	0	1	0	0	0	1	2	0	0	1	0	1	3	0	1	2	0	1	0	0	1	0	0	2	0	0	0	0	0	0	1	1	0	0	0	1	1	0	0	1	0	0	0	0	0	2	0	1	2	1	1	3	0	0	0	2	2	0	1	1	4	0	2	1	1	2	2	1	2	1	1	0	0	0	0	0	0	1	1	0	0	0	0	1	1	0	1	0	0	0	0	1	3	1	1	0	7	0	0	0	0	0	0	0	0	0	0	1	0	1	0	2	2	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	3	11	0	0	0	2	1	1	1	1	2	0	1	0	2	0	0	1	0	0	1	4	0	2	1	3	0	0	1	0	1	0	1

from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(real_df, real_df))

[[1.         0.60370413 0.39027206]
 [0.60370413 1.         0.29592291]
 [0.39027206 0.29592291 1.        ]]

# for fun, let's plot the freqdist for doc1

tokens3 = word_tokenize(doc1)
fdist = FreqDist(tokens3)
print(fdist)

<FreqDist with 84 samples and 136 outcomes>

# and make it look slightly nicer!

sns.set_style('darkgrid')

plt.figure(figsize=(15, 6))
fdist.plot()
plt.show()                              # Zipf's law!!

../../_images/06_tad_python_26_0.png