-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLP Pre-Processing.py
More file actions
104 lines (76 loc) · 3.89 KB
/
NLP Pre-Processing.py
File metadata and controls
104 lines (76 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#Make a new column to detect the length of a text messages
#Suppose that your dataframe is called df and the feutures "message" contains the text.
#This request allows you to create a new column (the length of text)
df['length'] = df['message'].apply(len)
df.length.describe()
#To visualize and to learn more about the distribution of df.length
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df['length'].plot(bins=50, kind='hist')
# To Delete/remove punctuation from text
import string #Learn more about string module -> https://docs.python.org/2/library/string.html
mess = 'Hello. Sample message! Does this text has punctuation? Answer: no \ yes' #Example
# Check characters to see if they are in punctuation
nopunct = [char for char in mess if char not in string.punctuation]
print (nopunc)
#['H', 'e', 'l', 'l', 'o', ' ', 'S', 'a', 'm', 'p', 'l', 'e', ' ', 'm', 'e', 's', 's',
#'a', 'g', 'e', ' ', 'D', 'o', 'e', 's', ' ', 't', 'h', 'i', 's', ' ', 't', 'e', 'x',
#'t', ' ', 'h', 'a', 's', ' ', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
#' ', 'A', 'n', 's', 'w', 'e', 'r', ' ', 'n', 'o', ' ', ' ', 'y', 'e', 's']
# Join the characters again to form the string.
nopunct = ''.join(nopunc)
print (nopunc)
#Output: Hello Sample message Does this text has punctuation Answer no yes
# Removing stop words with NLTK in Python.
#We can import a list of english stopwords from NLTK
#Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)that a search engine
#has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query.
from nltk.corpus import stopwords
# Show the ten first stop words
stopwords.words('english')[0:10]
#Output: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']
# Application
# Now just remove any stopwords
clean_message = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
clean_message
#Output: ['Hello', 'Sample', 'message', 'text', 'punctuation', 'Answer', 'yes']
#LET's Create a function to apply all the process above
def text_process(message):
# 1. Remove all punctuation
# Check characters to see if they are in punctuation
nopunc = [char for char in message if char not in string.punctuation]
# 2. Remove all stopwords
# Join the characters again to form the string.
nopunc = ''.join(nopunc)
# 3. Returns a list of the cleaned text
# Now just remove any stopwords
return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
#Application
df['clean_mess"]=df['message'].apply(text_process)
#We can imagine a 2-Dimensional matrix where the 1-dimension is the entire vocabulary(1 row per word)
#and the other dimension are the messages, in this case a column per text message.
#For example:
# Message1 Message2 ... MessageN
#Word1Count 0 1 ... 0
#Word2Count 0 0 ... 0
# ... 1 2 ... 0
#WordNCount 0 1 ... 1
#Since there are so many messages, we can expect a lot of zero counts for the
#presence of that word in a text.
#APPLICATION
from sklearn.feature_extraction.text import CountVectorizer
# A lot of arguments and parameters that can be passed to the CountVectorizer.
# We can specify the analyzer to be our own previously defined function "text_process"
mess_vect = CountVectorizer(analyzer=text_process).fit(df['message'])
# Print total number of vocab words
print(len(mess_vect.vocabulary_))
#GET the word in the row K( K is a numbre)
print(mess_vect.get_feature_names()[K])
#Transform our dataframe of messages to a Matrix
mess_word = mess_vect.transform(df['message'])
#Shape of th Matrix
print('Shape of Sparse Matrix: ', mess_word.shape)
#Count of positive occurences
print('Amount of Non-Zero occurences: ', mess_word.nnz)
#Sparsity and TF-IDF