-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_cleaning.py
More file actions
110 lines (87 loc) · 3.33 KB
/
text_cleaning.py
File metadata and controls
110 lines (87 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import time
import nltk
import string
import numpy as np
import pandas as pd
from textblob import TextBlob
nltk.download('all')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def spellcheck(df, column):
"""
The function corrects spellings of each word in that column
Arguments:
df - dataframe with textual data
column - column name to use
Returns:
df - dataframe with corrected spellings (shape same as original)
Example:
>>> df = spellcheck(df, col_name)
"""
for i in df.index:
df[column][i] = TextBlob(df[column][i]).correct().raw
return df
def clean(df, tasks, columns):
"""
The function performs various cleaning tasks from lower case conversion to tokenization and lemmatization.
Need to pass a dataframe of textual data for cleaning.
Arguments:
df - pandas dataframe with textual data to clean
tasks - list of tasks to perform.
'- ' or '-_' replaces '-' with spaces or underscores resp.
'punct' removes all punctuations from the data.
'num' replaces all digits with '#'.
'lemma' performs lemmatization on words.
'stop' removes stopwords from data.
'spell' runs textblob spellcheck.
columns - list of columns in dataframe to clean.
Returns:
df - cleaned dataframe (same shape as original)
Example:
>>> df1 = clean(df,task_list,col_list)
"""
for column in columns:
#Lowercase conversion
df[column] = df[column].apply(lambda x: x.lower())
print(column+": Converted to lowercase")
#Tokenization
df[column] = df[column].apply(word_tokenize)
df[column] = df[column].apply(lambda x: " ".join(x))
print(column+": Tokenized")
if('- ' in tasks):
#Split a-b into a and b
df[column] = df[column].str.replace('-',' ')
print(column+": - Replaced")
elif('-_' in tasks):
#Split a-b into a and b
df[column] = df[column].str.replace('-','_')
print(column+": - Replaced")
if('punct' in tasks):
#Removing punctuations
df[column] = df[column].str.replace('[^\w\s]','')
print(column+": Removed punctions ")
if('num' in tasks):
#Replacing numbers
df[column] = df[column].str.replace('[0-9]','#')
print(column+": Replaced Numbers ")
if('stop' in tasks):
#Removing Stop Words
df[column] = df[column].apply(lambda x: " ".join([w for w in x.split() if w not in stop_words]))
print(column+": StopWords Removed")
if('lemma' in tasks):
#Lemmatization - root words
df[column] = df[column].apply(lambda x: " ".join([lemmatizer.lemmatize(word,pos='v') for word in x.split()]))
print(column+": Root words Lemmatized")
if('spell' in tasks):
#Spellcheck words
df = spellcheck(df, column)
print(column+": Word spellings corrected")
print("Null values:",df.isnull().values.any())
return df