-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathremove_html.py
More file actions
113 lines (58 loc) · 2.24 KB
/
remove_html.py
File metadata and controls
113 lines (58 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# coding: utf-8
# # Dependancies
# In[1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import html
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>|&.{4};')
cleantext = re.sub(cleanr, '', str(raw_html))
# replacing the special characters
# cleanr = re.compile ('\\n')
# cleantext = re.sub(cleanr, ' ', cleantext)
clean = re.sub('\s+',' ',cleantext)
return html.unescape(clean) # replaces the special characters
#same using beautifulsoup
def remove_html_escape(html):
return BeautifulSoup(str(html), "lxml").text
# In[9]:
file = input("Enter CSV File name (without '.csv' at the end): ")
# In[10]:
# reading the file
try:
d = pd.read_csv("%s.csv" % file )
except IOError:
print ("Error: can\'t find file or read data")
else:
print ("File read successfully")
# In[11]:
a = pd.DataFrame(d)
print("File preview: \n",a.head(5))
# In[15]:
col = input("Enter column name: ")
try:
a[col][0:5]
except:
print("Error in fetching column. Please check the name '%s' from the table preview above" %col)
else:
print("Column read successfully: \n", a[col][0:5])
# In[16]:
a['clean'] = a[col].apply(cleanhtml)
# In[17]:
a['clean_bs'] = a[col].apply(remove_html_escape)
# In[18]:
# a.head(5)
# In[19]:
a['parity'] = a[col].str.len() - a['clean'].str.len() #using regex
a['parity_bs'] = a[col].str.len() - a['clean_bs'].str.len() #using beautifulsoup
# In[24]:
# a.tail(5)
print ("------------------------------------------------- \n HTML has been removed from your column contents \n------------------------------------------------- \n ")
print ("column 'clean' contins regex replacement of anything in between < > or &; or \\* \nin otherwords, it removes any html with the space character, no conversion of special characters to respective ASCII values.")
print ("column 'clean_bs' contains html removed with special characters replaced with their respective ASCII characters.")
print ("Parity columns show the difference in number of characters from the original html")
print ("Output table: \n %s" %a.head(5))
# In[13]:
a.to_csv("%shtml_cleaned_output.csv"%file)
print("New file '%shtml_cleaned_output.csv' generated with cleaned columns. Check in the same direcotry"%file)