python-remove-html-from-csv/remove_html.py at master · manghat/python-remove-html-from-csv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

# coding: utf-8

# # Dependancies

# In[1]:


import pandas as pd
from bs4 import BeautifulSoup
import re
import html

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&.{4};')
    cleantext = re.sub(cleanr, '', str(raw_html))
#     replacing the special characters
#     cleanr = re.compile ('\\n')
#     cleantext = re.sub(cleanr, ' ', cleantext)
    clean = re.sub('\s+',' ',cleantext)
    return html.unescape(clean) # replaces the special characters


#same using beautifulsoup

def remove_html_escape(html):
    return BeautifulSoup(str(html), "lxml").text


# In[9]:


file = input("Enter CSV File name (without '.csv' at the end): ")


# In[10]:


# reading the file
try:
    d = pd.read_csv("%s.csv" % file )
except IOError:
    print ("Error: can\'t find file or read data")
else:
    print ("File read successfully")


# In[11]:


a = pd.DataFrame(d)
print("File preview: \n",a.head(5))


# In[15]:


col = input("Enter column name: ")

try:
    a[col][0:5]
except:
    print("Error in fetching column. Please check the name '%s' from the table preview above" %col)
else:
    print("Column read successfully: \n", a[col][0:5])


# In[16]:


a['clean'] = a[col].apply(cleanhtml)


# In[17]:


a['clean_bs'] = a[col].apply(remove_html_escape)

# In[18]:


# a.head(5)


# In[19]:


a['parity'] = a[col].str.len() - a['clean'].str.len() #using regex
a['parity_bs'] = a[col].str.len() - a['clean_bs'].str.len() #using beautifulsoup


# In[24]:


# a.tail(5)

print ("------------------------------------------------- \n HTML has been removed from your column contents \n------------------------------------------------- \n ")
print ("column 'clean' contins regex replacement of anything in between < > or &; or \\* \nin otherwords, it removes any html with the space character, no conversion of special characters to respective ASCII values.")
print ("column 'clean_bs' contains html removed with special characters replaced with their respective ASCII characters.")
print ("Parity columns show the difference in number of characters from the original html")

print ("Output table: \n %s" %a.head(5))


# In[13]:


a.to_csv("%shtml_cleaned_output.csv"%file)
print("New file '%shtml_cleaned_output.csv' generated with cleaned columns. Check in the same direcotry"%file)