-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPreprocessing_Table.py
More file actions
121 lines (104 loc) · 4.02 KB
/
Preprocessing_Table.py
File metadata and controls
121 lines (104 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#it is now used only for the embedding part
#to be merged with the same steps in Fullphase code as one preprocessing file.
import html
import string
import re
import pandas as pd
import LabelColumn as LC
import ReferenceColumn as RC
from dateutil.parser import parse
def get_col_dtype(col):
"""
Infer datatype of a pandas column, process only if the column dtype is object.
input: col: a pandas Series representing a df column.
"""
if col.dtype =="object":
# try numeric
try:
col_new = pd.to_datetime(col.dropna().unique())
return col_new.dtype
except:
try:
col_new = pd.to_numeric(col.dropna().unique())
return col_new.dtype
except:
try:
col_new = pd.to_timedelta(col.dropna().unique())
return col_new.dtype
except:
return "object"
else:
return col.dtype
def Preprocessing_Table(T):
#index of removed columns from table
removed_col_index = []
#replace html characters reference in tables
for col in T:
#print("This is T[col] before replace html ref", T[col])
try:
T[col] = [html.unescape(elem) for elem in T[col]]
#remove first spaces
T[col] = [re.sub(r"^\s+", "", elem) for elem in T[col]]
#remove last spaces
T[col] = [re.sub(r"\s+$", "", elem) for elem in T[col]]
#remove dates from string
#because of try catch in one line for loop, i used exec
try:
exec("try:T[col] = [ ''.join(parse(elem, fuzzy_with_tokens=True)[1]) for elem in T[col]]\nexcept ValueError:print('dates handled!')")
except OverflowError:
try:
T[col] = [ ''.join(parse(elem, fuzzy_with_tokens=True)[1]) for elem in T[col]]
continue
except Exception:
continue
except TypeError:
print("Type not suitable for iteration")
continue
#print("This is T[col] after replace html ref", T[col])
#remove punctuations and html characters reference from tables
for col in T:
try:
exec("try:T[col] = [ ''.join([ch for ch in elem if ch not in string.punctuation]) for elem in T[col]]\nexcept ValueError:print('punctuation not removed!')")
except Exception:
print('punctuation cant be fixed!')
#replace html characters reference in tables
for col in T:
try:
#remove first spaces
T[col] = [re.sub(r"^\s+", "", elem) for elem in T[col]]
#remove last spaces
T[col] = [re.sub(r"\s+$", "", elem) for elem in T[col]]
except TypeError:
print("Type not suitable for iteration")
continue
#removing numeric datatypes
T = T.select_dtypes(exclude=['int_','float_','complex_'])
for col in T:
T.replace(r'[-+]?[0-9]*\.?[0-9]*%', '', regex=True, inplace=True)
exec("try:T[col] = [re.sub('^[0-9 ]+$','',elem) for elem in T[col]]\nexcept TypeError:print('cannot replace numeric values!')")
#remove if the pandas object is numeric! or date!
print("This is dtype:",get_col_dtype(T[col]))
if(get_col_dtype(T[col]) in ['int_','float_','datetime64[ns]']):
#col indexe column hast ke bayad pak she!
T = T.drop(col, axis=1)
removed_col_index.append(col)
#a hack to reset. the first T my dataframe. the second and last is not my table
T = T.T.reset_index(drop=True).T
#1.T' <- T;
TPrime = T
#----------------------------------Sample phase----------------------------------------
#4.labelColumn <- getLabelColumn(T);
labelColumn = LC.getLabelColumn(T)
print("-" * 50)
print(" " * 50)
print("-" * 50)
#TODO: change this index keeping to a sustainable method
#you can make annotations dictionary of dictionaries instead of dictionary of lists that they are right now
for rci in removed_col_index:
if(labelColumn > rci ):
labelColumn = labelColumn - 1
print("RCI:",rci)
print("label column after drop index substraction:",labelColumn)
#5.referenceColumns <- getReferenceColumns(T);
referenceColumns = RC.getReferenceColumns(T, labelColumn)
return(T,labelColumn,referenceColumns)