-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimport_cleaning.py
More file actions
118 lines (85 loc) · 4.64 KB
/
import_cleaning.py
File metadata and controls
118 lines (85 loc) · 4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import pandas as pd
import pickle
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
############# IMPORT AND CLEAN DATA ######################################
# import data
data = pd.read_csv('perronkante.csv', delimiter=';')
# remove columns with more than 50% missing values
data = data.loc[:, data.isnull().mean() < 0.5]
# separate single coordinates
data[['Start_platform_N_coord', 'Start_platform_E_coord']] = data['1_coord'].str.split(',', expand=True)
data[['End_platform_N_coord', 'End_platform_E_coord']] = data['2_coord'].str.split(',', expand=True)
# convert into float64
data['Start_platform_N_coord'] = data['Start_platform_N_coord'].astype(np.float64)
data['Start_platform_E_coord'] = data['Start_platform_E_coord'].astype(np.float64)
data['End_platform_N_coord'] = data['End_platform_N_coord'].astype(np.float64)
data['End_platform_E_coord'] = data['End_platform_E_coord'].astype(np.float64)
# calculate mean of the starting and ending point of the edge
data['N_coord'] = data[['Start_platform_N_coord', 'End_platform_N_coord']].mean(axis=1).fillna('unknown')
data['E_coord'] = data[['Start_platform_E_coord', 'End_platform_E_coord']].mean(axis=1).fillna('unknown')
# Replacing all NaN values with the same declaration pd.NA
data= data.fillna(pd.NA)
# exclude the platforms that are of type 'Dienstperron'
data = data[data['Platform type'] != 'Dienstperron']
# remove 'P' prefix from platform edge height and any non-digit characters
data['Platform edge height'] = data['Platform edge height'].str.replace(r'\D', '', regex=True)
data['Platform edge height'] = pd.to_numeric(data['Platform edge height'])
data = data.dropna(subset=['Platform edge height'])
# calculate overall length of platform
# get location of column length of platform edge
Length_index = data.columns.get_loc('Length of platform edge')
# Drop rows where 'Platform number' is NA
data_filtered = data.dropna(subset=['Platform number'])
# Group by 'Stop name' and 'customer track number' and calculate the sum of platform lengths
platform_length_sum = data_filtered.groupby(['Stop name', 'Platform number'])['Length of platform edge'].sum()
# Map the summed platform lengths back to the original DataFrame
new_col = data.apply(lambda row: platform_length_sum.get((row['Stop name'], row['Platform number']), pd.NA), axis=1)
#reorder new column and change name toLength platform total
data.insert(Length_index + 1, "Length platform total", new_col)
na_platform_rows = data_filtered[data_filtered['Platform number'].isna()]
not_na_platform_rows = (data_filtered['Platform number'] !='<NA>')
print("***********************************welche platform number sind na********************************")
# Print or work with the filtered rows
print(na_platform_rows)
print(data)
# delete columns which aren't relevant to SBB Platform Manager
data = data.drop(['Start_platform_N_coord', 'Start_platform_E_coord', 'End_platform_N_coord', 'End_platform_E_coord',
'Length of platform edge', 'BO_IPID', 'Didok number','IPID', 'FID', 'OPUIC', 'lod',
'1_coord', '2_coord'], axis = 1)
# checking data types
types =data.dtypes
print(types)
# correcting data types
print("******************** HERE ARE BEING TYPES CONVERTED***************************")
def convert_column_types(data, string_columns): #category_columns,
"""
Convert specified columns in a DataFrame to ('category' &) 'string' types,
and replace 'Not Applicable' with pd.NA in string columns.
Parameters:
data (pd.DataFrame): The DataFrame whose columns need to be converted.
category_columns (list): List of column names to be converted to 'category' type.
string_columns (list): List of column names to be processed for 'Not Applicable' and converted to 'string' type.
Returns:
pd.DataFrame: The DataFrame with converted column types.
"""
# Convert specified columns to 'category' type
#for col in category_columns:
# data[col] = data[col].astype('category')
# Convert specified columns to 'string' type
for col in string_columns:
# Replace 'Not Applicable' with pd.NA (essential for testing)
data.loc[(data[col] == 'NaN', col)] = pd.NA
data[col] = data[col].astype('string')
return data
# category_columns = ['Station abbreviation', 'Station name']
string_columns = ['Platform type', 'Platform gap filler', 'Material', 'Edge type', 'Step', 'Height']
# apply function on dataframe
data = convert_column_types(data, string_columns) #category_columns,
# checking data
print(data.dtypes)
# saving data
data.to_pickle('dataframe.pkl')
print(data)
print("Above you see the dataframe tidied up.")