SBB-Platform-Manager/import_cleaning.py at main · PythonDataScience24/SBB-Platform-Manager · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
import pandas as pd
import pickle

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

############# IMPORT AND CLEAN DATA ######################################

# import data
data = pd.read_csv('perronkante.csv', delimiter=';')

# remove columns with more than 50% missing values
data = data.loc[:, data.isnull().mean() < 0.5]

# separate single coordinates
data[['Start_platform_N_coord', 'Start_platform_E_coord']] = data['1_coord'].str.split(',', expand=True)
data[['End_platform_N_coord', 'End_platform_E_coord']] = data['2_coord'].str.split(',', expand=True)

# convert into float64
data['Start_platform_N_coord'] = data['Start_platform_N_coord'].astype(np.float64)
data['Start_platform_E_coord'] = data['Start_platform_E_coord'].astype(np.float64)
data['End_platform_N_coord'] = data['End_platform_N_coord'].astype(np.float64)
data['End_platform_E_coord'] = data['End_platform_E_coord'].astype(np.float64)

# calculate mean of the starting and ending point of the edge
data['N_coord'] = data[['Start_platform_N_coord', 'End_platform_N_coord']].mean(axis=1).fillna('unknown')
data['E_coord'] = data[['Start_platform_E_coord', 'End_platform_E_coord']].mean(axis=1).fillna('unknown')

# Replacing all NaN values with the same declaration pd.NA
data= data.fillna(pd.NA)

# exclude the platforms that are of type 'Dienstperron'
data = data[data['Platform type'] != 'Dienstperron']

# remove 'P' prefix from platform edge height and any non-digit characters
data['Platform edge height'] = data['Platform edge height'].str.replace(r'\D', '', regex=True)

data['Platform edge height'] = pd.to_numeric(data['Platform edge height'])

data = data.dropna(subset=['Platform edge height'])

# calculate overall length of platform
# get location of column length of platform edge
Length_index = data.columns.get_loc('Length of platform edge')

# Drop rows where 'Platform number' is NA
data_filtered = data.dropna(subset=['Platform number'])


# Group by 'Stop name' and 'customer track number' and calculate the sum of platform lengths
platform_length_sum = data_filtered.groupby(['Stop name', 'Platform number'])['Length of platform edge'].sum()

# Map the summed platform lengths back to the original DataFrame
new_col = data.apply(lambda row: platform_length_sum.get((row['Stop name'], row['Platform number']), pd.NA), axis=1)

#reorder new column and change name toLength platform total
data.insert(Length_index + 1, "Length platform total", new_col)

na_platform_rows = data_filtered[data_filtered['Platform number'].isna()]
not_na_platform_rows = (data_filtered['Platform number'] !='<NA>')
print("***********************************welche platform number sind na********************************")
# Print or work with the filtered rows
print(na_platform_rows)
print(data)

# delete columns which aren't relevant to SBB Platform Manager
data = data.drop(['Start_platform_N_coord', 'Start_platform_E_coord', 'End_platform_N_coord', 'End_platform_E_coord',
                  'Length of platform edge', 'BO_IPID', 'Didok number','IPID', 'FID', 'OPUIC', 'lod',
                  '1_coord', '2_coord'], axis = 1)

# checking data types
types =data.dtypes
print(types)

# correcting data types
print("******************** HERE ARE BEING TYPES CONVERTED***************************")

def convert_column_types(data, string_columns): #category_columns,
    """
    Convert specified columns in a DataFrame to ('category' &) 'string' types,
    and replace 'Not Applicable' with pd.NA in string columns.

    Parameters:
    data (pd.DataFrame): The DataFrame whose columns need to be converted.
    category_columns (list): List of column names to be converted to 'category' type.
    string_columns (list): List of column names to be processed for 'Not Applicable' and converted to 'string' type.


    Returns:
    pd.DataFrame: The DataFrame with converted column types.
    """
    # Convert specified columns to 'category' type
    #for col in category_columns:
    #    data[col] = data[col].astype('category')

    # Convert specified columns to 'string' type
    for col in string_columns:
        # Replace 'Not Applicable' with pd.NA (essential for testing)
        data.loc[(data[col] == 'NaN', col)] = pd.NA
        data[col] = data[col].astype('string')

    return data


# category_columns = ['Station abbreviation', 'Station name']
string_columns = ['Platform type', 'Platform gap filler', 'Material', 'Edge type', 'Step', 'Height']

# apply function on dataframe
data = convert_column_types(data, string_columns) #category_columns,

# checking data
print(data.dtypes)

# saving data
data.to_pickle('dataframe.pkl')
print(data)
print("Above you see the dataframe tidied up.")