Synthesis-Project-I/DATASET.py at main · laurabolta/Synthesis-Project-I · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Import necessary libraries
import pandas as pd
from Preparing_Data import functions
# PATHS TO THE CSV FILES -----------------------------------------------------------------

student_success_csv_path = './Students/Estudiants_èxit_accés_anònim.csv'
student_marks_csv_path = './Students/Estudiants_notes_assignatures_anònim.csv'
student_abandonment_csv_path = './Students/Estudiants_abandonament_anònim.csv'

# LOAD DATA -----------------------------------------------------------------

background_df = pd.read_csv(student_success_csv_path)
grades_df = pd.read_csv(student_marks_csv_path)
abandonment_df = pd.read_csv(student_abandonment_csv_path)

# Preprocess Background Data -----------------------------------------------------------------

# Rename for simplicity
background_df.rename(columns={'Nota d\'accés (preinscripció)': 'Nota d\'accés'}, inplace=True)
background_df.rename(columns={'S/N Discapacitat': 'Discapacitat'}, inplace=True)

# Preprocess Grades Data -----------------------------------------------------------------

# Rename columns with students ids for the name to match
grades_df.rename(columns={'Alumne': 'Id Anonim'}, inplace=True)
# Ensure grades are numeric
grades_df['Nota_assignatura'] = pd.to_numeric(grades_df['Nota_assignatura'], errors='coerce')

# Preprocess Abandonment Data -----------------------------------------------------------------

# Choose relevant abandonment features
abandonment_features = [
    'Id Anonim',
    'Nombre Abandonaments Universitat Reals'
]

# Keep only relevant features
abandonment_df = abandonment_df[abandonment_features]
# Replace name for simplicity
abandonment_df.rename(columns={'Nombre Abandonaments Universitat Reals': 'Abandonament'}, inplace=True)

# MERGE THE DATA -----------------------------------------------------------------

# First, merge background info with grades
merged_df = pd.merge(
    grades_df,
    background_df.drop(columns=['Taxa èxit']),  # drop 'Taxa èxit' from the students side
    on=['Id Anonim', 'Estudi', 'Curs acadèmic', 'Sexe'],
    how='left'
)

# Add 'Taxa èxit' as a separate field
taxa_exit_map = background_df.set_index('Id Anonim')['Taxa èxit'].to_dict()
merged_df['Taxa èxit'] = merged_df['Id Anonim'].map(taxa_exit_map)

# Ensure only one row per student in abandonment_df
abandonment_df = abandonment_df.drop_duplicates(subset='Id Anonim')

# Then, merge abandonment info
merged_df = merged_df.merge(abandonment_df, on='Id Anonim', how='left')

# Visualize merged dataframe
print(merged_df.head())
print(f"Number of entries: {len(merged_df)}")
print("--------------------------------------")
# Visualize features present in our dataset
print(merged_df.columns)
print("--------------------------------------")
# Visualize an example of an entry in our dataset
print(merged_df.iloc[10])
print("--------------------------------------")

# PREPROCESS AND CLEAN DATA -----------------------------------------------------------------

# Search for missing values
print('Missing values per column:\n', merged_df.isnull().sum())
print("--------------------------------------")
print(merged_df[merged_df.isnull().any(axis=1)]) # print rows where values are missing
print("--------------------------------------")

"""
There are 56 missing values in total, all related to information about the parents' studies.
Upon inspecting the rows, we found that all missing values correspond to the same student (identified by the same anonymous ID).
Since this is a single, specific case, we can safely remove this student from the dataset without significantly impacting the model's training.
"""

merged_df.dropna(subset=['Estudis Mare', 'Estudis Pare'], inplace=True) # remove those entries

# Now, check for missing values again and count the number of entries
print('Missing values per column:\n', merged_df.isnull().sum())
print(f"Number of entries: {len(merged_df)}")
print("--------------------------------------")

# Clean the resulting dataframe
clean_df = functions.load_and_clean_data(merged_df)
clean_df = clean_df.drop_duplicates()
clean_df = clean_df.drop(columns='curs_academic_acces_estudi') # remove not necessary features
clean_df.to_csv("CLEANDATASET.csv", index=False)

# Convert the dataframe to only numerical values
numerical_df = functions.df_to_numerical(clean_df)

# Save as a csv file if needed
numerical_df.to_csv("NUMDATASET.csv", index=False)

# Visualize an example of an entry in our dataset
print('Example of an entry of the cleaned DataFrame:\n')
print(numerical_df.iloc[10])
print(clean_df['abandonament'].value_counts())