-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathDatapreprocessing.py
More file actions
126 lines (95 loc) · 4.35 KB
/
Copy pathDatapreprocessing.py
File metadata and controls
126 lines (95 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""AIProject_DataPreprocessing.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1fOG1ixys5sj11nfTqN-8_oztKuLeeYIq
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!apt-get -qq install fonts-nanum
import matplotlib.font_manager as fm
import matplotlib as mpl
fontpath = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(fontpath)
mpl.rc('font', family='NanumGothic')
mpl.rcParams['axes.unicode_minus'] = False
#from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/data_2024_3.csv', encoding='euc-kr')
# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
print(data.head())
# Define split ratios
#train_ratio = 0.7
#validation_ratio = 0.15
#test_ratio = 0.15
# Split the data into training, validation, and testing sets
#train_data, temp_data = train_test_split(data, test_size=(1 - train_ratio))
#val_data, test_data = train_test_split(temp_data, test_size=test_ratio/(test_ratio + validation_ratio))
# Save the splits to CSV files if needed
#train_data.to_csv("train_data.csv", index=False)
#val_data.to_csv("val_data.csv", index=False)
#test_data.to_csv("test_data.csv", index=False)
data.dropna(subset=["관측시간", "관측지점"], inplace=True)
# 관측시간을 datetime 타입으로 변환
data["관측시간"] = pd.to_datetime(data["관측시간"])
data[["관측지점", "관측지점세부"]] = data["관측지점"].str.split('_', expand=True)
columns_to_fill = ["관측최대풍속"]
columns_to_fill = ["관측최대풍속", "(AVOC)관측최대풍속", "(BVOC)관측최대풍속"]
for column in columns_to_fill:
data[column] = data.groupby(["관측시간", "관측지점"])[column].transform(
lambda x: x.fillna(x.mean())
)
# 그래도 남아있는 결측치는 관측지점으로 채우기
data[column] = data.groupby('관측지점')[column].transform(
lambda x: x.fillna(x.mean())
)
# 그래도 남아있는 결측치는 관측시간으로 채우기
data[column] = data.groupby('관측시간')[column].transform(
lambda x: x.fillna(x.mean())
)
# 여전히 남아있는 결측치는 전체 평균으로 채우기
overall_mean = data[column].mean()
data[column].fillna(overall_mean, inplace=True)
# 결측치 처리 전 결측치 확인
if data.isnull().values.any():
print("데이터프레임에 결측치가 있습니다.")
else:
print("데이터프레임에 결측치가 없습니다.")
# 각 열의 결측치 수 확인
missing_values = data.isnull().sum()
print("각 열의 결측치 수:")
print(missing_values)
data.sort_values(by="관측시간", inplace=True)
data = data.drop(columns=['관측지점', '관측지점세부'])
data = data.drop(columns=['(AVOC)관측온도', '(AVOC)관측습도', '(AVOC)관측기압', '(AVOC)관측풍속', '(AVOC)관측풍향', '(AVOC)관측최대풍속', '(AVOC)관측미세먼지', '(AVOC)관측초미세먼지', '(AVOC)관측극초미세먼지', '(AVOC)배관관측온도', '(BVOC)관측온도', '(BVOC)관측습도', '(BVOC)관측기압', '(BVOC)관측풍속', '(BVOC)관측풍향', '(BVOC)관측최대풍속', '(BVOC)관측미세먼지', '(BVOC)관측초미세먼지', '(BVOC)관측극초미세먼지' ])
columns_to_visualize = [col for col in data.columns if col != '관측시간']
data[columns_to_visualize].hist(bins=15, figsize=(15, 10))
plt.show()
# Plot boxplots
plt.figure(figsize=(15, 10))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.show()
# Compute the correlation matrix
corr = data[columns_to_visualize].corr()
# Generate a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.show()
sns.pairplot(data, vars=columns_to_visualize, hue='관측미세먼지')
plt.show()
features = [col for col in columns_to_visualize if col not in ['관측미세먼지', '관측초미세먼지']]
target_vars = ['관측미세먼지', '관측초미세먼지']
for feature in features:
for target in target_vars:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data[feature], y=data[target])
plt.title(f'Scatter plot between {feature} and {target}')
plt.xlabel(feature)
plt.ylabel(target)
plt.legend()
plt.show()