Skip to content

Commit fd660c3

Browse files
committed
Data Transformation part completed
1 parent da4317c commit fd660c3

6 files changed

Lines changed: 215 additions & 0 deletions

File tree

final_model/preprocessor.pkl

2.28 MB
Binary file not shown.
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import os
2+
import sys
3+
4+
import numpy as np
5+
import pandas as pd
6+
from sklearn.impute import KNNImputer
7+
from sklearn.pipeline import Pipeline
8+
9+
from network_security.constant.training_pipeline import (
10+
DATA_TRANSFORMATION_IMPUTER_PARAMS,
11+
TARGET_COLUMN,
12+
)
13+
from network_security.entity.artifact_entity import (
14+
DataTransformationArtifact,
15+
DataValidationArtifact,
16+
)
17+
from network_security.entity.config_entity import DataTransformationConfig
18+
from network_security.exception.exception import NetworkSecurityException
19+
from network_security.logging.logger import logging
20+
from network_security.utils.main_utils.utils import save_numpy_array_data, save_object
21+
22+
23+
class DataTransformation:
24+
def __init__(
25+
self,
26+
data_validation_artifact: DataValidationArtifact,
27+
data_transformation_config: DataTransformationConfig,
28+
) -> None:
29+
try:
30+
self.data_validation_artifact: DataValidationArtifact = (
31+
data_validation_artifact
32+
)
33+
self.data_transformation_config: DataTransformationConfig = (
34+
data_transformation_config
35+
)
36+
except Exception as e:
37+
raise NetworkSecurityException(e, sys)
38+
39+
@staticmethod
40+
def read_data(file_path) -> pd.DataFrame:
41+
try:
42+
return pd.read_csv(file_path)
43+
except Exception as e:
44+
raise NetworkSecurityException(e, sys)
45+
46+
def get_data_transformer_object(cls) -> Pipeline:
47+
"""
48+
It initialises a KNNImputer object with the parameters specified in the training_pipeline.py file
49+
and returns a Pipeline object with the KNNImputer object as the first step.
50+
51+
Args:
52+
cls: DataTransformation
53+
54+
Returns:
55+
A Pipeline object
56+
"""
57+
logging.info(
58+
"Entered get_data_trnasformer_object method of Trnasformation class"
59+
)
60+
try:
61+
imputer: KNNImputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
62+
logging.info(
63+
f"Initialise KNNImputer with {DATA_TRANSFORMATION_IMPUTER_PARAMS}"
64+
)
65+
processor: Pipeline = Pipeline([("imputer", imputer)])
66+
return processor
67+
except Exception as e:
68+
raise NetworkSecurityException(e, sys)
69+
70+
def initiate_data_transformation(self) -> DataTransformationArtifact:
71+
logging.info(
72+
"Entered initiate_data_transformation method of DataTransformation class"
73+
)
74+
try:
75+
logging.info("Starting data transformation")
76+
train_df = DataTransformation.read_data(
77+
self.data_validation_artifact.valid_train_file_path
78+
)
79+
test_df = DataTransformation.read_data(
80+
self.data_validation_artifact.valid_test_file_path
81+
)
82+
83+
## training dataframe
84+
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
85+
target_feature_train_df = train_df[TARGET_COLUMN]
86+
target_feature_train_df = target_feature_train_df.replace(-1, 0)
87+
88+
# testing dataframe
89+
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
90+
target_feature_test_df = test_df[TARGET_COLUMN]
91+
target_feature_test_df = target_feature_test_df.replace(-1, 0)
92+
93+
preprocessor = self.get_data_transformer_object()
94+
95+
preprocessor_object = preprocessor.fit(input_feature_train_df)
96+
transformed_input_train_feature = preprocessor_object.transform(
97+
input_feature_train_df
98+
)
99+
transformed_input_test_feature = preprocessor_object.transform(
100+
input_feature_test_df
101+
)
102+
103+
train_arr = np.c_[
104+
transformed_input_train_feature, np.array(target_feature_train_df)
105+
]
106+
test_arr = np.c_[
107+
transformed_input_test_feature, np.array(target_feature_test_df)
108+
]
109+
110+
# save numpy array data
111+
save_numpy_array_data(
112+
self.data_transformation_config.transformed_train_file_path,
113+
array=train_arr,
114+
)
115+
save_numpy_array_data(
116+
self.data_transformation_config.transformed_test_file_path,
117+
array=test_arr,
118+
)
119+
save_object(
120+
self.data_transformation_config.transformed_object_file_path,
121+
preprocessor_object,
122+
)
123+
124+
save_object(
125+
"final_model/preprocessor.pkl",
126+
preprocessor_object,
127+
)
128+
129+
# preparing artifacts
130+
131+
data_transformation_artifact = DataTransformationArtifact(
132+
transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
133+
transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
134+
transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
135+
)
136+
return data_transformation_artifact
137+
138+
except Exception as e:
139+
raise NetworkSecurityException(e, sys)

network_security/constant/training_pipeline/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,21 @@
4040
DATA_VALIDATION_DRIFT_REPORT_DIR: str = "drift_report"
4141
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str = "report.yaml"
4242
PREPROCESSING_OBJECT_FILE_NAME = "preprocessing.pkl"
43+
44+
45+
"""
46+
Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME
47+
"""
48+
DATA_TRANSFORMATION_DIR_NAME: str = "data_transformation"
49+
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str = "transformed"
50+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR: str = "transformed_object"
51+
52+
## kkn imputer to replace nan values
53+
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
54+
"missing_values": np.nan,
55+
"n_neighbors": 3,
56+
"weights": "uniform",
57+
}
58+
DATA_TRANSFORMATION_TRAIN_FILE_PATH: str = "train.npy"
59+
60+
DATA_TRANSFORMATION_TEST_FILE_PATH: str = "test.npy"

network_security/entity/artifact_entity.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,10 @@ class DataValidationArtifact:
1515
invalid_train_file_path: str
1616
invalid_test_file_path: str
1717
drift_report_file_path: str
18+
19+
20+
@dataclass
21+
class DataTransformationArtifact:
22+
transformed_object_file_path: str
23+
transformed_train_file_path: str
24+
transformed_test_file_path: str

network_security/entity/config_entity.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,26 @@ def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
7676
/ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR
7777
/ training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME
7878
)
79+
80+
81+
class DataTransformationConfig:
82+
def __init__(self, training_pipeline_config: TrainingPipelineConfig) -> None:
83+
self.data_transformation_dir: Path = (
84+
Path(training_pipeline_config.artifact_dir)
85+
/ training_pipeline.DATA_TRANSFORMATION_DIR_NAME
86+
)
87+
self.transformed_train_file_path: Path = (
88+
self.data_transformation_dir
89+
/ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR
90+
/ training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy")
91+
)
92+
self.transformed_test_file_path: Path = (
93+
self.data_transformation_dir
94+
/ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR
95+
/ training_pipeline.TEST_FILE_NAME.replace("csv", "npy")
96+
)
97+
self.transformed_object_file_path: Path = (
98+
self.data_transformation_dir
99+
/ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR
100+
/ training_pipeline.PREPROCESSING_OBJECT_FILE_NAME
101+
)

network_security/utils/main_utils/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import os
22

33
# import dill
4+
import pickle
45
import sys
56
from pathlib import Path
67

8+
import numpy as np
79
import yaml
810

911
from network_security.exception.exception import NetworkSecurityException
@@ -27,3 +29,29 @@ def write_yaml_file(file_path: str, content: object, replace: bool = False) -> N
2729
yaml.dump(content, file)
2830
except Exception as e:
2931
raise NetworkSecurityException(e, sys)
32+
33+
34+
def save_numpy_array_data(file_path: str, array: np.array):
35+
"""
36+
Save numpy array data to file
37+
file_path: str location of file to save
38+
array: np.array data to save
39+
"""
40+
try:
41+
dir_path = os.path.dirname(file_path)
42+
os.makedirs(dir_path, exist_ok=True)
43+
with open(file_path, "wb") as file_obj:
44+
np.save(file_obj, array)
45+
except Exception as e:
46+
raise NetworkSecurityException(e, sys) from e
47+
48+
49+
def save_object(file_path: str, obj: object) -> None:
50+
try:
51+
logging.info("Entered the save_object method of MainUtils class")
52+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
53+
with open(file_path, "wb") as file_obj:
54+
pickle.dump(obj, file_obj)
55+
logging.info("Exited the save_object method of MainUtils class")
56+
except Exception as e:
57+
raise NetworkSecurityException(e, sys) from e

0 commit comments

Comments
 (0)