-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsql_script.sql
More file actions
321 lines (268 loc) · 9.93 KB
/
sql_script.sql
File metadata and controls
321 lines (268 loc) · 9.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
/* =====================================================================================
PROJECT: Jiji Cars – Data Cleaning, Normalization & Fact Table Build
DATABASE: jiji_cars
AUTHOR: Andy Ababio
DESCRIPTION:
Full ETL workflow including:
- Data type corrections & cleaning
- Null / empty value treatment
- Date conversion
- Dimension table creation
- Fact table creation
- Dimension population
- Fact table population
===================================================================================== */
USE jiji_cars;
SELECT * FROM jiji_cars_data;
/* =====================================================================================
STEP 1: CHECK FOR DUPLICATES
===================================================================================== */
SELECT
id, make, model, year_of_manufacture, price_value, date_created,
COUNT(*) AS duplicated_count
FROM jiji_cars_data
GROUP BY id, make, model, year_of_manufacture, price_value, date_created
HAVING COUNT(*) > 1;
/* =====================================================================================
STEP 2: FIX COLUMN TYPES
Ensuring correct datatypes for analysis and referential integrity.
===================================================================================== */
-- ID column
ALTER TABLE jiji_cars_data
MODIFY id BIGINT NOT NULL;
-- Boolean-like fields (stored as text)
ALTER TABLE jiji_cars_data
MODIFY is_boost VARCHAR(10),
MODIFY registered_car VARCHAR(30),
MODIFY exchange_possible VARCHAR(30);
-- Text fields
ALTER TABLE jiji_cars_data
MODIFY make VARCHAR(50),
MODIFY model VARCHAR(100),
MODIFY `condition` VARCHAR(30),
MODIFY second_condition VARCHAR(100),
MODIFY color VARCHAR(30),
MODIFY interior_color VARCHAR(30);
-- Status / code fields
ALTER TABLE jiji_cars_data
MODIFY icon_transmission VARCHAR(20),
MODIFY paid_info_text VARCHAR(50);
-- Location fields
ALTER TABLE jiji_cars_data
MODIFY region_name VARCHAR(100);
-- URL
ALTER TABLE jiji_cars_data
MODIFY url VARCHAR(500);
-- Numeric fields
ALTER TABLE jiji_cars_data
MODIFY year_of_manufacture INT,
MODIFY mileage INT;
/* =====================================================================================
STEP 3: CLEAN DATE FIELD (date_created)
Convert from text → DATETIME format
===================================================================================== */
-- Inspect invalid or empty dates
SELECT date_created
FROM jiji_cars_data
WHERE date_created IS NULL OR date_created = '';
-- Add a clean date column
ALTER TABLE jiji_cars_data
ADD COLUMN date_created_clean DATETIME;
-- Convert date format
START TRANSACTION;
UPDATE jiji_cars_data
SET date_created_clean = STR_TO_DATE(date_created, '%a, %d %b %Y %H:%i:%s GMT')
WHERE date_created IS NOT NULL;
COMMIT;
-- Replace old date column with cleaned version
ALTER TABLE jiji_cars_data DROP COLUMN date_created;
ALTER TABLE jiji_cars_data
CHANGE date_created_clean date_created DATETIME;
/* =====================================================================================
STEP 4: HANDLE NULL OR EMPTY VALUES
Replace missing text fields with meaningful defaults.
===================================================================================== */
SELECT
COUNT(CASE WHEN is_boost = '' OR is_boost IS NULL THEN 1 END) AS empty_is_boost,
COUNT(CASE WHEN paid_info_text = '' OR paid_info_text IS NULL THEN 1 END) AS empty_paid_info_text,
COUNT(CASE WHEN model = '' OR model IS NULL THEN 1 END) AS empty_model,
COUNT(CASE WHEN year_of_manufacture = '' OR year_of_manufacture IS NULL THEN 1 END) AS empty_yof,
COUNT(CASE WHEN `condition` = '' OR `condition` IS NULL THEN 1 END) AS empty_condition,
COUNT(CASE WHEN second_condition = '' OR second_condition IS NULL THEN 1 END) AS empty_second_condition,
COUNT(CASE WHEN interior_color = '' OR interior_color IS NULL THEN 1 END) AS empty_interior_color,
COUNT(CASE WHEN registered_car = '' OR registered_car IS NULL THEN 1 END) AS empty_registered_car,
COUNT(CASE WHEN exchange_possible = '' OR exchange_possible IS NULL THEN 1 END) AS empty_exchange_possible,
COUNT(CASE WHEN icon_transmission = '' OR icon_transmission IS NULL THEN 1 END) AS empty_transmission,
COUNT(CASE WHEN region_name = '' OR region_name IS NULL THEN 1 END) AS empty_region,
COUNT(CASE WHEN price_value = '' OR price_value IS NULL THEN 1 END) AS empty_price_value,
COUNT(CASE WHEN url = '' OR url IS NULL THEN 1 END) AS empty_url
FROM jiji_cars_data;
-- Replace empty values with defaults
START TRANSACTION;
UPDATE jiji_cars_data
SET paid_info_text = 'No Promo'
WHERE paid_info_text = '' OR paid_info_text IS NULL;
UPDATE jiji_cars_data
SET second_condition = 'No second condition'
WHERE second_condition = '' OR second_condition IS NULL;
UPDATE jiji_cars_data
SET interior_color = 'No color indicated'
WHERE interior_color = '' OR interior_color IS NULL;
UPDATE jiji_cars_data
SET registered_car = 'Not indicated'
WHERE registered_car = '' OR registered_car IS NULL;
UPDATE jiji_cars_data
SET exchange_possible = 'Not indicated'
WHERE exchange_possible = '' OR exchange_possible IS NULL;
COMMIT;
/* =====================================================================================
STEP 5: CREATE DIMENSION & FACT TABLES
Star-schema for analytics-ready structure.
===================================================================================== */
-- DIM_MAKE
CREATE TABLE dim_make (
make_id INT AUTO_INCREMENT PRIMARY KEY,
make_name VARCHAR(50) NOT NULL UNIQUE
);
-- DIM_MODEL
CREATE TABLE dim_model (
model_id INT AUTO_INCREMENT PRIMARY KEY,
make_id INT NOT NULL,
model_name VARCHAR(100) NOT NULL,
UNIQUE KEY unique_make_model (make_id, model_name),
FOREIGN KEY (make_id) REFERENCES dim_make(make_id)
);
-- DIM_REGION
CREATE TABLE dim_region (
region_id INT AUTO_INCREMENT PRIMARY KEY,
region_name VARCHAR(100) NOT NULL UNIQUE
);
-- DIM_COLOR
CREATE TABLE dim_color (
color_id INT AUTO_INCREMENT PRIMARY KEY,
color_name VARCHAR(30) NOT NULL UNIQUE
);
-- DIM_CONDITION
CREATE TABLE dim_condition (
condition_id INT AUTO_INCREMENT PRIMARY KEY,
condition_name VARCHAR(30) NOT NULL UNIQUE
);
-- DIM_TRANSMISSION
CREATE TABLE dim_transmission (
transmission_id INT AUTO_INCREMENT PRIMARY KEY,
transmission_type VARCHAR(30) NOT NULL UNIQUE
);
-- FACT TABLE
CREATE TABLE fact_car_listings (
listing_id BIGINT PRIMARY KEY,
make_id INT,
model_id INT,
region_id INT,
color_id INT,
condition_id INT,
transmission_id INT,
is_boost VARCHAR(20),
paid_info_text VARCHAR(20),
year_of_manufacture INT,
second_condition VARCHAR(250),
interior_color VARCHAR(50),
mileage INT,
registered_car VARCHAR(30),
exchange_possible VARCHAR(30),
price_value INT,
url VARCHAR(500),
date_created DATETIME,
FOREIGN KEY (make_id) REFERENCES dim_make(make_id),
FOREIGN KEY (model_id) REFERENCES dim_model(model_id),
FOREIGN KEY (region_id) REFERENCES dim_region(region_id),
FOREIGN KEY (color_id) REFERENCES dim_color(color_id),
FOREIGN KEY (condition_id) REFERENCES dim_condition(condition_id),
FOREIGN KEY (transmission_id) REFERENCES dim_transmission(transmission_id),
INDEX idx_year_price (year_of_manufacture, price_value),
INDEX idx_date_created (date_created),
INDEX idx_make_region (make_id, region_id)
);
/* =====================================================================================
STEP 6: POPULATE DIMENSION TABLES
===================================================================================== */
-- DIM_MAKE
INSERT INTO dim_make (make_name)
SELECT DISTINCT LOWER(TRIM(make))
FROM jiji_cars_data
WHERE make IS NOT NULL AND make != '';
-- DIM_MODEL
INSERT INTO dim_model (make_id, model_name)
SELECT DISTINCT m.make_id, r.model
FROM jiji_cars_data r
JOIN dim_make m ON r.make = m.make_name
WHERE r.model IS NOT NULL;
-- DIM_REGION
INSERT IGNORE INTO dim_region (region_name)
SELECT DISTINCT region_name
FROM jiji_cars_data
WHERE region_name IS NOT NULL;
-- DIM_COLOR (combines exterior + interior colors)
INSERT IGNORE INTO dim_color (color_name)
SELECT DISTINCT color FROM jiji_cars_data WHERE color IS NOT NULL
UNION
SELECT DISTINCT interior_color FROM jiji_cars_data WHERE interior_color IS NOT NULL;
-- DIM_CONDITION
INSERT IGNORE INTO dim_condition (condition_name)
SELECT DISTINCT `condition`
FROM jiji_cars_data
WHERE `condition` IS NOT NULL;
-- DIM_TRANSMISSION
INSERT IGNORE INTO dim_transmission (transmission_type)
SELECT DISTINCT icon_transmission
FROM jiji_cars_data
WHERE icon_transmission IS NOT NULL;
/* =====================================================================================
STEP 7: POPULATE FACT TABLE
===================================================================================== */
INSERT INTO fact_car_listings (
listing_id,
make_id,
model_id,
region_id,
color_id,
condition_id,
transmission_id,
year_of_manufacture,
mileage,
price_value,
is_boost,
registered_car,
exchange_possible,
date_created,
url
)
SELECT
r.id AS listing_id,
m.make_id,
mo.model_id,
rg.region_id,
cc.color_id,
c.condition_id,
t.transmission_id,
r.year_of_manufacture,
r.mileage,
r.price_value,
r.is_boost,
r.registered_car,
r.exchange_possible,
r.date_created,
r.url
FROM jiji_cars_data r
JOIN dim_make m ON r.make = m.make_name
JOIN dim_model mo ON mo.make_id = m.make_id AND r.model = mo.model_name
JOIN dim_region rg ON r.region_name = rg.region_name
JOIN dim_color cc ON r.color = cc.color_name
JOIN dim_condition c ON r.`condition` = c.condition_name
JOIN dim_transmission t ON r.icon_transmission = t.transmission_type;
-- Update additional attributes (added later)
UPDATE fact_car_listings f
JOIN jiji_cars_data r ON f.listing_id = r.id
SET
f.paid_info_text = r.paid_info_text,
f.second_condition = r.second_condition,
f.interior_color = r.interior_color;