-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_csv.py
More file actions
29 lines (22 loc) · 1013 Bytes
/
clean_csv.py
File metadata and controls
29 lines (22 loc) · 1013 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import shutil
import pandas as pd
# Make a backup copy of the original CSV file
shutil.copy('off_india_products.csv', 'off_india_products_backup.csv')
print("Backup of original CSV saved as 'off_india_products_backup.csv'.")
# Load the CSV file
df = pd.read_csv('off_india_products.csv')
# Remove duplicate rows based on 'code' (product identifier)
df = df.drop_duplicates(subset=['code'])
# Remove rows with missing critical fields (e.g., product_name, image_url)
df = df.dropna(subset=['product_name', 'image_url'])
# Optionally, remove rows with negative nutrition values
nutrient_fields = [
'energy_kcal_100g', 'proteins_100g', 'fat_100g', 'carbohydrates_100g',
'sugars_100g', 'fiber_100g', 'salt_100g', 'sodium_100g'
]
for field in nutrient_fields:
if field in df.columns:
df = df[(df[field].isna()) | (df[field] >= 0)]
# Save the cleaned CSV
df.to_csv('off_india_products_cleaned.csv', index=False)
print(f"Cleaned CSV saved as 'off_india_products_cleaned.csv'. Rows: {len(df)}")