NutriVision-AI/clean_csv.py at main · thansen7/NutriVision-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import shutil
import pandas as pd

# Make a backup copy of the original CSV file
shutil.copy('off_india_products.csv', 'off_india_products_backup.csv')
print("Backup of original CSV saved as 'off_india_products_backup.csv'.")

# Load the CSV file
df = pd.read_csv('off_india_products.csv')

# Remove duplicate rows based on 'code' (product identifier)
df = df.drop_duplicates(subset=['code'])

# Remove rows with missing critical fields (e.g., product_name, image_url)
df = df.dropna(subset=['product_name', 'image_url'])

# Optionally, remove rows with negative nutrition values
nutrient_fields = [
    'energy_kcal_100g', 'proteins_100g', 'fat_100g', 'carbohydrates_100g',
    'sugars_100g', 'fiber_100g', 'salt_100g', 'sodium_100g'
]
for field in nutrient_fields:
    if field in df.columns:
        df = df[(df[field].isna()) | (df[field] >= 0)]

# Save the cleaned CSV
df.to_csv('off_india_products_cleaned.csv', index=False)

print(f"Cleaned CSV saved as 'off_india_products_cleaned.csv'. Rows: {len(df)}")