DC2CSV/DC2CSV.py at main · kstateome/DC2CSV · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# AI UTILIZED IN THE MAKING OF THIS SOFTWARE. AI UTILIZED WAS CHATGPT-4o
# DC2CSV.py
# Created by:
#   Garrett Greathouse
#   Erin Tallman

import pandas as pd

# Setting input and output file names. TODO: add user input for input file name and path
# Input NEEDS to be the direct OWASP Dependency Checker's CSV output (make sure to add "-f 'CSV'" to your dependency checker command to do so)
input_file = "dc_out.csv"
output_file = "final.csv"

# List of columns that will be removed towards the end (this was what we didn't need, may differ to what you need)
# You can add in future columns that you create in here, too, as long as they're made and utilized before deletion
columns_to_remove = ["CVSSv2", "CVSSv3", "Identifiers", "CPE", "DuplicateCheck", "Project", "ScanDate", "DependencyPath", "Description", "License", "Md5", "Sha1", "CPE Confidence", "VendorProject", "Product", "Name", "DateAdded", "ShortDescription"]

# Load input CSV
df = pd.read_csv(input_file)

# Creating a DuplicateCheck column to avoid duplicate entries
# (i.e. if multiple pom.xml files reference the same dependency, this will have it only show up once)
df["DuplicateCheck"] = df["Identifiers"] + df["CVE"]

# Cleaning up scores to avoid long doubles, along with making an average score between v2 and v3
df["CVSSv3_BaseScore"] = df["CVSSv3_BaseScore"].round(1)
df["CVSSv2_Score"] = df["CVSSv2_Score"].round(1)
df["CVSS Score"] = df["CVSSv3_BaseScore"].combine_first(df["CVSSv2_Score"])

# Removing duplicates
df = df.drop_duplicates(subset=["DuplicateCheck"], keep="first") # Keeps first instance
df = df[~df["CVE"].str.contains("GHSA", na=False)] # NaN check

# Remove unneeded columns (MAKE SURE ANY NEWLY CREATED COLUMNS LISTED IN columns_to_remove HAVE BEEN PROCESSED ACCORDINGLY)
df = df.drop(columns=columns_to_remove, errors="ignore")  # errors="ignore" avoids crashes if column missing

# Sorting by CVSS Score
df = df.sort_values(by=["CVSS Score"], ascending=False)

# Output as CSV with name listed in output_file
df.to_csv(output_file, index=False)

# Notifies user of completion w/ file name
print(f"Cleaned CSV saved as: {output_file}")