-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
52 lines (40 loc) · 1.6 KB
/
data_loader.py
File metadata and controls
52 lines (40 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import io
import pandas as pd
from config import DATASET_PATH, MAX_TRANSACTIONS
# IBM HI-Small_Trans.csv has two columns both named "Account"
# pandas auto-renames the second one to "Account.1"
IBM_COLUMN_MAP = {
"Timestamp": "timestamp",
"Account": "from_account",
"Account.1": "to_account",
"Amount Paid": "amount",
"Amount Received": "amount_received",
"Payment Currency": "payment_currency",
"Receiving Currency": "receiving_currency",
"Payment Format": "payment_format",
"From Bank": "from_bank",
"To Bank": "to_bank",
"Is Laundering": "is_laundering",
}
REQUIRED_COLUMNS = {"from_account", "to_account", "amount", "timestamp"}
def _process(df: pd.DataFrame) -> pd.DataFrame:
df = df.rename(columns=IBM_COLUMN_MAP)
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(
f"CSV is missing required columns after rename: {missing}. "
f"Columns found: {list(df.columns)}"
)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y/%m/%d %H:%M", errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce").fillna(0.0)
if "is_laundering" in df.columns:
df["is_laundering"] = df["is_laundering"].fillna(0).astype(int)
else:
df["is_laundering"] = 0
return df.head(MAX_TRANSACTIONS)
def load_dataset() -> pd.DataFrame:
df = pd.read_csv(DATASET_PATH)
return _process(df)
def load_from_bytes(data: bytes) -> pd.DataFrame:
df = pd.read_csv(io.BytesIO(data))
return _process(df)