diff --git a/configs/examples/hf/audit_p2p.yaml b/configs/examples/hf/audit_p2p.yaml new file mode 100644 index 00000000..81fcc19b --- /dev/null +++ b/configs/examples/hf/audit_p2p.yaml @@ -0,0 +1,144 @@ +# VynFi Audit P2P (vynfi-audit-p2p) — HF dataset regeneration recipe +# +# Document Flow with Fraud Labels. Showcases the P2P (purchase +# order → goods receipt → vendor invoice → payment) and O2C +# (sales order → delivery → customer invoice → customer receipt) +# document chains, with line-level fraud labels and the v5.8.0+ +# `predecessor_line_id` chain context surfaced both on JE lines +# and via the `je_network.csv` flat edge list. +# +# Volumes target ≈230 documents across the four core P2P document +# types — small enough for quick prototyping, large enough to +# show realistic fraud-injection patterns at the document level. +# Manufacturing industry, 10 entities, 6 fiscal periods. +# +# Reproducibility: +# - global.seed pinned for byte-stable regeneration on the +# v5.9.0 release binary; +# - graph_export.je_network.method: a (Method A from +# Ivertowski 2024 — exactly one edge per 2-line journal entry, +# no Cartesian-product blow-up on multi-line consolidations); +# - cost-centers / profit-centers / employees masters generated so +# `cost_center` / `profit_center` / `created_by` join cleanly to +# their respective master tables (closes the v5.9.0 linkage gap). + +global: + industry: manufacturing + start_date: "2024-01-01" + period_months: 6 + seed: 20260509 + group_currency: USD + parallel: true + worker_threads: 4 + memory_limit_mb: 4096 + +companies: + - { code: "1000", name: "VynFi Mfg HQ", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 } + - { code: "1100", name: "VynFi Mfg Plant 1", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.14, fiscal_year_variant: K4 } + - { code: "1200", name: "VynFi Mfg Plant 2", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.12, fiscal_year_variant: K4 } + - { code: "1300", name: "VynFi Mfg Plant 3", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.10, fiscal_year_variant: K4 } + - { code: "2000", name: "VynFi Mfg EU GmbH", currency: EUR, country: DE, annual_transaction_volume: ten_k, volume_weight: 0.10, fiscal_year_variant: K4 } + - { code: "2100", name: "VynFi Mfg UK Ltd", currency: GBP, country: GB, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 } + - { code: "2200", name: "VynFi Mfg FR SAS", currency: EUR, country: FR, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 } + - { code: "3000", name: "VynFi Mfg APAC SG", currency: SGD, country: SG, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 } + - { code: "3100", name: "VynFi Mfg AU Pty", currency: AUD, country: AU, annual_transaction_volume: ten_k, volume_weight: 0.06, fiscal_year_variant: K4 } + - { code: "4000", name: "VynFi Mfg CA Inc", currency: CAD, country: CA, annual_transaction_volume: ten_k, volume_weight: 0.06, fiscal_year_variant: K4 } + +chart_of_accounts: + complexity: small + industry_specific: true + +master_data: + vendors: { count: 50, intercompany_percent: 0.10 } + customers: { count: 60, intercompany_percent: 0.05 } + materials: { count: 100, bom_enabled: true, average_bom_depth: 2 } + fixed_assets: { count: 30 } + employees: { count: 50 } + +# Document flows are the headline content for this dataset. +# Volumes tuned so each of PO / GR / Invoice / Payment lands in the +# 50-80 range across the 6-month / 10-company horizon (≈ 1 chain +# per company per month for both P2P and O2C). +document_flows: + p2p: + enabled: true + chains_per_month: 1 + three_way_match_rate: 0.92 + partial_delivery_rate: 0.12 + gr_ir_clearing_enabled: true + o2c: + enabled: true + chains_per_month: 1 + credit_check_failure_rate: 0.04 + partial_shipment_rate: 0.10 + return_rate: 0.025 + +balance: + generate_opening_balances: true + generate_trial_balances: true + reconcile_subledgers: true + validate_balance_equation: true + target_gross_margin: 0.32 + target_dso_days: 45 + target_dpo_days: 30 + +period_close: + enabled: true + +financial_reporting: + enabled: true + +# 3 % document-level fraud (matches the published 3 % rate). +fraud: + enabled: true + fraud_rate: 0.02 + document_fraud_rate: 0.03 + propagate_to_lines: true + +internal_controls: + enabled: true + coso_enabled: true + include_entity_level_controls: true + target_maturity_level: managed + exception_rate: 0.02 + sod_violation_rate: 0.01 + +distributions: + enabled: true + industry_profile: manufacturing + amounts: + enabled: true + distribution_type: log_normal + components: + - { weight: 0.65, mu: 5.5, sigma: 1.3, label: "routine" } + - { weight: 0.28, mu: 7.8, sigma: 0.9, label: "significant" } + - { weight: 0.07, mu: 9.5, sigma: 0.6, label: "major" } + benford_compliance: true + +temporal_patterns: + enabled: true + business_days: + enabled: true + half_day_policy: half_day + month_end_convention: modified_following + calendars: + regions: [US, DE, GB, SG] + period_end: + model: exponential + month_end: { start_day: -10, base_multiplier: 1.0, peak_multiplier: 3.5, decay_rate: 0.3 } + quarter_end: { inherit_from: month_end, additional_multiplier: 1.5 } + +audit: + enabled: false # Audit data has its own dataset + +# Method A flat edge-list for the accounting network. Provides a +# bonus artefact alongside the document-flow tables — every +# 2-line JE in this dataset gets one edge, joinable back to the +# row-level JE table via from_line_id / to_line_id. +graph_export: + je_network: + method: a + +output: + output_directory: "./output" + formats: [csv, json] diff --git a/configs/examples/hf/supply_chain_ocel.yaml b/configs/examples/hf/supply_chain_ocel.yaml new file mode 100644 index 00000000..2674d404 --- /dev/null +++ b/configs/examples/hf/supply_chain_ocel.yaml @@ -0,0 +1,159 @@ +# VynFi Supply Chain OCEL (vynfi-supply-chain-ocel) — HF dataset +# regeneration recipe. +# +# Native OCEL 2.0 event log from a manufacturing supply chain. +# Process types covered: P2P, O2C, manufacturing operations. Event +# log is the primary artefact, with object-level data +# (orders / invoices / shipments / receipts) and anomaly labels +# alongside. +# +# Volumes (post-v5.9.0 refresh): targeting ~100k events for +# usability — the v0.x dataset shipped 20 010 events which was +# quick to prototype against but tight for any serious process +# mining. Scaling: 5 companies × 12 months × ~120 chains for +# each of P2P and O2C (≈14 k chains × ~7 events per chain) puts +# the event log in the 90 k–110 k range, with proportional +# growth in objects and anomaly labels. Per-company JE volume +# kept at `ten_k` to bound the in-memory orchestrator footprint +# during OCPM event-log construction (`hundred_k` × 5 × 12 +# producing ~700 k JEs alongside ~14 k chains pushed peak heap +# past 8 GB during native-OCEL graph assembly). +# +# Reproducibility: +# - global.seed pinned; +# - 3 % document-level fraud injection (matches the published +# 3 % rate); +# - ChaCha8 PRNG output is platform-stable. + +global: + industry: manufacturing + start_date: "2024-01-01" + period_months: 12 + seed: 20260509 + group_currency: USD + parallel: true + worker_threads: 4 + memory_limit_mb: 8192 + +# Five subsidiaries: one HQ, three plants, one international +# entity. Big enough to exercise multi-company joins on objects; +# small enough to keep event counts focused on supply-chain +# behaviour rather than entity proliferation. +companies: + - { code: "1000", name: "VynFi Mfg HQ", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.30, fiscal_year_variant: K4 } + - { code: "1100", name: "VynFi Mfg Plant 1", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.22, fiscal_year_variant: K4 } + - { code: "1200", name: "VynFi Mfg Plant 2", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 } + - { code: "2000", name: "VynFi Mfg EU GmbH", currency: EUR, country: DE, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 } + - { code: "3000", name: "VynFi Mfg APAC SG", currency: SGD, country: SG, annual_transaction_volume: ten_k, volume_weight: 0.12, fiscal_year_variant: K4 } + +chart_of_accounts: + complexity: medium + industry_specific: true + +master_data: + vendors: { count: 100, intercompany_percent: 0.10 } + customers: { count: 120, intercompany_percent: 0.05 } + materials: { count: 400, bom_enabled: true, average_bom_depth: 3 } + fixed_assets: { count: 50 } + employees: { count: 100 } + +# Chain volume is the primary driver of event count — each +# P2P / O2C chain produces ~6-8 OCEL events (PO created → +# released → GR posted → invoice received → invoice posted → +# payment posted → payment cleared). 14 k chains × ~7 events +# ≈ 100 k events. +document_flows: + p2p: + enabled: true + chains_per_month: 360 + three_way_match_rate: 0.92 + partial_delivery_rate: 0.12 + gr_ir_clearing_enabled: true + o2c: + enabled: true + chains_per_month: 360 + credit_check_failure_rate: 0.04 + partial_shipment_rate: 0.10 + return_rate: 0.025 + +balance: + generate_opening_balances: true + generate_trial_balances: true + reconcile_subledgers: true + validate_balance_equation: true + +period_close: + enabled: true + +# 3 % document-level fraud (matches the published rate of the +# previous dataset version). Anomaly labels are produced for +# every fraudulent + selected non-fraudulent events in the +# OCEL log, joinable to events via `event_id`. +fraud: + enabled: true + fraud_rate: 0.02 + document_fraud_rate: 0.03 + propagate_to_lines: true + +internal_controls: + enabled: true + coso_enabled: true + include_entity_level_controls: true + target_maturity_level: managed + exception_rate: 0.02 + sod_violation_rate: 0.01 + +distributions: + enabled: true + industry_profile: manufacturing + amounts: + enabled: true + distribution_type: log_normal + components: + - { weight: 0.65, mu: 5.5, sigma: 1.3, label: "routine" } + - { weight: 0.28, mu: 7.8, sigma: 0.9, label: "significant" } + - { weight: 0.07, mu: 9.5, sigma: 0.6, label: "major" } + benford_compliance: true + +temporal_patterns: + enabled: true + business_days: + enabled: true + half_day_policy: half_day + month_end_convention: modified_following + calendars: + regions: [US, DE, SG] + period_end: + model: exponential + month_end: { start_day: -10, base_multiplier: 1.0, peak_multiplier: 3.5, decay_rate: 0.3 } + quarter_end: { inherit_from: month_end, additional_multiplier: 1.5 } + +# OCPM event log: native OCEL 2.0 generation with all relevant +# output formats. `compute_variants: true` plus +# `include_object_relationships: true` mirror the v0.x dataset's +# `events`, `objects`, `anomaly_labels`, `document_events` tables. +ocpm: + enabled: true + generate_lifecycle_events: true + include_object_relationships: true + compute_variants: true + max_variants: 0 # unlimited + output: + ocel_json: true + flattened_csv: true + event_object_csv: true + object_relationship_csv: true + variants_csv: true + +audit: + enabled: false # Audit data has its own dataset + +# Method-A flat edge-list provided for completeness — joinable to +# the OCEL events via shared document_id + entry_date. +graph_export: + je_network: + method: a + +output: + output_directory: "./output" + formats: [csv, json] diff --git a/scripts/hf_audit_p2p_to_parquet.py b/scripts/hf_audit_p2p_to_parquet.py new file mode 100755 index 00000000..fb719f66 --- /dev/null +++ b/scripts/hf_audit_p2p_to_parquet.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +"""Convert generation outputs into the HF-ready layout for the +`vynfi-audit-p2p` dataset. + +Each document-flow JSON file is flattened — header fields are +hoisted to the top level — and written as a single-shard parquet +under its own subdirectory so HF Datasets surfaces each document +type as a separate `config`: + + purchase_orders/train-00000-of-00001.parquet + goods_receipts/train-00000-of-00001.parquet + vendor_invoices/train-00000-of-00001.parquet + payments/train-00000-of-00001.parquet + +Items / line-level arrays are dropped from the top-level table — +they are available in the row-level `journal_entries.csv` (joinable +via `document_id`) and on the corresponding `je_network.csv` flat +edge-list (v5.8.0+). +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + + +def flatten_document(doc: dict) -> dict: + """Hoist `header.*` and discard nested arrays so the row is flat.""" + out: dict = {} + for k, v in doc.items(): + if k == "header" and isinstance(v, dict): + for hk, hv in v.items(): + out[hk] = hv + elif k == "items": + out["item_count"] = len(v) if isinstance(v, list) else 0 + elif k == "lines": + out["line_count"] = len(v) if isinstance(v, list) else 0 + elif isinstance(v, (dict, list)): + # Skip remaining nested structures — they don't fit cleanly + # into a flat schema. The corresponding JSON file remains + # the source of truth. + continue + else: + out[k] = v + return out + + +def doc_flow_json_to_parquet(json_path: Path, out_path: Path, label: str) -> int: + with open(json_path) as f: + data = json.load(f) + if isinstance(data, dict): + for key in ( + "purchase_orders", + "goods_receipts", + "vendor_invoices", + "payments", + "sales_orders", + "deliveries", + "customer_invoices", + "customer_receipts", + "records", + "items", + ): + if key in data and isinstance(data[key], list): + data = data[key] + break + if not isinstance(data, list) or not data: + print(f" {label}: empty, skipping") + return 0 + + rows = [flatten_document(doc) for doc in data] + df = pd.json_normalize(rows, max_level=0) + + out_path.parent.mkdir(parents=True, exist_ok=True) + table = pa.Table.from_pandas(df, preserve_index=False) + pq.write_table(table, out_path, compression="zstd", compression_level=9) + size_kb = out_path.stat().st_size / 1024 + print( + f" wrote {out_path.parent.name}/{out_path.name}: " + f"{len(df):,} rows x {len(df.columns)} cols, {size_kb:.1f} KB ({label})" + ) + return len(df) + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--output-dir", required=True) + ap.add_argument("--hf-dir", required=True) + args = ap.parse_args() + + out = Path(args.output_dir) / "document_flows" + hf = Path(args.hf_dir) + hf.mkdir(parents=True, exist_ok=True) + + if not out.exists(): + print( + f"ERROR: {out} not found (regen with document_flows enabled)", + file=sys.stderr, + ) + return 1 + + targets = [ + ("purchase_orders.json", "purchase_orders", "Purchase Orders"), + ("goods_receipts.json", "goods_receipts", "Goods Receipts"), + ("vendor_invoices.json", "vendor_invoices", "Vendor Invoices"), + ("payments.json", "payments", "Payments"), + ] + for fname, subdir, label in targets: + src = out / fname + if not src.exists(): + print(f" {label}: source {src} missing, skipping") + continue + dst = hf / subdir / "train-00000-of-00001.parquet" + doc_flow_json_to_parquet(src, dst, label) + + print(f"\nAll artefacts written to {hf}/") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/hf_supply_chain_ocel_to_parquet.py b/scripts/hf_supply_chain_ocel_to_parquet.py new file mode 100755 index 00000000..2930bb73 --- /dev/null +++ b/scripts/hf_supply_chain_ocel_to_parquet.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""Convert generation outputs into the HF-ready layout for the +`vynfi-supply-chain-ocel` dataset. + +Each table goes under its own subdirectory so HF Datasets surfaces +it as a separate `config`: + + events/train-00000-of-00001.parquet + objects/train-00000-of-00001.parquet + anomaly_labels/train-00000-of-00001.parquet + document_events/train-00000-of-00001.parquet + +`events.json` and `objects.json` come from `process_mining/`; +`anomaly_labels.json` from `labels/`; `document_events` is +synthesised from the document-flow headers (one row per +P2P/O2C document with the OCEL-style fields needed for joins). +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq + + +def unwrap_list(data, *keys): + if isinstance(data, dict): + for k in keys: + if k in data and isinstance(data[k], list): + return data[k] + if isinstance(data, list): + return data + return [] + + +def write_parquet(rows: list[dict], out_path: Path, label: str) -> int: + if not rows: + print(f" {label}: empty, skipping") + return 0 + df = pd.json_normalize(rows, max_level=1) + out_path.parent.mkdir(parents=True, exist_ok=True) + table = pa.Table.from_pandas(df, preserve_index=False) + pq.write_table(table, out_path, compression="zstd", compression_level=9) + size_kb = out_path.stat().st_size / 1024 + print( + f" wrote {out_path.parent.name}/{out_path.name}: " + f"{len(df):,} rows x {len(df.columns)} cols, " + f"{size_kb:.1f} KB ({label})" + ) + return len(df) + + +def synthesize_document_events(out_root: Path) -> list[dict]: + """Build a per-document summary table from the four P2P/O2C flows. + + Mirrors the v0.x `document_events` config: one row per (P2P or + O2C) document with the OCEL-style fields needed to join events + back to source documents. + """ + rows: list[dict] = [] + flows = [ + ("purchase_orders", "PO", "P2P"), + ("goods_receipts", "GR", "P2P"), + ("vendor_invoices", "VI", "P2P"), + ("payments", "PAY", "P2P"), + ("sales_orders", "SO", "O2C"), + ("deliveries", "DLV", "O2C"), + ("customer_invoices", "CI", "O2C"), + ("customer_receipts", "CR", "O2C"), + ] + df_dir = out_root / "document_flows" + if not df_dir.exists(): + return rows + for fname, doc_kind, process in flows: + fp = df_dir / f"{fname}.json" + if not fp.exists(): + continue + with open(fp) as f: + data = json.load(f) + for doc in unwrap_list( + data, + "purchase_orders", + "goods_receipts", + "vendor_invoices", + "payments", + "sales_orders", + "deliveries", + "customer_invoices", + "customer_receipts", + "records", + ): + header = doc.get("header", {}) + rows.append( + { + "document_id": header.get("document_id"), + "document_kind": doc_kind, + "process": process, + "company_code": header.get("company_code"), + "posting_date": header.get("posting_date"), + } + ) + return rows + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--output-dir", required=True) + ap.add_argument("--hf-dir", required=True) + args = ap.parse_args() + + out = Path(args.output_dir) + hf = Path(args.hf_dir) + hf.mkdir(parents=True, exist_ok=True) + + if not (out / "process_mining").exists(): + print( + f"ERROR: {out / 'process_mining'} not found " + "(regen with ocpm.enabled: true)", + file=sys.stderr, + ) + return 1 + + # 1. events + with open(out / "process_mining" / "events.json") as f: + events = unwrap_list(json.load(f), "events", "records") + write_parquet( + events, + hf / "events" / "train-00000-of-00001.parquet", + "OCEL Events", + ) + + # 2. objects + with open(out / "process_mining" / "objects.json") as f: + objects = unwrap_list(json.load(f), "objects", "records") + write_parquet( + objects, + hf / "objects" / "train-00000-of-00001.parquet", + "OCEL Objects", + ) + + # 3. anomaly_labels (from labels/anomaly_labels.json) + al_path = out / "labels" / "anomaly_labels.json" + if al_path.exists(): + with open(al_path) as f: + labels = unwrap_list(json.load(f), "labels", "anomaly_labels", "records") + write_parquet( + labels, + hf / "anomaly_labels" / "train-00000-of-00001.parquet", + "Anomaly Labels", + ) + + # 4. document_events (synthesised from the four flow headers) + doc_events = synthesize_document_events(out) + write_parquet( + doc_events, + hf / "document_events" / "train-00000-of-00001.parquet", + "Document Events", + ) + + print(f"\nAll artefacts written to {hf}/") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/spaces/accounting-network-explorer/.gitignore b/spaces/accounting-network-explorer/.gitignore new file mode 100644 index 00000000..f395a16e --- /dev/null +++ b/spaces/accounting-network-explorer/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +.streamlit/secrets.toml +.venv/ diff --git a/spaces/accounting-network-explorer/README.md b/spaces/accounting-network-explorer/README.md new file mode 100644 index 00000000..aa4be46a --- /dev/null +++ b/spaces/accounting-network-explorer/README.md @@ -0,0 +1,61 @@ +--- +title: VynFi Accounting Network Explorer +emoji: 🔗 +colorFrom: blue +colorTo: indigo +sdk: streamlit +sdk_version: 1.39.0 +python_version: '3.11' +app_file: app.py +pinned: true +license: apache-2.0 +short_description: Interactive ISO 21378 account-class flow graph (v5.9.0) +tags: + - vynfi + - accounting + - graph + - iso-21378 + - synthetic-data + - financial-network +--- + +# 🔗 VynFi Accounting Network Explorer + +Interactive view of the v5.9.0 Method-A accounting network published in +[`VynFi/vynfi-journal-entries-1m`](https://huggingface.co/datasets/VynFi/vynfi-journal-entries-1m), +aggregated to **ISO 21378 Level-2** account classes (~30 nodes). + +## What you can do + +* **Filter** the underlying 61 656 line-level edges by business process + (P2P / O2C / R2R / H2R / A2R), `is_fraud`, `is_anomaly`, + minimum edge amount, and top-N. +* **Inspect** any class node to see total flow, fraud %, and the top + in/out class pairs. +* **Drill in** to the Level-3 sub-class breakdown + (`A.A.A — Operating Cash`, `A.A.B — Petty Cash`, …). +* **Toggle** force-directed vs hierarchical layout. + +## Method A vs Cartesian + +In v5.9.0 the JE-network defaults to *Method A* from Ivertowski 2024: +exactly **one edge per 2-line journal entry**, confidence = 1.0. +This avoids the Cartesian explosion (225 M edges on 1 M JEs) that the +legacy `cartesian` method produces, and gives a clean topology for +graph-ML training. + +## Tech + +Streamlit + `streamlit-agraph` (vis-network) · pandas/pyarrow · +loads parquet directly from the HF dataset on cold-start, then +caches in-memory. + +## Source + +* App code: [github.com/mivertowski/SyntheticData/tree/main/spaces/accounting-network-explorer](https://github.com/mivertowski/SyntheticData/tree/main/spaces/accounting-network-explorer) +* Generation engine: [github.com/mivertowski/SyntheticData](https://github.com/mivertowski/SyntheticData) +* Companion paper: [SSRN abstract 6538639](https://ssrn.com/abstract=6538639) + +## License + +Apache-2.0. diff --git a/spaces/accounting-network-explorer/app.py b/spaces/accounting-network-explorer/app.py new file mode 100644 index 00000000..6062ced0 --- /dev/null +++ b/spaces/accounting-network-explorer/app.py @@ -0,0 +1,446 @@ +"""VynFi Accounting Network Explorer. + +Interactive ISO 21378 Level-2 account-class network from +`VynFi/vynfi-journal-entries-1m`. One node per account class, +one edge per (from_class, to_class) pair aggregated from the +v5.9.0 Method-A `je_network.parquet` (2-line JEs only, +confidence = 1.0). +""" +from __future__ import annotations + +import math +from typing import Tuple + +import pandas as pd +import streamlit as st +from huggingface_hub import snapshot_download +from streamlit_agraph import Config, Edge, Node, agraph + +DATASET_REPO = "VynFi/vynfi-journal-entries-1m" + +ACCOUNT_TYPE_COLORS = { + "asset": "#2563eb", # blue + "liability": "#ea580c", # orange + "equity": "#16a34a", # green + "revenue": "#9333ea", # purple + "expense": "#dc2626", # red + "other": "#6b7280", # grey +} + +st.set_page_config( + page_title="VynFi Accounting Network Explorer", + page_icon="🔗", + layout="wide", + initial_sidebar_state="expanded", +) + + +# ─── Data loading ──────────────────────────────────────────────────────────── + + +@st.cache_resource(show_spinner="Downloading je_network + chart_of_accounts from HF Hub…") +def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]: + base = snapshot_download( + repo_id=DATASET_REPO, + repo_type="dataset", + allow_patterns=["je_network.parquet", "chart_of_accounts.parquet"], + ) + edges = pd.read_parquet(f"{base}/je_network.parquet") + coa = pd.read_parquet(f"{base}/chart_of_accounts.parquet") + + # Normalise dtypes + edges["from_account"] = edges["from_account"].astype(str) + edges["to_account"] = edges["to_account"].astype(str) + coa["account_number"] = coa["account_number"].astype(str) + coa["account_type"] = coa["account_type"].astype(str).str.lower() + + # 4 account numbers in the published COA (1510, 1600, 4900, 7100) appear + # in two rows with conflicting class mappings — keep the first deterministically + # so the join doesn't inflate the edge count. + coa = coa.drop_duplicates(subset=["account_number"], keep="first").reset_index(drop=True) + + return edges, coa + + +# ─── Aggregation ───────────────────────────────────────────────────────────── + + +def aggregate_to_class(edges: pd.DataFrame, coa: pd.DataFrame): + """Join edges with COA on gl_account and aggregate by (from_class, to_class).""" + coa_slim = coa[ + ["account_number", "account_class", "account_class_name", "account_type"] + ].copy() + + e = ( + edges.merge( + coa_slim.rename( + columns={ + "account_number": "from_account", + "account_class": "from_class", + "account_class_name": "from_class_name", + "account_type": "from_type", + } + ), + on="from_account", + how="left", + ) + .merge( + coa_slim.rename( + columns={ + "account_number": "to_account", + "account_class": "to_class", + "account_class_name": "to_class_name", + "account_type": "to_type", + } + ), + on="to_account", + how="left", + ) + .dropna(subset=["from_class", "to_class"]) + ) + + class_edges = ( + e.groupby(["from_class", "to_class"], as_index=False) + .agg( + total_amount=("amount", "sum"), + edge_count=("edge_id", "count"), + fraud_count=("is_fraud", "sum"), + anomaly_count=("is_anomaly", "sum"), + ) + ) + + out = ( + e.groupby("from_class", as_index=False) + .agg(out_amount=("amount", "sum"), out_count=("edge_id", "count")) + .rename(columns={"from_class": "account_class"}) + ) + inn = ( + e.groupby("to_class", as_index=False) + .agg(in_amount=("amount", "sum"), in_count=("edge_id", "count")) + .rename(columns={"to_class": "account_class"}) + ) + nodes = pd.merge(out, inn, on="account_class", how="outer").fillna(0) + + meta = ( + coa.groupby("account_class", as_index=False) + .agg( + account_class_name=("account_class_name", "first"), + account_type=("account_type", "first"), + ) + ) + nodes = nodes.merge(meta, on="account_class", how="left") + nodes["account_class_name"] = nodes["account_class_name"].fillna(nodes["account_class"]) + nodes["account_type"] = nodes["account_type"].fillna("other") + nodes["total_flow"] = nodes["in_amount"] + nodes["out_amount"] + nodes["total_count"] = nodes["in_count"] + nodes["out_count"] + + return nodes, class_edges + + +# ─── Formatters ────────────────────────────────────────────────────────────── + + +def fmt_money(x: float) -> str: + sign = "-" if x < 0 else "" + x = abs(float(x)) + if x >= 1e12: + return f"{sign}${x / 1e12:.2f}T" + if x >= 1e9: + return f"{sign}${x / 1e9:.2f}B" + if x >= 1e6: + return f"{sign}${x / 1e6:.2f}M" + if x >= 1e3: + return f"{sign}${x / 1e3:.1f}K" + return f"{sign}${x:.0f}" + + +def node_size(amount: float, max_amount: float) -> int: + if amount <= 0 or max_amount <= 0: + return 18 + ratio = math.log10(amount + 1.0) / max(math.log10(max_amount + 1.0), 1.0) + return int(18 + ratio * 42) + + +def edge_width(amount: float, max_amount: float) -> int: + if amount <= 0 or max_amount <= 0: + return 1 + ratio = math.log10(amount + 1.0) / max(math.log10(max_amount + 1.0), 1.0) + return max(1, int(ratio * 8)) + + +# ─── Sidebar — filters ─────────────────────────────────────────────────────── + + +edges_raw, coa_raw = load_data() + +st.title("🔗 VynFi Accounting Network Explorer") +st.caption( + "ISO 21378 Level-2 account-class flows from " + "[`VynFi/vynfi-journal-entries-1m`](https://huggingface.co/datasets/VynFi/vynfi-journal-entries-1m) · " + "Method-A edge list (one edge per 2-line JE) · v5.9.0" +) + +with st.sidebar: + st.header("Filters") + + processes = sorted(edges_raw["business_process"].dropna().unique().tolist()) + selected_processes = st.multiselect( + "Business process", + processes, + default=processes, + help="P2P = procure-to-pay · O2C = order-to-cash · R2R = record-to-report · " + "H2R = hire-to-retire · A2R = adjust-to-report", + ) + + col_a, col_b = st.columns(2) + with col_a: + fraud_only = st.checkbox("Fraud only", value=False) + with col_b: + anomaly_only = st.checkbox("Anomaly only", value=False) + + st.divider() + + min_amount_log = st.slider( + "Min edge total (10ⁿ)", + min_value=0, + max_value=12, + value=0, + step=1, + help="Hide class-pairs whose summed flow is below 10ⁿ.", + ) + top_n = st.slider("Top N edges", min_value=20, max_value=400, value=120, step=20) + + st.divider() + + layout_mode = st.radio( + "Layout", + ["force-directed", "hierarchical"], + horizontal=True, + ) + + st.divider() + st.caption( + f"**Source rows:** {len(edges_raw):,} edges · {len(coa_raw):,} accounts \n" + f"_v5.9.0 · ChaCha8 seed `20260509`_" + ) + + +# ─── Filter the raw edges ──────────────────────────────────────────────────── + + +filt = edges_raw[edges_raw["business_process"].isin(selected_processes)] +if fraud_only: + filt = filt[filt["is_fraud"]] +if anomaly_only: + filt = filt[filt["is_anomaly"]] + +if filt.empty: + st.warning("No edges match the current filter combination — relax the filters.") + st.stop() + +nodes_df, class_edges_df = aggregate_to_class(filt, coa_raw) + +class_edges_df = class_edges_df[class_edges_df["total_amount"] >= 10**min_amount_log] +class_edges_df = class_edges_df.nlargest(top_n, "total_amount") + +keep_classes = set(class_edges_df["from_class"]) | set(class_edges_df["to_class"]) +nodes_df = nodes_df[nodes_df["account_class"].isin(keep_classes)].copy() + +if class_edges_df.empty or nodes_df.empty: + st.warning("Filters produced an empty graph — relax the min-amount cutoff.") + st.stop() + + +# ─── Build agraph nodes/edges ──────────────────────────────────────────────── + + +max_node = nodes_df["total_flow"].max() +max_edge = class_edges_df["total_amount"].max() + +agraph_nodes = [] +for _, n in nodes_df.iterrows(): + color = ACCOUNT_TYPE_COLORS.get(str(n["account_type"]).lower(), ACCOUNT_TYPE_COLORS["other"]) + label = f"{n['account_class']}\n{str(n['account_class_name'])[:24]}" + title = ( + f"Class {n['account_class']} ({n['account_type']})\n" + f"{n['account_class_name']}\n" + f"Total flow: {fmt_money(n['total_flow'])}\n" + f"Edges: {int(n['total_count'])}\n" + f"In: {fmt_money(n['in_amount'])} ({int(n['in_count'])})\n" + f"Out: {fmt_money(n['out_amount'])} ({int(n['out_count'])})" + ) + agraph_nodes.append( + Node( + id=str(n["account_class"]), + label=label, + title=title, + size=node_size(n["total_flow"], max_node), + color=color, + font={"color": "#ffffff", "size": 11, "face": "monospace"}, + shape="dot", + ) + ) + +agraph_edges = [] +for _, e in class_edges_df.iterrows(): + fraud_pct = (e["fraud_count"] / e["edge_count"] * 100) if e["edge_count"] else 0.0 + title = ( + f"{e['from_class']} → {e['to_class']}\n" + f"Total: {fmt_money(e['total_amount'])}\n" + f"Edges: {int(e['edge_count'])}\n" + f"Fraud: {int(e['fraud_count'])} ({fraud_pct:.1f}%)\n" + f"Anomaly: {int(e['anomaly_count'])}" + ) + color = "#dc2626" if e["fraud_count"] > 0 else "#94a3b8" + agraph_edges.append( + Edge( + source=str(e["from_class"]), + target=str(e["to_class"]), + title=title, + color=color, + type="CURVE_SMOOTH", + width=edge_width(e["total_amount"], max_edge), + ) + ) + + +# ─── Layout ────────────────────────────────────────────────────────────────── + + +config = Config( + width=900, + height=650, + directed=True, + physics=(layout_mode == "force-directed"), + hierarchical=(layout_mode == "hierarchical"), +) + +graph_col, side_col = st.columns([3, 1]) +with graph_col: + selected = agraph(nodes=agraph_nodes, edges=agraph_edges, config=config) + +with side_col: + st.subheader("Summary") + sm1, sm2 = st.columns(2) + sm1.metric("Classes", len(nodes_df)) + sm2.metric("Edges", len(class_edges_df)) + st.metric("Total flow", fmt_money(class_edges_df["total_amount"].sum())) + st.metric("Fraud edges", int(class_edges_df["fraud_count"].sum())) + st.metric("Anomaly edges", int(class_edges_df["anomaly_count"].sum())) + + st.divider() + + if selected: + n_match = nodes_df[nodes_df["account_class"] == selected] + if not n_match.empty: + n = n_match.iloc[0] + color = ACCOUNT_TYPE_COLORS.get( + str(n["account_type"]).lower(), ACCOUNT_TYPE_COLORS["other"] + ) + st.markdown( + f"

" + f" " + f"{n['account_class']}

", + unsafe_allow_html=True, + ) + st.markdown(f"**{n['account_class_name']}** \n_{n['account_type']}_") + st.markdown( + f"- Total flow: **{fmt_money(n['total_flow'])}** \n" + f"- Out: {fmt_money(n['out_amount'])} ({int(n['out_count'])}) \n" + f"- In: {fmt_money(n['in_amount'])} ({int(n['in_count'])})" + ) + + outs = class_edges_df[class_edges_df["from_class"] == selected].nlargest( + 5, "total_amount" + ) + if not outs.empty: + st.markdown("**Top outgoing**") + for _, oe in outs.iterrows(): + st.markdown( + f"→ `{oe['to_class']}` · {fmt_money(oe['total_amount'])} " + f"({int(oe['edge_count'])} edges)" + ) + + ins = class_edges_df[class_edges_df["to_class"] == selected].nlargest( + 5, "total_amount" + ) + if not ins.empty: + st.markdown("**Top incoming**") + for _, ie in ins.iterrows(): + st.markdown( + f"← `{ie['from_class']}` · {fmt_money(ie['total_amount'])} " + f"({int(ie['edge_count'])} edges)" + ) + + subs = ( + coa_raw[coa_raw["account_class"] == selected] + .groupby(["account_sub_class", "account_sub_class_name"], as_index=False) + .size() + ) + if not subs.empty: + with st.expander(f"Level-3 sub-classes ({len(subs)})"): + for _, s in subs.iterrows(): + st.markdown( + f"`{s['account_sub_class']}` — {s['account_sub_class_name']}" + ) + else: + st.info("Selected class is not currently visible — relax filters.") + else: + st.info("Click a node in the graph to drill in.") + +st.divider() + +with st.expander("Top edges (table view)", expanded=False): + table = class_edges_df.assign( + total=class_edges_df["total_amount"].apply(fmt_money), + fraud_pct=(class_edges_df["fraud_count"] / class_edges_df["edge_count"] * 100).round(2), + )[ + [ + "from_class", + "to_class", + "total", + "edge_count", + "fraud_count", + "anomaly_count", + "fraud_pct", + ] + ].rename( + columns={ + "from_class": "From", + "to_class": "To", + "total": "Total $", + "edge_count": "Edges", + "fraud_count": "Fraud", + "anomaly_count": "Anomaly", + "fraud_pct": "Fraud %", + } + ) + st.dataframe(table, use_container_width=True, hide_index=True) + +with st.expander("About this Space", expanded=False): + st.markdown( + """ +**What this is.** An interactive view of the v5.9.0 Method-A +accounting network published in +[`VynFi/vynfi-journal-entries-1m`](https://huggingface.co/datasets/VynFi/vynfi-journal-entries-1m). +The 61 656 line-level edges are aggregated to ISO 21378 Level-2 +account classes (~30 nodes), so you can see the macro money-flow +structure at a glance. + +**Method-A.** In v5.9.0 the JE network defaults to "Method A" +from Ivertowski 2024: exactly **one edge per 2-line journal entry**, +confidence = 1.0. This avoids the Cartesian explosion (225 M edges +on 1 M JEs) that the legacy `cartesian` method produced, and gives +a clean topology for graph-ML training. + +**Edge attributes.** `business_process` (P2P / O2C / R2R / H2R / A2R), +`is_fraud`, `is_anomaly`, `posting_date`, `amount`, `confidence`, +`predecessor_edge_id` (chains 2-line JEs into longer document flows). + +**Drill-down.** Click any class node to see the underlying Level-3 +sub-classes (`A.A.A` / `A.A.B` / …) and the top in/out flows. + +**Source.** [GitHub: mivertowski/SyntheticData](https://github.com/mivertowski/SyntheticData) · +[Companion paper (SSRN)](https://ssrn.com/abstract=6538639) + """ + ) diff --git a/spaces/accounting-network-explorer/preview.png b/spaces/accounting-network-explorer/preview.png new file mode 100644 index 00000000..06430555 Binary files /dev/null and b/spaces/accounting-network-explorer/preview.png differ diff --git a/spaces/accounting-network-explorer/requirements.txt b/spaces/accounting-network-explorer/requirements.txt new file mode 100644 index 00000000..f75ee7bf --- /dev/null +++ b/spaces/accounting-network-explorer/requirements.txt @@ -0,0 +1,5 @@ +streamlit==1.39.0 +streamlit-agraph==0.0.45 +pandas==2.2.3 +pyarrow==17.0.0 +huggingface_hub==0.26.2