Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions configs/examples/hf/audit_p2p.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# VynFi Audit P2P (vynfi-audit-p2p) — HF dataset regeneration recipe
#
# Document Flow with Fraud Labels. Showcases the P2P (purchase
# order → goods receipt → vendor invoice → payment) and O2C
# (sales order → delivery → customer invoice → customer receipt)
# document chains, with line-level fraud labels and the v5.8.0+
# `predecessor_line_id` chain context surfaced both on JE lines
# and via the `je_network.csv` flat edge list.
#
# Volumes target ≈230 documents across the four core P2P document
# types — small enough for quick prototyping, large enough to
# show realistic fraud-injection patterns at the document level.
# Manufacturing industry, 10 entities, 6 fiscal periods.
#
# Reproducibility:
# - global.seed pinned for byte-stable regeneration on the
# v5.9.0 release binary;
# - graph_export.je_network.method: a (Method A from
# Ivertowski 2024 — exactly one edge per 2-line journal entry,
# no Cartesian-product blow-up on multi-line consolidations);
# - cost-centers / profit-centers / employees masters generated so
# `cost_center` / `profit_center` / `created_by` join cleanly to
# their respective master tables (closes the v5.9.0 linkage gap).

global:
industry: manufacturing
start_date: "2024-01-01"
period_months: 6
seed: 20260509
group_currency: USD
parallel: true
worker_threads: 4
memory_limit_mb: 4096

companies:
- { code: "1000", name: "VynFi Mfg HQ", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 }
- { code: "1100", name: "VynFi Mfg Plant 1", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.14, fiscal_year_variant: K4 }
- { code: "1200", name: "VynFi Mfg Plant 2", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.12, fiscal_year_variant: K4 }
- { code: "1300", name: "VynFi Mfg Plant 3", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.10, fiscal_year_variant: K4 }
- { code: "2000", name: "VynFi Mfg EU GmbH", currency: EUR, country: DE, annual_transaction_volume: ten_k, volume_weight: 0.10, fiscal_year_variant: K4 }
- { code: "2100", name: "VynFi Mfg UK Ltd", currency: GBP, country: GB, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 }
- { code: "2200", name: "VynFi Mfg FR SAS", currency: EUR, country: FR, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 }
- { code: "3000", name: "VynFi Mfg APAC SG", currency: SGD, country: SG, annual_transaction_volume: ten_k, volume_weight: 0.08, fiscal_year_variant: K4 }
- { code: "3100", name: "VynFi Mfg AU Pty", currency: AUD, country: AU, annual_transaction_volume: ten_k, volume_weight: 0.06, fiscal_year_variant: K4 }
- { code: "4000", name: "VynFi Mfg CA Inc", currency: CAD, country: CA, annual_transaction_volume: ten_k, volume_weight: 0.06, fiscal_year_variant: K4 }

chart_of_accounts:
complexity: small
industry_specific: true

master_data:
vendors: { count: 50, intercompany_percent: 0.10 }
customers: { count: 60, intercompany_percent: 0.05 }
materials: { count: 100, bom_enabled: true, average_bom_depth: 2 }
fixed_assets: { count: 30 }
employees: { count: 50 }

# Document flows are the headline content for this dataset.
# Volumes tuned so each of PO / GR / Invoice / Payment lands in the
# 50-80 range across the 6-month / 10-company horizon (≈ 1 chain
# per company per month for both P2P and O2C).
document_flows:
p2p:
enabled: true
chains_per_month: 1
three_way_match_rate: 0.92
partial_delivery_rate: 0.12
gr_ir_clearing_enabled: true
o2c:
enabled: true
chains_per_month: 1
credit_check_failure_rate: 0.04
partial_shipment_rate: 0.10
return_rate: 0.025

balance:
generate_opening_balances: true
generate_trial_balances: true
reconcile_subledgers: true
validate_balance_equation: true
target_gross_margin: 0.32
target_dso_days: 45
target_dpo_days: 30

period_close:
enabled: true

financial_reporting:
enabled: true

# 3 % document-level fraud (matches the published 3 % rate).
fraud:
enabled: true
fraud_rate: 0.02
document_fraud_rate: 0.03
propagate_to_lines: true

internal_controls:
enabled: true
coso_enabled: true
include_entity_level_controls: true
target_maturity_level: managed
exception_rate: 0.02
sod_violation_rate: 0.01

distributions:
enabled: true
industry_profile: manufacturing
amounts:
enabled: true
distribution_type: log_normal
components:
- { weight: 0.65, mu: 5.5, sigma: 1.3, label: "routine" }
- { weight: 0.28, mu: 7.8, sigma: 0.9, label: "significant" }
- { weight: 0.07, mu: 9.5, sigma: 0.6, label: "major" }
benford_compliance: true

temporal_patterns:
enabled: true
business_days:
enabled: true
half_day_policy: half_day
month_end_convention: modified_following
calendars:
regions: [US, DE, GB, SG]
period_end:
model: exponential
month_end: { start_day: -10, base_multiplier: 1.0, peak_multiplier: 3.5, decay_rate: 0.3 }
quarter_end: { inherit_from: month_end, additional_multiplier: 1.5 }

audit:
enabled: false # Audit data has its own dataset

# Method A flat edge-list for the accounting network. Provides a
# bonus artefact alongside the document-flow tables — every
# 2-line JE in this dataset gets one edge, joinable back to the
# row-level JE table via from_line_id / to_line_id.
graph_export:
je_network:
method: a

output:
output_directory: "./output"
formats: [csv, json]
159 changes: 159 additions & 0 deletions configs/examples/hf/supply_chain_ocel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# VynFi Supply Chain OCEL (vynfi-supply-chain-ocel) — HF dataset
# regeneration recipe.
#
# Native OCEL 2.0 event log from a manufacturing supply chain.
# Process types covered: P2P, O2C, manufacturing operations. Event
# log is the primary artefact, with object-level data
# (orders / invoices / shipments / receipts) and anomaly labels
# alongside.
#
# Volumes (post-v5.9.0 refresh): targeting ~100k events for
# usability — the v0.x dataset shipped 20 010 events which was
# quick to prototype against but tight for any serious process
# mining. Scaling: 5 companies × 12 months × ~120 chains for
# each of P2P and O2C (≈14 k chains × ~7 events per chain) puts
# the event log in the 90 k–110 k range, with proportional
# growth in objects and anomaly labels. Per-company JE volume
# kept at `ten_k` to bound the in-memory orchestrator footprint
# during OCPM event-log construction (`hundred_k` × 5 × 12
# producing ~700 k JEs alongside ~14 k chains pushed peak heap
# past 8 GB during native-OCEL graph assembly).
#
# Reproducibility:
# - global.seed pinned;
# - 3 % document-level fraud injection (matches the published
# 3 % rate);
# - ChaCha8 PRNG output is platform-stable.

global:
industry: manufacturing
start_date: "2024-01-01"
period_months: 12
seed: 20260509
group_currency: USD
parallel: true
worker_threads: 4
memory_limit_mb: 8192

# Five subsidiaries: one HQ, three plants, one international
# entity. Big enough to exercise multi-company joins on objects;
# small enough to keep event counts focused on supply-chain
# behaviour rather than entity proliferation.
companies:
- { code: "1000", name: "VynFi Mfg HQ", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.30, fiscal_year_variant: K4 }
- { code: "1100", name: "VynFi Mfg Plant 1", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.22, fiscal_year_variant: K4 }
- { code: "1200", name: "VynFi Mfg Plant 2", currency: USD, country: US, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 }
- { code: "2000", name: "VynFi Mfg EU GmbH", currency: EUR, country: DE, annual_transaction_volume: ten_k, volume_weight: 0.18, fiscal_year_variant: K4 }
- { code: "3000", name: "VynFi Mfg APAC SG", currency: SGD, country: SG, annual_transaction_volume: ten_k, volume_weight: 0.12, fiscal_year_variant: K4 }

chart_of_accounts:
complexity: medium
industry_specific: true

master_data:
vendors: { count: 100, intercompany_percent: 0.10 }
customers: { count: 120, intercompany_percent: 0.05 }
materials: { count: 400, bom_enabled: true, average_bom_depth: 3 }
fixed_assets: { count: 50 }
employees: { count: 100 }

# Chain volume is the primary driver of event count — each
# P2P / O2C chain produces ~6-8 OCEL events (PO created →
# released → GR posted → invoice received → invoice posted →
# payment posted → payment cleared). 14 k chains × ~7 events
# ≈ 100 k events.
document_flows:
p2p:
enabled: true
chains_per_month: 360
three_way_match_rate: 0.92
partial_delivery_rate: 0.12
gr_ir_clearing_enabled: true
o2c:
enabled: true
chains_per_month: 360
credit_check_failure_rate: 0.04
partial_shipment_rate: 0.10
return_rate: 0.025

balance:
generate_opening_balances: true
generate_trial_balances: true
reconcile_subledgers: true
validate_balance_equation: true

period_close:
enabled: true

# 3 % document-level fraud (matches the published rate of the
# previous dataset version). Anomaly labels are produced for
# every fraudulent + selected non-fraudulent events in the
# OCEL log, joinable to events via `event_id`.
fraud:
enabled: true
fraud_rate: 0.02
document_fraud_rate: 0.03
propagate_to_lines: true

internal_controls:
enabled: true
coso_enabled: true
include_entity_level_controls: true
target_maturity_level: managed
exception_rate: 0.02
sod_violation_rate: 0.01

distributions:
enabled: true
industry_profile: manufacturing
amounts:
enabled: true
distribution_type: log_normal
components:
- { weight: 0.65, mu: 5.5, sigma: 1.3, label: "routine" }
- { weight: 0.28, mu: 7.8, sigma: 0.9, label: "significant" }
- { weight: 0.07, mu: 9.5, sigma: 0.6, label: "major" }
benford_compliance: true

temporal_patterns:
enabled: true
business_days:
enabled: true
half_day_policy: half_day
month_end_convention: modified_following
calendars:
regions: [US, DE, SG]
period_end:
model: exponential
month_end: { start_day: -10, base_multiplier: 1.0, peak_multiplier: 3.5, decay_rate: 0.3 }
quarter_end: { inherit_from: month_end, additional_multiplier: 1.5 }

# OCPM event log: native OCEL 2.0 generation with all relevant
# output formats. `compute_variants: true` plus
# `include_object_relationships: true` mirror the v0.x dataset's
# `events`, `objects`, `anomaly_labels`, `document_events` tables.
ocpm:
enabled: true
generate_lifecycle_events: true
include_object_relationships: true
compute_variants: true
max_variants: 0 # unlimited
output:
ocel_json: true
flattened_csv: true
event_object_csv: true
object_relationship_csv: true
variants_csv: true

audit:
enabled: false # Audit data has its own dataset

# Method-A flat edge-list provided for completeness — joinable to
# the OCEL events via shared document_id + entry_date.
graph_export:
je_network:
method: a

output:
output_directory: "./output"
formats: [csv, json]
Loading
Loading