From cca9f43088ed4407fb3cb3dca9de5b1c716e39cf Mon Sep 17 00:00:00 2001 From: user Date: Wed, 26 Nov 2025 02:37:02 +0530 Subject: [PATCH 1/7] updated network_infrastructure queries --- ...e_failure_impact_radius_visualization.gsql | 20 + ..._visualization_with_subgraph_topology.gsql | 19 + ...nstream_device_topology_visualization.gsql | 23 + .../explore_topology_from_all_router.gsql | 14 + ...xplore_topology_from_multiple_routers.gsql | 23 + .../explore_topology_from_one_router.gsql | 24 + ...nts_by_impacted_device_and_time_range.gsql | 29 + ...d_events_by_time_range_and_event_type.gsql | 24 + ...tial_incident_source_of_event_by_time.gsql | 25 + ..._related_events_from_incident_by_time.gsql | 29 + .../find_unsecured_servers_visualization.gsql | 25 +- .../incident_impact_by_max_radius.gsql | 25 + .../top_k_devices_with_most_alerts.gsql | 19 + .../top_k_devices_with_most_incidents.gsql | 19 + .../create_network_infrastructure_graph.gsql | 2 +- .../meta/application.json | 1262 +++++++++++++++++ .../queries/queries.gsql | 304 +++- .../supply_chain_management/readme.md | 82 +- 18 files changed, 1914 insertions(+), 54 deletions(-) create mode 100644 agile_operations/supply_chain_management/meta/application.json diff --git a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql index c22df26b..ef65600c 100644 --- a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization.gsql @@ -1,6 +1,26 @@ CREATE OR REPLACE QUERY device_failure_impact_radius_visualization ( VERTEX device ) { + /* + Query Name: + device_failure_impact_radius_visualization + + Purpose: + 1. Find devices that do NOT have an alternative communication path. + 2. Determine devices that will be impacted if the input device fails. + 3. Display impacted devices and their interconnections. + + Concept: + - If a device has an alternative path (redundant connectivity), it won't fail. + - Devices without alternative paths are marked as 'impacted'. + + Inputs: + - device: The device whose failure we are analyzing. + + Outputs: + - impacted_devices: Devices that will fail if the input device fails. + - @@edges_to_display: Edges between impacted devices for visual impact analysis. + */ SetAccum @@edges_to_display; OrAccum @has_alternative_path; diff --git a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql index 207200de..d8754b6e 100644 --- a/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql +++ b/agile_operations/network_infrastructure/queries/device_failure_impact_radius_visualization_with_subgraph_topology.gsql @@ -1,6 +1,25 @@ CREATE OR REPLACE QUERY device_failure_impact_radius_visualization_with_subgraph_topology ( VERTEX device ) { + /* + Query Name: + device_failure_impact_radius_visualization_with_subgraph_topology + + Purpose: + 1. Build the subgraph that contains the input device (based on connectivity). + 2. Explore and detect devices that have alternative network paths (redundancy). + 3. Identify devices that will be impacted (fail) if the given device fails. + 4. Collect and return visualization data: subgraph topology and failure impact edges. + + Input: + - device: The starting device whose failure impact we want to analyze. + + Output: + - all_vertices_in_subgraph: Devices in the connectivity region of the input device. + - @@edges_to_display_in_subgraph: Edges representing overall subgraph structure. + - impacted_devices: Devices that do not have an alternative path (will fail). + - @@edges_to_display: Edges among impacted devices (failure impact radius). + */ SetAccum @@edges_to_display; SetAccum @@edges_to_display_in_subgraph; OrAccum @has_alternative_path; diff --git a/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql b/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql index d39e7c98..2d52a4f0 100644 --- a/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/downstream_device_topology_visualization.gsql @@ -2,6 +2,29 @@ CREATE OR REPLACE QUERY downstream_device_topology_visualization ( VERTEX device, UINT k_hop_switch_limit = 3 ) { + /* + Query Name: + downstream_device_topology_visualization + + Purpose: + Visualize the downstream topology from a given device, following the device hierarchy: + Router → Firewall → Switch → Server + Also explores multiple downstream Switch layers (k-hop depth traversal). + + Key Features: + ✔ Identifies all downstream devices classified by device type. + ✔ Follows device hierarchy dynamically based on input device type. + ✔ Limits multi-hop Switch traversal using k_hop_switch_limit. + ✔ Returns devices and connecting edges for visualization. + + Inputs: + - device: Starting device. + - k_hop_switch_limit: Maximum depth for switch-to-switch iterations (default = 3). + + Outputs: + - impacted_devices: All discovered downstream devices. + - @@edges_to_display: Edges to visualize the downstream network path. +*/ SetAccum @@impacted_devices; SetAccum @@edges_to_display; diff --git a/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql b/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql index 4f8862e5..4a8700e5 100644 --- a/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql +++ b/agile_operations/network_infrastructure/queries/explore_topology_from_all_router.gsql @@ -1,4 +1,18 @@ CREATE OR REPLACE QUERY explore_topology_from_all_router () { + /* + Query Name: + explore_topology_from_all_router + + Purpose: + Visualize the entire network topology by: + - Retrieving all devices in the graph + - Exploring all connected edges using Connect_To + - Displaying both devices and their connections + + Outputs: + - all_devices_with_connections: List of devices connected via Connect_To edges + - @@edges_to_display: All edges among connected devices for visualization +*/ SetAccum @@edges_to_display; all_devices = {Device.*}; diff --git a/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql b/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql index 2f7f345a..651c97aa 100644 --- a/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql +++ b/agile_operations/network_infrastructure/queries/explore_topology_from_multiple_routers.gsql @@ -1,4 +1,27 @@ CREATE OR REPLACE QUERY explore_topology_from_multiple_routers (SET> starter_router_set) { + /* + Query Name: + explore_topology_from_multiple_routers + + Purpose: + Visualizes the downstream network topology starting from one or more routers. + Traverses devices in this structured order: + Router → Firewall → Switch → Server + + Key Features: + ✔ Uses BFS to exhaustively discover downstream switches. + + Inputs: + starter_router_set — Set of router vertices to start the topology exploration. + If empty, all routers in the graph are selected automatically. + + Outputs: + - all_visited_router_devices — Routers connected to input routers via Device_Has_Type + - all_visited_firewalls — First downstream stage (Firewalls) + - all_visited_switches — All connected Switches (multi-hop via BFS) + - all_visited_servers — Downstream Servers + - @@edges_to_display — All edges to display full topology +*/ OrAccum @visited; SetAccum @@edges_to_display; diff --git a/agile_operations/network_infrastructure/queries/explore_topology_from_one_router.gsql b/agile_operations/network_infrastructure/queries/explore_topology_from_one_router.gsql index 38f7e251..f916eed0 100644 --- a/agile_operations/network_infrastructure/queries/explore_topology_from_one_router.gsql +++ b/agile_operations/network_infrastructure/queries/explore_topology_from_one_router.gsql @@ -1,4 +1,28 @@ CREATE OR REPLACE QUERY explore_topology_from_one_router (VERTEX starter_router) { + /* + Query Name: + explore_topology_from_one_router + + Purpose: + Visualizes the downstream network topology starting from a single router. + Traverses devices in this structured order: + Router → Firewall → Switch → Server + + Key Features: + ✔ Discovers all downstream Firewalls and Switches (including bypass paths) + ✔ Uses BFS to find all connected Switches (multi-hop exploration) + ✔ Captures all edges forming the full topology view + + Input: + starter_router — A single Router vertex that acts as the exploration starting point. + + Outputs: + - all_visited_router_devices — Devices directly connected to starter router via Device_Has_Type + - all_visited_firewalls — Firewalls downstream of the router + - all_visited_switches — All discovered Switches (including BFS expansion) + - all_visited_servers — Servers connected downstream of switches + - @@edges_to_display — Complete collection of edges forming the explored topology +*/ OrAccum @visited; SetAccum @@edges_to_display; diff --git a/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql b/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql index 924cfcf9..9945fbd3 100644 --- a/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql +++ b/agile_operations/network_infrastructure/queries/find_events_by_impacted_device_and_time_range.gsql @@ -4,6 +4,35 @@ CREATE OR REPLACE QUERY find_events_by_impacted_device_and_time_range ( DATETIME end_time, BOOL show_event_types_vis = FALSE ) { + /* + Query Name: + find_events_by_impacted_device_and_time_range + + Purpose: + Finds and visualizes events linked to a specific device within a given time range. + Also traces time hierarchy (Minute → Hour → Date → Month → Year) for chronological visualization. + Optionally includes detailed event type information (Alert, Incident, and their classifications). + + Key Features: + ✔ Filters events by impacted device and time window. + ✔ Visualizes full time-based hierarchy for each event. + ✔ Option to include related event type, alert, and incident classifications. + ✔ Collects all traversal edges for easy visualization. + + Inputs: + input_device — The impacted device to search events for. + start_time — Minimum timestamp of events to include. + end_time — Maximum timestamp of events to include. + show_event_types_vis — (Optional) If TRUE, includes event/alert/incident types. + + Outputs: + - linked_events_within_time — Events impacting the input device within the time range. + - linked_time_date_minute / hour / date / month / year — Chronologically related time vertices. + - linked_event_types — Event classification (if enabled). + - linked_alerts_within_time, linked_incidents_within_time — Associated alerts and incidents. + - linked_alert_types, linked_incident_types — Alert/Incident categories. + - @@edges_to_display — All edges used to visualize event and time relationships. +*/ SetAccum @@edges_to_display; input_device_set = {input_device}; diff --git a/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql b/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql index 99d6ade8..5be011a7 100644 --- a/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql +++ b/agile_operations/network_infrastructure/queries/find_events_by_time_range_and_event_type.gsql @@ -3,6 +3,30 @@ CREATE OR REPLACE QUERY find_events_by_time_range_and_event_type ( DATETIME end_time, STRING input_event_type_filter = "" ) { + /* + Query Name: + find_events_by_time_range_and_event_type + + Purpose: + Retrieves events occurring within a specific time range and optionally filters by event type. + Additionally, collects related metadata such as impacted devices, alert types, + and incident types for comprehensive event analysis. + + Inputs: + start_time — Minimum datetime filter for event retrieval. + end_time — Maximum datetime filter for event retrieval. + input_event_type_filter — (Optional) Filters by event type. If empty, all types are included. + + Outputs: + - selected_events_with_info: + • event_id — Event identifier + • event_time — Timestamp of the event + • event_type — Type classification (Security, System, Network, etc.) + • event_alert_type — Enriched alert type data (if any) + • event_incident_type — Enriched incident type data (if any) + • impacted_devices_list — Devices affected by this event + +*/ MaxAccum @event_type; MaxAccum @incident_type; MaxAccum @alert_type; diff --git a/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql b/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql index c88da0ca..92d42c70 100644 --- a/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql +++ b/agile_operations/network_infrastructure/queries/find_potential_incident_source_of_event_by_time.gsql @@ -3,6 +3,31 @@ CREATE OR REPLACE QUERY find_potential_incident_source_of_event_by_time ( INT max_radius = 3, INT num_seconds_before_event_start = 3600 ) { + /* + Query Name: + find_potential_incident_source_of_event_by_time + + Purpose: + Identifies potential root-cause incidents for a given input event by: + • Retrieving impacted devices of the event + • Exploring connected devices within a specified hop radius + • Scanning for past incident events within a defined timeframe + • Linking discovered incidents with their incident types + + Inputs: + input_event — The event vertex to investigate potential source incidents for + max_radius — Max number of hops allowed for device connectivity exploration (default: 3) + num_seconds_before_event_start — Time window (in seconds) before input_event.timestamp to search for related incidents (default: 3600) + + Outputs: + • input_event_set — Original event input + • related_devices_within_radius — Connected devices marked by event radius + • incident_events_from_related_devices — Incident events found in time range + • incidents_from_related_devices — Linked incident entities + • incident_types_of_related_devices — Final categorized incident types + • @@edges_to_display — All traversal edges for visualization/UI mapping + +*/ SetAccum @@related_devices_set; SetAccum @@edges_to_display; MinAccum @@end_time_accum; diff --git a/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql b/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql index e27fbee7..c69c369d 100644 --- a/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql +++ b/agile_operations/network_infrastructure/queries/find_potential_related_events_from_incident_by_time.gsql @@ -3,6 +3,35 @@ CREATE OR REPLACE QUERY find_potential_related_events_from_incident_by_time ( INT max_radius = 3, INT num_seconds_from_incident_start = 3600 ) { + /* + Query Name: + find_potential_related_events_from_incident_by_time + + Purpose: + Identifies events that could be potentially related to a given input incident. + It does so by: + • Discovering all devices impacted by the incident + • Expanding outward through connected devices within max_radius hops + • Searching for events (Alerts, Incidents) that occurred within a time window + starting from the incident occurrence time + • Classifying and linking detected related alerts and incidents with types + + Inputs: + input_incident — Starting Incident vertex for event correlation analysis + max_radius — Max number of hops to discover connected impacted devices (default: 3) + num_seconds_from_incident_start — Time window (in seconds) after incident start to look for related events (default: 3600) + + Outputs: + • input_incident_set — Original input incident + • linked_event — Event directly linked to input incident + • impacted_devices_within_radius — All devices reached via radius traversal + • alerts_from_impacted_devices — Related Alerts discovered in time range + • incidents_from_impacted_devices — Related Incidents discovered + • alert_types_of_impacted_devices — Enriched alert category details + • incident_types_of_impacted_devices — Enriched incident category details + • @@edges_to_display — All collected edges for UI / Graph visualization + +*/ SetAccum @@impacted_devices_set; SetAccum @@edges_to_display; MinAccum @@start_time_accum; diff --git a/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql b/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql index be31a400..926d0dad 100644 --- a/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql +++ b/agile_operations/network_infrastructure/queries/find_unsecured_servers_visualization.gsql @@ -1,4 +1,27 @@ -CREATE OR REPLACE QUERY find_unsecured_servers_visualization (UINT k_hop_switch_limit = 3) { +CREATE OR REPLACE QUERY find_unsecured_servers_visualization (UINT k_hop_switch_limit = 3) { + /* + Query: find_unsecured_servers_visualization + + Purpose: + Visualizes unsecured network paths from Routers to Servers through Switches. + It identifies Servers that are reachable via Switches without passing through security devices (like Firewalls). + + What It Does: + • Finds all Routers, Switches, and Servers. + • Traverses paths from Routers → Switches → Servers using Connect_To edges. + • Expands through Switch-to-Switch connections up to 'k_hop_switch_limit' hops. + • Collects all involved vertices and edges for visualization. + + Key Outputs: + - routers_to_display → Starting routers + - switches_to_display → Switches on the unsecured path + - servers_to_display → Potentially unsecured servers + - edges_to_display → All traversal edges for graph visualization + + Parameter: + k_hop_switch_limit → Maximum number of Switch-to-Switch traversal hops (default: 3) + +*/ SetAccum @@routers_to_display; SetAccum @@switches_to_display; diff --git a/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql b/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql index ad6e3a9d..855d50b9 100644 --- a/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql +++ b/agile_operations/network_infrastructure/queries/incident_impact_by_max_radius.gsql @@ -2,6 +2,31 @@ CREATE OR REPLACE QUERY incident_impact_by_max_radius ( VERTEX input_incident, INT max_radius = 4 ) { + /* + Query: incident_impact_by_max_radius + + Purpose: + Determines how far an incident can propagate through the network. + It finds all devices that may be impacted by a given incident, + up to a specified hop limit (max_radius). + + What It Does: + 1. Gets the Event linked to the input Incident. + 2. Finds devices directly impacted by the incident (radius 0). + 3. Iteratively explores additional devices connected via Connect_To edges, + marking each with its hop distance (incident_radius). + 4. Collects all impacted devices and edges for visualization. + + Key Outputs: + - input_incident_set → The provided incident + - linked_event → Event associated with the incident + - impacted_devices_within_radius → All potentially impacted devices + - @@edges_to_display → Edges used during traversal + + Parameters: + max_radius → Max number of hops to explore propagation (default: 4) + +*/ SetAccum @@impacted_devices_set; SetAccum @@edges_to_display; OrAccum @visited; diff --git a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql index 16b2a08f..c8794393 100644 --- a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql +++ b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_alerts.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY top_k_devices_with_most_alerts (INT k) { + /* + Query: top_k_devices_with_most_alerts + + Purpose: + Identifies the devices that are associated with the highest number of alerts. + Useful for prioritizing monitoring, troubleshooting, and risk assessment. + + What It Does: + 1. Counts how many alerts are linked to each Event. + 2. Aggregates alert counts from Events to connected Devices. + 3. Sorts devices by total alert frequency in descending order. + 4. Returns the top K most alert-prone devices. + + Key Outputs: + - devices → Top K devices with highest aggregated alert_count + + Parameter: + k → Number of devices to return (top K) +*/ SumAccum @alert_count; diff --git a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql index 80721d69..44104d40 100644 --- a/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql +++ b/agile_operations/network_infrastructure/queries/top_k_devices_with_most_incidents.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY top_k_devices_with_most_incidents (INT k) { + /* + Query: top_k_devices_with_most_incidents + + Purpose: + Identifies devices that are associated with the highest number of incidents. + Helps in recognizing critical devices prone to failures, security breaches, or outages. + + What It Does: + 1. Counts the number of incidents linked to each Event. + 2. Propagates incident counts from Events to connected Devices. + 3. Sorts devices by total incident frequency in descending order. + 4. Returns the top K devices with the most incidents. + + Key Output: + - devices → List of top K devices ranked by incident_count + + Parameter: + k → Number of top devices to return.. +*/ SumAccum @incident_count; diff --git a/agile_operations/network_infrastructure/schema/create_network_infrastructure_graph.gsql b/agile_operations/network_infrastructure/schema/create_network_infrastructure_graph.gsql index 147c41b0..112fa328 100644 --- a/agile_operations/network_infrastructure/schema/create_network_infrastructure_graph.gsql +++ b/agile_operations/network_infrastructure/schema/create_network_infrastructure_graph.gsql @@ -13,7 +13,7 @@ CREATE SCHEMA_CHANGE JOB init_network_infrastructure_schema FOR GRAPH Network_In ADD VERTEX Event_Type(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Time_Date_Hour(PRIMARY_ID id DATETIME) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Time_Date_Month(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; - ADD VERTEX Time_Date_Minute(PRIMARY_ID id DATETIME) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; + ADD VERTEX Time_Date_Minute(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Router(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Firewall(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; ADD VERTEX Switch(PRIMARY_ID id STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"; diff --git a/agile_operations/supply_chain_management/meta/application.json b/agile_operations/supply_chain_management/meta/application.json new file mode 100644 index 00000000..955d55df --- /dev/null +++ b/agile_operations/supply_chain_management/meta/application.json @@ -0,0 +1,1262 @@ +{ + "defaultGraph": "Supply_Chain_Management", + "globalVariables": [], + "iconURL": "/insights/static/media/factory.4354dc741f4585127af6b70c9821a211.svg", + "id": "3fjAZmRYTeABxn4ncYY7M3", + "owner": "tigergraph", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "6wa4sFXLXHJmWQzGJeDojT": { + "chartSettings": { + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "showColumns": [ + { + "isChecked": true, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "customers.@total_quantity" + } + ], + "sortedColumns": [ + "v_id", + "customers.@total_quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "customers.@total_quantity" + ], + "value": [ + { + "id": "customers.@total_quantity", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "6wa4sFXLXHJmWQzGJeDojT", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "get_biggest_customers", + "id": "get_biggest_customers", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "d01266d0-d7b3-451d-bb0b-71969d3dac88", + "paramGlobalInput": "Top_K", + "paramName": "top_k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "e644912d-4d39-4544-8b97-afa36fbfcbdc", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "ad9bd726-e56d-4edf-b21e-7c1ef2bb66aa", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "elementType": "VERTEX", + "id": "b91d25e5-d27a-43d5-9b2b-0e2ef77bc969", + "paramGlobalInput": "", + "paramName": "input_nations", + "paramType": "SET", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Customers", + "type": "bar", + "version": "1763931181376086659" + }, + "aepZYhHtdrKMZscobrGixq": { + "chartSettings": { + "showColumns": [ + { + "isChecked": true, + "name": "p" + }, + { + "isChecked": true, + "name": "quantity" + } + ], + "sortedColumns": [ + "p", + "quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "p", + "quantity" + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "aepZYhHtdrKMZscobrGixq", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "top_k_products_sales_order", + "id": "top_k_products_sales_order", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "f0c011d4-d5d2-49fa-ba8c-3dd430d405af", + "paramGlobalInput": "Top_K", + "paramName": "k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Products Sales Order", + "type": "table", + "version": "1763931489536489074" + }, + "d1F1DV6GoqU8eBNjYT53b6": { + "chartSettings": { + "borderRadius": 26, + "borderWidth": 16, + "category": [ + { + "id": "v_id", + "type": "string" + } + ], + "radius": [ + 0, + 74 + ], + "roseType": true, + "showColumns": [ + { + "isChecked": true, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "suppliers.@total_quantity" + } + ], + "showLegend": false, + "sortedColumns": [ + "v_id", + "suppliers.@total_quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "suppliers.@total_quantity" + ], + "value": [ + { + "id": "suppliers.@total_quantity", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "d1F1DV6GoqU8eBNjYT53b6", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "get_biggest_suppliers", + "id": "get_biggest_suppliers", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "c04ee73c-eedf-46a5-bd4a-ff8135c33bde", + "paramGlobalInput": "Top_K", + "paramName": "top_k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "b4447a62-707f-4e06-839f-c7f95a624fc9", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "6646046a-e5fb-4700-9e15-0c04ec289e36", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "elementType": "VERTEX", + "id": "0ed291e9-16c4-427f-983b-1ab45829fe70", + "paramGlobalInput": "", + "paramName": "input_nations", + "paramType": "SET", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Suppliers", + "type": "pie", + "version": "1763931246459106033" + }, + "h1DS1HwMbhESTmLuSNseGF": { + "chartSettings": { + "borderRadius": 32, + "borderWidth": 21, + "category": [ + { + "id": "IsProduct", + "type": "boolean" + } + ], + "radius": [ + 0, + 81 + ], + "roseType": false, + "tableHeaders": [ + "IsProduct", + "cnt" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "h1DS1HwMbhESTmLuSNseGF", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Supply_Chain_Management {\n GroupByAccum cnt> @@grp;\n res = \n SELECT s from Material:s \n ACCUM CASE when s.is_product then @@grp += (\"Product\"->1) else @@grp += (\"Material\"->1) end\n ;\n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Material Product", + "type": "pie", + "version": "1764011824285635441" + }, + "iM7aR5tBHUSDNPuLkqCoeK": { + "chartSettings": { + "showColumns": [ + { + "isChecked": true, + "name": "p" + }, + { + "isChecked": true, + "name": "quantity" + } + ], + "sortedColumns": [ + "p", + "quantity" + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "p", + "quantity" + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "iM7aR5tBHUSDNPuLkqCoeK", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "top_k_products_purchase_order", + "id": "top_k_products_purchase_order", + "type": "QUERY" + }, + { + "data": "Top_K", + "id": "27f34beb-d818-44e9-b997-9538f52f37ac", + "paramGlobalInput": "Top_K", + "paramName": "k", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Products Purchase Order", + "type": "table", + "version": "1763931519243173565" + }, + "jiFArchKghkbBhGC4d548q": { + "chartSettings": { + "inputStates": [ + { + "dataType": "number", + "id": "input_jVfdmk2x1Ajk4kjWZAzFmK", + "name": "Top_K", + "settings": { + "max": "50", + "min": "1" + }, + "widgetType": "Input" + } + ] + }, + "graphName": "AntiFraud", + "hideWidgetName": false, + "id": "jiFArchKghkbBhGC4d548q", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "Inputs", + "version": "1763933359536882568" + }, + "vYoJnjMMFWLQWr6r1hThzL": { + "chartSettings": { + "values": [ + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/layout-template.4aadad8f62567d7e8735dd875cf9d307.svg", + "id": "7pTWXkngZm4jQ68oDa8i8d", + "key": "Plants", + "label": "Plant", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/user-round.a4c78e138aaee380f0d4d18c1e5de9a2.svg", + "id": "jNjXkiSwzVtWQGPHSkVYyA", + "key": "Customers", + "label": "Customer", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/user-pen.e57c9feb5a07ce1f9b9c05d8ee2d0786.svg", + "id": "7a15tQkEd4KzMo8kJXWn9e", + "key": "Suppliers", + "label": "Supplier", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 36, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "iconURL": "/insights/static/media/flag.ce538663ace2f1f8f1ef1eb29d46f4bf.svg", + "id": "5Nwa7ruXMyTRNz7J226TDh", + "key": "Nations", + "label": "Nation", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 24, + "styleRule": [] + } + ] + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "vYoJnjMMFWLQWr6r1hThzL", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Supply_Chain_Management {\n res = SELECT s from Plant:s \n ;\n PRINT res.size() as Plants;\n res = SELECT s FROM Customer:s; \n PRINT res.size() as Customers;\n res = SELECT s from Nation:s;\n PRINT res.size() as Nations;\n res = SELECT s FROM Supplier:s;\n PRINT res.size() as Suppliers;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "value", + "version": "1763933189824585386" + } + }, + "globalParameters": { + "Top_K": { + "id": "input_jVfdmk2x1Ajk4kjWZAzFmK", + "name": "Top_K", + "type": "NUMBER", + "value": 10 + } + }, + "iconURL": "/insights/static/media/chart-scatter.c83829978fdeba96c343f7cc07ee2bf9.svg", + "id": "mfgR3jZc9v78jGjp3Vn8jf", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 20, + "i": "6wa4sFXLXHJmWQzGJeDojT", + "moved": false, + "static": false, + "w": 8, + "x": 4, + "y": 32 + }, + { + "h": 23, + "i": "aepZYhHtdrKMZscobrGixq", + "moved": false, + "static": false, + "w": 4, + "x": 4, + "y": 9 + }, + { + "h": 24, + "i": "d1F1DV6GoqU8eBNjYT53b6", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 28 + }, + { + "h": 19, + "i": "h1DS1HwMbhESTmLuSNseGF", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 9 + }, + { + "h": 23, + "i": "iM7aR5tBHUSDNPuLkqCoeK", + "moved": false, + "static": false, + "w": 4, + "x": 8, + "y": 9 + }, + { + "h": 9, + "i": "jiFArchKghkbBhGC4d548q", + "moved": false, + "static": false, + "w": 2, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "vYoJnjMMFWLQWr6r1hThzL", + "moved": false, + "static": false, + "w": 10, + "x": 2, + "y": 0 + } + ], + "xs": [ + { + "h": 9, + "i": "jiFArchKghkbBhGC4d548q", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 9, + "i": "vYoJnjMMFWLQWr6r1hThzL", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 9 + }, + { + "h": 19, + "i": "h1DS1HwMbhESTmLuSNseGF", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 18 + }, + { + "h": 23, + "i": "aepZYhHtdrKMZscobrGixq", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 37 + }, + { + "h": 23, + "i": "iM7aR5tBHUSDNPuLkqCoeK", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 60 + }, + { + "h": 24, + "i": "d1F1DV6GoqU8eBNjYT53b6", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 83 + }, + { + "h": 20, + "i": "6wa4sFXLXHJmWQzGJeDojT", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 107 + } + ] + }, + "title": "Analytics", + "version": "1764009629639742855", + "weight": -10 + }, + { + "chartMap": { + "5ZQJVEuiq5cAxVxZWVqoLS": { + "chartSettings": { + "edgeLength": 175, + "rulesByType": { + "Line_Number": [ + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#1a7c0e" + } + ] + } + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "5ZQJVEuiq5cAxVxZWVqoLS", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "explore_BOM_line_Insights", + "id": "explore_BOM_line_Insights", + "type": "QUERY" + }, + { + "data": "Line_Number", + "id": "9eed304b-a520-48ce-8da6-40203562c238", + "paramGlobalInput": "", + "paramName": "vertType", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineVertex", + "id": "ec0f667f-e6d9-42e9-9d6b-98ad1be0bb33", + "paramGlobalInput": "LineVertex", + "paramName": "id", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineDepth", + "id": "cb2c8a11-87d4-410f-980d-bd08fa44aa1e", + "paramGlobalInput": "LineDepth", + "paramName": "depth", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "LineUpstream", + "id": "02e56b6f-bcf7-40ab-9c0f-2508fc886654", + "paramGlobalInput": "LineUpstream", + "paramName": "upstream", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "2ea2db6f-3568-467b-a6bf-fcb78f19b1a5", + "paramGlobalInput": "", + "paramName": "use_date_range", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "323ec303-484d-4ed0-a21e-74dc8de82cbe", + "paramGlobalInput": "", + "paramName": "start_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "", + "id": "b5fea6eb-6b3c-47df-bddd-e1aadbc6ee9e", + "paramGlobalInput": "", + "paramName": "end_date", + "paramType": "DATETIME", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "internal-graph", + "version": "1764021018679482546" + }, + "7GwzFLnQ5dDDVtUtVx7mPv": { + "chartSettings": { + "inputStates": [ + { + "dataType": "string", + "id": "input_rAjS1bcFu9NBPSX8wdUK4U", + "name": "LineVertex", + "settings": { + "graphName": "Supply_Chain_Management", + "labelKey": "Line_Number.id", + "open": false, + "options": [], + "patternLimit": 300, + "query": "", + "searchPattern": [ + { + "alias": "Line_Number", + "data": "Line_Number", + "id": "93d5c9b3-f987-4cfa-9d4d-369211e92af2", + "orderBy": [ + { + "asc": true, + "expression": { + "type": "AttrVariable", + "value": "alias_schema_Line_Number_0.id" + }, + "label": "Line_Number.id" + } + ], + "type": "vertex" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "tableHeaders": [ + "Matched pattern", + "Line_Number" + ], + "useQuery": true, + "valueKey": "Line_Number.id" + }, + "widgetType": "Dropdown" + }, + { + "dataType": "number", + "id": "input_1cRTwt6EKyjHGDuhpK7XUT", + "name": "LineDepth", + "settings": { + "max": "20", + "min": "2" + }, + "widgetType": "Input" + }, + { + "dataType": "bool", + "id": "input_p7HJXg2VhZhgeF1pK8hv4S", + "name": "LineUpstream", + "settings": { + "options": [] + }, + "widgetType": "Dropdown" + } + ] + }, + "graphName": "AntiFraud", + "hideWidgetName": false, + "id": "7GwzFLnQ5dDDVtUtVx7mPv", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "BOM Line", + "type": "Inputs", + "version": "1763928107928372418" + }, + "hx3rVgRRGfLEwkeabZwKKi": { + "chartSettings": { + "edgeLength": 285, + "rulesByType": { + "BOM": [ + { + "condition": "always", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": "", + "fieldName": "id", + "fieldType": "string", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "0.25x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 0.25 + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#0571f0" + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "1x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 1 + }, + { + "condition": "=", + "conditionEndValue": 20, + "conditionStartValue": 1, + "conditionValue": "0", + "fieldName": "base_quantity", + "fieldType": "number", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#ff0000" + } + ], + "Material": [ + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "@isSrc", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#4973f3" + }, + { + "condition": "=", + "conditionEndValue": 0, + "conditionStartValue": 0, + "conditionValue": true, + "fieldName": "is_product", + "fieldType": "boolean", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "node-radius", + "styleLabel": "2x", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "numeric", + "styleValue": 2 + } + ] + } + }, + "graphName": "Supply_Chain_Management", + "hideWidgetName": false, + "id": "hx3rVgRRGfLEwkeabZwKKi", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "explore_BOM_insights", + "id": "explore_BOM_insights", + "type": "QUERY" + }, + { + "data": "vertType", + "id": "2e9b80fd-030e-49fb-9c68-754bf2efca37", + "paramGlobalInput": "vertType", + "paramName": "vertType", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "vertexId", + "id": "70e1db0f-1025-4a86-8fc3-03da2d5c8338", + "paramGlobalInput": "vertexId", + "paramName": "id", + "paramType": "STRING", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "depth", + "id": "23681b01-c596-4f2a-8208-212236a966d4", + "paramGlobalInput": "depth", + "paramName": "depth", + "paramType": "INT", + "paramTypeReadonly": true, + "type": "PARAM" + }, + { + "data": "upstream", + "id": "f622d4ae-0db7-4da1-af9b-54317cb0bbb3", + "paramGlobalInput": "upstream", + "paramName": "upstream", + "paramType": "BOOL", + "paramTypeReadonly": true, + "type": "PARAM" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "internal-graph", + "version": "1763928813910151148" + }, + "wGYxHtNz9dmfUBHvh8vi8a": { + "chartSettings": { + "inputStates": [ + { + "dataType": "string", + "id": "input_35dZipma5w14zcPa9srhP7", + "name": "vertType", + "settings": { + "graphName": "AntiFraud", + "open": false, + "options": [ + { + "isCreatable": true, + "label": "BOM", + "value": "BOM" + }, + { + "isCreatable": true, + "label": "Material", + "value": "Material" + } + ], + "patternLimit": 300, + "query": "", + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "useQuery": false + }, + "widgetType": "Dropdown" + }, + { + "dataType": "string", + "id": "input_rGNXpbuD8xm2Jp2jqYA3TR", + "name": "vertexId", + "settings": { + "graphName": "Supply_Chain_Management", + "open": false, + "options": [], + "patternLimit": 300, + "query": "INTERPRET QUERY(STRING vertType) FOR GRAPH Supply_Chain_Management {\n ListAccum @@ids;\n res = SELECT s from (BOM|Material):s \n where s.type == vertType\n ACCUM @@ids += s.id \n ;\n PRINT @@ids;\n}", + "queryType": "interactive", + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "useQuery": true + }, + "widgetType": "Dropdown" + }, + { + "dataType": "number", + "id": "input_rHJAj5bR83viw6oAHPtzh4", + "name": "depth", + "settings": { + "max": "20" + }, + "widgetType": "Input" + }, + { + "dataType": "bool", + "id": "input_gvFDPVUBoEs7WCmUUYvKiA", + "name": "upstream", + "settings": { + "options": [ + { + "isCreatable": true, + "label": "TRUE", + "value": true + }, + { + "isCreatable": true, + "label": "FALSE", + "value": false + } + ] + }, + "widgetType": "Dropdown" + } + ] + }, + "graphName": "AntiFraud", + "hideWidgetName": false, + "id": "wGYxHtNz9dmfUBHvh8vi8a", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "BOM", + "type": "Inputs", + "version": "1763928099165046718" + } + }, + "globalParameters": { + "LineDepth": { + "id": "input_1cRTwt6EKyjHGDuhpK7XUT", + "name": "LineDepth", + "type": "NUMBER", + "value": 5 + }, + "LineUpstream": { + "id": "input_p7HJXg2VhZhgeF1pK8hv4S", + "name": "LineUpstream", + "type": "BOOL", + "value": "true" + }, + "LineVertex": { + "id": "input_rAjS1bcFu9NBPSX8wdUK4U", + "name": "LineVertex", + "type": "STRING", + "value": "mat_101_China_74_3931_3634" + }, + "depth": { + "id": "input_rHJAj5bR83viw6oAHPtzh4", + "name": "depth", + "type": "NUMBER", + "value": 3 + }, + "upstream": { + "id": "input_gvFDPVUBoEs7WCmUUYvKiA", + "name": "upstream", + "type": "BOOL", + "value": "true" + }, + "vertType": { + "id": "input_35dZipma5w14zcPa9srhP7", + "name": "vertType", + "type": "STRING", + "value": "BOM" + }, + "vertexId": { + "id": "input_rGNXpbuD8xm2Jp2jqYA3TR", + "name": "vertexId", + "type": "STRING", + "value": "mat_1330_China_3665" + } + }, + "iconURL": "/insights/static/media/library.d3f7f207c6bb1d7be8e64045a19991b2.svg", + "id": "xB74jXwxq3KbSxpMEXvnQH", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 43, + "i": "5ZQJVEuiq5cAxVxZWVqoLS", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 10 + }, + { + "h": 10, + "i": "7GwzFLnQ5dDDVtUtVx7mPv", + "moved": false, + "static": false, + "w": 6, + "x": 6, + "y": 0 + }, + { + "h": 43, + "i": "hx3rVgRRGfLEwkeabZwKKi", + "moved": false, + "static": false, + "w": 6, + "x": 0, + "y": 10 + }, + { + "h": 10, + "i": "wGYxHtNz9dmfUBHvh8vi8a", + "moved": false, + "static": false, + "w": 6, + "x": 0, + "y": 0 + } + ], + "xs": [ + { + "h": 10, + "i": "wGYxHtNz9dmfUBHvh8vi8a", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 12, + "i": "7GwzFLnQ5dDDVtUtVx7mPv", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 10 + }, + { + "h": 43, + "i": "hx3rVgRRGfLEwkeabZwKKi", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 22 + }, + { + "h": 43, + "i": "5ZQJVEuiq5cAxVxZWVqoLS", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 65 + } + ] + }, + "title": "BOM", + "version": "1763928024253252036", + "weight": 10 + } + ], + "title": "Supply Chain Management", + "userRoleForApp": "owner", + "version": "1764009595079839773" +} \ No newline at end of file diff --git a/agile_operations/supply_chain_management/queries/queries.gsql b/agile_operations/supply_chain_management/queries/queries.gsql index dccf3ad1..632aca16 100644 --- a/agile_operations/supply_chain_management/queries/queries.gsql +++ b/agile_operations/supply_chain_management/queries/queries.gsql @@ -1,4 +1,21 @@ -CREATE DISTRIBUTED QUERY explore_BOM(VERTEX input_vert, INT depth, BOOL upstream) FOR GRAPH Supply_Chain_Management { +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM(VERTEX input_vert, INT depth, BOOL upstream) FOR GRAPH Supply_Chain_Management { + /* + Query Name: explore_BOM + Full multi-level BOM explosion (downstream) or where-used analysis (upstream) + from any Material, BOM header, or Supplier. + + Key Use Cases: + • Explode a finished good into all components (any depth, even 20–30 levels) + • Reverse trace: "Which finished goods or BOMs use this raw material?" + • Critical for demand planning, costing, engineering change impact, + single-source risk analysis, and compliance reporting. + + Parameters: + input_vert : Starting vertex (Material Or BOM) + depth : Maximum traversal depth (set to 30+ for full explosion) + upstream : TRUE → Where-Used (who consumes this material?) + FALSE → BOM Explosion (what does this material consist of?) + */ SetAccum @@edges; OrAccum @visited; @@ -30,13 +47,33 @@ CREATE DISTRIBUTED QUERY explore_BOM(VERTEX input_vert, INT depth, BOOL upstream PRINT @@edges; } -CREATE DISTRIBUTED QUERY explore_BOM_line( - VERTEX input_vert, - INT depth, - BOOL upstream, - BOOL use_date_range, - DATETIME start_date, - DATETIME end_date) FOR GRAPH Supply_Chain_Management { +CREATE OR REPLACE DISTRIBUTED QUERY explore_BOM_line(VERTEX input_vert, INT depth, BOOL upstream, BOOL use_date_range, DATETIME start_date, DATETIME end_date) FOR GRAPH Supply_Chain_Management { + /* + Description: + End-to-end transactional lineage traversal (order-to-cash & procure-to-pay) + across actual production instances (SFC), sales orders, purchase orders, + shipments, and customers/suppliers. + + This is the "digital thread" query — it connects the physical flow + (what was actually built and shipped) with the commercial flow + (who ordered it and who supplied the components). + + Parameters: + input_vert (VERTEX): + Starting vertex — typically SFC_Material (actual batch), Sales_Order, + Purchase_Order, Customer, or Supplier + depth (INT): + Maximum traversal depth (usually 10–20 is sufficient for full lineage) + upstream (BOOL): + FALSE → Follow the flow downstream (e.g. raw material → finished good → customer) + TRUE → Follow the flow upstream (e.g. finished good → raw materials → supplier) + use_date_range (BOOL): + If TRUE, only traverse through dated vertices (Sales_Order, Purchase_Order, + SFC_Assembly) that fall within the specified window + start_date / end_date (DATETIME): + Optional time fence for filtering transactional vertices + + */ SetAccum @@edges; OrAccum @visited; @@ -77,7 +114,29 @@ CREATE DISTRIBUTED QUERY explore_BOM_line( PRINT "explore_BOMLine works!"; } -CREATE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { +CREATE OR REPLACE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { + /* + Description: + Top-K Customers by Delivered Quantity (or Revenue Proxy) in a selected period + with optional geographic filtering (by Nation / Region). + + Instantly ranks customers based on actual shipped/delivered volume from + Sales Orders — far more accurate than booked revenue or invoice data, + because it reflects real physical fulfillment. + + Parameters: + top_k (INT): + Number of top customers to return (e.g., 10, 20, 50) + start_date / end_date (DATETIME): + Time window for analysis (e.g., last 12 months, current FY) + input_nations (SET>): + Optional filter — only include customers from these countries. + Pass empty set {} to include all nations globally. + + Output: + Ranked list of Customer vertices with accumulated delivered quantity + Ready for visualization in GraphStudio, BI tools, or REST API exposure. + */ SumAccum @total_quantity; @@ -92,7 +151,8 @@ CREATE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, D // filter their sales orders by date range sales_orders = SELECT t FROM customers:s -(Has_Sales_Order)- Sales_Order:t - WHERE t.date BETWEEN start_date AND end_date; + //WHERE t.date BETWEEN start_date AND end_date + ; // sum up the quantities of sales order items sales_orders = SELECT s FROM sales_orders:s -(Has_Sales_Order_Item)- Sales_Order_Item:t @@ -106,7 +166,28 @@ CREATE DISTRIBUTED QUERY get_biggest_customers(INT top_k, DATETIME start_date, D PRINT customers[customers.@total_quantity]; } -CREATE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { +CREATE OR REPLACE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, DATETIME end_date, SET input_nations) FOR GRAPH Supply_Chain_Management { + /* + Description: + Top-K Suppliers by Purchased/Inbound Volume in a selected time period + with optional geographic filtering (by Nation / Region). + + Ranks suppliers based on actual received or ordered quantity from + Purchase Orders — the most accurate measure of supplier spend and + strategic importance in manufacturing and distribution environments. + Parameters: + top_k (INT): + Number of top suppliers to return (e.g., 10, 25, 50) + start_date / end_date (DATETIME): + Analysis period (e.g., trailing 12 months, current fiscal year) + input_nations (SET>): + Optional country/region filter. + Pass empty set {} to include all suppliers globally. + + Output: + Ranked list of Supplier vertices with total purchased quantity + Ready for GraphStudio visualization, Power BI, Tableau, or REST API. + */ SumAccum @total_quantity; @@ -121,7 +202,8 @@ CREATE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, D // filter their purchase orders by date range purchase_orders = SELECT t FROM suppliers:s -(Has_Purchase_Order)- Purchase_Order:t - WHERE t.date BETWEEN start_date AND end_date; + //WHERE t.date BETWEEN start_date AND end_date + ; // sum up the quantities of line numbers purchase_orders = SELECT s FROM purchase_orders:s -(Has_Line_Number)- Line_Number:t @@ -135,7 +217,28 @@ CREATE DISTRIBUTED QUERY get_biggest_suppliers(INT top_k, DATETIME start_date, D PRINT suppliers[suppliers.@total_quantity]; } -CREATE DISTRIBUTED QUERY trace_BOM_line_nations(VERTEX input_fg) FOR GRAPH Supply_Chain_Management { +CREATE OR REPLACE DISTRIBUTED QUERY trace_BOM_line_nations(VERTEX input_fg) FOR GRAPH Supply_Chain_Management { + /* + Description: + Domestic vs. International Sourcing Analysis for a Specific Finished Good Batch. + + Starting from a real produced instance (SFC_Material) of a finished good, + this query traces the full transactional BOM lineage upstream through + actual production and procurement events to determine: + • What quantity of input materials was sourced domestically + • What quantity was sourced internationally + + This is a true "Country of Origin" and supply chain sovereignty query — + critical for compliance, risk, ESG, trade policy, and cost analysis. + + Parameters: + input_fg (VERTEX): + A specific produced batch/instance of a finished good (actual SFC_Material vertex) + + Output: + @@domestic_quantity → Total quantity sourced from the same nation as the final assembly plant + @@international_quantity → Total quantity sourced from all other nations + */ OrAccum @visited; MaxAccum @@src_nation, @rm_nation; SumAccum @@domestic_quantity, @@international_quantity; @@ -201,7 +304,28 @@ CREATE DISTRIBUTED QUERY trace_BOM_line_nations(VERTEX input_fg) F } CREATE OR REPLACE QUERY add_purchase_to_inventory(VERTEX purchase_order, SET invs){ - /* This query updates the inventory for the product supplied by the supplier */ + /* + Description: + Simulates the goods-receipt (GR) posting of a Purchase Order and instantly + shows the resulting inventory impact for the affected materials in the + specified Inventory_Held locations (storage locations / bins). + + This is a "what-if" inventory projection query — ideal for MRP simulation, + ATP/CTP validation, and real-time inventory planning without touching + the actual ERP system. + + Parameters: + purchase_order (VERTEX): + The Purchase Order vertex representing the inbound receipt + invs (SET): + Set of Inventory_Held primary IDs (storage locations) to check/update. + Use empty set {} to include all relevant locations. + + Output: + • Previous inventory quantity + last_updated timestamp per material/location + • Projected new inventory quantity after GR posting + • Clear message if no inventory exists in the specified locations + */ SumAccum @quantity; MinAccum @date; @@ -243,7 +367,23 @@ CREATE OR REPLACE QUERY add_purchase_to_inventory(VERTEX purchas } CREATE OR REPLACE QUERY check_shipment_capacity(VERTEX so){ - /* This query checks shipment capacity and sales order quantity and to check if the order has been fulfilled */ + /* + Description: + Instant Order Fulfillment Status & Shipment Completion Check. + Compares the ordered quantity on a Sales Order against the quantity + actually shipped (via Shipment_Item vertices) and immediately reports: + • Total ordered quantity per material + • Quantity already fulfilled (shipped) + • Remaining unfulfilled quantity + Parameters: + so (VERTEX): + The Sales Order vertex to analyze + + Output: + @@prod_total_quantity → Total ordered quantity per material + @@prod_fulfillment → Quantity already shipped/fulfilled + @@prod_unfulfillment → Remaining open quantity (backorder + */ MapAccum @@prod_fulfillment, @@prod_unfulfillment, @@prod_total_quantity; @@ -271,10 +411,30 @@ CREATE OR REPLACE QUERY check_shipment_capacity(VERTEX so){ PRINT @@prod_unfulfillment; } - CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, INT inventory_interval_threshold){ - /* This query gets the product quantity required in a sales order and checks inventory for the product and checks the date when the - inventory was last updated. If inventory was updated more than inventory_interval_threshold ago, order will not be fulfilled */ +/* + Description: + Real-time Available-to-Promise (ATP) with Inventory Freshness Control. + Answers the critical business question: + "Can we fulfill this Sales Order using only inventory that has been + physically counted or received within the last X months?" + + This is a strict, audit-ready ATP check that prevents promising + customers based on stale or "ghost" inventory — a major cause of + delivery failures in traditional ERP systems. + + Parameters: + sales (VERTEX): + The Sales Order to validate + inventory_interval_threshold (INT): + Maximum allowed age of inventory in months. + Example: 3 → only use inventory updated in the last 3 months + + Output: + • Clear fulfillment decision message + • Detailed map of which materials pass/fail the freshness + quantity check + • Ready for integration into order management UIs or ATP engines + */ TYPEDEF TUPLE myTuple; @@ -317,10 +477,27 @@ CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, } CREATE OR REPLACE QUERY plant_failure_impact_nodes (VERTEX start_inv,INT max_hops = 8) { - // BFS Shortest Path - base version that can traverse through all these edges: - // Stores, Contains, SO_Requests, Includes_SO, Places - // Adapted from the following BFS algorithm: - // https://github.com/tigergraph/gsql-graph-algorithms/blob/master/algorithms/Path/bfs/tg_bfs.gsql + /* + Description: + Critical Business Continuity & Risk Impact Analysis Query. + Answers the mission-critical question: + "If this manufacturing plant goes down tomorrow (fire, strike, natural disaster), + which finished goods, customer orders, and end customers will be directly impacted?" + + Performs a constrained BFS from a Plant → through current inventory → required materials + → open sales order items → customers. Returns the full downstream impact path + within seconds — impossible in traditional ERP or BI systems. + Parameters: + start_plant (VERTEX): + The manufacturing or distribution plant to simulate failure of + max_hops (INT, default 8): + Maximum graph distance to traverse. 8 is sufficient for full global impact. + + Output: + • All reachable Customer vertices (affected end customers) + • Full path of edges showing exactly how disruption propagates + • Visualizable in GraphStudio as impact network + */ TYPEDEF TUPLE Edge_Info; SumAccum @@num_hops_traversed; // current step @@ -393,6 +570,7 @@ CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, end_traverse_back = visited_end; // go from target to source to collect the edges + vertices visited + print end_traverse_back; WHILE (end_traverse_back.size() > 0) DO end_traverse_back = SELECT s FROM end_traverse_back:s @@ -431,7 +609,26 @@ CREATE OR REPLACE QUERY datewise_product_availability(VERTEX sales, } CREATE OR REPLACE QUERY product_quantity_sales_order(VERTEX sales){ - /* This query gets the product quantity required in a sales order and checks inventory for the product */ + /* + Description: + Real-time Available-to-Promise (ATP) Check – Basic Edition. + Answers the fundamental order fulfillment question: + "Do we have enough on-hand inventory (any location, any batch) to fulfill this sales order today?" + + Compares required quantity per material on a Sales Order against current + global inventory (Inventory_Held) and instantly reports whether the order + can be fully satisfied. + + Parameters: + sales (VERTEX): + The Sales Order vertex to validate + + Output: + • Clear fulfillment decision message + • Detailed map of materials with sufficient inventory (location + quantity) + • Immediate visibility into which items block full fulfillment + + */ TYPEDEF TUPLE myTuple; @@ -468,8 +665,23 @@ CREATE OR REPLACE QUERY product_quantity_sales_order(VERTEX sales){ PRINT @@product_inventory; } - CREATE OR REPLACE QUERY top_k_products_purchase_order(INT k = 10) { - /* Get the top purchased products from purchase orders */ + CREATE OR REPLACE QUERY top_k_products_purchase_order(INT k = 10) { + /* + Description: + Global Top-K Most Purchased Materials (by total ordered quantity) + across all Purchase Orders in the entire graph. + + Reveals true consumption drivers — the raw, semi-finished, or finished + materials that dominate procurement volume. This is the real "voice of + the supply chain" for demand and spend patterns. + + Parameters: + k (INT, default 10): + Number of top materials to return (e.g., 10, 25, 50, 100) + + Output: + Ranked heap of SFC_Material vertices with total purchased quantity + */ TYPEDEF TUPLE prod_tuple; @@ -487,7 +699,21 @@ CREATE OR REPLACE QUERY product_quantity_sales_order(VERTEX sales){ } CREATE OR REPLACE QUERY top_k_products_sales_order(INT k = 10) { - /* Get the top purchased products from sales orders */ + /* + Description: + Global Top-K Best-Selling / Most Demanded Materials (by total ordered quantity) + across all Sales Orders in the entire graph. + + This is the true "voice of the customer" — reveals which finished goods, + semi-finished items, or configurable materials drive real revenue and + demand volume. Essential for S&OP, demand planning, and product strategy. + Parameters: + k (INT, default 10): + Number of top-selling materials to return (e.g., 10, 20, 50, 100) + Output: + Ranked heap of SFC_Material vertices with total sold/ordered quantity + + */ TYPEDEF TUPLE prod_tuple; @@ -505,8 +731,30 @@ CREATE OR REPLACE QUERY top_k_products_sales_order(INT k = 10) { } CREATE OR REPLACE QUERY unfulfilled_orders(Set> sales_set) { - /* This query gets multiple sales orders as parameters and checks inventory for - products in the sales order. It fulfills order and keeps track of inventory level. */ + /* + Description: + Batch Available-to-Promise (ATP) with Real-Time Inventory Allocation. + + Takes a set of Sales Orders (any size) and performs a true multi-order, + multi-material ATP check using current global inventory. + Automatically allocates inventory to fulfill orders in the order they + appear in the input set and reports: + • Which orders can be fully fulfilled + • Which orders remain unfulfilled (or partially fulfilled) + • Updated inventory levels after allocation + + This is the production-grade "order promising engine" used by advanced + order management systems — but running natively on the graph in real time. + + Parameters: + sales_set (SET>): + Set of Sales Order vertices to evaluate and allocate (FIFO order) + + Output: + • Previous_Product_Inventory → inventory before allocation + • Current_Product_Inventory → inventory after allocation + • Orders_that_cannot_be_fulfilled → list of fully/partially unfulfilled orders + */ MapAccum> @prod_per_sales; MapAccum>> @@prod_per_inv; diff --git a/agile_operations/supply_chain_management/readme.md b/agile_operations/supply_chain_management/readme.md index c97ea0ef..23d5beb8 100644 --- a/agile_operations/supply_chain_management/readme.md +++ b/agile_operations/supply_chain_management/readme.md @@ -1,39 +1,73 @@ # Narratives +The Supply Chain Management solution for TigerGraph delivers a complete, enterprise-ready property graph that models an end-to-end manufacturing and distribution supply chain. Built for manufacturers and distributors, it accurately reflects real-world ERP structures — Material Masters, multi-level Bills of Materials (BOMs), Purchase Orders, Sales Orders, Inventory, Production Batches (SFC), Plants, Shipments and full genealogy. -Inventory Management is the process of ordering, storing, and using a company's inventory. In retail, manufacturing, and other inventory-intensive sectors, a company's raw materials and finished goods are the core of its business. A shortage of inventory when and where needed can be detrimental and at the same time a large inventory is at risk of spoilage, theft, damage, or shifts in demand. To manage inventory and transport logistics, it is important to understand the quantity of materials or products available in inventory. - -Products in inventory management have intricate relationships with various components of the supply chain also having a network of dependencies for each calculation. TigerGraph's ability to model and navigate complex relationships makes it the ideal choice. Querying is also more intuitive and faster with TigerGraph leading to more efficient querying. The large volumes of products, their inventories, along with the other components in a supply chain require a scalable database platform which has the ability to maintain the large data like TigerGraph. +Unlike traditional relational data warehouses that struggle with deep recursive questions, this graph enables real-time, highly connected analytics that were previously slow or impossible: impact analysis of plant/supplier failures, full forward/backward traceability of batches, real-time Available-to-Promise (ATP) with freshness rules, country-of-origin compliance, demand-supply matching, and bottleneck identification. By leveraging TigerGraph’s native parallel graph engine and bi-directional edges, analysts, planners, sourcing managers, and risk teams gain instant answers to complex multi-hop questions, dramatically improving agility, resilience, and decision quality across the supply chain. # Components - This repository includes multiple components: -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. +* `data` - Realistic sample dataset (CSV files) with master and transactional data. +* `load_jobs` - Data loading jobs and scripts (supports offline batch and optional AWS S3 loading). +* `meta` - Solution Kit metadata. +* `queries` - Complete set of 13 production-ready GSQL analytical queries. +* `schema` - Full graph schema definition (vertices, edges, indexes). +* `readme.md` - This usage guide. +* `setup.sh` - One-click automated installation script. # Instructions - The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: -1. **Schema Creation**: Initiates the schema creation process with the `schema/create_inventory_management_graph.gsql` script. -2. **Data Loading**: Load data into the schema by running the data loading jobs with the scripts in the `loading_job` folder. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. +1. **Schema Creation**: Creates the global graph `Supply_Chain_Management` and runs all schema definition files located in the `schema/` folder. -## Query Explanations +2. **Data Loading**: Executes all loading jobs defined in `load_jobs/` to populate master data (Materials, Plants, BOMs) and transactional data (Purchase Orders, Sales Orders, Inventory, Shipments, SFC batches, etc.). -We have different queries to perform the following tasks: +3. **Query Installation**: Automatically installs the full library of 13 analytical queries from the `queries/` folder. -1. **Available Inventory Measure:** This query shows the available inventory for every product. It also tells us how many inventories a particular product is stored at and the time at which the inventory stock value was last updated. With this we can get a general idea of inventory and make decisions regarding re-stocking the inventory or putting a halt on the manufacturing of products. -2. **Determine Fulfillment of Sales Orders :** This query determines the stock of product required in each sales order placed by a customer. It then determines the quantity of available stock of the product in the inventory which is used to realise fulfillment of sales orders. If a sales order can be fulfilled, it tells us the result stock in the inventory after fulfillment of sales order. -3. **Impact of Plant Failure on the Components :** This query gives us the impact that the failure of a plant has. With this query we can keep a track of affected products and inventories and make analysis based on this information. -4. **Top Products in a Sales Order/Purchase Order :** To maintain inventory levels, it is necessary to know the product that is most required and its required quantity. Among all the sales orders, certain products can most popular and important which should also require maintainance of a certain quantity of these products. -5. --- +After the script finishes (typically within minutes), the graph is fully loaded and ready for immediate use in GraphStudio or via REST API. -## Mock Data +```bash +git clone +cd Supply_Chain_Management_TigerGraph +chmod +x setup.sh +./setup.sh +``` + +## Query Execution Order and Explanations +The queries are independent and can be executed in any order for day-to-day analysis. There is no mandatory sequential workflow except when performing inventory simulation or batch allocation. + +### Core Operational Queries (run on-demand) +- `product_quantity_sales_order` – Standard Available-to-Promise (ATP): checks if sufficient quantity exists for a Sales Order. +- `datewise_product_availability` – Smart ATP with freshness: only considers inventory updated within the last X months. +- `unfulfilled_orders` – Batch allocation engine: processes a set of Sales Orders and virtually reserves inventory. +- `check_shipment_capacity` – Compares ordered vs shipped quantities to find partially fulfilled items. +- `add_purchase_to_inventory` – Simulates the effect of receiving a Purchase Order on current stock levels. + +### Traceability & Risk Queries (recursive – run anytime) +- `explore_BOM` – Explodes a Bill of Materials downstream or traces where-used upstream to any depth. +- `explore_BOM_line` – Traces the actual physical genealogy of a specific batch (SFC) across production and consumption. +- `plant_failure_impact_nodes` – BFS impact analysis: returns all customers, orders, and finished goods affected if a Plant fails. +- `trace_BOM_line_nations` – Calculates the exact percentage of domestic vs international raw materials in a finished good. -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. +### Analytics & Reporting Queries (aggregation – run on-demand) +- `get_biggest_customers` – Top-K customers by ordered quantity (optional date range and nation filter). +- `get_biggest_suppliers` – Top-K suppliers by supplied quantity (optional date range and nation filter). +- `top_k_products_sales_order` – Most demanded finished goods across all Sales Orders. +- `top_k_products_purchase_order` – Most purchased raw/semi-finished materials. + +All queries are distributed, highly parameterized (depth, date ranges, top-K, nation filters, etc.), and complete in seconds even on datasets with millions of orders and multi-level BOMs. + +## Scalability +The heaviest queries (`explore_BOM`, `explore_BOM_line`, `plant_failure_impact_nodes`, `unfulfilled_orders`) use TigerGraph’s native parallel traversal and recursion engines and scale linearly across the cluster. + +For extremely large supply chains (hundreds of millions of SFC instances or deep BOMs): +- Depth/hop limits are built into queries to guarantee predictable runtime. +- When an enormous number of temporary edges is expected (e.g., massive impact analysis), configure the engine to spill edges to disk instead of RAM: + +```bash +gadmin config set GPE.EdgeDataMemoryLimit 0 +gadmin config apply -y +gadmin restart -y +``` + +## Mock Data +The data folder is populated with sample data files. These files are crafted for testing and demonstration purposes. From e65878915a9c7a25cc9c3af2e820110dae1e537d Mon Sep 17 00:00:00 2001 From: user Date: Thu, 27 Nov 2025 01:54:49 +0530 Subject: [PATCH 2/7] renamed the insight app --- .../meta/{application.json => Insights_application.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename agile_operations/supply_chain_management/meta/{application.json => Insights_application.json} (100%) diff --git a/agile_operations/supply_chain_management/meta/application.json b/agile_operations/supply_chain_management/meta/Insights_application.json similarity index 100% rename from agile_operations/supply_chain_management/meta/application.json rename to agile_operations/supply_chain_management/meta/Insights_application.json From 48929ae095fa786fcfc1879d3cf400c50b488612 Mon Sep 17 00:00:00 2001 From: user Date: Fri, 28 Nov 2025 00:33:05 +0530 Subject: [PATCH 3/7] modified readme --- agile_operations/supply_chain_management/readme.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/agile_operations/supply_chain_management/readme.md b/agile_operations/supply_chain_management/readme.md index 23d5beb8..2d6ec76c 100644 --- a/agile_operations/supply_chain_management/readme.md +++ b/agile_operations/supply_chain_management/readme.md @@ -25,12 +25,6 @@ The `setup.sh` script is designed to streamline the initial setup process by seq After the script finishes (typically within minutes), the graph is fully loaded and ready for immediate use in GraphStudio or via REST API. -```bash -git clone -cd Supply_Chain_Management_TigerGraph -chmod +x setup.sh -./setup.sh -``` ## Query Execution Order and Explanations The queries are independent and can be executed in any order for day-to-day analysis. There is no mandatory sequential workflow except when performing inventory simulation or batch allocation. From f36ce65447f336a3e6be4a4627a30b9378b2e581 Mon Sep 17 00:00:00 2001 From: user Date: Tue, 2 Dec 2025 23:39:29 +0530 Subject: [PATCH 4/7] added the Insight query and readme file --- financial_crime/application_fraud/readme.md | 434 +++++++++++++++++- .../queries/top_connected_component.gsql | 16 + .../entity_resolution_kyc/readme.md | 110 +++++ .../loading_job/load_data.gsql | 1 - 4 files changed, 540 insertions(+), 21 deletions(-) create mode 100644 financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql create mode 100644 financial_crime/entity_resolution_kyc/readme.md diff --git a/financial_crime/application_fraud/readme.md b/financial_crime/application_fraud/readme.md index e776e5be..97f7f726 100644 --- a/financial_crime/application_fraud/readme.md +++ b/financial_crime/application_fraud/readme.md @@ -1,24 +1,169 @@ -# Narratives -Application fraud detection identifies and prevents unauthorized or deceptive financial applications in real-time. It analyzes shared PII attributes between Applications to detect anomalies and suspicious activity indicative of fraud. TigerGraph models complex relationships and patterns among entities such as shared Name, DOB, Email, Phone, Address, IP, ID, Device, Party, Account, and Card. This enables detection of fraudulent networks and patterns that may be difficult to uncover using traditional relational databases. Using graph algorithms enable organizations to detect and respond to fraudulent transactions quickly and efficiently, ultimately reducing financial losses and protecting consumers from fraudulent activity. +# Application Fraud (TigerGraph Solution Kit) -# Components -This repository includes multiple components: +A TigerGraph solution kit for detecting **application fraud** and uncovering +fraud rings using shared Personally Identifiable Information (PII) and **entity resolution**. -- `data` - Sample data. -- `load_jobs` - Scripts for data loading tasks. -- `meta` - Solution Kit metadata. -- `queries` - Collection of GSQL queries. -- `schema` - Definition of database schema. -- `readme.md` - This usage guide. -- `setup.sh` - Automated setup script. +This project provisions a complete graph environment — schema, sample data, +and GSQL queries — to analyze how applications are connected via devices, IPs, +addresses, and other identity signals. -# Instructions +With this kit you can: -The `setup.sh` script is designed to streamline the initial setup process by sequentially executing the following steps: +- Link applications into fraud rings based on shared PII. +- Judge a new application's proximity to known fraud. +- Quantify the impact of graph features with uplift metrics and other insights +- Generate rich graph features for downstream fraud ML models. + +--- + +## Contents + +- [Overview](#overview) +- [Features](#features) +- [Prerequisites](#prerequisites) +- [Setup Instructions](#setupInstructions) +- [Installation Notes](#InstallationNotes) +- [Working-with-the-graph](#working-with-the-graph) + - [Query-Explanations](#query-explanations) + - [Running-example-queries](#running-example-queries) +- [Using-your-own-data](#using-your-own-data) +- [Resetting-the-environment](#resetting-the-environment) +- [ML-model-and-insights-application](#ml-model-and-insights-application) + +--- + +## Overview + +Application fraud detection focuses on spotting suspicious credit applications +before they are approved, when: + +- Multiple applications share the same PII. +- Applications are part of a larger, tightly connected ring. +- New applications appear “close” in the graph to known fraud. + +Relational databases struggle to track these many-to-many, multi-hop +relationships at scale. TigerGraph’s native graph engine is well suited +because it can: + +**Fast traversals.** Move from an application through its PII and back to other +applications in just a few hops. + +**Community detection.** Group related applications into communities that +represent fraud rings or clusters of highly related applications. + +**Real-time signals.** Compute graph-based fraud features on demand for +scoring new applications as they arrive. + +In this solution kit, we model a fraud detection environment with: + +1. Application vertices +2. PII vertices such as name, DOB, email, phone, address, IP, ID, device, etc. +3. Location vertices +4. Connected_Component vertices: representing communities +5. Product vertices linked to applications. + +You can use the included queries as building blocks for fraud analytics, +operational alerting, and model feature generation. + +> **Graph name:** This kit creates and uses a graph named `Application_Fraud`. + +--- + +## Features + +- **Application Fraud Graph Environment** + +This kit includes a ready-to-use `Application_Fraud` graph schema, sample data, +and a loading job that pulls this data from a public S3 bucket into TigerGraph. +The loaded data is identical to the data found in the `data/` folder of the +kit. + +- **Entity resolution pipeline** + +Use queries to determine if different records with possibly different and +overlapping details actually refer to the same real-world entity -> the same +person, account or application. + +We do this by creating a pipeline that computes similarity between +applications based on shared PII, groups related applications and then +supports incremental matching of new applications in real time. + +- **Graph feature and proximity queries** + +Queries that generate: + - Component-level features (ring size, number of fraud apps, distinct PII). + - Distances and paths from a given application to known fraud. + - CSV feature files for ML training. + +- **Insights analysis** + +Queries that take confusion-matrix counts from your ML model and calculate: + - Precision, recall, specificity, and NPV. + - Detected and missed fraud amounts. + - Net financial gain and relative performance uplift. + +- **Automated setup script** + `setup.sh` creates the graph, loads the data, and installs all queries in one go. + +--- +## Prerequisites + +Before you run this solution kit, make sure you have: + +- **A running TigerGraph instance** + - TigerGraph installed and running, or use the prebuilt kit on TG cloud. + - You must have permission to create graphs and run GSQL commands. + +- **GSQL client access** + - The `gsql` command-line tool available on the same machine/container where you cloned this repo. + - Ability to connect to your TigerGraph service + +- **Network access for sample data** + - Outbound internet access from the TigerGraph machine to read the sample + CSV files from the public S3 bucket used in `loading_job/load_data.gsql`. + +- **Shell environment** + - A Unix-like shell (Linux, macOS, or WSL) to run `setup.sh` and `queries/install_queries.sh`. + - Executable permissions for the scripts: + ```bash + chmod +x setup.sh + chmod +x queries/install_queries.sh + ``` + +> **Graph name:** This kit creates and uses a graph named `Customer_360_Financial`. + +# Setup Instructions +The following instructions assume that you are running the following scripts +with `gsql` command installed. + +If you don't yet have the `gsql` command available, see the TigerGraph documentation: + +- **Local GSQL shell on the server** + [The GSQL Shell](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/) + (explains how to run `gsql` directly on a TigerGraph server) + +- **Remote GSQL client (from your laptop or another machine)** + [Using a Remote GSQL Client](https://docs.tigergraph.com/tigergraph-server/4.2/gsql-shell/using-a-remote-gsql-client) + (explains how to download the GSQL client JAR, configure SSL, and create a `gsql` alias) +Ensure that the script is executable with: +```bash + +chmod +x setup.sh + +``` +Then, run the automated script using: +```bash +./setup.sh +``` +This script with create and run the schema, load all the mock data into graph +and install all available queries. + +To check if all the queries have run successfully, please run the following + command: + ```bash + gsql -g Customer_360_Financial "SHOW QUERY *" + ``` -1. **Schema Creation**: Initiates the schema creation process using the `schema/create_schema.gsql` script. This schema is a subset of the Super Schema in financial_crime/library. -2. **Data Loading**: Load data into the schema by running the data loading job with the `loading_job/load_data.gsql` script. -3. **Query Installation**: Completes the setup by installing necessary queries through the `queries/install_queries.sh` script. ## Installation Note for Queries @@ -42,7 +187,6 @@ These queries have weights for each PII attribute and threshold parameters used - FLOAT threshold = 1.0 When an Application matches another Application through shared PII attribute(s) the respective weight(s) will accumulate and any accumulated weights equal or over threshold is considered a match. For example if device id has a weight of 1.0 and the threshold is 1.0 then any single matching device id between any Applications will cause those respective Applications to be put into the same Connected_Component community. If device id has a weight of 0.5 and threshold is 1.0 then it would require at least two shared device ids between any Applications to match etc. Default weights and thresholds are assigned to these queries but should be adapted if there are different business rules for matching required to satisfy. - ## Query Execution Order and Explanations ### Step 1: Insert Edges between Matching Applications @@ -93,9 +237,259 @@ If the entity_resolution response is True the incoming Application was successfu If entity_resolution is False the incoming Application was not able to be matched with the historical Application dataset and the same respective Application should be sent to `incremental_unify_entity` query so it's eligible for matching against future Applications in near real time. -## Mock Data -The `data` folder is populated with sample data files. These files are crafted for testing and demonstration purposes. + Here is a write-up of +some of the queries along with their uses: + +--- + +### 1. Entity Resolution & Fraud Ring Construction + +**match_application_entities(...)** +Creates a `Same_Application` similarity edge if the computed score of +application similarity exceeds a given threshold by comparing applications +using weighted matches on shared PII. + +--- + +**unify_application_entities()** +Scans the applications that are linked by `Same_Application` similarity edges, +and groups them into a shared `Connected_Component` vertex, and connects each +member via `Application_In_Ring`. Pairwise similarity links are turned into +"fraud ring" clusters. + +--- + +**incremental_application_match(input JSON, weights..., threshold)** +Scores a *single* new or updated application in near real time. It parses a +JSON payload containing application attributes + PII, upserts it into the graph, and +compares it against existing applications to see if it is part of the +`Connected_Component` or an existing fraud ring. + +--- + +**incremental_application_unify(SET\)** +Given a set of Application vertices, this query walks their existing +Same_Application (if any) links to determine which applications belong together, +ensures there is a Connected_Component vertex for each group, and creates the +corresponding Application_In_Ring edges. + +--- + +**delete_all_application_cc_connections(num_of_batches, batch_id)** +Deletes all `Application_In_Ring` edges in batches. Use this before re-running +a full entity-resolution cycle so that you can rebuild communities from +scratch without dropping the whole graph. + +--- + +**delete_unused_cc_nodes(num_of_batches, batch_id)** +Cleans up `Connected_Component` vertices that no longer have any edges. +This keeps the graph lean and avoids clutter from obsolete components. + +--- + +**output_application_cc_to_file(output_file_path)** +Exports a simple mapping of `Application` → `Connected_Component` into a CSV +file. This is useful for joining graph-based communities back into downstream +systems such as your warehouse, feature store, or BI dashboards. + +--- + +### 2. Graph Features for Machine Learning + +**batch_application_cc_features(connections, output_file_path)** +Generates a CSV of connected-component features for every `Application`. +For each application’s community, it counts distinct PII nodes and how many +applications are connected via each PII type. Use this to build offline +training datasets capturing the structure and density of each fraud ring. + +--- + +**batch_application_distance_and_path(depth, output_file_path)** +For every `Application`, finds the closest fraud application reachable via +shared PII within a maximum hop distance, and writes out features such as: +degree of connection (number of hops), the specific fraud counterpart, and a +path description. These features can further be used for ML training. + +--- + +**get_application_cc_features(application, connections)** +Computes graph features for the connected component of a single Application +Returns a structured result you can use directly at scoring time, with +`connections` acting as a cap to filter out very high-degree PII hubs. + +--- + +**get_application_fraud_status(SET\)** +Given a set of application vertices, returns the `is_fraud` boolean label for +each one. + +--- + +**set_application_fraud_status(application, fraud_status)** +Updates the `is_fraud` attribute for a specific `Application`. Use this to push +investigation outcomes or model decisions back into the graph so that future +graph features and proximity queries are based on the latest fraud labels. + +--- + +### 3. Fraud Proximity & Case Investigation + +**distance_and_path_to_fraud_application(application, depth)** +Starting from a single `Application`, this query traverses through shared PII +to find any reachable fraud applications within a given hop limit. It returns +the distance in hops, the matching fraud applications, their communities, and +the path of connection. + +--- + +**distance_and_path_to_fraud_application_vis(application, min_depth, max_depth)** +Finds fraud applications reachable from the input Application between min_depth +and max_depth hops, then collects the vertices and edges on those paths. +Returns a subgraph that can be rendered in TigerGraph visualization. + +--- + +**find_shared_piis_of_two_applications(...)** +Highlights the PII elements shared between two selected applications, +such as common device, IP, phone, or address. + +--- + +**get_top_k_connected_components_by_num_applications(top_k)** +Returns the `Connected_Component` communities with the most applications attached, +along with their member applications. Use this to find your largest rings. + +--- + +### 4. Portfolio & Product Insights + +**get_num_applications_by_app_status()** +Counts applications by their status (e.g., PENDING, APPROVED, DECLINED). +View your onboarding pipeline and how many applications are sitting in each decision bucket. + +--- + +**get_num_applications_by_fraud_status()** +Returns the total number of fraudulent versus legitimate applications. + +--- + +**get_top_k_products_by_num_applications(top_k, input_application_fraud_status, input_application_status)** +Returns the top products ranked by number of connected applications, +optionally filtered by application fraud status and/or application status. + +--- + +**get_top_k_products_by_num_applications_with_other(top_k, input_application_fraud_status, input_application_status)** +Provides a “top-k + Other” breakdown: the top products by number of applications +plus an aggregated “Other Products” bucket. + +--- + +### 5. Business Impact & Uplift Analytics + +These queries work together to quantify how much value you get from using graph +features (TigerGraph) versus your baseline ML model or rules. They take +confusion-matrix counts *with* and *without* graph features and turn them +into uplift and dollar amounts. + +--- + +**insights_get_application_count_by_binary_classification(...)** +Takes confusion-matrix counts for two model runs—typically a baseline model +and a model augmented with graph features—and returns a side-by-side table +plus totals. + +--- + +**insights_get_binary_classification_ratios(...)** +Computes standard model performance metrics (precision, recall, specificity, +negative predictive value) for both scenarios and expresses them as percentages. + +--- + +**insights_get_detected_fraud_amount(no_tg_true_positive, tg_true_positive, avg_loss_per_app_fraud)** +Given an average monetary loss per fraudulent application, this query converts +true positives into “detected fraud dollars” with and without graph features. + +--- + +**insights_get_missed_fraud_amount(no_tg_false_negative, tg_false_negative, avg_loss_per_app_fraud)** +Calculates how much fraud slips through undetected in each scenario. This +highlights the reduction in missed fraud when graph features improve recall. + +--- + +**insights_get_net_gain_numbers_and_percentages(...)** +Computes the *counts* and *percentages* of: additional fraud blocked, net +reduction in missed fraud, and net reduction in false positives. + +--- + +### 6. Metric Utility Functions + +**utils_get_precision(true_positive, false_positive)** +**utils_get_recall(true_positive, false_negative)** +**utils_get_specificity(true_negative, false_positive)** +**utils_get_negative_predictive_value(true_negative, false_negative)** + +--- + + +## Run an Example Query + +Once the schema and data are loaded, you can inspect graph-based features for a single application using `get_application_cc_features`. + +1. Pick an `Application` ID from `data/application_fraud.csv` (for example `APP_10001`). +2. From the GSQL shell, run: + +```bash +gsql 'USE GRAPH Application_Fraud RUN QUERY get_application_cc_features("7da7048b-16f3-46f0-bdf4-a1ddd5c8005a", 25000)' +``` + + +This returns a JSON-style record with the application’s connected-component +ID, fraud count in its ring, and PII-based graph feature that you can feed +directly into an ML model or rules engine. + +## Using Your Own Data + +By default, this solution kit loads **mock application and product data** from a +TigerGraph-hosted public S3 bucket, using the loading job defined in +`loading_job/load_data.gsql`. The two CSV files it loads are: + +- `application_fraud.csv` +- `products.csv` + +TigerGraph supports multiple loading options. For detailed, up-to-date examples, +please refer to the official documentation: + +- **Data loading overview** – supported sources and general workflow + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/data-loading-overview + +- **Data loading entry page** – index of all loading methods (local files, cloud storage, warehouses, Spark, etc.) + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/ + +- **Load data from cloud storage** – step-by-step guide for Amazon S3, Google Cloud Storage, and Azure Blob Storage + https://docs.tigergraph.com/tigergraph-server/4.2/data-loading/load-from-cloud + + + +## Resetting the Environment + +Sometimes you’ll want to wipe the graph data and start fresh — for example, when +switching to a new dataset or after updating the schema and loading logic. + +This repository provides a utility query to delete all data, and a separate +reset script for more advanced workflows. + +To run it: + +```bash +gsql reset/reset.gsql +``` ## ML Model and Insights Application -You can find the instructions for training the ML model and its performance metrics within the documentation located in the model folder. The Insights Applications are available as JSON files in the meta folder. + The Insights Applications are available as JSON files in the meta folder. diff --git a/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql new file mode 100644 index 00000000..50eb2b95 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql @@ -0,0 +1,16 @@ +CREATE OR REPLACE DISTRIBUTED QUERY Top_Connected_Component(/* Parameters here */) FOR GRAPH Entity_Resolution_KYC { + SumAccum @parties,@address,@phone,@risk; + res = + SELECT s FROM + Connected_Component:s - () - Party:p - () - (Address|Phone):t + ACCUM + CASE WHEN t.type == "Address" THEN s.@address += 1 ELSE t.@phone += 1 END + //CASE WHEN p.is_fraud > 0 THEN s.@risk += 1 END + POST-ACCUM s.@parties += s.outdegree("Entity_In_Ring"),s.@risk += sum(s.neighborAttribute( "Entity_In_Ring", "Party", "is_fraud" )) + + + + ORDER BY s.@parties DESC + ; + PRINT res[res.id as ComponentId,res.@parties as Parties,res.@address as Address,res.@phone as Phone,res.@risk as Risk]; +} \ No newline at end of file diff --git a/financial_crime/entity_resolution_kyc/readme.md b/financial_crime/entity_resolution_kyc/readme.md new file mode 100644 index 00000000..0742200d --- /dev/null +++ b/financial_crime/entity_resolution_kyc/readme.md @@ -0,0 +1,110 @@ + + +## Narratives: Enhanced KYC and Fraud Linkage + +Accurate **Entity Resolution (ER)** is the foundation of effective Know Your Customer (KYC) compliance and financial crime detection. Traditional systems often fail to link scattered customer records, leading to gaps in risk profiles. This TigerGraph solution addresses this by: + +* **Establishing a Single Customer View (SCV):** By clustering all PII-linked records (via the **`Same_As`** edge) into a single **`Connected_Component`**, the solution creates an authoritative view of the customer, crucial for compliance. +* **Weighted Weakly Connected Components (WCC):** We use an approximate **weighted matching algorithm** where shared attributes (Email, Phone, Name, etc.) contribute a customizable **score** to the total match. This provides a high-fidelity, tunable mechanism for linking parties. +* **Real-Time Fraud Linkage:** Any new Party is instantly matched against existing entities using `incremental_party_match`. If a match is found and the existing entity belongs to a component linked to a known fraudulent party, the new entity is immediately flagged, drastically reducing risk exposure. +* **Feature Engineering for Downstream ML:** The resulting `Connected_Component` structure allows for powerful feature extraction, providing machine learning models with metrics like "Count of Parties in Component" or "Shortest Path to Known Fraudulent Party." + +*** + +## Components + +This repository is structured to provide a comprehensive, ready-to-deploy graph solution. + +| Component | Description | +| :--- | :--- | +| `Schema` | Definition of the database schema, focusing on the **`Party`** vertex, PII, and the `Connected_Component` structure (18 core vertices). | +| `Queries` | A collection of **GSQL queries** for matching, unification, feature engineering, and real-time lookup. | +| `Mock Data` | Sample data for Party, PII, and relationship loading. | +| `Loading Jobs` | Two loading jobs: one for cloud integration and the other for on-prem local deployment. | +| `README.md` | This usage guide. | +| `setup.sh` | Automated setup script for schema creation, loading, and query installation. | + +*** + +## ML Features: ER-Derived Features for Risk Scoring + +The structure created by the Entity Resolution process generates powerful features for risk scoring and identifying high-risk entities. + +1. **Community Size :** + * **Feature:** The total count of **`Party`** vertices belonging to the same **`Connected_Component`**. + * **Insight:** Larger components may indicate potential fraud rings, as numerous distinct records share PII/attributes. + * **Query Source:** `batch_party_cc_features` or `get_party_cc_features`. + +2. **Shortest Path Length to Fraud:** + * **Feature:** The shortest path length from a given **`Party`** or **`Connected_Component`** to any known **fraudulent Party** (`Party.is_fraud = 1`). + * **Insight:** Direct proximity (short path) is a primary risk metric for linkage analysis. + * **Query Source:** `batch_party_distance_and_path` or `distance_and_path_to_fraud_party`. + +3. **PII Overlap Count:** + * **Feature:** The accumulated weighted score of matching PII attributes (Full Name, DOB, Email, etc.) between a target Party and other Parties in the graph. + * **Insight:** Quantifies the **strength** and **type** of common links that resulted in the entity match. + * **Query Source:** Calculated within `incremental_party_match` and `match_party_entities`. + +4. **PII Multi-Party Degree:** + * **Feature:** The out-degree of a critical PII vertex (e.g., `Email`, `Phone`) that is shared by multiple distinct parties. + * **Insight:** High-degree PIIs (like an IP address used by many accounts) are strong indicators of shared risk or a single fraud operation. + +5. **Linked Asset Count:** + * **Feature:** The number of unique `Account`, `Card`, and `Application` vertices linked to the Party's **`Connected_Component`**. + * **Insight:** Assesses the total scope and exposure of the consolidated customer view. + +*** + +## Instructions + +### Setup and Installation + +1. **Schema Creation:** Run the general schema creation script, followed by the specific ER/KYC schema script. +2. **Data Loading:** Load data into the schema by running the data loading job script (`local_loading_job.gsql`). +3. **Query Installation:** Install all **15 GSQL queries** by executing the `install_queries.sh` script. + +### Query Execution Order and Explanations + +The solution requires a sequential execution of the batch queries to populate the core ER structure. + +#### Step 1: Graph Cleanup (Maintenance) + +*(Optional, but highly recommended before a fresh batch run)* + +| Query | Purpose | +| :--- | :--- | +| **`delete_all_party_cc_connections`** | Deletes old `Entity_In_Ring` edges to clear previous clustering results. | +| **`delete_unused_cc_nodes`** | Removes `Connected_Component` vertices that no longer link to any `Party`. | + +#### Step 2: Batch Entity Resolution (Matching and Unification) + +This is the core process to establish the links and the clusters. + +| Query | Purpose | Resulting Edges/Vertices | +| :--- | :--- | :--- | +| **`match_party_entities`** | Calculates the weighted similarity score between all potential Party pairs and inserts the **`Same_As`** edge if the score exceeds the `threshold`. | Inserts **`Same_As`** edges | +| **`unify_party_entities`** | Runs the Connected Components algorithm based on all existing `Same_As` edges. | Creates **`Connected_Component`** vertices; Inserts **`Entity_In_Ring`** edges | + +#### Step 3: Feature Engineering (Batch Calculation) + +These queries run after Step 2 is complete. They calculate the ML features and output the results. + +| Query | Purpose | Output/Target | +| :--- | :--- | :--- | +| **`batch_party_cc_features`** | Calculates structural features (e.g., CC size) for every Party in the graph. | Writes results to output file. | +| **`batch_party_distance_and_path`** | Finds the shortest path and minimum distance from all parties to known fraud (`Party.is_fraud = 1`). | Writes results to output file. | +| **`output_party_cc_to_file`** | Exports the final Party-to-Connected\_Component mapping. | Writes mapping to CSV file. | + +#### Step 4: Real-Time / Incremental Queries + +These queries are used for real-time lookups and transactional updates. + +| Query | Purpose | Type | +| :--- | :--- | :--- | +| **`incremental_party_match`** | Inserts a new Party and attempts to match it to an existing `Connected_Component`. | Real-Time Insert/Match | +| **`incremental_party_unify`** | Creates a new `Connected_Component` for a Party that did not match any existing component. | Real-Time Unification | +| **`set_party_fraud_status`** | Updates the `is_fraud` attribute for a given Party. | Real-Time Update | +| **`get_party_cc_features`** | Retrieves pre-computed features for a single Party. | Real-Time Lookup | +| **`distance_and_path_to_fraud_party`** | Retrieves the path to fraud for a single Party. | Investigation/Lookup | +| **`find_shared_piis_of_two_parties`** | Finds the specific PII attributes that connect any two parties. | Investigation/Lookup | + diff --git a/financial_crime/transaction_fraud/loading_job/load_data.gsql b/financial_crime/transaction_fraud/loading_job/load_data.gsql index da65368d..f74b1134 100644 --- a/financial_crime/transaction_fraud/loading_job/load_data.gsql +++ b/financial_crime/transaction_fraud/loading_job/load_data.gsql @@ -9,7 +9,6 @@ CREATE DATA_SOURCE s1 = """{ CREATE LOADING JOB loading_data FOR GRAPH Transaction_Fraud { DEFINE FILENAME MyDataSource="$s1:s3://tigergraph-solution-kits/financial_crime/transaction_fraud/data/"; - LOAD MyDataSource TO VERTEX Merchant_Category VALUES($22) USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; LOAD MyDataSource TO VERTEX Merchant VALUES($25, _, _, _) USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; LOAD MyDataSource TO VERTEX Payment_Transaction VALUES($20, $28, $23, $24, $21, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _) USING SEPARATOR="|", HEADER="true", EOL="\n", QUOTE="double"; From b51250d1a32748900b4fc436b2a594839e4bcc0e Mon Sep 17 00:00:00 2001 From: user Date: Tue, 2 Dec 2025 23:41:30 +0530 Subject: [PATCH 5/7] added insight file --- .../meta/Insights_dashboard.json | 488 ++++++++++++++++++ 1 file changed, 488 insertions(+) create mode 100644 financial_crime/entity_resolution_kyc/meta/Insights_dashboard.json diff --git a/financial_crime/entity_resolution_kyc/meta/Insights_dashboard.json b/financial_crime/entity_resolution_kyc/meta/Insights_dashboard.json new file mode 100644 index 00000000..c1740344 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/meta/Insights_dashboard.json @@ -0,0 +1,488 @@ +{ + "defaultGraph": "AntiFraud", + "iconURL": "/insights/static/media/atom.14f5dd297b1a450cae3413a44f69a75b.svg", + "id": "arDCiwHPNbgkqU51x1tBNY", + "pageConfigSeparated": true, + "pages": [ + { + "chartMap": { + "2g1ThadQg96eQw9vWvjWKc": { + "chartSettings": { + "rulesByType": { + "Party": [ + { + "condition": "=", + "conditionEndValue": 1, + "conditionStartValue": 0, + "conditionValue": "1", + "fieldName": "is_fraud", + "fieldType": "number", + "palateName": "", + "styleEndLabel": "", + "styleEndValue": "", + "styleKey": "background-color", + "styleLabel": "Vertex color", + "styleStartLabel": "", + "styleStartValue": "", + "styleType": "color", + "styleValue": "#ff0000" + } + ] + } + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "2g1ThadQg96eQw9vWvjWKc", + "patternLimit": 5, + "query": "INTERPRET QUERY(VERTEX componentId) FOR GRAPH Entity_Resolution_KYC {\n src = {componentId};\n SetAccum @@edge;\n SetAccum @@node;\n res = \n SELECT s FROM\n src:s - (:e1) - Party:p - (:e2) - (Address|Phone):t \n ACCUM @@edge += e1,@@edge += e2,@@node += s,@@node += p,@@node += t\n ;\n PRINT @@edge;\n \n nodes = {@@node};\n PRINT nodes;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Connected Component Graph", + "type": "internal-graph", + "version": "1764696139513871285" + }, + "2zfNGnX5qw4YvjKuguE776": { + "chartSettings": { + "values": [ + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "h48iV6XssVA3mGqRSn3dVa", + "key": "party", + "label": "Total Parties", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "1D9rXhSo9qv3jjqj6S7Z59", + "key": "active_address", + "label": "Active Address", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "jJhtv54FHCFSaoUZD7Bcy7", + "key": "connected_component", + "label": "Connected Component", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + }, + { + "fontColor": "#000000", + "fontSize": 32, + "iconColor": "#000000", + "iconPostion": "", + "iconSize": 24, + "id": "uh2uw457meuyBNT7TnVhyW", + "key": "risk_alert", + "label": "Risk Alert", + "labelColor": "#000000", + "labelPostion": "", + "labelSize": 20, + "styleRule": [] + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "2zfNGnX5qw4YvjKuguE776", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n res = \n SELECT s from Party:s\n ;\n PRINT res.size() as party;\n \n res = SELECT s from\n Address:s - () - Party:p;\n PRINT res.size() as active_address;\n \n res = SELECT s FROM Connected_Component:s - () - Party:p;\n PRINT res.size() as connected_component;\n \n res = SELECT s FROM Connected_Component:s - () - Party:p WHERE p.is_fraud > 0;\n PRINT res.size() as risk_alert;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "", + "type": "value", + "version": "1764693452951940468" + }, + "6Yj4WGLfhs5iobubJJAgza": { + "chartSettings": { + "category": [ + { + "id": "typ", + "type": "string" + } + ], + "radius": [ + 0, + 71 + ], + "tableHeaders": [ + "cnt", + "typ" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "6Yj4WGLfhs5iobubJJAgza", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n GroupByAccum cnt> @@grp;\n SumAccum @connCount;\n \n parties = SELECT p FROM Party:p\n ACCUM p.@connCount = p.outdegree()\n POST-ACCUM\n CASE \n WHEN p.@connCount <= 1 THEN @@grp += (\"Single Entity\"->1)\n WHEN p.@connCount <= 5 THEN @@grp += (\"2-5 Connections\"->1)\n WHEN p.@connCount <= 10 THEN @@grp += (\"6-10 Connections\"->1)\n ELSE @@grp += (\"11+ Connections\"->1)\n END;\n \n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Connection Distribution", + "type": "pie", + "version": "1764696420610182024" + }, + "aZsdFDfkYXbLqore5CeS77": { + "chartSettings": { + "category": [ + { + "id": "entity", + "type": "string" + } + ], + "tableHeaders": [ + "cnt", + "entity" + ], + "tableIndex": 0, + "value": [ + { + "id": "cnt", + "type": "number" + } + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "aZsdFDfkYXbLqore5CeS77", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n GroupByAccum cnt> @@grp;\n \n partySrc = \n SELECT s FROM Party:s ACCUM @@grp += (\"Party\"->1)\n ;\n res = \n SELECT t from partySrc:s - () - (Address|Phone|Email|Device|IP):t \n POST-ACCUM @@grp += (t.type->1)\n ;\n PRINT @@grp;\n}", + "queryType": "interactive", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Entity Type Distribution", + "type": "bar", + "version": "1764695406058758862" + }, + "kSo7qKu14XSMP6mQCjryv1": { + "chartSettings": { + "showColumns": [ + { + "isChecked": false, + "name": "v_id" + }, + { + "isChecked": false, + "name": "v_type" + }, + { + "isChecked": true, + "name": "Address" + }, + { + "isChecked": true, + "name": "ComponentId" + }, + { + "isChecked": true, + "name": "Parties" + }, + { + "isChecked": true, + "name": "Phone" + }, + { + "isChecked": true, + "name": "Risk" + } + ], + "sortedColumns": [ + "ComponentId", + "Parties", + "Address", + "Phone", + "Risk", + "Action" + ], + "tableActions": [ + { + "pageID": "viZkpMCa5wFYgruGR9JSWy", + "params": [ + { + "isCreatable": false, + "name": "componentId", + "paramGlobalInput": "", + "value": "ComponentId" + } + ], + "text": "click", + "url": "", + "urlName": "New Page" + } + ], + "tableColumns": [ + { + "id": "Matched pattern", + "isChecked": "true", + "label": "" + }, + { + "children": [ + { + "children": [ + { + "id": "phone_number", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Phone", + "isChecked": "true", + "type": "vertex" + }, + { + "children": [ + { + "id": "address", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Address", + "isChecked": "true", + "type": "vertex" + } + ], + "id": "Vertices", + "isChecked": "true", + "isExpanded": true, + "label": "Vertices" + }, + { + "children": [], + "id": "Edges", + "isChecked": "true", + "isExpanded": false, + "label": "Edges" + } + ], + "tableHeaders": [ + "v_id", + "v_type", + "Address", + "ComponentId", + "Parties", + "Phone", + "Risk" + ] + }, + "graphName": "Entity_Resolution_KYC", + "hideWidgetName": false, + "id": "kSo7qKu14XSMP6mQCjryv1", + "patternLimit": 5, + "query": "INTERPRET QUERY() FOR GRAPH Entity_Resolution_KYC {\n SumAccum @parties,@address,@phone,@risk;\n res = \n SELECT s FROM\n Connected_Component:s - () - Party:p - () - (Address|Phone):t\n ACCUM\n CASE WHEN t.type == \"Address\" THEN s.@address += 1 ELSE t.@phone += 1 END\n //CASE WHEN p.is_fraud > 0 THEN s.@risk += 1 END\n POST-ACCUM s.@parties += s.outdegree(\"Entity_In_Ring\"),s.@risk += sum(s.neighborAttribute( \"Entity_In_Ring\", \"Party\", \"is_fraud\" ))\n\n\n \n ORDER BY s.@parties DESC\n ;\n PRINT res[res.id as ComponentId,res.@parties as Parties,res.@address as Address,res.@phone as Phone,res.@risk as Risk];\n}", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [ + { + "data": "Top_Connected_Component", + "id": "Top_Connected_Component", + "type": "QUERY" + } + ], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "Top Connected Component", + "type": "table", + "version": "1764697101443443105" + }, + "pWVjbPx54GYQyYYTHe8Qpj": { + "chartSettings": { + "inputStates": [ + { + "dataType": "vertex", + "id": "input_rjFCLCFNBMSX1tGBx9rHuK", + "name": "componentId", + "selectedGraph": "Entity_Resolution_KYC", + "settings": { + "fixVertexType": true + }, + "vertexType": "Connected_Component", + "widgetType": "Input" + } + ] + }, + "graphName": "AntiFraud", + "hideWidgetName": false, + "id": "pWVjbPx54GYQyYYTHe8Qpj", + "patternLimit": 5, + "query": "", + "queryType": "pattern", + "refreshRate": 0, + "searchPattern": [], + "staticData": "[\n {\n \"key\": \"value\"\n }\n]", + "title": "New Widget", + "type": "Inputs", + "version": "1764696049224336494" + } + }, + "globalParameters": { + "componentId": { + "id": "input_rjFCLCFNBMSX1tGBx9rHuK", + "name": "componentId", + "type": "VERTEX", + "value": { + "vertexID": "1678770176", + "vertexType": "Connected_Component" + } + } + }, + "iconURL": "/insights/static/media/library.d3f7f207c6bb1d7be8e64045a19991b2.svg", + "id": "viZkpMCa5wFYgruGR9JSWy", + "isDetail": true, + "isNew": false, + "layouts": { + "md": [ + { + "h": 33, + "i": "2g1ThadQg96eQw9vWvjWKc", + "moved": false, + "static": false, + "w": 7, + "x": 5, + "y": 28 + }, + { + "h": 8, + "i": "2zfNGnX5qw4YvjKuguE776", + "moved": false, + "static": false, + "w": 12, + "x": 0, + "y": 0 + }, + { + "h": 20, + "i": "6Yj4WGLfhs5iobubJJAgza", + "moved": false, + "static": false, + "w": 5, + "x": 0, + "y": 8 + }, + { + "h": 20, + "i": "aZsdFDfkYXbLqore5CeS77", + "moved": false, + "static": false, + "w": 7, + "x": 5, + "y": 8 + }, + { + "h": 33, + "i": "kSo7qKu14XSMP6mQCjryv1", + "moved": false, + "static": false, + "w": 5, + "x": 0, + "y": 28 + }, + { + "h": 12, + "i": "pWVjbPx54GYQyYYTHe8Qpj", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 61 + } + ], + "xs": [ + { + "h": 8, + "i": "2zfNGnX5qw4YvjKuguE776", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 0 + }, + { + "h": 20, + "i": "6Yj4WGLfhs5iobubJJAgza", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 8 + }, + { + "h": 20, + "i": "aZsdFDfkYXbLqore5CeS77", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 28 + }, + { + "h": 33, + "i": "kSo7qKu14XSMP6mQCjryv1", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 48 + }, + { + "h": 25, + "i": "2g1ThadQg96eQw9vWvjWKc", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 81 + }, + { + "h": 12, + "i": "pWVjbPx54GYQyYYTHe8Qpj", + "moved": false, + "static": false, + "w": 4, + "x": 0, + "y": 106 + } + ] + }, + "title": "Entity Network Dashboard", + "version": "1764696199602489941", + "weight": 20 + } + ], + "title": "Entity_Resolution_KYC", + "userRoleForApp": "owner", + "version": "1764673492229672678" +} \ No newline at end of file From f7425c0a1f7611f927a6a1ca2315744cede4fd08 Mon Sep 17 00:00:00 2001 From: user Date: Sat, 6 Dec 2025 16:17:35 +0530 Subject: [PATCH 6/7] added extra query recomended by Jim --- ...ected_component_entities_same_as_edge.gsql | 51 ++++++++++++++ ..._connected_components_to_entities_vis.gsql | 64 ++++++++++++++++++ .../queries/explore_er_steps.gsql | 67 +++++++++++++++++++ 3 files changed, 182 insertions(+) create mode 100644 financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_same_as_edge.gsql create mode 100644 financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql create mode 100644 financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql diff --git a/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_same_as_edge.gsql b/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_same_as_edge.gsql new file mode 100644 index 00000000..9dbae1c3 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/expand_connected_component_entities_same_as_edge.gsql @@ -0,0 +1,51 @@ +CREATE OR REPLACE DISTRIBUTED QUERY expand_connected_component_entities_same_as_edge ( + VERTEX input_connected_component +) { + /* + Query: expand_connected_component_entities_same_as_edge + Description: + Following Query is used to give an holistic view of the given input Connected_Component node using Same_As edge + + input: Connected_Component : `1678770178` + + output: connected edges + */ + OrAccum @visited; + SetAccum @@edges_to_display; + input_connected_component_set = {input_connected_component}; + + curr_linked_parties = SELECT t + FROM input_connected_component_set:s -(Entity_In_Ring:e)- Party:t + WHERE getvid(t) == s.id + ACCUM @@edges_to_display += e + + ; + + WHILE curr_linked_parties.size() > 0 DO + curr_linked_parties = SELECT t + FROM curr_linked_parties:s -(Same_As:e)- Party:t + ACCUM @@edges_to_display += e + HAVING NOT t.@visited + ; + + curr_linked_parties = SELECT s + FROM curr_linked_parties:s + POST-ACCUM s.@visited += True + ; + END; + + all_related_parties = SELECT s + FROM Party:s + WHERE s.@visited + ; + + PRINT + input_connected_component + ; + + PRINT + input_connected_component_set, + all_related_parties + ; + PRINT @@edges_to_display; +} \ No newline at end of file diff --git a/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql b/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql new file mode 100644 index 00000000..1695e1b0 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/expand_connected_components_to_entities_vis.gsql @@ -0,0 +1,64 @@ +CREATE OR REPLACE DISTRIBUTED QUERY expand_connected_components_to_entities_vis ( + VERTEX input_connected_component, + BOOL show_only_shared_features = False +) { + + /* + Query: expand_connected_components_to_entities_vis + Description: + Following Query is used to give an holistic view of all connected edges for the Connected_Component node + input: Connected_Component : `1678770178` + + output: connected edges + */ + SumAccum @num_parties_linked_to_features; + SetAccum @@edges_to_display; + input_connected_component_set = {input_connected_component}; + + SetAccum @@edge_type_set = ( + "Has_Full_Name", + "Has_DOB", + "Has_Email", + "Has_Phone", + "Has_Address", + "Has_IP", + "Has_ID", + "Has_Device", + "Application_Has_Party", + "Party_Has_Account", + "Party_Has_Card" + ); + + linked_parties = SELECT t + FROM input_connected_component_set:s -(Entity_In_Ring:e)- Party:t + ACCUM @@edges_to_display += e + ; + + selected_features_to_display = SELECT t + FROM linked_parties:s -(@@edge_type_set:e)- :t + ACCUM t.@num_parties_linked_to_features += 1 + ; + + IF show_only_shared_features THEN + selected_features_to_display = SELECT s + FROM selected_features_to_display:s + WHERE s.@num_parties_linked_to_features > 1 + ; + END; + + selected_features_to_display = SELECT s + FROM selected_features_to_display:s -(@@edge_type_set:e)- linked_parties:t + ACCUM @@edges_to_display += e + ; + + PRINT + input_connected_component, + show_only_shared_features + ; + PRINT + input_connected_component_set, + linked_parties, + selected_features_to_display + ; + PRINT @@edges_to_display; +} \ No newline at end of file diff --git a/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql b/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql new file mode 100644 index 00000000..e75ded08 --- /dev/null +++ b/financial_crime/entity_resolution_kyc/queries/explore_er_steps.gsql @@ -0,0 +1,67 @@ +CREATE OR REPLACE DISTRIBUTED QUERY explore_er_steps ( + SET> party_to_explore, + BOOL explore_step_01, + BOOL explore_step_02, + BOOL explore_step_03 +) { + // The visualization for this query will be more intuitive + // if all vertices in party_to_explore were all in the same vertex community + + // Step 01 = explore attribute edges - before match_party_entities + // Step 02 = status after match_party_entities was run + // Step 03 = status after unify_party_entities was run + SetAccum @@edges_to_display; + SetAccum @@attr_edges; + OrAccum @is_start_set; + + @@attr_edges += ( + "Has_Full_Name", + "Has_DOB", + "Has_Email", + "Has_Phone", + "Has_Address", + "Has_IP", + "Has_ID", + "Has_Device", + "Application_Has_Party", + "Party_Has_Account", + "Party_Has_Card" + ); + + party_to_explore_set = {party_to_explore}; + + party_to_explore_set = SELECT s + FROM party_to_explore_set:s + POST-ACCUM s.@is_start_set += TRUE + ; + + IF explore_step_01 THEN + attributes_explore_step_01 = SELECT t + FROM party_to_explore_set:s -(@@attr_edges:e)- :t + ACCUM @@edges_to_display += e + ; + + PRINT attributes_explore_step_01; + END; + + IF explore_step_02 THEN + party_with_same_as_edges = SELECT s + FROM party_to_explore_set:s -(Same_As:e)- Party:t + WHERE t.@is_start_set == TRUE + ACCUM @@edges_to_display += e + ; + END; + + IF explore_step_03 THEN + curr_party_cc_step_03 = SELECT t + FROM party_to_explore_set:s -(Entity_In_Ring:e)- Connected_Component:t + ACCUM @@edges_to_display += e + ; + + PRINT curr_party_cc_step_03; + END; + + + PRINT party_to_explore_set; + PRINT @@edges_to_display; +} \ No newline at end of file From 854aad1802cded3969102a0a87459fc6a9f9918e Mon Sep 17 00:00:00 2001 From: user Date: Mon, 8 Dec 2025 11:39:57 +0530 Subject: [PATCH 7/7] added comments to the code --- .../queries/batch_party_cc_features.gsql | 19 ++++++++++++++++ .../batch_party_distance_and_path.gsql | 20 +++++++++++++++++ .../delete_all_party_cc_connections.gsql | 20 +++++++++++++++++ .../queries/delete_unused_cc_nodes.gsql | 20 +++++++++++++++++ .../distance_and_path_to_fraud_party.gsql | 19 ++++++++++++++++ .../find_shared_piis_of_two_parties.gsql | 21 ++++++++++++++++++ .../queries/get_party_cc_features.gsql | 19 ++++++++++++++++ .../queries/get_party_fraud_status.gsql | 19 ++++++++++++++++ .../queries/incremental_party_match.gsql | 19 ++++++++++++++++ .../queries/incremental_party_unify.gsql | 22 +++++++++++++++++++ .../queries/match_party_entities.gsql | 22 +++++++++++++++++++ .../queries/output_party_cc_to_file.gsql | 21 +++++++++++++++++- .../queries/top_connected_component.gsql | 20 ++++++++++++++++- .../queries/unify_party_entities.gsql | 21 ++++++++++++++++++ 14 files changed, 280 insertions(+), 2 deletions(-) diff --git a/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql b/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql index f4acba91..e1e37768 100644 --- a/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql +++ b/financial_crime/entity_resolution_kyc/queries/batch_party_cc_features.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE DISTRIBUTED QUERY batch_party_cc_features(INT connections=25000, STRING output_file_path = "/home/tigergraph/gsql_output/batch_party_cc_features.csv") { + /* + Query Name: batch_party_cc_features + + Purpose: Generate Machine Learning Features for Fraud Detection + This query traverses the graph to extract aggregated features from the Connected Component (CC) + that each 'Party' belongs to, focusing on shared Personally Identifiable Information (PII) + and other related entities (Accounts, Cards, etc.). + + Key Use Cases: + • Feature Engineering: Create robust, graph-based variables to train a fraud prediction model. + • Ring Detection: Identify 'Parties' that are linked via shared PII, often indicative of fraud rings. + • Risk Profiling: Assess the risk of a Party based on the size and fraudulence of its community. + + Parameters: + connections: Upper limit for the out-degree of PII/Entity nodes (e.g., limit a shared IP address + to prevent overly broad, non-specific connections, often called 'Super-Node' filtering). + output_file_path: Location to save the resulting CSV file with all features. + */ + MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_party, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_party, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_party, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_party, @connected_via_account, @connected_via_card; SetAccum @middle_type; diff --git a/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql b/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql index 2c1ff471..bdd684db 100644 --- a/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql +++ b/financial_crime/entity_resolution_kyc/queries/batch_party_distance_and_path.gsql @@ -1,5 +1,25 @@ CREATE DISTRIBUTED QUERY batch_party_distance_and_path(INT depth=5, STRING output_file_path = "/home/tigergraph/gsql_output/batch_party_distance_and_path_features.csv") { + /* + Query Name: batch_party_distance_and_path + Fraudulent Party Linkage Analysis (Shortest Path & Distance) + + Purpose: Graph Linkage Feature Generation + This query is run for *every* 'Party' in the graph (a batch process) to: + 1. Find the shortest path and distance (in hops) to any other 'Party' that is marked as **fraudulent**. + 2. Record the precise path of shared PII/Entities that connects the two. + 3. Generate a feature set that shows the degree of separation between a Party and known fraud. + + Key Use Cases: + • Feature Engineering: Create "Distance-to-Fraud" and "Path-Type" features for ML models. + • Risk Prioritization: Identify non-fraudulent parties (is_fraud=0) that are only 1-2 hops away from known fraud. + • Investigation: Quickly trace the PII/Entity linkage for high-risk individuals. + + Parameters: + depth: Maximum traversal depth (number of hops) for the search. Limits search time and focuses on close connections. + output_file_path: Location to save the resulting CSV file. + */ + MapAccum, VERTEX> @@cc_map; MaxAccum @dis; MaxAccum @visited; diff --git a/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql b/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql index 8c9cb643..0139514d 100644 --- a/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql +++ b/financial_crime/entity_resolution_kyc/queries/delete_all_party_cc_connections.gsql @@ -1,5 +1,25 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_all_party_cc_connections(INT num_of_batches = 1, INT batch_id = 0) { + /* + Query Name: delete_all_party_cc_connections + Graph Maintenance: Clear Connected Component (CC) Assignments + + Purpose: Preparation for Community Detection + This query's sole purpose is to efficiently **delete** all existing `Entity_In_Ring` edges + that link `Party` vertices to their associated `Connected_Component` vertices. This action is + a required precursor before running a fresh community detection algorithm (like a new Connected + Component run) to ensure the graph features are calculated based on the latest community structure. + + Key Concept: Batch Processing + The query is designed to run in distributed, manageable batches (`num_of_batches` and `batch_id`) + to prevent memory/resource exhaustion when deleting a massive number of edges (millions or billions) + in a single transaction. + + Parameters: + num_of_batches: Total number of jobs required to process all `Party` vertices. + batch_id: The specific partition of `Party` vertices to process in the current run (0 to num_of_batches - 1). + */ + SumAccum @@count; temp = SELECT t diff --git a/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql b/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql index 98705745..35d56255 100644 --- a/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql +++ b/financial_crime/entity_resolution_kyc/queries/delete_unused_cc_nodes.gsql @@ -1,4 +1,24 @@ CREATE OR REPLACE DISTRIBUTED QUERY delete_unused_cc_nodes(INT num_of_batches = 1, INT batch_id = 0) { + /* + Query Name: delete_unused_cc_nodes + Graph Maintenance: Prune Unused Connected Component (CC) Vertices + + Purpose: Cleanup and Optimization + This query is designed to clean up the graph by **deleting** any `Connected_Component` (CC) + vertex that no longer serves a purpose. After running a community detection algorithm and then + deleting the assignment edges (e.g., using `delete_all_party_cc_connections`), some CC nodes + might remain but have no connections to any `Party` vertices. Deleting these "orphaned" nodes + reduces graph size, improves query performance, and maintains graph integrity. + + Key Concept: Orphaned Node Removal + A CC node is considered 'unused' or 'orphaned' if its **out-degree is zero** (it has no outgoing edges). + Since the CC node is only supposed to have outgoing edges to `Party` vertices, an out-degree of zero + means it is not linked to any entities and can be safely removed. + + Parameters: + num_of_batches: Total number of jobs required to process all `Connected_Component` vertices. + batch_id: The specific partition of `Connected_Component` vertices to process in the current run. + */ SumAccum @@count; diff --git a/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql b/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql index 5dee6779..5dcae6df 100644 --- a/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql +++ b/financial_crime/entity_resolution_kyc/queries/distance_and_path_to_fraud_party.gsql @@ -1,4 +1,23 @@ CREATE OR REPLACE QUERY distance_and_path_to_fraud_party(VERTEX input, INT depth=5) { + /* + Query Name: distance_and_path_to_fraud_party + Real-Time Shortest Path to Fraudulent Parties + + Purpose: Real-Time Risk Assessment + This query is designed for near **real-time execution**, performing a single Breadth-First Search (BFS) + starting from a specific `Party` (e.g., a new applicant). Its goal is to quickly find the shortest + distance and the precise chain of shared PII/Entities linking the input Party to any known + `Fraud Party` within a specified hop `depth`. + + Key Use Cases: + • Transaction Monitoring: Identify if a customer is closely linked to fraud before approving a transaction. + • New Application Scoring: Immediately flag new applicants connected to high-risk communities. + • Investigation: Quickly map the shortest connection path between a suspect and known fraud entities. + + Parameters: + input: The specific Party vertex to start the search from (the target of the risk assessment). + depth: Maximum number of hops allowed. Keeps the search fast and focused on close-range risk. + */ MinAccum @dis; OrAccum @visited; diff --git a/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql b/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql index 80b69d73..24d9277d 100644 --- a/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql +++ b/financial_crime/entity_resolution_kyc/queries/find_shared_piis_of_two_parties.gsql @@ -1,5 +1,26 @@ CREATE OR REPLACE QUERY find_shared_piis_of_two_parties(VERTEX party_1, VERTEX party_2) { + /* + Query Name: find_shared_piis_of_two_parties + Real-Time PII Linkage Analysis between Two Parties + + Purpose: Investigative Link Discovery + The query's goal is to identify all **Personally Identifiable Information (PII)** nodes + (like Name, Address, IP, Device, etc.) that are **directly shared** by the two input + `Party` vertices. This immediately reveals the underlying connection, which is a critical + step in verifying potential fraud rings or suspicious collusion. + + Key Output Features: + • PII Type (e.g., 'Full_Name'): What type of entity is shared. + • PII Value (Vertex ID): The ID of the shared entity itself. + • Degree (Out-degree): How many *other* parties are also connected to this shared PII. + A high degree indicates a high-risk 'hub' entity. + + Parameters: + party_1: The first Party vertex ID. + party_2: The second Party vertex ID. + */ + TYPEDEF TUPLE pii_info; ListAccum @@degrees_of_shared_piis; diff --git a/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql b/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql index 8c99f0a7..d6e5a7cf 100644 --- a/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql +++ b/financial_crime/entity_resolution_kyc/queries/get_party_cc_features.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE QUERY get_party_cc_features(VERTEX party, INT connections=25000) { + /* + Query Name: get_party_cc_features + Real-Time Connected Component (CC) Feature Extraction for a Single Party + + Purpose: Real-Time Feature Engineering for Single Entity Risk + This query quickly calculates detailed, graph-based features for a specific `Party` by analyzing + the aggregated properties of the entire `Connected_Component` (CC) it belongs to. This is ideal + for calculating a risk score for a single entity (e.g., a new loan application or login attempt). + + Key Features Calculated (for the CC): + • Size and Composition: Total Party nodes, distinct PII nodes (Name, IP, Device, etc.). + • Linkage Strength: Number of Parties connected via shared PII. + • Specificity: Number of Parties connected *only* by a single type of shared PII (e.g., only by DOB). + + Parameters: + party: The specific Party vertex ID to analyze. + connections: PII Super-Node filter. Maximum out-degree allowed for PII nodes to be considered valid connectors. + */ + MapAccum @@connect_component, @@distinct_name, @@distinct_dob, @@distinct_email, @@distinct_phone, @@distinct_address, @@distinct_ip, @@distinct_id, @@distinct_device, @@distinct_application, @@distinct_account, @@distinct_card, @@connected_via_name, @@connected_via_dob, @@connected_via_email, @@connected_via_phone, @@connected_via_address, @@connected_via_ip, @@connected_via_id, @@connected_via_device, @@connected_via_application, @@connected_via_account, @@connected_via_card, @@only_connected_via_name, @@only_connected_via_dob, @@only_connected_via_email, @@only_connected_via_phone, @@only_connected_via_address, @@only_connected_via_ip, @@only_connected_via_id, @@only_connected_via_device, @@only_connected_via_application, @@only_connected_via_account, @@only_connected_via_card; OrAccum @connected_via_name, @connected_via_dob, @connected_via_email, @connected_via_phone, @connected_via_address, @connected_via_ip, @connected_via_id, @connected_via_device, @connected_via_application, @connected_via_account, @connected_via_card; SetAccum @middle_type; diff --git a/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql b/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql index 5b75a5a0..07ab2e10 100644 --- a/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql +++ b/financial_crime/entity_resolution_kyc/queries/get_party_fraud_status.gsql @@ -1,4 +1,23 @@ CREATE DISTRIBUTED QUERY get_party_fraud_status(SET> parties) { + + /* + Query Name: get_party_fraud_status + Bulk Retrieval of Party Fraud Status + + Purpose: Data Lookup and Status Verification + This query takes a set of specific `Party` vertices as input and quickly returns + the value of their `is_fraud` attribute. It is typically used for external systems + to retrieve known labels for downstream analysis, model evaluation, or auditing. + It performs no graph traversal, making it extremely fast. + + Key Use Cases: + • Model Evaluation: Retrieve ground truth labels for a set of Parties for model performance testing. + • System Integration: Fetch current fraud flags for multiple entities efficiently. + • Audit and Reporting: Generate a quick report on the current status of specific accounts. + + Parameters: + parties: A set of Party vertex IDs whose fraud status needs to be retrieved. + */ MapAccum, INT> @@fraud_status; diff --git a/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql b/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql index 67bfc9bc..cf5b44c6 100644 --- a/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql +++ b/financial_crime/entity_resolution_kyc/queries/incremental_party_match.gsql @@ -14,6 +14,25 @@ CREATE OR REPLACE QUERY incremental_party_match( FLOAT threshold = 1.0 // if a total matching score between two entities exceed the threshold, a similarity edge will be created between the matching entities ) { + /* + Query Name: incremental_party_match + Real-Time Weighted Entity Resolution and Matching + + Purpose: Incremental Graph Update and De-duplication + This query handles the full lifecycle of a new (or updated) Party record: + 1. **Data Ingestion:** Parses a JSON payload and inserts the new Party and all associated PII/Entity nodes (like Address, IP, Account) into the graph. + 2. **Scoring:** Calculates a **weighted match score** between the new Party and *all existing Parties* by summing the weights of their shared PII/Entities. + 3. **Resolution:** If the score to an existing Party's community exceeds a defined `threshold`, the new Party is linked to that highest-scoring `Connected_Component` (CC), effectively merging it into an existing group. + + Key Concept: Weighted Matching + Different PII/Entity types carry different evidential weight for matching (e.g., a shared Device ID is a strong match, a shared Name is weaker). This is controlled by the input `weight` parameters. + + Parameters: + input: The JSON string containing the new Party and PII data. + Customer_[Entity]_weight: Weights defining the contribution of each shared PII/Entity type to the total score. + threshold: The minimum cumulative score required to declare a match and perform the merge. + */ + TYPEDEF TUPLE ver> Connected_Component_heap_tuple; HeapAccum(1, score DESC) @@Connected_Component_ID_heap; SetAccum @@edge_type_set, @@party_vertex, @@name, @@dob, @@email, @@phone_numbers, @@addresses, @@ip, @@ids, @@device, @@applications, @@accounts, @@cards; diff --git a/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql b/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql index ab029e0b..b775970e 100644 --- a/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql +++ b/financial_crime/entity_resolution_kyc/queries/incremental_party_unify.gsql @@ -1,5 +1,27 @@ CREATE OR REPLACE DISTRIBUTED QUERY incremental_party_unify(SET> parties) { + /* + Query Name: incremental_party_unify + Incremental Community Initialization (Connected Component Assignment) + + Purpose: Graph Structure Completion + This query takes a set of newly inserted `Party` vertices that were not successfully + matched to an existing `Connected_Component` (CC) community by the matching logic + (e.g., `incremental_party_match`). It then assigns each of these unmatched Parties + to a **new, unique CC** of its own. + + Core Logic: + 1. **Self-Assignment:** Initially treat each Party (or small group linked by `Same_As` edges) + as its own CC. + 2. **Propagation (Optional):** If the input Parties are linked by `Same_As` edges (a rare scenario + in a strictly incremental flow, but accounted for), this step uses the standard Connected + Component algorithm (specifically, label propagation) to group them. + 3. **Creation:** Inserts a new `Connected_Component` vertex and links the Party to it. + + Parameters: + parties: The set of new Party vertices that need to be assigned to a CC. + */ + MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now(); diff --git a/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql b/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql index 3f69c364..2436e483 100644 --- a/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql +++ b/financial_crime/entity_resolution_kyc/queries/match_party_entities.gsql @@ -18,6 +18,28 @@ CREATE OR REPLACE DISTRIBUTED QUERY match_party_entities( DATETIME compute_entities_after_date = to_datetime("1970-01-01 00:00:00") // All entities that were created in TG after this date, will be computed to find similarity edges related to these new entities. The default value is the earliest time possible (great for the first time running this query) ) { + /* + Query Name: match_party_entities + Batch-Based Weighted Entity Resolution (All-Pairs Similarity) + + Purpose: Full Graph De-Duplication + This query is an **offline batch job** designed to find and link similar `Party` vertices + across the entire graph based on shared PII/Entities. It calculates a weighted score + for every pair of parties connected by a shared PII, and if the score exceeds the + `threshold`, it creates a `Same_As` edge between them. + + Key Concepts: + • **Weighted Similarity:** Each shared PII type contributes a different weight to the final score. + • **Nested Batching:** The graph is divided into `source` and `target` batches to manage memory and parallelize the all-pairs comparison efficiently. + • **Super-Node Filtering:** PII nodes (e.g., a common IP address shared by millions of users) are filtered out using `pii_low_connections_limit` and `pii_high_connections_limit` to prevent false positives and poor performance. + + Parameters: + Customer_[Entity]_weight: Weights for each PII type defining match strength. + num_of_source_batches/num_of_target_batches: Control the size and number of batches for distributed processing. + threshold: Score required to create a `Same_As` link. + compute_entities_after_date: Incremental processing date filter, useful for re-running the job on recent data only. + */ + ListAccum> @entity_list; MapAccum,FLOAT> @entity_map; SumAccum @@count; diff --git a/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql b/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql index 32543849..65ef9038 100644 --- a/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql +++ b/financial_crime/entity_resolution_kyc/queries/output_party_cc_to_file.gsql @@ -1,5 +1,24 @@ CREATE OR REPLACE QUERY output_party_cc_to_file(STRING output_file_path = "/home/tigergraph/gsql_output/party_cc_output.txt") { - + /* + Query Name: output_party_cc_to_file + Export Party-to-Community Mapping to File + + Purpose: Data Export and Downstream Integration + The primary goal of this query is to **generate a plain text or CSV file** that maps + every `Party` vertex in the graph to the `Connected_Component` community vertex it + currently belongs to. This output is critical for: + • **Model Training:** Providing ground truth labels (community IDs) for machine learning models. + • **Reporting/Auditing:** Creating a comprehensive record of the current entity resolution state. + • **External System Integration:** Loading the community assignments into a data warehouse or external database. + + Key Concepts: + • FILE Object: TigerGraph's utility to write data directly to a file on the server. + • Simple Traversal: A single hop is used to link the Party to its assigned community. + + Parameters: + output_file_path: The full path where the output file (e.g., CSV) should be created on the TigerGraph server. + */ + FILE f (output_file_path); f.println("Party", "Connected_Component"); diff --git a/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql index 50eb2b95..f193c843 100644 --- a/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql +++ b/financial_crime/entity_resolution_kyc/queries/top_connected_component.gsql @@ -1,5 +1,23 @@ CREATE OR REPLACE DISTRIBUTED QUERY Top_Connected_Component(/* Parameters here */) FOR GRAPH Entity_Resolution_KYC { - SumAccum @parties,@address,@phone,@risk; + + /* + Query Name: Top_Connected_Component + Top Connected Component Summary Analysis + + Purpose: Community Profiling and Risk Prioritization + This query traverses the graph to aggregate key statistics for each `Connected_Component` (CC) + vertex. It focuses on gathering the total count of associated `Party` vertices, the number of + unique PII entities (specifically `Address` and `Phone`), and a cumulative risk score. The + results are then sorted to show the largest/most active communities first. + + Key Metrics Calculated per Connected_Component: + • Parties: Total number of Party vertices belonging to the CC. + • Address: Total number of unique Address PII vertices linked to the CC. + • Phone: Total number of unique Phone PII vertices linked to the CC. + • Risk: The sum of the `is_fraud` attribute from all associated Party vertices (cumulative risk score). + */ + + SumAccum @parties,@address,@phone,@risk; res = SELECT s FROM Connected_Component:s - () - Party:p - () - (Address|Phone):t diff --git a/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql b/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql index c469f5b9..169ddb0b 100644 --- a/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql +++ b/financial_crime/entity_resolution_kyc/queries/unify_party_entities.gsql @@ -1,5 +1,26 @@ CREATE OR REPLACE DISTRIBUTED QUERY unify_party_entities() { + /* + Query Name: unify_party_entities + Finalization of Entity Resolution: Connected Component Assignment + + Purpose: Clustering and Community Creation + This query is the final, essential step in a batch Entity Resolution pipeline. It performs + the Connected Components (CC) algorithm, using the transitive links established by the + `Same_As` similarity edges (created by `match_party_entities`), to **group all connected + `Party` vertices** into single, distinct communities. + + Core Logic: Label Propagation (MinAccum) + 1. **Initialization:** Every Party vertex is initially labeled with its own unique ID. + 2. **Propagation:** Smaller IDs are iteratively passed across `Same_As` edges until every + Party in a connected cluster has adopted the smallest ID among them. This smallest ID + becomes the unique identifier for the entire community (the CC ID). + 3. **Creation:** New `Connected_Component` vertices are created using these final CC IDs, + and linking edges (`Entity_In_Ring`) are established. + + Key Outcome: Graph structure is finalized with explicit community assignments. + */ + MinAccum @cc_id; // Each vertex's tentative component id DATETIME start_time = now();